├── .gitignore ├── .gitmodules ├── LICENSE ├── MatrixMul.pdf ├── README.md ├── README_ZH_CN.md ├── aarch64 ├── CMakeLists.txt ├── MMult0.cpp ├── MMult1.cpp ├── MMult_4x4_10.cpp ├── MMult_4x4_11.cpp ├── MMult_4x4_12.cpp ├── MMult_4x4_13.cpp ├── MMult_4x4_14.cpp ├── MMult_4x4_15.cpp ├── MMult_4x4_16.cpp ├── MMult_4x4_17.cpp ├── MMult_4x4_18.cpp ├── MMult_4x4_19.cpp ├── MMult_4x4_20.cpp ├── MMult_4x4_21.cpp ├── MMult_4x4_8.cpp ├── MMult_4x4_9.cpp ├── REF_MMult.cpp ├── compare_matrices.cpp ├── copy_matrix.cpp ├── dclock.cpp ├── figures │ ├── compare_MMult0_MMult0.png │ ├── compare_MMult0_MMult_4x4_8.png │ ├── compare_MMult_4x4_10_MMult_4x4_11.png │ ├── compare_MMult_4x4_11_MMult_4x4_12.png │ ├── compare_MMult_4x4_12_MMult_4x4_13.png │ ├── compare_MMult_4x4_12_MMult_4x4_14.png │ ├── compare_MMult_4x4_12_MMult_4x4_17.png │ ├── compare_MMult_4x4_13_MMult_4x4_14.png │ ├── compare_MMult_4x4_14_MMult_4x4_15.png │ ├── compare_MMult_4x4_8_MMult_4x4_9.png │ └── compare_MMult_4x4_9_MMult_4x4_10.png ├── gflops_benchmark │ ├── clear.sh │ ├── func1.S │ ├── func2.S │ ├── main.c │ └── make.sh ├── makefile ├── output_MMult0.m ├── output_MMult1.m ├── output_MMult_4x4_10.m ├── output_MMult_4x4_11.m ├── output_MMult_4x4_12.m ├── output_MMult_4x4_13.m ├── output_MMult_4x4_14.m ├── output_MMult_4x4_15.m ├── output_MMult_4x4_16.m ├── output_MMult_4x4_17.m ├── output_MMult_4x4_18.m ├── output_MMult_4x4_8.m ├── output_MMult_4x4_9.m ├── output_new.m ├── output_old.m ├── parameters.h ├── plot.py ├── print_matrix.cpp ├── random_matrix.cpp └── test_MMult.cpp ├── armv7 ├── MMult0.c ├── MMult1.c ├── MMult_4x4_19.c ├── MMult_4x4_20.c ├── MMult_4x4_21.c ├── MMult_4x4_8.c ├── MMult_4x4_9.c ├── PlotAll.m ├── REF_MMult.c ├── compare_matrices.c ├── copy_matrix.c ├── dclock.c ├── makefile ├── output_MMult_4x4_12.m ├── output_MMult_4x4_18.m ├── output_MMult_4x4_19.m ├── output_MMult_4x4_20.m ├── output_MMult_4x4_21.m ├── output_new.m ├── output_old.m ├── parameters.h ├── plot.py ├── print_matrix.c ├── proc_parameters.m ├── random_matrix.c └── test_MMult.c ├── cuda-int4 └── README.md ├── cuda ├── .gitignore ├── MMult_cuBLAS_1.cpp ├── MMult_cuBLAS_2.cpp ├── MMult_cuda_10.cu ├── MMult_cuda_11.cu ├── MMult_cuda_12.cu ├── MMult_cuda_2.cu ├── MMult_cuda_3.cu ├── MMult_cuda_4.cu ├── MMult_cuda_5.cu ├── MMult_cuda_6.cu ├── MMult_cuda_7.cu ├── MMult_cuda_8.cu ├── MMult_cuda_9.cu ├── PlotAll.m ├── REF_MMult.cpp ├── compare_matrices.cpp ├── copy_matrix.cpp ├── dclock.cpp ├── helper.h ├── makefile ├── output_MMult_cuBLAS_1.m ├── output_MMult_cuBLAS_2.m ├── output_MMult_cuda_10.m ├── output_MMult_cuda_11.m ├── output_MMult_cuda_12.m ├── output_MMult_cuda_2.m ├── output_MMult_cuda_3.m ├── output_MMult_cuda_4.m ├── output_MMult_cuda_5.m ├── output_MMult_cuda_6.m ├── output_MMult_cuda_7.m ├── output_MMult_cuda_8.m ├── output_MMult_cuda_9.m ├── output_new.m ├── output_old.m ├── parameters.h ├── plot.py ├── print_matrix.cpp ├── proc_parameters.m ├── random_matrix.cpp └── test_MMult.cpp ├── images ├── aarch64-fp32-peak-vs-int8.png └── cublas-vs-MMult_cuda_12.jpg ├── requirements.txt └── vulkan ├── .gitignore ├── MMult_vk_1.cpp ├── MMult_vk_2.comp ├── MMult_vk_2.cpp ├── MMult_vk_3.comp ├── MMult_vk_3.cpp ├── MMult_vk_naive.cpp ├── README.md ├── REF_MMult.cpp ├── Shader.hpp ├── benchmark ├── .gitignore ├── build.sh ├── gflops_fmla.cpp ├── gflops_fmla_1.comp ├── gflops_fmla_2.comp ├── gmem_bandwidth.comp ├── gmem_bandwidth.cpp ├── sampler_bandwidth.comp ├── smem_bandwidth.comp ├── smem_bandwidth.cpp ├── smem_bandwidth1.comp ├── smem_latency.cpp └── types.h ├── compare_matrices.cpp ├── copy_matrix.cpp ├── dclock.cpp ├── fmt ├── chrono.h ├── color.h ├── compile.h ├── core.h ├── format-inl.h ├── format.h ├── locale.h ├── os.h ├── ostream.h ├── posix.h ├── printf.h └── ranges.h ├── kompute └── Kompute.hpp ├── makefile ├── parameters.h ├── plot.py ├── print_matrix.cpp ├── random_matrix.cpp ├── spdlog ├── async.h ├── async_logger-inl.h ├── async_logger.h ├── cfg │ ├── argv.h │ ├── env.h │ ├── helpers-inl.h │ └── helpers.h ├── common-inl.h ├── common.h ├── details │ ├── backtracer-inl.h │ ├── backtracer.h │ ├── circular_q.h │ ├── console_globals.h │ ├── file_helper-inl.h │ ├── file_helper.h │ ├── fmt_helper.h │ ├── log_msg-inl.h │ ├── log_msg.h │ ├── log_msg_buffer-inl.h │ ├── log_msg_buffer.h │ ├── mpmc_blocking_q.h │ ├── null_mutex.h │ ├── os-inl.h │ ├── os.h │ ├── periodic_worker-inl.h │ ├── periodic_worker.h │ ├── registry-inl.h │ ├── registry.h │ ├── synchronous_factory.h │ ├── tcp_client-windows.h │ ├── tcp_client.h │ ├── thread_pool-inl.h │ ├── thread_pool.h │ └── windows_include.h ├── fmt │ ├── bin_to_hex.h │ ├── bundled │ │ ├── LICENSE.rst │ │ ├── chrono.h │ │ ├── color.h │ │ ├── compile.h │ │ ├── core.h │ │ ├── format-inl.h │ │ ├── format.h │ │ ├── locale.h │ │ ├── os.h │ │ ├── ostream.h │ │ ├── posix.h │ │ ├── printf.h │ │ └── ranges.h │ ├── chrono.h │ ├── fmt.h │ └── ostr.h ├── formatter.h ├── fwd.h ├── logger-inl.h ├── logger.h ├── pattern_formatter-inl.h ├── pattern_formatter.h ├── sinks │ ├── android_sink.h │ ├── ansicolor_sink-inl.h │ ├── ansicolor_sink.h │ ├── base_sink-inl.h │ ├── base_sink.h │ ├── basic_file_sink-inl.h │ ├── basic_file_sink.h │ ├── daily_file_sink.h │ ├── dist_sink.h │ ├── dup_filter_sink.h │ ├── msvc_sink.h │ ├── null_sink.h │ ├── ostream_sink.h │ ├── ringbuffer_sink.h │ ├── rotating_file_sink-inl.h │ ├── rotating_file_sink.h │ ├── sink-inl.h │ ├── sink.h │ ├── stdout_color_sinks-inl.h │ ├── stdout_color_sinks.h │ ├── stdout_sinks-inl.h │ ├── stdout_sinks.h │ ├── syslog_sink.h │ ├── systemd_sink.h │ ├── tcp_sink.h │ ├── win_eventlog_sink.h │ ├── wincolor_sink-inl.h │ └── wincolor_sink.h ├── spdlog-inl.h ├── spdlog.h ├── stopwatch.h ├── tweakme.h └── version.h ├── test_MMult.cpp └── vulkan ├── vk_icd.h ├── vk_layer.h ├── vk_platform.h ├── vk_sdk_platform.h ├── vulkan.h ├── vulkan.hpp ├── vulkan_android.h ├── vulkan_beta.h ├── vulkan_core.h ├── vulkan_directfb.h ├── vulkan_fuchsia.h ├── vulkan_ggp.h ├── vulkan_ios.h ├── vulkan_macos.h ├── vulkan_metal.h ├── vulkan_vi.h ├── vulkan_wayland.h ├── vulkan_win32.h ├── vulkan_xcb.h ├── vulkan_xlib.h └── vulkan_xlib_xrandr.h /.gitignore: -------------------------------------------------------------------------------- 1 | aarch64/build 2 | # Prerequisites 3 | *.d 4 | 5 | # Compiled Object files 6 | *.slo 7 | *.lo 8 | *.o 9 | *.obj 10 | *.x 11 | 12 | # Precompiled Headers 13 | *.gch 14 | *.pch 15 | 16 | # Compiled Dynamic libraries 17 | *.so 18 | *.dylib 19 | *.dll 20 | 21 | # Fortran module files 22 | *.mod 23 | *.smod 24 | 25 | # Compiled Static libraries 26 | *.lai 27 | *.la 28 | *.a 29 | *.lib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | 36 | # Images and Test results 37 | src/HowToOptimizeGemm/*.png 38 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "x86"] 2 | path = x86 3 | url = https://github.com/flame/how-to-optimize-gemm 4 | shallow = true 5 | [submodule "kompute"] 6 | shallow = true 7 | [submodule "OpenBLAS-0.2.20"] 8 | path = OpenBLAS-0.2.20 9 | url = https://github.com/tpoisonooo/OpenBLAS 10 | [submodule "aarch64-int8"] 11 | path = aarch64-int8 12 | url = https://github.com/tpoisonooo/chgemm 13 | [submodule "mperf"] 14 | path = mperf 15 | url = https://github.com/tpoisonooo/mperf 16 | -------------------------------------------------------------------------------- /MatrixMul.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/MatrixMul.pdf -------------------------------------------------------------------------------- /aarch64/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15.2) 2 | 3 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "") 4 | 5 | project(how-to-optimize-gemm LANGUAGES C CXX ASM VERSION 0.1) 6 | set(CMAKE_CXX_STANDARD 17) 7 | 8 | option(MPERF_ENABLE "build with mperf." OFF) 9 | set(CMAKE_CXX_FLAGS "-O2 -g -march=native -ftree-vectorize ${CMAKE_CXX_FLAGS}") 10 | 11 | if(MPERF_ENABLE) 12 | add_definitions(-DMPERF=1) 13 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../mperf ${CMAKE_CURRENT_BINARY_DIR}/mperf) 14 | endif() 15 | 16 | function(add_bin source_file) 17 | get_filename_component(target_name ${source_file} NAME_WE) 18 | add_executable(${target_name} ${source_file} test_MMult.cpp compare_matrices.cpp random_matrix.cpp copy_matrix.cpp dclock.cpp REF_MMult.cpp print_matrix.cpp) 19 | 20 | if(MPERF_ENABLE) 21 | target_link_libraries(${target_name} mperf) 22 | endif() 23 | endfunction() 24 | 25 | 26 | add_bin(MMult0.cpp) 27 | add_bin(MMult1.cpp) 28 | add_bin(MMult_4x4_8.cpp) 29 | add_bin(MMult_4x4_9.cpp) 30 | add_bin(MMult_4x4_10.cpp) 31 | add_bin(MMult_4x4_11.cpp) 32 | add_bin(MMult_4x4_12.cpp) 33 | add_bin(MMult_4x4_13.cpp) 34 | add_bin(MMult_4x4_14.cpp) 35 | add_bin(MMult_4x4_15.cpp) 36 | add_bin(MMult_4x4_16.cpp) 37 | add_bin(MMult_4x4_17.cpp) 38 | add_bin(MMult_4x4_18.cpp) 39 | add_bin(MMult_4x4_19.cpp) 40 | 41 | -------------------------------------------------------------------------------- /aarch64/MMult0.cpp: -------------------------------------------------------------------------------- 1 | /* Routine for computing C = A * B */ 2 | 3 | void MY_MMult(int m, int n, int k, float *a, int lda, float *b, int ldb, 4 | float *c, int ldc) { 5 | #define A(i, j) a[(i) * k + (j)] 6 | #define B(i, j) b[(i) * n + (j)] 7 | #define C(i, j) c[(i) * n + (j)] 8 | 9 | int i, j, p; 10 | 11 | for (i = 0; i < m; i++) { /* Loop over the rows of C */ 12 | for (j = 0; j < n; j++) { /* Loop over the columns of C */ 13 | for (p = 0; p < k; p++) { /* Update C( i,j ) with the inner 14 | product of the ith row of A and 15 | the jth column of B */ 16 | C(i, j) = C(i, j) + A(i, p) * B(p, j); 17 | } 18 | } 19 | } 20 | #undef A 21 | #undef B 22 | #undef C 23 | } 24 | -------------------------------------------------------------------------------- /aarch64/MMult1.cpp: -------------------------------------------------------------------------------- 1 | /* Routine for computing C = A * B */ 2 | 3 | void AddDot(int, float *, float *, int, float *); 4 | 5 | void MY_MMult(int m, int n, int k, float *a, int lda, float *b, int ldb, 6 | float *c, int ldc) { 7 | int i, j; 8 | 9 | #define A(i, j) a[(i) * k + (j)] 10 | #define B(i, j) b[(i) * n + (j)] 11 | #define C(i, j) c[(i) * n + (j)] 12 | 13 | for (j = 0; j < n; ++j) { /* Loop over the columns of C */ 14 | for (i = 0; i < m; ++i) { /* Loop over the rows of C */ 15 | /* Update the C( i,j ) with the inner product of the ith row of A 16 | and the jth column of B */ 17 | 18 | AddDot(k, &A(i, 0), &B(0, j), ldb, &C(i, j)); 19 | } 20 | } 21 | #undef A 22 | #undef B 23 | #undef C 24 | } 25 | 26 | /* Create macro to let X( i ) equal the ith element of x */ 27 | void AddDot(int k, float *x, float *y, int ldb, float *gamma) { 28 | /* compute gamma := x' * y + gamma with vectors x and y of length n. 29 | 30 | Here x starts at location x with increment (stride) incx and y starts at 31 | location y and has (implicit) stride of 1. 32 | */ 33 | for (int p = 0; p < k; p++) { 34 | *gamma += x[p] * y[p * ldb]; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /aarch64/REF_MMult.cpp: -------------------------------------------------------------------------------- 1 | /* Create macros so that the matrices are stored in row-major order */ 2 | 3 | #if 0 4 | #include 5 | /* Routine for computing C = A * B + C */ 6 | void REF_MMult(int m, int n, int k, float *a, float *b, float *c) { 7 | cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0f, a, k, 8 | b, n, 0.0f, c, n); 9 | } 10 | 11 | #else 12 | 13 | #define A(i, j) a[(i) * k + (j)] 14 | #define B(i, j) b[(i) * n + (j)] 15 | #define C(i, j) c[(i) * n + (j)] 16 | /* Routine for computing C = A * B + C */ 17 | 18 | void REF_MMult(int m, int n, int k, float *a, float *b, float *c) { 19 | int i, j, p; 20 | 21 | for (i = 0; i < m; i++) { 22 | for (j = 0; j < n; j++) { 23 | for (p = 0; p < k; p++) { 24 | C(i, j) += A(i, p) * B(p, j); 25 | } 26 | } 27 | } 28 | } 29 | 30 | #undef A 31 | #undef B 32 | #undef C 33 | #endif 34 | -------------------------------------------------------------------------------- /aarch64/compare_matrices.cpp: -------------------------------------------------------------------------------- 1 | #define abs(x) ((x) < 0.0 ? -(x) : (x)) 2 | 3 | #include 4 | 5 | float compare_matrices(int m, int n, float *a, float *b) { 6 | #define A(i, j) a[(i) * n + (j)] 7 | #define B(i, j) b[(i) * n + (j)] 8 | // printf("\n---result----\n"); 9 | // print_matrix(m, n, a, lda); 10 | // printf("\n-------\n"); 11 | // print_matrix(m, n, b, ldb); 12 | // printf("\n-------\n"); 13 | int i, j; 14 | float max_diff = 0.0, diff; 15 | int printed = 0; 16 | 17 | for (i = 0; i < m; i++) { 18 | for (j = 0; j < n; j++) { 19 | diff = abs(A(i, j) - B(i, j)); 20 | max_diff = (diff > max_diff ? diff : max_diff); 21 | if (0 == printed) 22 | if (max_diff > 0.5f || max_diff < -0.5f) { 23 | fprintf(stdout, "error: i %d j %d diff %f got %f expect %f \n", i, 24 | j, max_diff, A(i, j), B(i, j)); 25 | printed = 1; 26 | } 27 | } 28 | } 29 | 30 | return max_diff; 31 | #undef A 32 | #undef B 33 | } 34 | -------------------------------------------------------------------------------- /aarch64/copy_matrix.cpp: -------------------------------------------------------------------------------- 1 | void copy_matrix(int m, int n, float *a, float *b) { 2 | #define A(i, j) a[(i) * n + (j)] 3 | #define B(i, j) b[(i) * n + (j)] 4 | 5 | int i, j; 6 | 7 | for (j = 0; j < n; j++) { 8 | for (i = 0; i < m; i++) { 9 | B(i, j) = A(i, j); 10 | } 11 | } 12 | 13 | #undef A 14 | #undef B 15 | } 16 | -------------------------------------------------------------------------------- /aarch64/dclock.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | static double gtod_ref_time_sec = 0.0; 5 | 6 | /* Adapted from the bl2_clock() routine in the BLIS library */ 7 | 8 | double dclock() { 9 | double the_time, norm_sec; 10 | struct timeval tv; 11 | 12 | gettimeofday(&tv, NULL); 13 | 14 | if (gtod_ref_time_sec == 0.0) 15 | gtod_ref_time_sec = (double)tv.tv_sec; 16 | 17 | norm_sec = (double)tv.tv_sec - gtod_ref_time_sec; 18 | 19 | the_time = norm_sec + tv.tv_usec * 1.0e-6; 20 | 21 | return the_time; 22 | } 23 | -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult0_MMult0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult0_MMult0.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult0_MMult_4x4_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult0_MMult_4x4_8.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_10_MMult_4x4_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_10_MMult_4x4_11.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_11_MMult_4x4_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_11_MMult_4x4_12.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_12_MMult_4x4_13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_13.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_12_MMult_4x4_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_14.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_12_MMult_4x4_17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_17.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_13_MMult_4x4_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_13_MMult_4x4_14.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_14_MMult_4x4_15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_14_MMult_4x4_15.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_8_MMult_4x4_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_8_MMult_4x4_9.png -------------------------------------------------------------------------------- /aarch64/figures/compare_MMult_4x4_9_MMult_4x4_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_9_MMult_4x4_10.png -------------------------------------------------------------------------------- /aarch64/gflops_benchmark/clear.sh: -------------------------------------------------------------------------------- 1 | rm -rf main 2 | rm -rf *.o 3 | -------------------------------------------------------------------------------- /aarch64/gflops_benchmark/func1.S: -------------------------------------------------------------------------------- 1 | .text 2 | .align 5 3 | .global func1 4 | 5 | func1: 6 | .loop1: 7 | fmla v0.4s, v0.4s, v16.s[0] 8 | fmla v1.4s, v1.4s, v16.s[1] 9 | fmla v2.4s, v2.4s, v16.s[2] 10 | fmla v3.4s, v3.4s, v16.s[3] 11 | 12 | subs x0, x0, #1 13 | 14 | fmla v4.4s, v4.4s, v17.s[0] 15 | fmla v5.4s, v5.4s, v17.s[1] 16 | fmla v6.4s, v6.4s, v17.s[2] 17 | fmla v7.4s, v7.4s, v17.s[3] 18 | 19 | fmla v8.4s, v8.4s, v18.s[0] 20 | fmla v9.4s, v9.4s, v18.s[1] 21 | bne .loop1 22 | ret 23 | -------------------------------------------------------------------------------- /aarch64/gflops_benchmark/func2.S: -------------------------------------------------------------------------------- 1 | .text 2 | .align 5 3 | .global func2 4 | 5 | func2: 6 | .loop2: 7 | fmla v0.4s, v0.4s, v0.4s 8 | fmla v1.4s, v1.4s, v1.4s 9 | fmla v2.4s, v2.4s, v2.4s 10 | fmla v3.4s, v3.4s, v3.4s 11 | 12 | fmla v4.4s, v4.4s, v4.4s 13 | fmla v5.4s, v5.4s, v5.4s 14 | fmla v6.4s, v6.4s, v6.4s 15 | fmla v7.4s, v7.4s, v7.4s 16 | 17 | fmla v8.4s, v8.4s, v8.4s 18 | fmla v9.4s, v9.4s, v9.4s 19 | 20 | subs x0, x0, #1 21 | bne .loop2 22 | ret 23 | -------------------------------------------------------------------------------- /aarch64/gflops_benchmark/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define LOOP (1e9) 5 | #define OP_FLOATS (80) 6 | 7 | void func1(int); 8 | void func2(int); 9 | 10 | static double get_time(struct timespec *start, 11 | struct timespec *end) { 12 | return end->tv_sec - start->tv_sec + (end->tv_nsec - start->tv_nsec) * 1e-9; 13 | } 14 | 15 | int main() { 16 | struct timespec start, end; 17 | double time_used = 0.0; 18 | 19 | clock_gettime(CLOCK_MONOTONIC_RAW, &start); 20 | // func1(LOOP); 21 | func2(LOOP); 22 | clock_gettime(CLOCK_MONOTONIC_RAW, &end); 23 | 24 | time_used = get_time(&start, &end); 25 | printf("perf: %.6lf \r\n", LOOP * OP_FLOATS * 1.0 * 1e-9 / time_used); 26 | } 27 | -------------------------------------------------------------------------------- /aarch64/gflops_benchmark/make.sh: -------------------------------------------------------------------------------- 1 | as -o func1.o func1.S 2 | as -o func2.o func2.S 3 | gcc -c main.c 4 | gcc -o main main.o func2.o func1.o 5 | -------------------------------------------------------------------------------- /aarch64/makefile: -------------------------------------------------------------------------------- 1 | OLD := MMult_4x4_10 2 | NEW := MMult_4x4_21 3 | # ARCH := armv7-a 4 | # ARCH := aarch64 5 | ARCH := native 6 | 7 | # 8 | # sample makefile 9 | # 10 | 11 | CC := g++ 12 | LINKER := $(CC) 13 | #CFLAGS := -O0 -g -Wall 14 | CFLAGS := -std=c++17 -O2 -g -march=$(ARCH) -ftree-vectorize 15 | LDFLAGS := -lm 16 | 17 | UTIL := copy_matrix.o \ 18 | compare_matrices.o \ 19 | random_matrix.o \ 20 | dclock.o \ 21 | REF_MMult.o \ 22 | print_matrix.o 23 | 24 | TEST_OBJS := test_MMult.o $(NEW).o 25 | 26 | %.o: %.cpp 27 | $(CC) $(CFLAGS) -c $< -o $@ 28 | 29 | all: 30 | make clean; 31 | make test_MMult.x 32 | 33 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h 34 | $(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \ 35 | $(BLAS_LIB) -o $(TEST_BIN) $@ 36 | 37 | run: 38 | make all 39 | export OMP_NUM_THREADS=1 40 | export GOTO_NUM_THREADS=1 41 | echo "version = '$(NEW)';" > output_$(NEW).m 42 | ./test_MMult.x >> output_$(NEW).m 43 | cp output_$(OLD).m output_old.m 44 | cp output_$(NEW).m output_new.m 45 | 46 | clean: 47 | rm -f *.o *~ core *.x 48 | 49 | cleanall: 50 | rm -f *.o *~ core *.x output*.m *.eps *.png 51 | -------------------------------------------------------------------------------- /aarch64/output_MMult0.m: -------------------------------------------------------------------------------- 1 | version = 'MMult0'; 2 | MY_MMult = [ 3 | 40 1.542169e+00 0.000000e+00 4 | 80 1.404664e+00 0.000000e+00 5 | 120 1.360094e+00 0.000000e+00 6 | 160 1.331816e+00 0.000000e+00 7 | 200 1.329787e+00 0.000000e+00 8 | 240 1.321227e+00 0.000000e+00 9 | 280 1.319191e+00 0.000000e+00 10 | 320 1.041146e+00 0.000000e+00 11 | 360 1.308063e+00 0.000000e+00 12 | 400 1.255653e+00 0.000000e+00 13 | 440 1.229162e+00 0.000000e+00 14 | 480 1.228200e+00 0.000000e+00 15 | 520 1.172813e+00 0.000000e+00 16 | 560 1.213585e+00 0.000000e+00 17 | 600 1.216463e+00 0.000000e+00 18 | 640 1.033379e+00 0.000000e+00 19 | 680 1.221786e+00 0.000000e+00 20 | 720 1.222401e+00 0.000000e+00 21 | 760 1.238574e+00 0.000000e+00 22 | 800 1.000343e+00 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult1.m: -------------------------------------------------------------------------------- 1 | version = 'MMult1'; 2 | MY_MMult = [ 3 | 40 1.560976e+00 0.000000e+00 4 | 80 1.424200e+00 0.000000e+00 5 | 120 1.364390e+00 0.000000e+00 6 | 160 1.331816e+00 0.000000e+00 7 | 200 1.327030e+00 0.000000e+00 8 | 240 1.321480e+00 0.000000e+00 9 | 280 1.290687e+00 0.000000e+00 10 | 320 1.028823e+00 0.000000e+00 11 | 360 1.243646e+00 0.000000e+00 12 | 400 1.285347e+00 0.000000e+00 13 | 440 1.267855e+00 0.000000e+00 14 | 480 1.267428e+00 0.000000e+00 15 | 520 1.224942e+00 0.000000e+00 16 | 560 1.285491e+00 0.000000e+00 17 | 600 1.269580e+00 0.000000e+00 18 | 640 9.652605e-01 0.000000e+00 19 | 680 1.270135e+00 0.000000e+00 20 | 720 1.270478e+00 0.000000e+00 21 | 760 1.250450e+00 0.000000e+00 22 | 800 9.419695e-01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_10.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_10'; 2 | MY_MMult = [ 3 | 40 1.628223e+01 0.000000e+00 4 | 80 1.628223e+01 0.000000e+00 5 | 120 1.626016e+01 0.000000e+00 6 | 160 1.628223e+01 0.000000e+00 7 | 200 1.610738e+01 0.000000e+00 8 | 240 1.621622e+01 0.000000e+00 9 | 280 1.628223e+01 0.000000e+00 10 | 320 1.628223e+01 0.000000e+00 11 | 360 1.626016e+01 0.000000e+00 12 | 400 1.628223e+01 0.000000e+00 13 | 440 1.628223e+01 0.000000e+00 14 | 480 1.626016e+01 0.000000e+00 15 | 520 1.628223e+01 0.000000e+00 16 | 560 1.628223e+01 0.000000e+00 17 | 600 1.626016e+01 0.000000e+00 18 | 640 1.628223e+01 0.000000e+00 19 | 680 1.628223e+01 0.000000e+00 20 | 720 1.626016e+01 0.000000e+00 21 | 760 1.628223e+01 0.000000e+00 22 | 800 1.626016e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_11.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_11'; 2 | MY_MMult = [ 3 | 40 1.548387e+01 0.000000e+00 4 | 80 1.552393e+01 0.000000e+00 5 | 120 1.554404e+01 0.000000e+00 6 | 160 1.560468e+01 0.000000e+00 7 | 200 1.556420e+01 0.000000e+00 8 | 240 1.550388e+01 0.000000e+00 9 | 280 1.560468e+01 0.000000e+00 10 | 320 1.558442e+01 0.000000e+00 11 | 360 1.556420e+01 0.000000e+00 12 | 400 1.556420e+01 0.000000e+00 13 | 440 1.558442e+01 0.000000e+00 14 | 480 1.558442e+01 0.000000e+00 15 | 520 1.556420e+01 0.000000e+00 16 | 560 1.558442e+01 0.000000e+00 17 | 600 1.558442e+01 0.000000e+00 18 | 640 1.558442e+01 0.000000e+00 19 | 680 1.558442e+01 0.000000e+00 20 | 720 1.558442e+01 0.000000e+00 21 | 760 1.558442e+01 0.000000e+00 22 | 800 1.558442e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_12.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_12'; 2 | MY_MMult = [ 3 | 40 1.589404e+01 0.000000e+00 4 | 80 1.593625e+01 0.000000e+00 5 | 120 1.593625e+01 0.000000e+00 6 | 160 1.591512e+01 0.000000e+00 7 | 200 1.591512e+01 0.000000e+00 8 | 240 1.587302e+01 0.000000e+00 9 | 280 1.591512e+01 0.000000e+00 10 | 320 1.595745e+01 0.000000e+00 11 | 360 1.593625e+01 0.000000e+00 12 | 400 1.593625e+01 0.000000e+00 13 | 440 1.593625e+01 0.000000e+00 14 | 480 1.595745e+01 0.000000e+00 15 | 520 1.593625e+01 0.000000e+00 16 | 560 1.595745e+01 0.000000e+00 17 | 600 1.593625e+01 0.000000e+00 18 | 640 1.593625e+01 0.000000e+00 19 | 680 1.595745e+01 0.000000e+00 20 | 720 1.595745e+01 0.000000e+00 21 | 760 1.593625e+01 0.000000e+00 22 | 800 1.593625e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_13.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_13'; 2 | MY_MMult = [ 3 | 40 1.280000e+01 0.000000e+00 4 | 80 1.651613e+01 0.000000e+00 5 | 120 1.868108e+01 0.000000e+00 6 | 160 1.820444e+01 0.000000e+00 7 | 200 1.884570e+01 0.000000e+00 8 | 240 1.901513e+01 0.000000e+00 9 | 280 1.875438e+01 0.000000e+00 10 | 320 1.888646e+01 0.000000e+00 11 | 360 1.906661e+01 0.000000e+00 12 | 400 1.919328e+01 0.000000e+00 13 | 440 1.932925e+01 0.000000e+00 14 | 480 1.943450e+01 0.000000e+00 15 | 520 1.929970e+01 0.000000e+00 16 | 560 1.935163e+01 0.000000e+00 17 | 600 1.935571e+01 0.000000e+00 18 | 640 1.859837e+01 0.000000e+00 19 | 680 1.943818e+01 0.000000e+00 20 | 720 1.919161e+01 0.000000e+00 21 | 760 1.924236e+01 0.000000e+00 22 | 800 1.942890e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_14.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_14'; 2 | MY_MMult = [ 3 | 40 1.163636e+01 0.000000e+00 4 | 80 1.575385e+01 0.000000e+00 5 | 120 1.868108e+01 0.000000e+00 6 | 160 1.728270e+01 0.000000e+00 7 | 200 1.893491e+01 0.000000e+00 8 | 240 1.908075e+01 0.000000e+00 9 | 280 1.913862e+01 0.000000e+00 10 | 320 1.919063e+01 0.000000e+00 11 | 360 1.961985e+01 0.000000e+00 12 | 400 1.943812e+01 0.000000e+00 13 | 440 1.947063e+01 0.000000e+00 14 | 480 1.971864e+01 0.000000e+00 15 | 520 1.968748e+01 0.000000e+00 16 | 560 1.950639e+01 0.000000e+00 17 | 600 1.970084e+01 0.000000e+00 18 | 640 1.974868e+01 0.000000e+00 19 | 680 1.996204e+01 0.000000e+00 20 | 720 1.966948e+01 0.000000e+00 21 | 760 1.978483e+01 0.000000e+00 22 | 800 1.967301e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_15.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_15'; 2 | MY_MMult = [ 3 | 40 9.142857e+00 0.000000e+00 4 | 80 1.204706e+01 0.000000e+00 5 | 120 1.355294e+01 0.000000e+00 6 | 160 1.317042e+01 0.000000e+00 7 | 200 1.360544e+01 0.000000e+00 8 | 240 1.366683e+01 0.000000e+00 9 | 280 1.370715e+01 0.000000e+00 10 | 320 1.371333e+01 0.000000e+00 11 | 360 1.369215e+01 0.000000e+00 12 | 400 1.387083e+01 0.000000e+00 13 | 440 1.384093e+01 0.000000e+00 14 | 480 1.383957e+01 0.000000e+00 15 | 520 1.379727e+01 0.000000e+00 16 | 560 1.397438e+01 0.000000e+00 17 | 600 1.392336e+01 0.000000e+00 18 | 640 1.390574e+01 0.000000e+00 19 | 680 1.403088e+01 0.000000e+00 20 | 720 1.403768e+01 0.000000e+00 21 | 760 1.398525e+01 0.000000e+00 22 | 800 1.396560e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_16.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_16'; 2 | MY_MMult = [ 3 | 40 1.163636e+01 0.000000e+00 4 | 80 1.651613e+01 0.000000e+00 5 | 120 1.898901e+01 0.000000e+00 6 | 160 1.804405e+01 0.000000e+00 7 | 200 1.871345e+01 0.000000e+00 8 | 240 1.897598e+01 0.000000e+00 9 | 280 1.902253e+01 0.000000e+00 10 | 320 1.899594e+01 0.000000e+00 11 | 360 1.922373e+01 0.000000e+00 12 | 400 1.929164e+01 0.000000e+00 13 | 440 1.895927e+01 0.000000e+00 14 | 480 1.928202e+01 0.000000e+00 15 | 520 1.924292e+01 0.000000e+00 16 | 560 1.924453e+01 0.000000e+00 17 | 600 1.934011e+01 0.000000e+00 18 | 640 1.938935e+01 0.000000e+00 19 | 680 1.943878e+01 0.000000e+00 20 | 720 1.954690e+01 0.000000e+00 21 | 760 1.941985e+01 0.000000e+00 22 | 800 1.953006e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_17.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_17'; 2 | MY_MMult = [ 3 | 40 1.163636e+01 0.000000e+00 4 | 80 1.600000e+01 0.000000e+00 5 | 120 1.888525e+01 0.000000e+00 6 | 160 1.840899e+01 0.000000e+00 7 | 200 1.916168e+01 0.000000e+00 8 | 240 1.901513e+01 0.000000e+00 9 | 280 1.923084e+01 0.000000e+00 10 | 320 1.911228e+01 0.000000e+00 11 | 360 1.939555e+01 0.000000e+00 12 | 400 1.943812e+01 0.000000e+00 13 | 440 1.949960e+01 0.000000e+00 14 | 480 1.958594e+01 0.000000e+00 15 | 520 1.944651e+01 0.000000e+00 16 | 560 1.950747e+01 0.000000e+00 17 | 600 1.936438e+01 0.000000e+00 18 | 640 1.946060e+01 0.000000e+00 19 | 680 1.958041e+01 0.000000e+00 20 | 720 1.955765e+01 0.000000e+00 21 | 760 1.941942e+01 0.000000e+00 22 | 800 1.963002e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_18.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_18'; 2 | MY_MMult = [ 3 | 40 3.069054e+01 0.000000e+00 4 | 80 3.061224e+01 0.000000e+00 5 | 120 3.084833e+01 0.000000e+00 6 | 160 3.076923e+01 0.000000e+00 7 | 200 3.084833e+01 0.000000e+00 8 | 240 3.076923e+01 0.000000e+00 9 | 280 3.076923e+01 0.000000e+00 10 | 320 3.084833e+01 0.000000e+00 11 | 360 3.084833e+01 0.000000e+00 12 | 400 3.076923e+01 0.000000e+00 13 | 440 3.076923e+01 0.000000e+00 14 | 480 3.084833e+01 0.000000e+00 15 | 520 3.084833e+01 0.000000e+00 16 | 560 3.084833e+01 0.000000e+00 17 | 600 3.076923e+01 0.000000e+00 18 | 640 3.076923e+01 0.000000e+00 19 | 680 3.084833e+01 0.000000e+00 20 | 720 3.084833e+01 0.000000e+00 21 | 760 3.076923e+01 0.000000e+00 22 | 800 3.076923e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_8.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_8'; 2 | MY_MMult = [ 3 | error: i 0 j 0 diff 600.000000 got 800.000000 expect 200.000000 4 | 40 diff too big: 6.000000e+02 5 | -------------------------------------------------------------------------------- /aarch64/output_MMult_4x4_9.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_9'; 2 | MY_MMult = [ 3 | error: i 0 j 0 diff 600.000000 got 800.000000 expect 200.000000 4 | 40 diff too big: 6.000000e+02 5 | -------------------------------------------------------------------------------- /aarch64/output_new.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_18'; 2 | MY_MMult = [ 3 | 40 3.069054e+01 0.000000e+00 4 | 80 3.061224e+01 0.000000e+00 5 | 120 3.084833e+01 0.000000e+00 6 | 160 3.076923e+01 0.000000e+00 7 | 200 3.084833e+01 0.000000e+00 8 | 240 3.076923e+01 0.000000e+00 9 | 280 3.076923e+01 0.000000e+00 10 | 320 3.084833e+01 0.000000e+00 11 | 360 3.084833e+01 0.000000e+00 12 | 400 3.076923e+01 0.000000e+00 13 | 440 3.076923e+01 0.000000e+00 14 | 480 3.084833e+01 0.000000e+00 15 | 520 3.084833e+01 0.000000e+00 16 | 560 3.084833e+01 0.000000e+00 17 | 600 3.076923e+01 0.000000e+00 18 | 640 3.076923e+01 0.000000e+00 19 | 680 3.084833e+01 0.000000e+00 20 | 720 3.084833e+01 0.000000e+00 21 | 760 3.076923e+01 0.000000e+00 22 | 800 3.076923e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/output_old.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_10'; 2 | MY_MMult = [ 3 | 40 1.628223e+01 0.000000e+00 4 | 80 1.628223e+01 0.000000e+00 5 | 120 1.626016e+01 0.000000e+00 6 | 160 1.628223e+01 0.000000e+00 7 | 200 1.610738e+01 0.000000e+00 8 | 240 1.621622e+01 0.000000e+00 9 | 280 1.628223e+01 0.000000e+00 10 | 320 1.628223e+01 0.000000e+00 11 | 360 1.626016e+01 0.000000e+00 12 | 400 1.628223e+01 0.000000e+00 13 | 440 1.628223e+01 0.000000e+00 14 | 480 1.626016e+01 0.000000e+00 15 | 520 1.628223e+01 0.000000e+00 16 | 560 1.628223e+01 0.000000e+00 17 | 600 1.626016e+01 0.000000e+00 18 | 640 1.628223e+01 0.000000e+00 19 | 680 1.628223e+01 0.000000e+00 20 | 720 1.626016e+01 0.000000e+00 21 | 760 1.628223e+01 0.000000e+00 22 | 800 1.626016e+01 0.000000e+00 23 | ]; 24 | -------------------------------------------------------------------------------- /aarch64/parameters.h: -------------------------------------------------------------------------------- 1 | /* 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )" 3 | The below parameters set this range of values that p takes on 4 | */ 5 | #define PFIRST 48 6 | #define PLAST 960 7 | #define PINC 48 8 | 9 | /* 10 | In the test driver, the m, n, and k dimensions are set to the below 11 | values. If the value equals "-1" then that dimension is bound to the 12 | index p, given above. 13 | */ 14 | 15 | #define M -1 16 | #define N -1 17 | #define K -1 18 | 19 | /* 20 | In the test driver, each experiment is repeated NREPEATS times and 21 | the best time from these repeats is used to compute the performance 22 | */ 23 | 24 | #define NREPEATS 10 25 | 26 | /* 27 | Matrices A, B, and C are stored in two dimensional arrays with 28 | row dimensions that are greater than or equal to the row dimension 29 | of the matrix. This row dimension of the array is known as the 30 | "leading dimension" and determines the stride (the number of 31 | double precision numbers) when one goes from one element in a row 32 | to the next. Having this number larger than the row dimension of 33 | the matrix tends to adversely affect performance. LDX equals the 34 | leading dimension of the array that stores matrix X. If LDX=-1 35 | then the leading dimension is set to the row dimension of matrix X. 36 | */ 37 | 38 | #if 0 39 | #define LDA 1000 40 | #define LDB 1000 41 | #define LDC 1000 42 | #else 43 | #define LDA -1 44 | #define LDB -1 45 | #define LDC -1 46 | #endif 47 | -------------------------------------------------------------------------------- /aarch64/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def readFile(filename): 5 | f = open(filename) 6 | sizes = [40] 7 | times = [0.0] 8 | title = '' 9 | try: 10 | title = f.readline() 11 | # skip 1 line 12 | f.readline() 13 | while True: 14 | line = f.readline() 15 | if line: 16 | slices = line.split(" ") 17 | if len(slices) <= 2: 18 | break; 19 | size = int(slices[0]) 20 | time = float(slices[1]) 21 | sizes.append(size) 22 | times.append(time) 23 | finally: 24 | f.close() 25 | return title, sizes, times 26 | 27 | if __name__ == '__main__': 28 | plt.xlabel('size') 29 | plt.ylabel('gflops') 30 | t1, x1, y1 = readFile('output_old.m') 31 | plt.plot(x1, y1, label=t1) 32 | t2, x2, y2 = readFile('output_new.m') 33 | plt.plot(x2, y2, label=t2) 34 | plt.legend() 35 | plt.show() 36 | 37 | -------------------------------------------------------------------------------- /aarch64/print_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define A(i, j) a[(i) * lda + (j)] 4 | 5 | void print_matrix(int m, int n, float *a, int lda) { 6 | int i, j; 7 | 8 | for (i = 0; i < m; i++) { 9 | for (j = 0; j < n; j++) { 10 | printf("%.1f\t", A(i, j)); 11 | } 12 | printf("\n"); 13 | } 14 | printf("\n"); 15 | } 16 | -------------------------------------------------------------------------------- /aarch64/random_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void random_matrix(int m, int n, float *a) { 4 | #define A(i, j) a[(i) * n + (j)] 5 | 6 | double drand48() __THROW; 7 | int i, j; 8 | 9 | for (i = 0; i < m; i++) { 10 | for (j = 0; j < n; j++) { 11 | // #if 0 12 | // A(i, j) = 2.0 * (float)drand48() - 1.0; 13 | // #else 14 | // A(i, j) = (j - i) % 3; 15 | // #endif 16 | A(i, j) = 1.0f; 17 | } 18 | } 19 | #undef A 20 | } 21 | -------------------------------------------------------------------------------- /armv7/MMult0.c: -------------------------------------------------------------------------------- 1 | /* Create macros so that the matrices are stored in row-major order */ 2 | 3 | #define A(i,j) a[ (i)*lda + (j) ] 4 | #define B(i,j) b[ (i)*ldb + (j) ] 5 | #define C(i,j) c[ (i)*ldc + (j) ] 6 | 7 | /* Routine for computing C = A * B + C */ 8 | 9 | void MY_MMult( int m, int n, int k, float *a, int lda, 10 | float *b, int ldb, 11 | float *c, int ldc ) 12 | { 13 | int i, j, p; 14 | 15 | for ( i=0; i 6 | 7 | float compare_matrices( int m, int n, float *a, int lda, float *b, int ldb ) 8 | { 9 | // printf("\n---result----\n"); 10 | // print_matrix(m, n, a, lda); 11 | // printf("\n-------\n"); 12 | // print_matrix(m, n, b, ldb); 13 | // printf("\n-------\n"); 14 | int i, j; 15 | float max_diff = 0.0, diff; 16 | int printed = 0; 17 | 18 | for ( i=0; i max_diff ? diff : max_diff ); 22 | if(0 == printed) 23 | if(max_diff > 0.5f || max_diff < -0.5f) { 24 | printf("\n error: i %d j %d diff %f", i, j, max_diff); 25 | printed = 1; 26 | } 27 | } 28 | } 29 | 30 | return max_diff; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /armv7/copy_matrix.c: -------------------------------------------------------------------------------- 1 | #define A( i, j ) a[ (i)*lda + (j) ] 2 | #define B( i, j ) b[ (i)*ldb + (j) ] 3 | 4 | void copy_matrix( int m, int n, float *a, int lda, float *b, int ldb ) 5 | { 6 | int i, j; 7 | 8 | for ( j=0; j 2 | #include 3 | 4 | static double gtod_ref_time_sec = 0.0; 5 | 6 | /* Adapted from the bl2_clock() routine in the BLIS library */ 7 | 8 | double dclock() 9 | { 10 | double the_time, norm_sec; 11 | struct timeval tv; 12 | 13 | gettimeofday( &tv, NULL ); 14 | 15 | if ( gtod_ref_time_sec == 0.0 ) 16 | gtod_ref_time_sec = ( double ) tv.tv_sec; 17 | 18 | norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec; 19 | 20 | the_time = norm_sec + tv.tv_usec * 1.0e-6; 21 | 22 | return the_time; 23 | } 24 | 25 | -------------------------------------------------------------------------------- /armv7/makefile: -------------------------------------------------------------------------------- 1 | OLD := MMult_4x4_19 2 | NEW := MMult_4x4_19 3 | 4 | # 5 | # sample makefile 6 | # 7 | 8 | CC := g++ 9 | LINKER := $(CC) 10 | #CFLAGS := -O0 -g -Wall 11 | CFLAGS := -std=c++11 -O3 -march=armv7-a -mfpu=neon -ftree-vectorize 12 | LDFLAGS := -lm 13 | 14 | UTIL := copy_matrix.o \ 15 | compare_matrices.o \ 16 | random_matrix.o \ 17 | dclock.o \ 18 | REF_MMult.o \ 19 | print_matrix.o 20 | 21 | TEST_OBJS := test_MMult.o $(NEW).o 22 | 23 | %.o: %.c 24 | $(CC) $(CFLAGS) -c $< -o $@ 25 | %.o: %.c 26 | $(CC) $(CFLAGS) -c $< -o $@ 27 | 28 | all: 29 | make clean; 30 | make test_MMult.x 31 | 32 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h 33 | $(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \ 34 | $(BLAS_LIB) -o $(TEST_BIN) $@ 35 | 36 | run: 37 | make all 38 | export OMP_NUM_THREADS=1 39 | export GOTO_NUM_THREADS=1 40 | echo "version = '$(NEW)';" > output_$(NEW).m 41 | ./test_MMult.x >> output_$(NEW).m 42 | cp output_$(OLD).m output_old.m 43 | cp output_$(NEW).m output_new.m 44 | 45 | clean: 46 | rm -f *.o *~ core *.x 47 | 48 | cleanall: 49 | rm -f *.o *~ core *.x output*.m *.eps *.png 50 | -------------------------------------------------------------------------------- /armv7/output_MMult_4x4_12.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_12'; 2 | MY_MMult = [ 3 | 40 1.910448e+00 0.000000e+00 4 | 80 1.939394e+00 0.000000e+00 5 | 120 2.016336e+00 0.000000e+00 6 | 160 1.960278e+00 0.000000e+00 7 | 200 1.881689e+00 0.000000e+00 8 | 240 1.879666e+00 0.000000e+00 9 | 280 1.748258e+00 7.629395e-06 10 | 320 1.787037e+00 1.144409e-05 11 | 360 1.817282e+00 1.525879e-05 12 | 400 1.827318e+00 2.288818e-05 13 | 440 1.820832e+00 2.670288e-05 14 | 480 1.854326e+00 2.861023e-05 15 | 520 1.784230e+00 3.242493e-05 16 | 560 1.830802e+00 3.814697e-05 17 | 600 1.822593e+00 4.005432e-05 18 | 640 1.807011e+00 4.005432e-05 19 | 680 1.829843e+00 4.386902e-05 20 | 720 1.864469e+00 5.340576e-05 21 | 760 1.862807e+00 4.959106e-05 22 | 800 1.830948e+00 5.912781e-05 23 | ]; 24 | -------------------------------------------------------------------------------- /armv7/output_MMult_4x4_18.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_18'; 2 | MY_MMult = [ 3 | 40 1.828571e+00 0.000000e+00 4 | 80 2.027723e+00 0.000000e+00 5 | 120 2.084439e+00 0.000000e+00 6 | 160 1.971600e+00 0.000000e+00 7 | 200 1.950268e+00 0.000000e+00 8 | 240 1.970915e+00 0.000000e+00 9 | 280 1.920896e+00 1.716614e-05 10 | 320 1.963273e+00 2.098083e-05 11 | 360 1.976154e+00 2.098083e-05 12 | 400 1.996444e+00 2.670288e-05 13 | 440 2.008725e+00 2.670288e-05 14 | 480 2.038543e+00 2.861023e-05 15 | 520 1.977067e+00 3.242493e-05 16 | 560 2.009877e+00 3.433228e-05 17 | 600 2.009910e+00 3.623962e-05 18 | 640 2.032573e+00 4.005432e-05 19 | 680 1.975727e+00 4.386902e-05 20 | ]; 21 | -------------------------------------------------------------------------------- /armv7/output_MMult_4x4_19.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_19'; 2 | MY_MMult = [ 3 | 40 2.461538e+00 0.000000e+00 4 | 80 2.805479e+00 0.000000e+00 5 | 120 2.870432e+00 0.000000e+00 6 | 160 2.747150e+00 0.000000e+00 7 | 200 2.675585e+00 0.000000e+00 8 | 240 2.720457e+00 0.000000e+00 9 | 280 2.639889e+00 0.000000e+00 10 | 320 2.748186e+00 0.000000e+00 11 | 360 2.713741e+00 0.000000e+00 12 | 400 2.780010e+00 0.000000e+00 13 | 440 2.742077e+00 0.000000e+00 14 | 480 2.835692e+00 0.000000e+00 15 | 520 2.719270e+00 0.000000e+00 16 | 560 2.828980e+00 0.000000e+00 17 | 600 2.768823e+00 0.000000e+00 18 | 640 2.866309e+00 0.000000e+00 19 | 680 2.808218e+00 0.000000e+00 20 | ]; 21 | -------------------------------------------------------------------------------- /armv7/output_MMult_4x4_20.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_20'; 2 | MY_MMult = [ 3 | 40 3.121951e+00 0.000000e+00 4 | 80 3.555556e+00 0.000000e+00 5 | 120 3.700214e+00 0.000000e+00 6 | 160 3.499359e+00 0.000000e+00 7 | 200 3.285421e+00 0.000000e+00 8 | 240 3.454704e+00 0.000000e+00 9 | 280 3.278620e+00 0.000000e+00 10 | 320 3.498425e+00 0.000000e+00 11 | 360 3.407164e+00 0.000000e+00 12 | 400 3.527337e+00 0.000000e+00 13 | 440 3.441569e+00 0.000000e+00 14 | 480 3.619914e+00 0.000000e+00 15 | 520 3.421952e+00 0.000000e+00 16 | 560 3.617666e+00 0.000000e+00 17 | -------------------------------------------------------------------------------- /armv7/output_MMult_4x4_21.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_21'; 2 | MY_MMult = [ 3 | 40 3.200000e+00 0.000000e+00 4 | 80 3.778598e+00 0.000000e+00 5 | 120 3.967853e+00 0.000000e+00 6 | 160 3.756075e+00 0.000000e+00 7 | 200 3.474484e+00 0.000000e+00 8 | 240 3.681491e+00 0.000000e+00 9 | 280 3.468204e+00 0.000000e+00 10 | 320 3.714561e+00 0.000000e+00 11 | 360 3.615343e+00 0.000000e+00 12 | 400 3.750256e+00 0.000000e+00 13 | 440 3.659264e+00 0.000000e+00 14 | 480 3.860374e+00 0.000000e+00 15 | 520 3.633610e+00 0.000000e+00 16 | 560 3.852707e+00 0.000000e+00 17 | 600 3.730312e+00 0.000000e+00 18 | 640 3.926516e+00 0.000000e+00 19 | 680 3.803391e+00 0.000000e+00 20 | ]; 21 | -------------------------------------------------------------------------------- /armv7/output_new.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_19'; 2 | MY_MMult = [ 3 | 40 2.461538e+00 0.000000e+00 4 | 80 2.805479e+00 0.000000e+00 5 | 120 2.870432e+00 0.000000e+00 6 | 160 2.747150e+00 0.000000e+00 7 | 200 2.675585e+00 0.000000e+00 8 | 240 2.720457e+00 0.000000e+00 9 | 280 2.639889e+00 0.000000e+00 10 | 320 2.748186e+00 0.000000e+00 11 | 360 2.713741e+00 0.000000e+00 12 | 400 2.780010e+00 0.000000e+00 13 | 440 2.742077e+00 0.000000e+00 14 | 480 2.835692e+00 0.000000e+00 15 | 520 2.719270e+00 0.000000e+00 16 | 560 2.828980e+00 0.000000e+00 17 | 600 2.768823e+00 0.000000e+00 18 | 640 2.866309e+00 0.000000e+00 19 | 680 2.808218e+00 0.000000e+00 20 | ]; 21 | -------------------------------------------------------------------------------- /armv7/output_old.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_4x4_19'; 2 | MY_MMult = [ 3 | 40 2.461538e+00 0.000000e+00 4 | 80 2.805479e+00 0.000000e+00 5 | 120 2.870432e+00 0.000000e+00 6 | 160 2.747150e+00 0.000000e+00 7 | 200 2.675585e+00 0.000000e+00 8 | 240 2.720457e+00 0.000000e+00 9 | 280 2.639889e+00 0.000000e+00 10 | 320 2.748186e+00 0.000000e+00 11 | 360 2.713741e+00 0.000000e+00 12 | 400 2.780010e+00 0.000000e+00 13 | 440 2.742077e+00 0.000000e+00 14 | 480 2.835692e+00 0.000000e+00 15 | 520 2.719270e+00 0.000000e+00 16 | 560 2.828980e+00 0.000000e+00 17 | 600 2.768823e+00 0.000000e+00 18 | 640 2.866309e+00 0.000000e+00 19 | 680 2.808218e+00 0.000000e+00 20 | ]; 21 | -------------------------------------------------------------------------------- /armv7/parameters.h: -------------------------------------------------------------------------------- 1 | /* 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )" 3 | The below parameters set this range of values that p takes on 4 | */ 5 | #define PFIRST 40 6 | #define PLAST 700 7 | #define PINC 40 8 | 9 | /* 10 | In the test driver, the m, n, and k dimensions are set to the below 11 | values. If the value equals "-1" then that dimension is bound to the 12 | index p, given above. 13 | */ 14 | 15 | #define M -1 16 | #define N -1 17 | #define K -1 18 | 19 | /* 20 | In the test driver, each experiment is repeated NREPEATS times and 21 | the best time from these repeats is used to compute the performance 22 | */ 23 | 24 | #define NREPEATS 20 25 | 26 | /* 27 | Matrices A, B, and C are stored in two dimensional arrays with 28 | row dimensions that are greater than or equal to the row dimension 29 | of the matrix. This row dimension of the array is known as the 30 | "leading dimension" and determines the stride (the number of 31 | double precision numbers) when one goes from one element in a row 32 | to the next. Having this number larger than the row dimension of 33 | the matrix tends to adversely affect performance. LDX equals the 34 | leading dimension of the array that stores matrix X. If LDX=-1 35 | then the leading dimension is set to the row dimension of matrix X. 36 | */ 37 | 38 | #if 0 39 | #define LDA 1000 40 | #define LDB 1000 41 | #define LDC 1000 42 | #else 43 | #define LDA -1 44 | #define LDB -1 45 | #define LDC -1 46 | #endif 47 | -------------------------------------------------------------------------------- /armv7/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def readFile(filename): 5 | f = open(filename) 6 | sizes = [40] 7 | times = [0.0] 8 | title = '' 9 | try: 10 | title = f.readline() 11 | # skip 1 line 12 | f.readline() 13 | while True: 14 | line = f.readline() 15 | if line: 16 | slices = line.split(" ") 17 | if len(slices) <= 2: 18 | break; 19 | size = int(slices[0]) 20 | time = float(slices[1]) 21 | sizes.append(size) 22 | times.append(time) 23 | finally: 24 | f.close() 25 | return title, sizes, times 26 | 27 | if __name__ == '__main__': 28 | plt.xlabel('size') 29 | plt.ylabel('gflops') 30 | t1, x1, y1 = readFile('output_old.m') 31 | plt.plot(x1, y1, label=t1) 32 | t2, x2, y2 = readFile('output_new.m') 33 | plt.plot(x2, y2, label=t2) 34 | plt.legend() 35 | plt.show() 36 | 37 | -------------------------------------------------------------------------------- /armv7/print_matrix.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define A( i, j ) a[ (i)*lda + (j) ] 4 | 5 | void print_matrix( int m, int n, float *a, int lda ) 6 | { 7 | int i, j; 8 | 9 | for ( i=0; i 2 | 3 | #define A( i,j ) a[ (j)*lda + (i) ] 4 | 5 | void random_matrix( int m, int n, float *a, int lda ) 6 | { 7 | double drand48(); 8 | int i,j; 9 | 10 | for ( i=0; i 2 | // #include 3 | #include 4 | #include 5 | 6 | #include "parameters.h" 7 | 8 | void REF_MMult(int, int, int, float *, int, float *, int, float *, int ); 9 | void MY_MMult(int, int, int, float *, int, float *, int, float *, int ); 10 | void copy_matrix(int, int, float *, int, float *, int ); 11 | void random_matrix(int, int, float *, int); 12 | float compare_matrices( int, int, float *, int, float *, int ); 13 | 14 | double dclock(); 15 | 16 | int main() 17 | { 18 | int 19 | p, 20 | m, n, k, 21 | lda, ldb, ldc, 22 | rep; 23 | 24 | double 25 | dtime, dtime_best, 26 | gflops, 27 | diff; 28 | 29 | float 30 | *a, *b, *c, *cref, *cold; 31 | 32 | printf( "MY_MMult = [\n" ); 33 | 34 | for ( p=PFIRST; p<=PLAST; p+=PINC ){ 35 | m = ( M == -1 ? p : M ); 36 | n = ( N == -1 ? p : N ); 37 | k = ( K == -1 ? p : K ); 38 | 39 | gflops = 2.0 * m * n * k * 1.0e-09; 40 | 41 | lda = ( LDA == -1 ? m : LDA ); 42 | ldb = ( LDB == -1 ? k : LDB ); 43 | ldc = ( LDC == -1 ? m : LDC ); 44 | 45 | /* Allocate space for the matrices */ 46 | /* Note: I create an extra column in A to make sure that 47 | prefetching beyond the matrix does not cause a segfault */ 48 | a = ( float * ) malloc( lda * (k+1) * sizeof( float ) ); 49 | b = ( float * ) malloc( ldb * n * sizeof( float ) ); 50 | c = ( float * ) malloc( ldc * n * sizeof( float ) ); 51 | cold = ( float * ) malloc( ldc * n * sizeof( float ) ); 52 | cref = ( float * ) malloc( ldc * n * sizeof( float ) ); 53 | 54 | /* Generate random matrices A, B, Cold */ 55 | random_matrix( m, k, a, lda ); 56 | random_matrix( k, n, b, ldb ); 57 | random_matrix( m, n, cold, ldc ); 58 | #if 1 59 | memset(cold, 0, ldc * n * sizeof(float)); 60 | #endif 61 | 62 | copy_matrix( m, n, cold, ldc, cref, ldc ); 63 | 64 | /* Run the reference implementation so the answers can be compared */ 65 | 66 | REF_MMult( m, n, k, a, lda, b, ldb, cref, ldc ); 67 | 68 | /* Time the "optimized" implementation */ 69 | for ( rep=0; rep 0.5f || diff < -0.5f){ 87 | exit(0); 88 | } 89 | 90 | printf( "%d %le %le \n", p, gflops / dtime_best, diff ); 91 | fflush( stdout ); 92 | 93 | free( a ); 94 | free( b ); 95 | free( c ); 96 | free( cold ); 97 | free( cref ); 98 | } 99 | 100 | printf( "];\n" ); 101 | 102 | exit( 0 ); 103 | } 104 | 105 | -------------------------------------------------------------------------------- /cuda-int4/README.md: -------------------------------------------------------------------------------- 1 | WIP 2 | -------------------------------------------------------------------------------- /cuda/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | run.sh 3 | -------------------------------------------------------------------------------- /cuda/MMult_cuBLAS_1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA runtime 5 | #include "helper.h" 6 | #include 7 | #include 8 | 9 | // CUDA and CUBLAS functions 10 | 11 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda, 12 | float *d_B, int ldb, float *d_C, int ldc) { 13 | 14 | const float alpha = 1.0f; 15 | const float beta = 0.0f; 16 | 17 | checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, 18 | d_B, n, d_A, k, &beta, d_C, n)); 19 | } 20 | -------------------------------------------------------------------------------- /cuda/MMult_cuBLAS_2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA runtime 5 | #include "helper.h" 6 | #include 7 | #include 8 | 9 | // CUDA and CUBLAS functions 10 | 11 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda, 12 | float *d_B, int ldb, float *d_C, int ldc) { 13 | 14 | const float alpha = 1.0f; 15 | const float beta = 0.0f; 16 | #if __CUDACC_VER_MAJOR__ >= 11 17 | cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; 18 | #else 19 | cudaDataType_t compute_type = CUDA_R_32F; 20 | #endif 21 | 22 | checkCudaErrors(cublasGemmEx( 23 | handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, 24 | (void*)(&alpha), d_B, CUDA_R_32F, n, d_A, CUDA_R_32F, k, 25 | (void*)(&beta), d_C, CUDA_R_32F, n, compute_type, CUBLAS_GEMM_DEFAULT)); 26 | } 27 | -------------------------------------------------------------------------------- /cuda/MMult_cuda_2.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA runtime 5 | #include "helper.h" 6 | #include 7 | #include 8 | 9 | /** 10 | * naive 实现 11 | */ 12 | template 13 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb, 14 | float *c, int ldc) { 15 | int _m = blockIdx.x * BLOCK + threadIdx.x; 16 | int _n = blockIdx.y * BLOCK + threadIdx.y; 17 | if (_m < m and _n < n) { 18 | float sum = 0.f; 19 | for (int i = 0; i < k; ++i) { 20 | sum += a[_m * k + i] * b[i * n + _n]; 21 | } 22 | c[_m * n + _n] = sum; 23 | } 24 | } 25 | 26 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda, 27 | float *d_B, int ldb, float *d_C, int ldc) { 28 | 29 | constexpr int BLOCK = 16; 30 | // subm, subn, subk 31 | dim3 block(BLOCK, BLOCK); 32 | dim3 grid((m + BLOCK - 1) / BLOCK, (n + BLOCK - 1) / BLOCK); 33 | 34 | sgemm<<>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc); 35 | } 36 | -------------------------------------------------------------------------------- /cuda/MMult_cuda_3.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA runtime 5 | #include "helper.h" 6 | #include 7 | #include 8 | 9 | // a = mxk, b = kxn 10 | template 11 | __global__ void sgemm(int m, int n, int k, float *a, float *b, float *c) { 12 | // blockIdx control subpanel matrix 13 | 14 | const int tx = threadIdx.x; 15 | const int ty = threadIdx.y; 16 | const int bx = blockIdx.x; 17 | const int by = blockIdx.y; 18 | 19 | float *begin_a = a + bx * BLOCK * k; 20 | float *begin_b = b + by * BLOCK; 21 | float *end_a = begin_a + k; 22 | 23 | float sum = 0.f; 24 | 25 | for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a; 26 | a_ptr += BLOCK, b_ptr += BLOCK * n) { 27 | 28 | __shared__ float ashare[BLOCK][BLOCK]; 29 | __shared__ float bshare[BLOCK][BLOCK]; 30 | 31 | ashare[ty][tx] = a_ptr[ty * k + tx]; 32 | bshare[ty][tx] = b_ptr[ty * n + tx]; 33 | __syncthreads(); 34 | 35 | #pragma unroll 36 | for (int kk = 0; kk < BLOCK; ++kk) { 37 | sum += ashare[ty][kk] * bshare[kk][tx]; 38 | } 39 | __syncthreads(); 40 | } 41 | 42 | c[(BLOCK * bx + ty) * n + BLOCK * by + tx] = sum; 43 | } 44 | 45 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda, 46 | float *d_B, int ldb, float *d_C, int ldc) { 47 | 48 | constexpr int BLOCK = 16; 49 | dim3 block(BLOCK, BLOCK); 50 | dim3 grid((m + BLOCK - 1) / BLOCK, (n + BLOCK - 1) / BLOCK); 51 | 52 | sgemm<<>>(m, n, k, d_A, d_B, d_C); 53 | } 54 | -------------------------------------------------------------------------------- /cuda/MMult_cuda_4.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA runtime 5 | #include "helper.h" 6 | #include 7 | #include 8 | 9 | // a = mxk, b = kxn 10 | template 11 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb, 12 | float *c, int ldc) { 13 | // blockIdx control subpanel matrix 14 | constexpr int STEP = BLOCK * STRIDE; 15 | const int tx = threadIdx.x; 16 | const int ty = threadIdx.y; 17 | const int bx = blockIdx.x; 18 | const int by = blockIdx.y; 19 | 20 | float *begin_a = a + by * STEP * k; 21 | float *begin_b = b + bx * STEP; 22 | float *end_a = begin_a + k; 23 | 24 | float sum[STRIDE][STRIDE] = {0.f}; 25 | for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a; 26 | a_ptr += STEP, b_ptr += STEP * n) { 27 | __shared__ float ashare[STEP][STEP]; 28 | __shared__ float bshare[STEP][STEP]; 29 | 30 | for (int i = 0; i < STRIDE; ++i) { 31 | for (int j = 0; j < STRIDE; ++j) { 32 | ashare[ty * STRIDE + i][tx * STRIDE + j] = 33 | a_ptr[(ty * STRIDE + i) * k + tx * STRIDE + j]; 34 | bshare[ty * STRIDE + i][tx * STRIDE + j] = 35 | b_ptr[(ty * STRIDE + i) * n + tx * STRIDE + j]; 36 | } 37 | } 38 | __syncthreads(); 39 | 40 | #pragma unroll 41 | for (int i = 0; i < STRIDE; ++i) { 42 | for (int j = 0; j < STRIDE; ++j) { 43 | for (int kk = 0; kk < STEP; ++kk) { 44 | sum[i][j] += 45 | ashare[ty * STRIDE + i][kk] * bshare[kk][tx * STRIDE + j]; 46 | } 47 | } 48 | } 49 | 50 | __syncthreads(); 51 | } 52 | 53 | for (int i = 0; i < STRIDE; ++i) { 54 | for (int j = 0; j < STRIDE; ++j) { 55 | c[(STEP * by + ty * STRIDE + i) * n + STEP * bx + tx * STRIDE + j] = 56 | sum[i][j]; 57 | } 58 | } 59 | } 60 | 61 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda, 62 | float *d_B, int ldb, float *d_C, int ldc) { 63 | 64 | constexpr int BLOCK = 16; 65 | constexpr int STRIDE = 2; // every thread calc STRIDExSTRIDE result 66 | dim3 block(BLOCK, BLOCK); 67 | dim3 grid((m + BLOCK - 1) / BLOCK / STRIDE, (n + BLOCK - 1) / BLOCK / STRIDE); 68 | 69 | sgemm<<>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc); 70 | } 71 | -------------------------------------------------------------------------------- /cuda/MMult_cuda_5.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA runtime 5 | #include "helper.h" 6 | #include 7 | #include 8 | 9 | // MY_MMult = [ 10 | // 1024 6467.51 7.247925e-05 11 | // 2048 6693.74 1.525879e-04 12 | // 3072 7096.70 2.288818e-04 13 | // 4096 6677.67 4.425049e-04 14 | // ]; 15 | /** 16 | * 和 version4 的区别: 17 | * 1. 修改了分块尺寸 18 | * 2. 每个 block 有 8x8 个线程,每个线程计算 4x4 个结果 19 | */ 20 | template 21 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb, 22 | float *c, int ldc) { 23 | // blockIdx control subpanel matrix 24 | constexpr int STEP = BLOCK * STRIDE; 25 | const int tx = threadIdx.x * STRIDE; 26 | const int ty = threadIdx.y * STRIDE; 27 | const int bx = blockIdx.x * STEP; 28 | const int by = blockIdx.y * STEP; 29 | 30 | float *begin_a = a + by * k; 31 | float *begin_b = b + bx; 32 | float *end_a = begin_a + k; 33 | 34 | float sum[STRIDE][STRIDE] = {0.f}; 35 | for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a; 36 | a_ptr += STEP, b_ptr += STEP * n) { 37 | __shared__ __align__(16 * 1024) float ashare[STEP][STEP]; 38 | __shared__ __align__(16 * 1024) float bshare[STEP][STEP]; 39 | 40 | for (int i = 0; i < STRIDE; ++i) { 41 | for (int j = 0; j < STRIDE; ++j) { 42 | ashare[ty + i][tx + j] = a_ptr[(ty + i) * k + tx + j]; 43 | bshare[ty + i][tx + j] = b_ptr[(ty + i) * n + tx + j]; 44 | } 45 | } 46 | __syncthreads(); 47 | 48 | for (int i = 0; i < STRIDE; ++i) { 49 | for (int j = 0; j < STRIDE; ++j) { 50 | for (int kk = 0; kk < STEP; ++kk) { 51 | sum[i][j] += ashare[ty + i][kk] * bshare[kk][tx + j]; 52 | } 53 | } 54 | } 55 | 56 | __syncthreads(); 57 | } 58 | 59 | #pragma unroll 60 | for (int i = 0; i < STRIDE; ++i) { 61 | for (int j = 0; j < STRIDE; ++j) { 62 | c[(by + ty + i) * n + bx + tx + j] = sum[i][j]; 63 | } 64 | } 65 | } 66 | 67 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda, 68 | float *d_B, int ldb, float *d_C, int ldc) { 69 | 70 | constexpr int BLOCK = 8; 71 | constexpr int STRIDE = 4; // every thread calc STRIDExSTRIDE result 72 | dim3 block(BLOCK, BLOCK); 73 | dim3 grid((m + BLOCK - 1) / BLOCK / STRIDE, (n + BLOCK - 1) / BLOCK / STRIDE); 74 | 75 | sgemm<<>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc); 76 | } 77 | -------------------------------------------------------------------------------- /cuda/MMult_cuda_6.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA runtime 5 | #include "helper.h" 6 | #include 7 | #include 8 | 9 | // a = mxk, b = kxn 10 | template 11 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb, 12 | float *c, int ldc) { 13 | // blockIdx control subpanel matrix 14 | constexpr int STEP = BLOCK * STRIDE; 15 | const int tx = threadIdx.x * STRIDE; 16 | const int ty = threadIdx.y * STRIDE; 17 | const int bx = blockIdx.x * STEP; 18 | const int by = blockIdx.y * STEP; 19 | 20 | float *begin_a = a + by * k; 21 | float *begin_b = b + bx; 22 | float *end_a = begin_a + k; 23 | 24 | float sum[STRIDE][STRIDE] = {0.f}; 25 | 26 | __shared__ float ashare[STEP][2 * STEP]; 27 | __shared__ float bshare[2 * STEP][STEP]; 28 | // bigger split 29 | for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a; 30 | a_ptr += 2 * STEP, b_ptr += 2 * STEP * n) { 31 | 32 | for (int i = 0; i < STRIDE; ++i) { 33 | for (int j = 0; j < STRIDE; ++j) { 34 | ashare[ty + i][tx + j] = a_ptr[(ty + i) * k + tx + j]; 35 | ashare[ty + i][tx + j + STEP] = a_ptr[(ty + i) * k + tx + j + STEP]; 36 | 37 | bshare[ty + i][tx + j] = b_ptr[(ty + i) * n + tx + j]; 38 | bshare[ty + i + STEP][tx + j] = b_ptr[(ty + i + STEP) * n + tx + j]; 39 | } 40 | } 41 | __syncthreads(); 42 | 43 | for (int i = 0; i < STRIDE; ++i) { 44 | for (int j = 0; j < STRIDE; ++j) { 45 | for (int kk = 0; kk < 2 * STEP; ++kk) { 46 | sum[i][j] += ashare[ty + i][kk] * bshare[kk][tx + j]; 47 | } 48 | } 49 | } 50 | 51 | __syncthreads(); 52 | } 53 | 54 | #pragma unroll 55 | for (int i = 0; i < STRIDE; ++i) { 56 | for (int j = 0; j < STRIDE; ++j) { 57 | c[(by + ty + i) * n + bx + tx + j] = sum[i][j]; 58 | } 59 | } 60 | } 61 | 62 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda, 63 | float *d_B, int ldb, float *d_C, int ldc) { 64 | 65 | constexpr int BLOCK = 16; 66 | constexpr int STRIDE = 2; // every thread calc STRIDExSTRIDE result 67 | dim3 block(BLOCK, BLOCK); 68 | dim3 grid((m + BLOCK - 1) / BLOCK / STRIDE, (n + BLOCK - 1) / BLOCK / STRIDE); 69 | 70 | sgemm<<>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc); 71 | } 72 | -------------------------------------------------------------------------------- /cuda/MMult_cuda_7.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA runtime 5 | #include "helper.h" 6 | #include 7 | #include 8 | 9 | // a = mxk, b = kxn 10 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb, 11 | float *c, int ldc) { 12 | const int tx = (threadIdx.x % 16) * 2; 13 | const int ty = threadIdx.x / 16 * 2; 14 | const int bx = blockIdx.x * 64; 15 | const int by = blockIdx.y * 64; 16 | 17 | float *begin_a = a + by * k; 18 | float *begin_b = b + bx; 19 | float *end_a = begin_a + k; 20 | 21 | __shared__ float ashare[64][64]; 22 | __shared__ float bshare[64][64]; 23 | float sum0[2][2] = {0}; 24 | float sum1[2][2] = {0}; 25 | float sum2[2][2] = {0}; 26 | float sum3[2][2] = {0}; 27 | 28 | // bigger split 29 | for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a; 30 | a_ptr += 64, b_ptr += 64 * n) { 31 | 32 | #pragma unroll 33 | for (int i = 0; i < 2; ++i) { 34 | for (int j = 0; j < 2; ++j) { 35 | ashare[ty + i][tx + j] = a_ptr[(ty + i) * k + tx + j]; 36 | ashare[ty + i][tx + j + 32] = a_ptr[(ty + i) * k + tx + j + 32]; 37 | ashare[ty + i + 32][tx + j] = a_ptr[(ty + 32 + i) * k + tx + j]; 38 | ashare[ty + i + 32][tx + j + 32] = 39 | a_ptr[(ty + 32 + i) * k + tx + j + 32]; 40 | 41 | bshare[ty + i][tx + j] = b_ptr[(ty + i) * n + tx + j]; 42 | bshare[ty + i][tx + j + 32] = b_ptr[(ty + i) * n + tx + j + 32]; 43 | bshare[ty + i + 32][tx + j] = b_ptr[(ty + i + 32) * n + tx + j]; 44 | bshare[ty + i + 32][tx + j + 32] = 45 | b_ptr[(ty + i + 32) * n + tx + j + 32]; 46 | } 47 | } 48 | __syncthreads(); 49 | 50 | #pragma unroll 51 | for (int i = 0; i < 2; ++i) { 52 | for (int j = 0; j < 2; ++j) { 53 | for (int subk = 0; subk < 64; ++subk) { 54 | sum0[i][j] += ashare[ty + i][subk] * bshare[subk][tx + j]; 55 | sum1[i][j] += ashare[ty + i][subk] * bshare[subk][tx + j + 32]; 56 | sum2[i][j] += ashare[ty + i + 32][subk] * bshare[subk][tx + j]; 57 | sum3[i][j] += ashare[ty + i + 32][subk] * bshare[subk][tx + j + 32]; 58 | } 59 | } 60 | } 61 | __syncthreads(); 62 | } 63 | 64 | #pragma unroll 65 | for (int i = 0; i < 2; ++i) { 66 | for (int j = 0; j < 2; ++j) { 67 | c[(by + ty + i) * n + bx + tx + j] = sum0[i][j]; 68 | c[(by + ty + i) * n + bx + tx + 32 + j] = sum1[i][j]; 69 | c[(by + ty + i + 32) * n + bx + tx + j] = sum2[i][j]; 70 | c[(by + ty + i + 32) * n + bx + tx + 32 + j] = sum3[i][j]; 71 | } 72 | } 73 | } 74 | 75 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda, 76 | float *d_B, int ldb, float *d_C, int ldc) { 77 | 78 | dim3 block(256); 79 | dim3 grid(m / 64, n / 64); 80 | 81 | sgemm<<>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc); 82 | } 83 | -------------------------------------------------------------------------------- /cuda/PlotAll.m: -------------------------------------------------------------------------------- 1 | % 2 | % Clear all variables and close all graphs 3 | % 4 | 5 | clear all 6 | close all 7 | 8 | % 9 | % Get max_gflops from /proc/cpuinfo by reading the parameters 10 | % set in file proc_parameters.m 11 | % 12 | 13 | proc_parameters 14 | 15 | max_gflops = nflops_per_cycle * nprocessors * GHz_of_processor; 16 | 17 | % 18 | % Read in the first data set and plot it. 19 | % 20 | 21 | output_old 22 | 23 | version_old = version; 24 | 25 | plot( MY_MMult( :,1 ), MY_MMult( :,2 ), 'bo-.;OLD;' ); 26 | last = size( MY_MMult, 1 ); 27 | 28 | hold on 29 | 30 | axis( [ 0 MY_MMult( last,1 ) 0 max_gflops ] ); 31 | 32 | xlabel( 'm = n = k' ); 33 | ylabel( 'GFLOPS/sec.' ); 34 | 35 | % 36 | % Read in second data set and plot it. 37 | % 38 | 39 | output_new 40 | 41 | version_new = version 42 | 43 | title_string = sprintf("OLD = %s, NEW = %s", version_old, version_new); 44 | 45 | plot( MY_MMult( :,1 ), MY_MMult( :,2 ), 'r-*;NEW;' ); 46 | 47 | title( title_string ); 48 | 49 | filename = sprintf( "compare_%s_%s", version_old, version_new ); 50 | 51 | print( filename, '-dpng' ); 52 | -------------------------------------------------------------------------------- /cuda/REF_MMult.cpp: -------------------------------------------------------------------------------- 1 | /* Create macros so that the matrices are stored in row-major order */ 2 | #define A(i, j) a[(i)*lda + (j)] 3 | #define B(i, j) b[(i)*ldb + (j)] 4 | #define C(i, j) c[(i)*ldc + (j)] 5 | 6 | #include 7 | /* Routine for computing C = A * B + C */ 8 | 9 | void REF_MMult(int m, int n, int k, float *a, int lda, float *b, int ldb, 10 | float *c, int ldc) { 11 | cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0f, a, lda, 12 | b, ldb, 0.0f, c, ldc); 13 | } 14 | -------------------------------------------------------------------------------- /cuda/compare_matrices.cpp: -------------------------------------------------------------------------------- 1 | #define A(i, j) a[(i)*lda + (j)] 2 | #define B(i, j) b[(i)*ldb + (j)] 3 | #define abs(x) ((x) < 0.0 ? -(x) : (x)) 4 | 5 | #include 6 | 7 | float compare_matrices(int m, int n, float *a, int lda, float *b, int ldb) { 8 | // printf("\n---result----\n"); 9 | // print_matrix(m, n, a, lda); 10 | // printf("\n-------\n"); 11 | // print_matrix(m, n, b, ldb); 12 | // printf("\n-------\n"); 13 | int i, j; 14 | float max_diff = 0.0, diff; 15 | int printed = 0; 16 | 17 | for (i = 0; i < m; i++) { 18 | for (j = 0; j < n; j++) { 19 | diff = abs(A(i, j) - B(i, j)); 20 | max_diff = (diff > max_diff ? diff : max_diff); 21 | if (0 == printed) 22 | if (max_diff > 0.5f || max_diff < -0.5f) { 23 | printf("\n error: i %d j %d diff %f got %f expect %f ", i, j, max_diff, A(i, j), B(i, j)); 24 | printed = 1; 25 | } 26 | } 27 | } 28 | 29 | return max_diff; 30 | } 31 | -------------------------------------------------------------------------------- /cuda/copy_matrix.cpp: -------------------------------------------------------------------------------- 1 | #define A(i, j) a[(i)*lda + (j)] 2 | #define B(i, j) b[(i)*ldb + (j)] 3 | 4 | void copy_matrix(int m, int n, float *a, int lda, float *b, int ldb) { 5 | int i, j; 6 | 7 | for (j = 0; j < n; j++) 8 | for (i = 0; i < m; i++) 9 | B(i, j) = A(i, j); 10 | } 11 | -------------------------------------------------------------------------------- /cuda/dclock.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | static double gtod_ref_time_sec = 0.0; 5 | 6 | /* Adapted from the bl2_clock() routine in the BLIS library */ 7 | 8 | double dclock() { 9 | double the_time, norm_sec; 10 | struct timeval tv; 11 | 12 | gettimeofday(&tv, NULL); 13 | 14 | if (gtod_ref_time_sec == 0.0) 15 | gtod_ref_time_sec = (double)tv.tv_sec; 16 | 17 | norm_sec = (double)tv.tv_sec - gtod_ref_time_sec; 18 | 19 | the_time = norm_sec + tv.tv_usec * 1.0e-6; 20 | 21 | return the_time; 22 | } 23 | -------------------------------------------------------------------------------- /cuda/helper.h: -------------------------------------------------------------------------------- 1 | #pragma _HELPER_H_ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | template 8 | void check(T result, char const *const func, const char *const file, 9 | int const line) { 10 | if (result) { 11 | fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, 12 | static_cast(result), func); 13 | exit(EXIT_FAILURE); 14 | } 15 | } 16 | 17 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) 18 | -------------------------------------------------------------------------------- /cuda/makefile: -------------------------------------------------------------------------------- 1 | OLD := MMult_cuBLAS_1 2 | #NEW := MMult_cuBLAS_1 3 | NEW := MMult_cuda_3 4 | SMS ?= 70 75 80 86 5 | 6 | # 7 | # sample makefile 8 | # 9 | 10 | CC := nvcc 11 | LINKER := $(CC) 12 | #CFLAGS := -O0 -g -Wall 13 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 14 | # CFLAGS := -std=c++17 -O0 -g -G 15 | CFLAGS := -std=c++17 -O2 16 | LDFLAGS := -lm -lcublas -lopenblas 17 | 18 | UTIL := copy_matrix.o \ 19 | compare_matrices.o \ 20 | random_matrix.o \ 21 | dclock.o \ 22 | REF_MMult.o \ 23 | print_matrix.o 24 | 25 | TEST_OBJS := test_MMult.o $(NEW).o 26 | 27 | %.o: %.cpp 28 | $(CC) $(CFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 29 | 30 | %.o: %.cu 31 | $(CC) $(CFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 32 | 33 | all: 34 | make clean; 35 | make test_MMult.x 36 | 37 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h 38 | $(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \ 39 | $(BLAS_LIB) -o $(TEST_BIN) $@ 40 | 41 | run: 42 | make all 43 | echo "version = '$(NEW)';" > output_$(NEW).m 44 | ./test_MMult.x >> output_$(NEW).m 45 | cp output_$(OLD).m output_old.m 46 | cp output_$(NEW).m output_new.m 47 | 48 | clean: 49 | rm -f *.o *~ core *.x 50 | 51 | cleanall: 52 | rm -f *.o *~ core *.x output*.m *.eps *.png 53 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuBLAS_1.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuBLAS_1'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 10637.93 3.242493e-05 6 | 1152 16397.92 3.480911e-05 7 | 1280 16559.61 5.626678e-05 8 | 1408 13734.41 5.340576e-05 9 | 1536 14581.54 5.340576e-05 10 | 1664 14285.89 4.577637e-05 11 | 1792 13704.07 5.340576e-05 12 | 1920 13878.71 5.912781e-05 13 | 2048 16339.17 1.564026e-04 14 | 2176 12957.95 1.716614e-04 15 | 2304 16535.38 1.716614e-04 16 | 2432 12519.21 8.392334e-05 17 | 2560 16971.11 1.945496e-04 18 | 2688 18144.32 1.907349e-04 19 | 2816 12950.36 2.193451e-04 20 | 2944 16634.41 2.593994e-04 21 | 3072 17836.77 2.441406e-04 22 | 3200 12842.99 2.746582e-04 23 | 3328 16601.17 3.280640e-04 24 | 3456 16300.24 3.280640e-04 25 | 3584 12411.39 1.029968e-04 26 | 3712 17320.47 3.280640e-04 27 | 3840 14158.14 3.738403e-04 28 | 3968 13989.40 3.738403e-04 29 | 4096 14217.98 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuBLAS_2.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuBLAS_2'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 10768.11 3.242493e-05 6 | 1152 16446.89 3.480911e-05 7 | 1280 16571.86 5.626678e-05 8 | 1408 13898.33 5.340576e-05 9 | 1536 14489.48 5.340576e-05 10 | 1664 14472.10 4.577637e-05 11 | 1792 13926.61 5.340576e-05 12 | 1920 13842.92 5.912781e-05 13 | 2048 16421.74 1.564026e-04 14 | 2176 13009.90 1.716614e-04 15 | 2304 16599.08 1.716614e-04 16 | 2432 12421.43 8.392334e-05 17 | 2560 16870.40 1.945496e-04 18 | 2688 17895.05 1.907349e-04 19 | 2816 12906.06 2.193451e-04 20 | 2944 16560.19 2.593994e-04 21 | 3072 17843.88 2.441406e-04 22 | 3200 12758.92 2.746582e-04 23 | 3328 16618.32 3.280640e-04 24 | 3456 16333.63 3.280640e-04 25 | 3584 12356.31 1.029968e-04 26 | 3712 17338.03 3.280640e-04 27 | 3840 14111.53 3.738403e-04 28 | 3968 13971.58 3.738403e-04 29 | 4096 14184.42 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_10.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_10'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 10860.91 7.247925e-05 6 | 1152 9017.44 8.392334e-05 7 | 1280 11176.41 1.068115e-04 8 | 1408 13409.62 1.258850e-04 9 | 1536 10387.59 1.182556e-04 10 | 1664 12207.25 1.258850e-04 11 | 1792 14192.14 1.373291e-04 12 | 1920 12808.66 1.907349e-04 13 | 2048 15352.21 1.564026e-04 14 | 2176 13981.38 1.716614e-04 15 | 2304 15379.50 1.716614e-04 16 | 2432 14650.03 1.831055e-04 17 | 2560 15833.78 1.945496e-04 18 | 2688 15134.27 1.907349e-04 19 | 2816 14683.54 2.193451e-04 20 | 2944 15758.60 2.593994e-04 21 | 3072 15407.97 2.441406e-04 22 | 3200 15168.65 2.746582e-04 23 | 3328 16030.18 3.280640e-04 24 | 3456 15862.35 3.280640e-04 25 | 3584 15620.07 2.899170e-04 26 | 3712 15600.41 3.280640e-04 27 | 3840 15479.32 3.738403e-04 28 | 3968 15421.87 3.738403e-04 29 | 4096 15388.22 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_11.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_11'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 12636.12 7.247925e-05 6 | 1152 10658.86 8.392334e-05 7 | 1280 13162.70 1.068115e-04 8 | 1408 15935.04 1.258850e-04 9 | 1536 11895.09 1.182556e-04 10 | 1664 14023.52 1.258850e-04 11 | 1792 16268.22 1.373291e-04 12 | 1920 14974.50 1.907349e-04 13 | 2048 16748.66 1.564026e-04 14 | 2176 15306.64 1.716614e-04 15 | 2304 17145.08 1.716614e-04 16 | 2432 16403.14 1.831055e-04 17 | 2560 17659.38 1.945496e-04 18 | 2688 16749.79 1.907349e-04 19 | 2816 16429.85 2.193451e-04 20 | 2944 17211.17 2.593994e-04 21 | 3072 16926.06 2.441406e-04 22 | 3200 16721.53 2.746582e-04 23 | 3328 17413.48 3.280640e-04 24 | 3456 17003.49 3.280640e-04 25 | 3584 17123.20 2.899170e-04 26 | 3712 16784.96 3.280640e-04 27 | 3840 16988.37 3.738403e-04 28 | 3968 16687.82 3.738403e-04 29 | 4096 16831.67 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_12.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_12'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3090" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 16257.53 7.247925e-05 6 | 1152 20440.02 8.392334e-05 7 | 1280 14655.91 1.068115e-04 8 | 1408 17732.82 1.258850e-04 9 | 1536 20937.86 1.182556e-04 10 | 1664 16541.26 1.258850e-04 11 | 1792 19023.14 1.373291e-04 12 | 1920 21486.99 1.907349e-04 13 | 2048 18795.48 1.564026e-04 14 | 2176 20553.09 1.716614e-04 15 | 2304 22417.96 1.716614e-04 16 | 2432 20389.30 1.831055e-04 17 | 2560 21924.48 1.945496e-04 18 | 2688 20747.71 1.907349e-04 19 | 2816 22255.11 2.193451e-04 20 | 2944 20980.56 2.593994e-04 21 | 3072 20402.22 2.441406e-04 22 | 3200 21205.12 2.746582e-04 23 | 3328 20986.45 3.280640e-04 24 | 3456 21859.21 3.280640e-04 25 | 3584 21398.98 2.899170e-04 26 | 3712 20981.10 3.280640e-04 27 | 3840 21836.18 3.738403e-04 28 | 3968 21428.58 3.738403e-04 29 | 4096 21410.87 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_2.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_2'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 449.47 7.247925e-05 6 | 1152 484.94 8.392334e-05 7 | 1280 473.72 1.068115e-04 8 | 1408 483.25 1.258850e-04 9 | 1536 477.37 1.182556e-04 10 | 1664 480.06 1.258850e-04 11 | 1792 482.28 1.373291e-04 12 | 1920 474.67 1.907349e-04 13 | 2048 483.35 1.564026e-04 14 | 2176 474.56 1.716614e-04 15 | 2304 477.77 1.716614e-04 16 | 2432 476.35 1.831055e-04 17 | 2560 465.45 1.945496e-04 18 | 2688 474.31 1.907349e-04 19 | 2816 481.86 2.193451e-04 20 | 2944 482.28 2.593994e-04 21 | 3072 476.64 2.441406e-04 22 | 3200 476.94 2.746582e-04 23 | 3328 485.21 3.280640e-04 24 | 3456 484.32 3.280640e-04 25 | 3584 484.52 2.899170e-04 26 | 3712 482.32 3.280640e-04 27 | 3840 471.74 3.738403e-04 28 | 3968 483.43 3.738403e-04 29 | 4096 481.02 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_3.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_3'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 2348.83 7.247925e-05 6 | 1152 2368.86 8.392334e-05 7 | 1280 2696.05 1.068115e-04 8 | 1408 2685.68 1.258850e-04 9 | 1536 2663.44 1.182556e-04 10 | 1664 2664.02 1.258850e-04 11 | 1792 2667.68 1.373291e-04 12 | 1920 2674.12 1.907349e-04 13 | 2048 2673.12 1.564026e-04 14 | 2176 2669.83 1.716614e-04 15 | 2304 2659.31 1.716614e-04 16 | 2432 2653.01 1.831055e-04 17 | 2560 2649.64 1.945496e-04 18 | 2688 2643.43 1.907349e-04 19 | 2816 2637.53 2.193451e-04 20 | 2944 2631.19 2.593994e-04 21 | 3072 2628.32 2.441406e-04 22 | 3200 2626.45 2.746582e-04 23 | 3328 2594.80 3.280640e-04 24 | 3456 2588.46 3.280640e-04 25 | 3584 2588.16 2.899170e-04 26 | 3712 2590.75 3.280640e-04 27 | 3840 2601.79 3.738403e-04 28 | 3968 2600.13 3.738403e-04 29 | 4096 2604.26 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_4.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_4'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 6109.22 7.247925e-05 6 | 1152 6208.11 8.392334e-05 7 | 1280 6391.93 1.068115e-04 8 | 1408 6413.10 1.258850e-04 9 | 1536 6490.45 1.182556e-04 10 | 1664 6592.70 1.258850e-04 11 | 1792 6491.07 1.373291e-04 12 | 1920 6541.92 1.907349e-04 13 | 2048 6522.68 1.564026e-04 14 | 2176 6503.46 1.716614e-04 15 | 2304 6550.76 1.716614e-04 16 | 2432 6531.24 1.831055e-04 17 | 2560 6503.59 1.945496e-04 18 | 2688 6515.36 1.907349e-04 19 | 2816 6509.47 2.193451e-04 20 | 2944 6453.46 2.593994e-04 21 | 3072 6495.38 2.441406e-04 22 | 3200 6483.08 2.746582e-04 23 | 3328 6460.18 3.280640e-04 24 | 3456 6496.13 3.280640e-04 25 | 3584 6501.23 2.899170e-04 26 | 3712 6524.08 3.280640e-04 27 | 3840 6499.54 3.738403e-04 28 | 3968 6487.26 3.738403e-04 29 | 4096 6505.56 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_5.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_5'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 6476.59 7.247925e-05 6 | 1152 6727.65 8.392334e-05 7 | 1280 6989.59 1.068115e-04 8 | 1408 6370.10 1.258850e-04 9 | 1536 6755.63 1.182556e-04 10 | 1664 7304.07 1.258850e-04 11 | 1792 7107.59 1.373291e-04 12 | 1920 7109.28 1.907349e-04 13 | 2048 6943.20 1.564026e-04 14 | 2176 7306.77 1.716614e-04 15 | 2304 7060.76 1.716614e-04 16 | 2432 7053.52 1.831055e-04 17 | 2560 7173.36 1.945496e-04 18 | 2688 7071.53 1.907349e-04 19 | 2816 6955.60 2.193451e-04 20 | 2944 6923.04 2.593994e-04 21 | 3072 6969.85 2.441406e-04 22 | 3200 6918.04 2.746582e-04 23 | 3328 6844.63 3.280640e-04 24 | 3456 6825.10 3.280640e-04 25 | 3584 6766.01 2.899170e-04 26 | 3712 6780.47 3.280640e-04 27 | 3840 6814.50 3.738403e-04 28 | 3968 6746.66 3.738403e-04 29 | 4096 6534.27 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_6.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_6'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 3165.02 7.247925e-05 6 | 1152 3219.39 8.392334e-05 7 | 1280 3311.29 1.068115e-04 8 | 1408 3318.39 1.258850e-04 9 | 1536 3735.15 1.182556e-04 10 | 1664 3694.61 1.258850e-04 11 | 1792 3671.32 1.373291e-04 12 | 1920 3711.67 1.907349e-04 13 | 2048 3668.11 1.564026e-04 14 | 2176 3714.72 1.716614e-04 15 | 2304 3699.94 1.716614e-04 16 | 2432 3719.28 1.831055e-04 17 | 2560 3667.40 1.945496e-04 18 | 2688 3695.71 1.907349e-04 19 | 2816 3678.39 2.193451e-04 20 | 2944 3683.63 2.593994e-04 21 | 3072 3671.62 2.441406e-04 22 | 3200 3665.02 2.746582e-04 23 | 3328 3658.52 3.280640e-04 24 | 3456 3669.57 3.280640e-04 25 | 3584 3664.37 2.899170e-04 26 | 3712 3668.71 3.280640e-04 27 | 3840 3674.00 3.738403e-04 28 | 3968 3677.78 3.738403e-04 29 | 4096 3670.64 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_7.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_7'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3090" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 4799.10 7.247925e-05 6 | 1152 6046.16 8.392334e-05 7 | 1280 6020.74 1.068115e-04 8 | 1408 6103.93 1.258850e-04 9 | 1536 5485.28 1.182556e-04 10 | 1664 6441.97 1.258850e-04 11 | 1792 6729.21 1.373291e-04 12 | 1920 6952.87 1.907349e-04 13 | 2048 6714.18 1.564026e-04 14 | 2176 6628.49 1.716614e-04 15 | 2304 6876.40 1.716614e-04 16 | 2432 6800.96 1.831055e-04 17 | 2560 6763.13 1.945496e-04 18 | 2688 6786.04 1.907349e-04 19 | 2816 6795.07 2.193451e-04 20 | 2944 6847.96 2.593994e-04 21 | 3072 6692.63 2.441406e-04 22 | 3200 6743.39 2.746582e-04 23 | 3328 6839.06 3.280640e-04 24 | 3456 6744.52 3.280640e-04 25 | 3584 6679.47 2.899170e-04 26 | 3712 6643.94 3.280640e-04 27 | 3840 6731.34 3.738403e-04 28 | 3968 6725.52 3.738403e-04 29 | 4096 6730.59 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_8.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_8'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 4820.90 7.247925e-05 6 | 1152 4996.40 8.392334e-05 7 | 1280 5156.29 1.068115e-04 8 | 1408 4870.39 1.258850e-04 9 | 1536 5387.88 1.182556e-04 10 | 1664 5544.25 1.258850e-04 11 | 1792 5585.79 1.373291e-04 12 | 1920 5577.91 1.907349e-04 13 | 2048 5486.57 1.564026e-04 14 | 2176 5569.79 1.716614e-04 15 | 2304 5461.78 1.716614e-04 16 | 2432 5444.76 1.831055e-04 17 | 2560 5508.09 1.945496e-04 18 | 2688 5434.42 1.907349e-04 19 | 2816 5395.18 2.193451e-04 20 | 2944 5418.13 2.593994e-04 21 | 3072 5443.66 2.441406e-04 22 | 3200 5413.37 2.746582e-04 23 | 3328 5363.66 3.280640e-04 24 | 3456 5364.71 3.280640e-04 25 | 3584 5336.84 2.899170e-04 26 | 3712 5355.17 3.280640e-04 27 | 3840 5362.73 3.738403e-04 28 | 3968 5306.01 3.738403e-04 29 | 4096 5220.77 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_MMult_cuda_9.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_9'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 10787.91 7.247925e-05 6 | 1152 9263.87 8.392334e-05 7 | 1280 11465.92 1.068115e-04 8 | 1408 13740.90 1.258850e-04 9 | 1536 10607.32 1.182556e-04 10 | 1664 12420.91 1.258850e-04 11 | 1792 14453.16 1.373291e-04 12 | 1920 13160.41 1.907349e-04 13 | 2048 14995.85 1.564026e-04 14 | 2176 13423.66 1.716614e-04 15 | 2304 15407.11 1.716614e-04 16 | 2432 14797.16 1.831055e-04 17 | 2560 15988.39 1.945496e-04 18 | 2688 15164.42 1.907349e-04 19 | 2816 14900.56 2.193451e-04 20 | 2944 15936.94 2.593994e-04 21 | 3072 15665.25 2.441406e-04 22 | 3200 15319.37 2.746582e-04 23 | 3328 16216.18 3.280640e-04 24 | 3456 15997.54 3.280640e-04 25 | 3584 15892.93 2.899170e-04 26 | 3712 15742.18 3.280640e-04 27 | 3840 15672.13 3.738403e-04 28 | 3968 15652.81 3.738403e-04 29 | 4096 15611.22 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_new.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuda_7'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3090" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 4799.10 7.247925e-05 6 | 1152 6046.16 8.392334e-05 7 | 1280 6020.74 1.068115e-04 8 | 1408 6103.93 1.258850e-04 9 | 1536 5485.28 1.182556e-04 10 | 1664 6441.97 1.258850e-04 11 | 1792 6729.21 1.373291e-04 12 | 1920 6952.87 1.907349e-04 13 | 2048 6714.18 1.564026e-04 14 | 2176 6628.49 1.716614e-04 15 | 2304 6876.40 1.716614e-04 16 | 2432 6800.96 1.831055e-04 17 | 2560 6763.13 1.945496e-04 18 | 2688 6786.04 1.907349e-04 19 | 2816 6795.07 2.193451e-04 20 | 2944 6847.96 2.593994e-04 21 | 3072 6692.63 2.441406e-04 22 | 3200 6743.39 2.746582e-04 23 | 3328 6839.06 3.280640e-04 24 | 3456 6744.52 3.280640e-04 25 | 3584 6679.47 2.899170e-04 26 | 3712 6643.94 3.280640e-04 27 | 3840 6731.34 3.738403e-04 28 | 3968 6725.52 3.738403e-04 29 | 4096 6730.59 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/output_old.m: -------------------------------------------------------------------------------- 1 | version = 'MMult_cuBLAS_1'; 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6 3 | 4 | MY_MMult = [ 5 | 1024 10637.93 3.242493e-05 6 | 1152 16397.92 3.480911e-05 7 | 1280 16559.61 5.626678e-05 8 | 1408 13734.41 5.340576e-05 9 | 1536 14581.54 5.340576e-05 10 | 1664 14285.89 4.577637e-05 11 | 1792 13704.07 5.340576e-05 12 | 1920 13878.71 5.912781e-05 13 | 2048 16339.17 1.564026e-04 14 | 2176 12957.95 1.716614e-04 15 | 2304 16535.38 1.716614e-04 16 | 2432 12519.21 8.392334e-05 17 | 2560 16971.11 1.945496e-04 18 | 2688 18144.32 1.907349e-04 19 | 2816 12950.36 2.193451e-04 20 | 2944 16634.41 2.593994e-04 21 | 3072 17836.77 2.441406e-04 22 | 3200 12842.99 2.746582e-04 23 | 3328 16601.17 3.280640e-04 24 | 3456 16300.24 3.280640e-04 25 | 3584 12411.39 1.029968e-04 26 | 3712 17320.47 3.280640e-04 27 | 3840 14158.14 3.738403e-04 28 | 3968 13989.40 3.738403e-04 29 | 4096 14217.98 3.509521e-04 30 | ]; 31 | -------------------------------------------------------------------------------- /cuda/parameters.h: -------------------------------------------------------------------------------- 1 | /* 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )" 3 | The below parameters set this range of values that p takes on 4 | */ 5 | #define PFIRST 1024 6 | #define PLAST 4096 7 | #define PINC 128 8 | 9 | /* 10 | In the test driver, the m, n, and k dimensions are set to the below 11 | values. If the value equals "-1" then that dimension is bound to the 12 | index p, given above. 13 | */ 14 | 15 | #define M -1 16 | #define N -1 17 | #define K -1 18 | 19 | /* 20 | In the test driver, each experiment is repeated NREPEATS times and 21 | the best time from these repeats is used to compute the performance 22 | */ 23 | 24 | #define NREPEATS 20 25 | 26 | /* 27 | Matrices A, B, and C are stored in two dimensional arrays with 28 | row dimensions that are greater than or equal to the row dimension 29 | of the matrix. This row dimension of the array is known as the 30 | "leading dimension" and determines the stride (the number of 31 | double precision numbers) when one goes from one element in a row 32 | to the next. Having this number larger than the row dimension of 33 | the matrix tends to adversely affect performance. LDX equals the 34 | leading dimension of the array that stores matrix X. If LDX=-1 35 | then the leading dimension is set to the row dimension of matrix X. 36 | */ 37 | -------------------------------------------------------------------------------- /cuda/plot.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | def readFile(filename): 6 | f = open(filename) 7 | sizes = [] 8 | times = [] 9 | title = '' 10 | try: 11 | title = f.readline() 12 | # skip 3 line 13 | f.readline() 14 | f.readline() 15 | f.readline() 16 | while True: 17 | line = f.readline() 18 | if line: 19 | slices = line.split(" ") 20 | if len(slices) <= 2: 21 | break; 22 | size = int(slices[0]) 23 | time = float(slices[1]) 24 | sizes.append(size) 25 | times.append(time) 26 | finally: 27 | f.close() 28 | return title, sizes, times 29 | 30 | if __name__ == '__main__': 31 | plt.xlabel('shape') 32 | plt.ylabel('gflops') 33 | l = len(sys.argv) 34 | for i,item in enumerate(sys.argv): 35 | if i == 0: 36 | continue 37 | t,x,y = readFile(item) 38 | plt.plot(x,y,label=t) 39 | plt.legend() 40 | plt.show() 41 | 42 | -------------------------------------------------------------------------------- /cuda/print_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define A(i, j) a[(i)*lda + (j)] 4 | 5 | void print_matrix(int m, int n, float *a, int lda) { 6 | int i, j; 7 | 8 | for (i = 0; i < m; i++) { 9 | for (j = 0; j < n; j++) { 10 | printf("%.1f\t", A(i, j)); 11 | } 12 | printf("\n"); 13 | } 14 | printf("\n"); 15 | } 16 | -------------------------------------------------------------------------------- /cuda/proc_parameters.m: -------------------------------------------------------------------------------- 1 | % Indicate the number of floating point operations that can be executed 2 | % per clock cycle 3 | % 4 | 5 | nflops_per_cycle = 4; 6 | 7 | % 8 | % Indicate the number of processors being used (in case you are using a 9 | % multicore or SMP) 10 | % 11 | 12 | nprocessors = 1; 13 | 14 | % 15 | % Indicate the clock speed of the processor. On a Linux machine this info 16 | % can be found in the file /proc/cpuinfo 17 | % 18 | % Note: some processors have a "turbo boost" mode, which increases 19 | % the peak clock rate... 20 | % 21 | 22 | GHz_of_processor = 2.6; 23 | -------------------------------------------------------------------------------- /cuda/random_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define A(i, j) a[(j)*lda + (i)] 4 | 5 | double drand48(); 6 | void random_matrix(int m, int n, float *a, int lda) { 7 | int i, j; 8 | 9 | for (i = 0; i < m; i++) 10 | for (j = 0; j < n; j++) 11 | #if 1 12 | A(i, j) = 2.0 * (float)drand48() - 1.0; 13 | #else 14 | A(i, j) = (j - i) % 3; 15 | #endif 16 | } 17 | -------------------------------------------------------------------------------- /images/aarch64-fp32-peak-vs-int8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/images/aarch64-fp32-peak-vs-int8.png -------------------------------------------------------------------------------- /images/cublas-vs-MMult_cuda_12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/images/cublas-vs-MMult_cuda_12.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | matplotlib 3 | -------------------------------------------------------------------------------- /vulkan/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | run.sh 3 | tmp_kp_shader.comp 4 | tmp_kp_shader.comp.spv 5 | test1 6 | test2 7 | -------------------------------------------------------------------------------- /vulkan/MMult_vk_2.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #extension GL_EXT_control_flow_attributes : enable 3 | 4 | layout (local_size_x = 16, local_size_y = 16) in; 5 | 6 | layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; }; 7 | layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; }; 8 | layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; }; 9 | 10 | layout (constant_id = 0) const float tensor_size_f = 0; 11 | 12 | shared float sub_tensor_1[16][16]; 13 | shared float sub_tensor_2[16][16]; 14 | 15 | void main() { 16 | uint block = 16; 17 | uint tensor_size = uint(tensor_size_f); 18 | uint loop = tensor_size / block; 19 | 20 | uvec3 threadID = gl_LocalInvocationID; 21 | 22 | uint globalCol = gl_WorkGroupID.y * block +threadID.y; 23 | uint globalRow = gl_WorkGroupID.x * block + threadID.x; 24 | 25 | float acc = 0.0; 26 | [[unroll]] for (uint i = 0u; i < loop; ++i) { 27 | sub_tensor_1[threadID.y][threadID.x] = in_tensor_1[tensor_size * globalCol + i * block + threadID.x]; 28 | sub_tensor_2[threadID.y][threadID.x] = in_tensor_2[tensor_size * (i * block + threadID.y) + globalRow]; 29 | 30 | // memoryBarrierShared(); 31 | barrier(); 32 | 33 | #if 1 34 | for (uint k = 0u; k < block; ++k) { 35 | acc += sub_tensor_1[threadID.y][k] * sub_tensor_2[k][threadID.x]; 36 | } 37 | #else 38 | for (uint k = 0u; k < block; k+=4) { 39 | vec4 a; 40 | a.r = sub_tensor_1[threadID.y][k]; 41 | a.g = sub_tensor_1[threadID.y][k+1]; 42 | a.b = sub_tensor_1[threadID.y][k+2]; 43 | a.a = sub_tensor_1[threadID.y][k+3]; 44 | 45 | vec4 b; 46 | b.r = sub_tensor_2[k][threadID.x]; 47 | b.g = sub_tensor_2[k+1][threadID.x]; 48 | b.b = sub_tensor_2[k+2][threadID.x]; 49 | b.a = sub_tensor_2[k+3][threadID.x]; 50 | 51 | acc += dot(a, b); 52 | } 53 | #endif 54 | barrier(); 55 | } 56 | 57 | out_tensor[(globalCol * tensor_size) + globalRow] = acc; 58 | } 59 | -------------------------------------------------------------------------------- /vulkan/MMult_vk_2.cpp: -------------------------------------------------------------------------------- 1 | #define SPDLOG_ACTIVE_LEVEL 5 2 | #include "Shader.hpp" 3 | #include "kompute/Kompute.hpp" 4 | #include 5 | #include 6 | #include 7 | 8 | // MY_MMult = [ 9 | // 64 36.24 0.000000e+00 10 | // 128 48.75 0.000000e+00 11 | // 192 51.88 0.000000e+00 12 | // 256 52.91 0.000000e+00 13 | // 320 53.30 0.000000e+00 14 | // 384 53.51 0.000000e+00 15 | // 448 53.64 0.000000e+00 16 | // 512 53.72 0.000000e+00 17 | // ]; 18 | float kompute(const std::string &comp, uint32_t m, uint32_t k, 19 | uint32_t n, float *a, float *b, float *c) { 20 | constexpr uint32_t local_size = 16; 21 | kp::Manager mgr; 22 | 23 | // Create and initialise Kompute Tensors through manager 24 | auto dtype = kp::Tensor::TensorDataTypes::eFloat; 25 | auto tensorInA = mgr.tensor(a, m * k, sizeof(float), dtype); 26 | auto tensorInB = mgr.tensor(b, k * n, sizeof(float), dtype); 27 | auto tensorInC = mgr.tensor(c, m * n, sizeof(float), dtype); 28 | 29 | std::vector> params = {tensorInA, tensorInB, 30 | tensorInC}; 31 | 32 | // Create algorithm based on shader (supports buffers & push/spec constants) 33 | kp::Workgroup workgroup({m / local_size, n / local_size, 1}); 34 | 35 | auto algorithm = 36 | mgr.algorithm(params, compileFile(comp), workgroup, {k * 1.f}); 37 | 38 | mgr.sequence()->record(params)->eval(); 39 | // use weired vk timestamps 40 | auto seq = mgr.sequence(0, 1); 41 | seq->record(algorithm)->eval(); 42 | 43 | mgr.sequence()->record(params)->eval(); 44 | 45 | auto timestamps = seq->getTimestamps(); 46 | auto computecost = timestamps[1] - timestamps[0]; 47 | memcpy(c, tensorInC->data(), m * n * sizeof(float)); 48 | return computecost/1e6f; 49 | } 50 | 51 | float MY_MMult(int m, int n, int k, float *a, float *b, float *c) { 52 | return kompute("MMult_vk_2.comp", static_cast(m), 53 | static_cast(n), static_cast(k), a, b, c); 54 | } 55 | -------------------------------------------------------------------------------- /vulkan/MMult_vk_3.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (local_size_x = 16, local_size_y = 16) in; 4 | 5 | layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { vec4 in_tensor_a[]; }; 6 | layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_b[]; }; 7 | layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; }; 8 | 9 | layout (constant_id = 0) const float tensor_size_f = 0; 10 | 11 | void main() 12 | { 13 | uint block = 16; 14 | uint tensor_size = uint(tensor_size_f); 15 | uint lda = tensor_size / 4; 16 | uint ldb = tensor_size; 17 | uint ldc = tensor_size; 18 | uint loop_k = tensor_size / 4; 19 | 20 | uint globalRow = gl_WorkGroupID.x * block + gl_LocalInvocationID.x; 21 | uint globalCol = gl_WorkGroupID.y * block +gl_LocalInvocationID.y; 22 | 23 | float acc = 0.0; 24 | for(uint k = 0u; k < loop_k; k++) { 25 | vec4 a = in_tensor_a[(globalCol * lda) + k]; 26 | vec4 b; 27 | b.r = in_tensor_b[(k * 4 * ldb) + globalRow]; 28 | b.g = in_tensor_b[((k * 4 + 1) * ldb) + globalRow]; 29 | b.b = in_tensor_b[((k * 4 + 2)* ldb) + globalRow]; 30 | b.a = in_tensor_b[((k * 4 + 3)* ldb) + globalRow]; 31 | 32 | acc += dot(a, b); 33 | } 34 | out_tensor[(globalCol * ldc) + globalRow] = acc; 35 | } -------------------------------------------------------------------------------- /vulkan/MMult_vk_3.cpp: -------------------------------------------------------------------------------- 1 | #define SPDLOG_ACTIVE_LEVEL 5 2 | #include "Shader.hpp" 3 | #include "kompute/Kompute.hpp" 4 | #include 5 | #include 6 | #include 7 | 8 | // MY_MMult = [ 9 | // 64 20.13 2.861023e-06 10 | // 128 22.85 5.722046e-06 11 | // 192 23.48 1.144409e-05 12 | // 256 23.24 1.716614e-05 13 | // 320 23.26 2.098083e-05 14 | // 384 23.29 2.288818e-05 15 | // 448 23.30 2.861023e-05 16 | // 512 23.31 3.433228e-05 17 | // ]; 18 | float kompute(const std::string &comp, uint32_t m, uint32_t k, 19 | uint32_t n, float *a, float *b, float *c) { 20 | constexpr uint32_t local_size = 16; 21 | kp::Manager mgr; 22 | 23 | // Create and initialise Kompute Tensors through manager 24 | auto dtype = kp::Tensor::TensorDataTypes::eFloat; 25 | auto tensorInA = mgr.tensor(a, m * k, sizeof(float), dtype); 26 | auto tensorInB = mgr.tensor(b, k * n, sizeof(float), dtype); 27 | auto tensorInC = mgr.tensor(c, m * n, sizeof(float), dtype); 28 | 29 | std::vector> params = {tensorInA, tensorInB, 30 | tensorInC}; 31 | 32 | // Create algorithm based on shader (supports buffers & push/spec constants) 33 | kp::Workgroup workgroup({m / local_size, n / local_size, 1}); 34 | 35 | auto algorithm = 36 | mgr.algorithm(params, compileFile(comp), workgroup, {k * 1.f}); 37 | 38 | auto seq = mgr.sequence(0, 3); 39 | seq->record(params) 40 | ->record(algorithm) 41 | ->record(params)->eval(); 42 | 43 | auto timestamps = seq->getTimestamps(); 44 | auto computecost = timestamps[2] - timestamps[1]; 45 | memcpy(c, tensorInC->data(), m * n * sizeof(float)); 46 | return computecost/1e6f; 47 | } 48 | 49 | float MY_MMult(int m, int n, int k, float *a, float *b, float *c) { 50 | return kompute("MMult_vk_3.comp", static_cast(m), 51 | static_cast(n), static_cast(k), a, b, c); 52 | } 53 | -------------------------------------------------------------------------------- /vulkan/README.md: -------------------------------------------------------------------------------- 1 | # How to Build 2 | 3 | ## Fetch Vulkan SDK 4 | 5 | ```bash 6 | $ wget https://sdk.lunarg.com/sdk/download/1.3.204.1/linux/vulkansdk-linux-x86_64-1.3.204.1.tar.gz 7 | $ tar xvf vulkansdk-linux-x86_64-1.3.204.1.tar.gz 8 | $ export VULKAN_SDK=/path/to/1.3.204.1/x86_64 9 | ``` 10 | 11 | ## Build and Install `glslangValidator` 12 | 13 | ```bash 14 | $ git clone git clone https://github.com/KhronosGroup/glslang.git --recursive --depth=1 15 | $ cd glslang 16 | $ ./update_glslang_sources.py 17 | $ cmake -DCMAKE_INSTALL_PREFIX="/path/to/glslang/install" .. 18 | $ make && make install 19 | $ export PATH=/path/to/glslang/install/bin 20 | ``` 21 | 22 | ## Build and Install `kompute` 23 | 24 | ```bash 25 | $ git clone https://github.com/KomputeProject/kompute --depth=1 --recursive 26 | $ cd kompute 27 | $ mkdir -p build && cd build 28 | $ cmake -DCMAKE_INSTALL_PREFIX="/path/to/kompute/install" .. 29 | $ make && make install 30 | ``` 31 | 32 | ## Build 33 | Now we have `libkompute.a` and `glslangValidator`, edit makefile and compile our GEMM implementation. 34 | ```bash 35 | $ vim makefile 36 | # update KOMPUTE_BUILD 37 | $ export CPLUS_INCLUDE_PATH=`pwd` 38 | $ make 39 | ... 40 | ``` 41 | 42 | ## Run 43 | On Jetson Nano, enable MAXN power mode first. 44 | 45 | ```bash 46 | $ sudo jetson_clocks 47 | ... 48 | $ ./test_MMult.x 49 | ... 50 | ``` 51 | -------------------------------------------------------------------------------- /vulkan/REF_MMult.cpp: -------------------------------------------------------------------------------- 1 | /* Create macros so that the matrices are stored in row-major order */ 2 | 3 | #if 0 4 | #include 5 | /* Routine for computing C = A * B + C */ 6 | void REF_MMult(int m, int n, int k, float *a, float *b, float *c) { 7 | cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0f, a, k, 8 | b, n, 0.0f, c, n); 9 | } 10 | 11 | #else 12 | 13 | #define A(i, j) a[(i)*k + (j)] 14 | #define B(i, j) b[(i)*n + (j)] 15 | #define C(i, j) c[(i)*n + (j)] 16 | /* Routine for computing C = A * B + C */ 17 | 18 | void REF_MMult(int m, int n, int k, float *a, float *b, float *c) { 19 | int i, j, p; 20 | 21 | for (i = 0; i < m; i++) { 22 | for (j = 0; j < n; j++) { 23 | for (p = 0; p < k; p++) { 24 | C(i, j) = C(i, j) + A(i, p) * B(p, j); 25 | } 26 | } 27 | } 28 | } 29 | #endif -------------------------------------------------------------------------------- /vulkan/Shader.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | /** 10 | * Compile a single glslang source from string value. This is only meant 11 | * to be used for testing as it's non threadsafe, and it had to be removed 12 | * from the glslang dependency and now can only run the CLI directly due to 13 | * license issues: see https://github.com/KomputeProject/kompute/pull/235 14 | * 15 | * @param source An individual raw glsl shader in string format 16 | * @return The compiled SPIR-V binary in unsigned int32 format 17 | */ 18 | static 19 | std::vector 20 | compileSource( 21 | const std::string& source) 22 | { 23 | std::ofstream fileOut("tmp_kp_shader.comp"); 24 | fileOut << source; 25 | fileOut.close(); 26 | if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str())) 27 | throw std::runtime_error("Error running glslangValidator command"); 28 | std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); 29 | std::vector buffer; 30 | buffer.insert(buffer.begin(), std::istreambuf_iterator(fileStream), {}); 31 | return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; 32 | } 33 | 34 | /** 35 | * @param source An individual raw glsl shader filename 36 | * @return The compiled SPIR-V binary in unsigned int32 format 37 | */ 38 | static 39 | std::vector 40 | compileFile(const std::string& filename) 41 | { 42 | char cmd[256] = {0}; 43 | sprintf(cmd, "glslangValidator -V %s -o tmp_kp_shader.comp.spv", filename.c_str()); 44 | if (system(cmd)) 45 | throw std::runtime_error("Error running glslangValidator command"); 46 | std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); 47 | std::vector buffer; 48 | buffer.insert(buffer.begin(), std::istreambuf_iterator(fileStream), {}); 49 | return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; 50 | } 51 | -------------------------------------------------------------------------------- /vulkan/benchmark/.gitignore: -------------------------------------------------------------------------------- 1 | gflops_fmla 2 | 3 | gmem_latency 4 | gmem_bandwidth 5 | gmem_banchmark 6 | 7 | smem_bandwidth 8 | smem_latency 9 | 10 | -------------------------------------------------------------------------------- /vulkan/benchmark/build.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | NAME=${1} 4 | KOMPUTE_BUILD="/home/khj/kompute/build" 5 | 6 | g++ -g -O0 -std=c++17 -c ${NAME}.cpp 7 | g++ -o ${NAME} ${NAME}.o "${KOMPUTE_BUILD}/src/libkompute.a" "${KOMPUTE_BUILD}/src/kompute_fmt/libfmt.a" "${KOMPUTE_BUILD}/src/kompute_spdlog/libspdlog.a" -L/usr/local/lib -lvulkan -lpthread 8 | -------------------------------------------------------------------------------- /vulkan/benchmark/gflops_fmla.cpp: -------------------------------------------------------------------------------- 1 | #include "../Shader.hpp" 2 | #include "../kompute/Kompute.hpp" 3 | #include 4 | #include 5 | #include "types.h" 6 | 7 | // gflops_fmla: 184.358588 8 | constexpr uint32_t COUNT = 16384; 9 | constexpr uint32_t BLOCK = 256; 10 | 11 | constexpr float LOOP = 3000000.0; 12 | 13 | uint64_t kompute(const std::string &filename) { 14 | 15 | kp::Manager mgr; 16 | kp::Workgroup workgroup({COUNT / BLOCK, 1, 1}); 17 | 18 | AlignVector data1(COUNT, 1.0f); 19 | AlignVector data2(COUNT, 0.0f); 20 | AlignVector data3(COUNT, 0.0f); 21 | 22 | auto dtype = kp::Tensor::TensorDataTypes::eFloat; 23 | auto tensorIn1 = mgr.tensor(data1.data(), data1.size(), sizeof(float), dtype); 24 | auto tensorIn2 = mgr.tensor(data2.data(), data2.size(), sizeof(float), dtype); 25 | auto tensorOut = mgr.tensor(data3.data(), data3.size(), sizeof(float), dtype); 26 | 27 | std::vector> params = {tensorIn1, tensorIn2, tensorOut}; 28 | auto algorithm = mgr.algorithm(params, compileFile(filename), workgroup, {LOOP}); 29 | auto seq = mgr.sequence(0, 3); 30 | 31 | seq->record(params) 32 | ->record(algorithm) 33 | ->record(params) 34 | ->eval(); 35 | 36 | float* pResult = static_cast(tensorOut->rawData()); 37 | for (int i =0; i< 10; ++i) { 38 | fprintf(stdout, "%f ", pResult[i]); 39 | } 40 | 41 | auto timestamps = seq->getTimestamps(); 42 | return (timestamps[3] - timestamps[0]); 43 | } 44 | 45 | int main() { 46 | auto rw_compute_cost = kompute("gflops_fmla_1.comp"); 47 | auto rw_cost = kompute("gflops_fmla_2.comp"); 48 | fprintf(stdout, "gflops_fmla: %lf \n", LOOP * COUNT * 10.0/ (rw_compute_cost - rw_cost)); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /vulkan/benchmark/gflops_fmla_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (local_size_x = 256) in; 4 | layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; }; 5 | layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; }; 6 | layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; }; 7 | 8 | layout (constant_id = 0) const float loopf = 0; 9 | 10 | void main() { 11 | float a = in_tensor_1[gl_GlobalInvocationID.x]; 12 | float b = in_tensor_2[gl_GlobalInvocationID.x]; 13 | float c = 1.0; 14 | int loop = int(loopf); 15 | 16 | for (int i = 0; i < loop; ++i) { 17 | c = a * c + b; 18 | c = a * c + b; 19 | c = a * c + b; 20 | c = a * c + b; 21 | c = a * c + b; 22 | 23 | c = a * c + b; 24 | c = a * c + b; 25 | c = a * c + b; 26 | c = a * c + b; 27 | c = a * c + b; 28 | } 29 | out_tensor[gl_GlobalInvocationID.x] = c; 30 | } -------------------------------------------------------------------------------- /vulkan/benchmark/gflops_fmla_2.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (local_size_x = 256) in; 4 | layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; }; 5 | layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; }; 6 | layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; }; 7 | 8 | layout (constant_id = 0) const float loopf = 0; 9 | 10 | void main() { 11 | float a = in_tensor_1[gl_GlobalInvocationID.x]; 12 | float b = in_tensor_2[gl_GlobalInvocationID.x]; 13 | 14 | out_tensor[gl_GlobalInvocationID.x] = a+b; 15 | } -------------------------------------------------------------------------------- /vulkan/benchmark/gmem_bandwidth.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (local_size_x = 256) in; 4 | 5 | // The input tensors bind index is relative to index in parameter passed 6 | layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; 7 | layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; 8 | layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; }; 9 | 10 | void main() { 11 | uint index = gl_GlobalInvocationID.x; 12 | out_a[index] = in_a[index]; 13 | } 14 | -------------------------------------------------------------------------------- /vulkan/benchmark/gmem_bandwidth.cpp: -------------------------------------------------------------------------------- 1 | #include "Shader.hpp" 2 | #include "kompute/Kompute.hpp" 3 | #include 4 | #include 5 | #include "types.h" 6 | 7 | 8 | void kompute(const std::string &shader) { 9 | kp::Manager mgr; 10 | 11 | constexpr uint32_t MB = 256; 12 | constexpr uint32_t SIZE = MB * 1024 * 1024; 13 | constexpr uint32_t COUNT = SIZE/ sizeof(float); // cannot exceed `vulkaninfo | grep maxComputeWorkGroupCount` 14 | assert(COUNT <= 2147483647); 15 | AlignVector data(COUNT, 3.14f); 16 | 17 | auto dtype = kp::Tensor::TensorDataTypes::eFloat; 18 | auto tensorIn1 = mgr.tensor(data.data(), data.size(), sizeof(float), dtype); 19 | auto tensorIn2 = mgr.tensor(data.data(), data.size(), sizeof(float), dtype); 20 | auto tensorOut = mgr.tensor(data.data(), data.size(), sizeof(float), dtype); 21 | 22 | std::vector> params = {tensorIn1, tensorIn2, tensorOut}; 23 | kp::Workgroup workgroup({COUNT / 256, 1, 1}); 24 | 25 | auto algorithm = mgr.algorithm(params, compileFile(shader), workgroup); 26 | 27 | auto seq = mgr.sequence(0, 3); 28 | 29 | seq->record(params) 30 | ->record(algorithm) 31 | ->record(params) 32 | ->eval(); 33 | 34 | auto timestamps = seq->getTimestamps(); 35 | for (int i = 0; i < timestamps.size() -1; ++i) { 36 | auto cost = timestamps[i+1] - timestamps[i]; 37 | fprintf(stdout, "time cost %ld %0.4f GB/s \n", cost, MB / (cost/1e9f) / 1000.f); 38 | } 39 | // auto h2d = (timestamps[1] - timestamps[0]) / 1e9f; 40 | // auto d2d = (timestamps[2] - timestamps[1]) / 1e9f; 41 | // auto d2h = (timestamps[3] - timestamps[2]) / 1e9f; 42 | 43 | // fprintf(stdout, "h2d: %f MB/s, \nd2d %f MB/s, \nd2h: %f MB/s \n", MB/h2d, MB/d2d, MB/d2h); 44 | } 45 | 46 | int main() { 47 | kompute("gmem_bandwidth.comp"); 48 | } 49 | -------------------------------------------------------------------------------- /vulkan/benchmark/sampler_bandwidth.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (local_size_x = 64) in; 4 | 5 | // The input tensors bind index is relative to index in parameter passed 6 | layout (binding = 0) uniform sampler2D in_a; 7 | layout (constant_id = 0) const float tensor_size_f = 0; 8 | 9 | shared vec4 sub_tensor_1[64]; 10 | 11 | void main() { 12 | uint index = gl_GlobalInvocationID.x; 13 | uint loop = uint(tensor_size_f); 14 | for (uint x = 0; x < loop; ++x) { 15 | 16 | ivec2 ipos = ivec2(index, 0); 17 | sub_tensor_1[index] = texelFetch(in_a, ipos, 0); 18 | barrier(); 19 | } 20 | } -------------------------------------------------------------------------------- /vulkan/benchmark/smem_bandwidth.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #pragma use_vulkan_memory_model 3 | 4 | layout (local_size_x = 256) in; 5 | 6 | // The input tensors bind index is relative to index in parameter passed 7 | layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; 8 | layout (constant_id = 0) const float tensor_size_f = 0; 9 | 10 | shared float sub_tensor_1[256]; 11 | 12 | void main() { 13 | uint index = gl_GlobalInvocationID.x; 14 | uint loop = uint(tensor_size_f); 15 | for (uint x = 0; x < loop; ++x) { 16 | sub_tensor_1[index] = in_a[index]; 17 | barrier(); 18 | } 19 | } -------------------------------------------------------------------------------- /vulkan/benchmark/smem_bandwidth.cpp: -------------------------------------------------------------------------------- 1 | #include "Shader.hpp" 2 | #include "kompute/Kompute.hpp" 3 | #include 4 | #include 5 | #include "types.h" 6 | 7 | void kompute(const std::string &shader) { 8 | kp::Manager mgr; 9 | 10 | constexpr uint32_t SIZE_IN_BYTES = 32768; 11 | constexpr uint32_t BLOCK = 256; 12 | AlignVector data(SIZE_IN_BYTES/ sizeof(float), 3.14f); 13 | 14 | auto dtype = kp::Tensor::TensorDataTypes::eFloat; 15 | auto tensorIn = mgr.tensor(data.data(), data.size(), sizeof(float), dtype); 16 | 17 | std::vector> params = {tensorIn}; 18 | kp::Workgroup workgroup({SIZE_IN_BYTES / BLOCK, 1, 1}); 19 | constexpr float LOOP = 1000000.f; 20 | auto algorithm = mgr.algorithm(params, compileFile(shader), workgroup, {LOOP}); 21 | 22 | auto seq = mgr.sequence(0, 2); 23 | 24 | seq->record(params) 25 | ->record(algorithm) 26 | ->eval(); 27 | 28 | auto timestamps = seq->getTimestamps(); 29 | assert(timestamps.size() == 3); 30 | auto gmem2smem = (timestamps[2] - timestamps[1]); 31 | 32 | const float sec = gmem2smem * 1.0 / LOOP / 1e9f; 33 | fprintf(stdout, "***** %s bandwidth %0.3f GB/s \n", shader.c_str(),SIZE_IN_BYTES / 1024. / 1024. / 1024. / sec); 34 | } 35 | 36 | int main() { 37 | // smem_bandwidth.comp bandwidth 10.665 GB/s 38 | kompute("smem_bandwidth.comp"); 39 | // smem_bandwidth1.comp bandwidth 18.663 GB/s 40 | kompute("smem_bandwidth1.comp"); 41 | // sampler_bandwidth.comp bandwidth 35.502 GB/s 42 | kompute("sampler_bandwidth.comp"); 43 | } 44 | -------------------------------------------------------------------------------- /vulkan/benchmark/smem_bandwidth1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #pragma use_vulkan_memory_model 3 | 4 | layout (local_size_x = 32) in; 5 | 6 | // The input tensors bind index is relative to index in parameter passed 7 | layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; 8 | layout (constant_id = 0) const float tensor_size_f = 0; 9 | 10 | shared vec4 sub_tensor_1[64][2]; 11 | 12 | void main() { 13 | uint index = gl_GlobalInvocationID.x; 14 | uint loop = uint(tensor_size_f); 15 | for (uint x = 0; x < loop; ++x) { 16 | vec4 val0; 17 | val0.r = in_a[index]; 18 | val0.g = in_a[index + 32]; 19 | val0.b = in_a[index + 64]; 20 | val0.a = in_a[index + 96]; 21 | sub_tensor_1[index][0] = val0; 22 | 23 | vec4 val1; 24 | val1.r = in_a[index + 128]; 25 | val1.g = in_a[index + 160]; 26 | val1.b = in_a[index + 192]; 27 | val1.a = in_a[index + 224]; 28 | sub_tensor_1[index][1] = val1; 29 | barrier(); 30 | } 31 | } -------------------------------------------------------------------------------- /vulkan/benchmark/smem_latency.cpp: -------------------------------------------------------------------------------- 1 | #include "Shader.hpp" 2 | #include "kompute/Kompute.hpp" 3 | #include 4 | #include 5 | #include "types.h" 6 | 7 | // gmem2smem 80.194374 ns ~ 72 cycle 0.899 GHz 8 | void kompute(const std::string &shader) { 9 | kp::Manager mgr; 10 | 11 | constexpr uint32_t SIZE = 128; // 128B 12 | constexpr uint32_t COUNT = SIZE/ sizeof(float); // cannot exceed `vulkaninfo | grep maxComputeWorkGroupCount` 13 | AlignVector data(COUNT, 3.14f); 14 | 15 | auto dtype = kp::Tensor::TensorDataTypes::eFloat; 16 | auto tensorIn = mgr.tensor(data.data(), data.size(), sizeof(float), dtype); 17 | 18 | std::vector> params = {tensorIn}; 19 | kp::Workgroup workgroup({1, 1, 1}); 20 | constexpr float LOOP = 10000000.f; 21 | auto algorithm = mgr.algorithm(params, compileSource(shader), workgroup, {LOOP}); 22 | 23 | auto seq = mgr.sequence(0, 2); 24 | 25 | seq->record(params) 26 | ->record(algorithm) 27 | ->eval(); 28 | 29 | auto timestamps = seq->getTimestamps(); 30 | assert(timestamps.size() == 3); 31 | auto gmem2smem = (timestamps[2] - timestamps[1]); 32 | 33 | const float ns = gmem2smem / LOOP; 34 | constexpr float GHz = 921/ 1024.f; // jetson nano max_frequency. 35 | const int cycle = ns * GHz; 36 | fprintf(stdout, "***** gmem2smem %f ns ~ %d cycle %0.3f GHz \n", ns, cycle, GHz); 37 | } 38 | 39 | int main() { 40 | 41 | std::string shader = (R"( 42 | #version 450 43 | 44 | layout (local_size_x = 32) in; 45 | 46 | // The input tensors bind index is relative to index in parameter passed 47 | layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; 48 | layout (constant_id = 0) const float tensor_size_f = 0; 49 | 50 | shared float sub_tensor_1[32]; 51 | 52 | void main() { 53 | uint index = gl_GlobalInvocationID.x; 54 | uint loop = uint(tensor_size_f); 55 | for (uint x = 0; x < loop; ++x) { 56 | sub_tensor_1[index] = in_a[index]; 57 | barrier(); 58 | } 59 | } 60 | )"); 61 | 62 | kompute(shader); 63 | } 64 | -------------------------------------------------------------------------------- /vulkan/benchmark/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | template 5 | struct AlignAllocator { 6 | typedef T value_type; 7 | 8 | AlignAllocator() = default; 9 | template 10 | constexpr AlignAllocator(const AlignAllocator&) noexcept {} 11 | 12 | [[nodiscard]] T* allocate(std::size_t n) { 13 | if (n > std::numeric_limits::max() / sizeof(T)) 14 | throw std::bad_alloc(); 15 | 16 | if (void* p = std::aligned_alloc(64, n * sizeof(T))) { 17 | return static_cast(p); 18 | } 19 | 20 | throw std::bad_alloc(); 21 | } 22 | 23 | void deallocate(T* p, std::size_t n) noexcept { std::free(p); } 24 | }; 25 | 26 | template 27 | bool operator==(const AlignAllocator&, const AlignAllocator&) { 28 | return true; 29 | } 30 | 31 | template 32 | bool operator!=(const AlignAllocator&, const AlignAllocator&) { 33 | return false; 34 | } 35 | 36 | using AlignVector = std::vector >; 37 | -------------------------------------------------------------------------------- /vulkan/compare_matrices.cpp: -------------------------------------------------------------------------------- 1 | #define abs(x) ((x) < 0.0 ? -(x) : (x)) 2 | 3 | #include 4 | 5 | float compare_matrices(int m, int n, float *a, float *b) { 6 | #define A(i, j) a[(i)*n + (j)] 7 | #define B(i, j) b[(i)*n + (j)] 8 | // printf("\n---result----\n"); 9 | // print_matrix(m, n, a, lda); 10 | // printf("\n-------\n"); 11 | // print_matrix(m, n, b, ldb); 12 | // printf("\n-------\n"); 13 | int i, j; 14 | float max_diff = 0.0, diff; 15 | int printed = 0; 16 | 17 | for (i = 0; i < m; i++) { 18 | for (j = 0; j < n; j++) { 19 | diff = abs(A(i, j) - B(i, j)); 20 | max_diff = (diff > max_diff ? diff : max_diff); 21 | if (0 == printed) 22 | if (max_diff > 0.5f || max_diff < -0.5f) { 23 | fprintf(stdout, "error: i %d j %d diff %f got %f expect %f \n", i, 24 | j, max_diff, A(i, j), B(i, j)); 25 | printed = 1; 26 | } 27 | } 28 | } 29 | 30 | return max_diff; 31 | #undef A 32 | #undef B 33 | } 34 | -------------------------------------------------------------------------------- /vulkan/copy_matrix.cpp: -------------------------------------------------------------------------------- 1 | void copy_matrix(int m, int n, float *a, float *b) { 2 | #define A(i, j) a[(i)*n + (j)] 3 | #define B(i, j) b[(i)*n + (j)] 4 | 5 | int i, j; 6 | 7 | for (j = 0; j < n; j++) { 8 | for (i = 0; i < m; i++) { 9 | B(i, j) = A(i, j); 10 | } 11 | } 12 | 13 | #undef A 14 | #undef B 15 | } 16 | -------------------------------------------------------------------------------- /vulkan/dclock.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | static double gtod_ref_time_sec = 0.0; 5 | 6 | /* Adapted from the bl2_clock() routine in the BLIS library */ 7 | 8 | double dclock() { 9 | double the_time, norm_sec; 10 | struct timeval tv; 11 | 12 | gettimeofday(&tv, NULL); 13 | 14 | if (gtod_ref_time_sec == 0.0) 15 | gtod_ref_time_sec = (double)tv.tv_sec; 16 | 17 | norm_sec = (double)tv.tv_sec - gtod_ref_time_sec; 18 | 19 | the_time = norm_sec + tv.tv_usec * 1.0e-6; 20 | 21 | return the_time; 22 | } 23 | -------------------------------------------------------------------------------- /vulkan/fmt/locale.h: -------------------------------------------------------------------------------- 1 | // Formatting library for C++ - std::locale support 2 | // 3 | // Copyright (c) 2012 - present, Victor Zverovich 4 | // All rights reserved. 5 | // 6 | // For the license information refer to format.h. 7 | 8 | #ifndef FMT_LOCALE_H_ 9 | #define FMT_LOCALE_H_ 10 | 11 | #include 12 | 13 | #include "format.h" 14 | 15 | FMT_BEGIN_NAMESPACE 16 | 17 | namespace detail { 18 | template 19 | std::basic_string vformat( 20 | const std::locale& loc, basic_string_view format_str, 21 | basic_format_args>> args) { 22 | basic_memory_buffer buffer; 23 | detail::vformat_to(buffer, format_str, args, detail::locale_ref(loc)); 24 | return fmt::to_string(buffer); 25 | } 26 | } // namespace detail 27 | 28 | template > 29 | inline std::basic_string vformat( 30 | const std::locale& loc, const S& format_str, 31 | basic_format_args>> args) { 32 | return detail::vformat(loc, to_string_view(format_str), args); 33 | } 34 | 35 | template > 36 | inline std::basic_string format(const std::locale& loc, 37 | const S& format_str, Args&&... args) { 38 | return detail::vformat(loc, to_string_view(format_str), 39 | fmt::make_args_checked(format_str, args...)); 40 | } 41 | 42 | template , 44 | FMT_ENABLE_IF(detail::is_output_iterator::value)> 45 | inline OutputIt vformat_to( 46 | OutputIt out, const std::locale& loc, const S& format_str, 47 | basic_format_args>> args) { 48 | decltype(detail::get_buffer(out)) buf(detail::get_buffer_init(out)); 49 | vformat_to(buf, to_string_view(format_str), args, detail::locale_ref(loc)); 50 | return detail::get_iterator(buf); 51 | } 52 | 53 | template >::value> 55 | inline auto format_to(OutputIt out, const std::locale& loc, 56 | const S& format_str, Args&&... args) -> 57 | typename std::enable_if::type { 58 | const auto& vargs = fmt::make_args_checked(format_str, args...); 59 | return vformat_to(out, loc, to_string_view(format_str), vargs); 60 | } 61 | 62 | FMT_END_NAMESPACE 63 | 64 | #endif // FMT_LOCALE_H_ 65 | -------------------------------------------------------------------------------- /vulkan/fmt/posix.h: -------------------------------------------------------------------------------- 1 | #include "os.h" 2 | #warning "fmt/posix.h is deprecated; use fmt/os.h instead" 3 | -------------------------------------------------------------------------------- /vulkan/makefile: -------------------------------------------------------------------------------- 1 | OLD := MMult_vk_naive 2 | NEW := MMult_vk_2 3 | KOMPUTE_BUILD := /home/khj/kompute/build 4 | 5 | CC := g++ 6 | LINKER := $(CC) 7 | # CFLAGS := -std=c++17 -O0 -g -Wall 8 | CFLAGS := -std=c++17 -O2 -g 9 | LDFLAGS := -lm $(KOMPUTE_BUILD)/src/libkompute.a $(KOMPUTE_BUILD)/src/kompute_fmt/libfmt.a $(KOMPUTE_BUILD)/src/kompute_spdlog/libspdlog.a `pkg-config --libs vulkan` -lpthread 10 | 11 | UTIL := copy_matrix.o \ 12 | compare_matrices.o \ 13 | random_matrix.o \ 14 | dclock.o \ 15 | REF_MMult.o \ 16 | print_matrix.o 17 | 18 | TEST_OBJS := test_MMult.o $(NEW).o 19 | 20 | %.o: %.cpp 21 | $(CC) $(CFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 22 | 23 | all: 24 | make clean; 25 | make test_MMult.x 26 | 27 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h 28 | $(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \ 29 | $(BLAS_LIB) -o $(TEST_BIN) $@ 30 | 31 | run: 32 | make all 33 | echo "version = '$(NEW)';" > output_$(NEW).m 34 | ./test_MMult.x >> output_$(NEW).m 35 | cp output_$(OLD).m output_old.m 36 | cp output_$(NEW).m output_new.m 37 | 38 | clean: 39 | rm -f *.o *~ core *.x 40 | 41 | cleanall: 42 | rm -f *.o *~ core *.x output*.m *.eps *.png 43 | -------------------------------------------------------------------------------- /vulkan/parameters.h: -------------------------------------------------------------------------------- 1 | /* 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )" 3 | The below parameters set this range of values that p takes on 4 | */ 5 | #define PFIRST 64 6 | #define PLAST 512 7 | #define PINC 64 8 | 9 | /* 10 | In the test driver, the m, n, and k dimensions are set to the below 11 | values. If the value equals "-1" then that dimension is bound to the 12 | index p, given above. 13 | */ 14 | 15 | #define M -1 16 | #define N -1 17 | #define K -1 18 | 19 | /* 20 | In the test driver, each experiment is repeated NREPEATS times and 21 | the best time from these repeats is used to compute the performance 22 | */ 23 | 24 | #define NREPEATS 1 25 | 26 | /* 27 | Matrices A, B, and C are stored in two dimensional arrays with 28 | row dimensions that are greater than or equal to the row dimension 29 | of the matrix. This row dimension of the array is known as the 30 | "leading dimension" and determines the stride (the number of 31 | double precision numbers) when one goes from one element in a row 32 | to the next. Having this number larger than the row dimension of 33 | the matrix tends to adversely affect performance. LDX equals the 34 | leading dimension of the array that stores matrix X. If LDX=-1 35 | then the leading dimension is set to the row dimension of matrix X. 36 | */ 37 | 38 | #if 0 39 | #define LDA 1000 40 | #define LDB 1000 41 | #define LDC 1000 42 | #else 43 | #define LDA -1 44 | #define LDB -1 45 | #define LDC -1 46 | #endif 47 | -------------------------------------------------------------------------------- /vulkan/plot.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | def readFile(filename): 6 | f = open(filename) 7 | sizes = [] 8 | times = [] 9 | title = '' 10 | try: 11 | title = f.readline() 12 | # skip 3 line 13 | f.readline() 14 | f.readline() 15 | f.readline() 16 | while True: 17 | line = f.readline() 18 | if line: 19 | slices = line.split(" ") 20 | if len(slices) <= 2: 21 | break; 22 | size = int(slices[0]) 23 | time = float(slices[1]) 24 | sizes.append(size) 25 | times.append(time) 26 | finally: 27 | f.close() 28 | return title, sizes, times 29 | 30 | if __name__ == '__main__': 31 | plt.xlabel('shape') 32 | plt.ylabel('gflops') 33 | l = len(sys.argv) 34 | for i,item in enumerate(sys.argv): 35 | if i == 0: 36 | continue 37 | t,x,y = readFile(item) 38 | plt.plot(x,y,label=t) 39 | plt.legend() 40 | plt.show() 41 | 42 | -------------------------------------------------------------------------------- /vulkan/print_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define A(i, j) a[(i)*lda + (j)] 4 | 5 | void print_matrix(int m, int n, float *a, int lda) { 6 | int i, j; 7 | 8 | for (i = 0; i < m; i++) { 9 | for (j = 0; j < n; j++) { 10 | printf("%.1f\t", A(i, j)); 11 | } 12 | printf("\n"); 13 | } 14 | printf("\n"); 15 | } 16 | -------------------------------------------------------------------------------- /vulkan/random_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void random_matrix(int m, int n, float *a) { 4 | #define A(i, j) a[(i)*n + (j)] 5 | 6 | double drand48(); 7 | int i, j; 8 | 9 | for (i = 0; i < m; i++) { 10 | for (j = 0; j < n; j++) { 11 | #if 1 12 | A(i, j) = 2.0 * (float)drand48() - 1.0; 13 | #else 14 | A(i, j) = (j - i) % 3; 15 | #endif 16 | // A(i, j) = 1; 17 | } 18 | } 19 | #undef A 20 | } 21 | -------------------------------------------------------------------------------- /vulkan/spdlog/async_logger-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | SPDLOG_INLINE spdlog::async_logger::async_logger( 17 | std::string logger_name, sinks_init_list sinks_list, std::weak_ptr tp, async_overflow_policy overflow_policy) 18 | : async_logger(std::move(logger_name), sinks_list.begin(), sinks_list.end(), std::move(tp), overflow_policy) 19 | {} 20 | 21 | SPDLOG_INLINE spdlog::async_logger::async_logger( 22 | std::string logger_name, sink_ptr single_sink, std::weak_ptr tp, async_overflow_policy overflow_policy) 23 | : async_logger(std::move(logger_name), {std::move(single_sink)}, std::move(tp), overflow_policy) 24 | {} 25 | 26 | // send the log message to the thread pool 27 | SPDLOG_INLINE void spdlog::async_logger::sink_it_(const details::log_msg &msg) 28 | { 29 | if (auto pool_ptr = thread_pool_.lock()) 30 | { 31 | pool_ptr->post_log(shared_from_this(), msg, overflow_policy_); 32 | } 33 | else 34 | { 35 | throw_spdlog_ex("async log: thread pool doesn't exist anymore"); 36 | } 37 | } 38 | 39 | // send flush request to the thread pool 40 | SPDLOG_INLINE void spdlog::async_logger::flush_() 41 | { 42 | if (auto pool_ptr = thread_pool_.lock()) 43 | { 44 | pool_ptr->post_flush(shared_from_this(), overflow_policy_); 45 | } 46 | else 47 | { 48 | throw_spdlog_ex("async flush: thread pool doesn't exist anymore"); 49 | } 50 | } 51 | 52 | // 53 | // backend functions - called from the thread pool to do the actual job 54 | // 55 | SPDLOG_INLINE void spdlog::async_logger::backend_sink_it_(const details::log_msg &msg) 56 | { 57 | for (auto &sink : sinks_) 58 | { 59 | if (sink->should_log(msg.level)) 60 | { 61 | SPDLOG_TRY 62 | { 63 | sink->log(msg); 64 | } 65 | SPDLOG_LOGGER_CATCH() 66 | } 67 | } 68 | 69 | if (should_flush_(msg)) 70 | { 71 | backend_flush_(); 72 | } 73 | } 74 | 75 | SPDLOG_INLINE void spdlog::async_logger::backend_flush_() 76 | { 77 | for (auto &sink : sinks_) 78 | { 79 | SPDLOG_TRY 80 | { 81 | sink->flush(); 82 | } 83 | SPDLOG_LOGGER_CATCH() 84 | } 85 | } 86 | 87 | SPDLOG_INLINE std::shared_ptr spdlog::async_logger::clone(std::string new_name) 88 | { 89 | auto cloned = std::make_shared(*this); 90 | cloned->name_ = std::move(new_name); 91 | return cloned; 92 | } 93 | -------------------------------------------------------------------------------- /vulkan/spdlog/async_logger.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | // Fast asynchronous logger. 7 | // Uses pre allocated queue. 8 | // Creates a single back thread to pop messages from the queue and log them. 9 | // 10 | // Upon each log write the logger: 11 | // 1. Checks if its log level is enough to log the message 12 | // 2. Push a new copy of the message to a queue (or block the caller until 13 | // space is available in the queue) 14 | // Upon destruction, logs all remaining messages in the queue before 15 | // destructing.. 16 | 17 | #include 18 | 19 | namespace spdlog { 20 | 21 | // Async overflow policy - block by default. 22 | enum class async_overflow_policy 23 | { 24 | block, // Block until message can be enqueued 25 | overrun_oldest // Discard oldest message in the queue if full when trying to 26 | // add new item. 27 | }; 28 | 29 | namespace details { 30 | class thread_pool; 31 | } 32 | 33 | class SPDLOG_API async_logger final : public std::enable_shared_from_this, public logger 34 | { 35 | friend class details::thread_pool; 36 | 37 | public: 38 | template 39 | async_logger(std::string logger_name, It begin, It end, std::weak_ptr tp, 40 | async_overflow_policy overflow_policy = async_overflow_policy::block) 41 | : logger(std::move(logger_name), begin, end) 42 | , thread_pool_(std::move(tp)) 43 | , overflow_policy_(overflow_policy) 44 | {} 45 | 46 | async_logger(std::string logger_name, sinks_init_list sinks_list, std::weak_ptr tp, 47 | async_overflow_policy overflow_policy = async_overflow_policy::block); 48 | 49 | async_logger(std::string logger_name, sink_ptr single_sink, std::weak_ptr tp, 50 | async_overflow_policy overflow_policy = async_overflow_policy::block); 51 | 52 | std::shared_ptr clone(std::string new_name) override; 53 | 54 | protected: 55 | void sink_it_(const details::log_msg &msg) override; 56 | void flush_() override; 57 | void backend_sink_it_(const details::log_msg &incoming_log_msg); 58 | void backend_flush_(); 59 | 60 | private: 61 | std::weak_ptr thread_pool_; 62 | async_overflow_policy overflow_policy_; 63 | }; 64 | } // namespace spdlog 65 | 66 | #ifdef SPDLOG_HEADER_ONLY 67 | #include "async_logger-inl.h" 68 | #endif 69 | -------------------------------------------------------------------------------- /vulkan/spdlog/cfg/argv.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | #include 6 | #include 7 | 8 | // 9 | // Init log levels using each argv entry that starts with "SPDLOG_LEVEL=" 10 | // 11 | // set all loggers to debug level: 12 | // example.exe "SPDLOG_LEVEL=debug" 13 | 14 | // set logger1 to trace level 15 | // example.exe "SPDLOG_LEVEL=logger1=trace" 16 | 17 | // turn off all logging except for logger1 and logger2: 18 | // example.exe "SPDLOG_LEVEL=off,logger1=debug,logger2=info" 19 | 20 | namespace spdlog { 21 | namespace cfg { 22 | 23 | // search for SPDLOG_LEVEL= in the args and use it to init the levels 24 | inline void load_argv_levels(int argc, const char **argv) 25 | { 26 | const std::string spdlog_level_prefix = "SPDLOG_LEVEL="; 27 | for (int i = 1; i < argc; i++) 28 | { 29 | std::string arg = argv[i]; 30 | if (arg.find(spdlog_level_prefix) == 0) 31 | { 32 | auto levels_string = arg.substr(spdlog_level_prefix.size()); 33 | helpers::load_levels(levels_string); 34 | } 35 | } 36 | } 37 | 38 | inline void load_argv_levels(int argc, char **argv) 39 | { 40 | load_argv_levels(argc, const_cast(argv)); 41 | } 42 | 43 | } // namespace cfg 44 | } // namespace spdlog 45 | -------------------------------------------------------------------------------- /vulkan/spdlog/cfg/env.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | 9 | // 10 | // Init levels and patterns from env variables SPDLOG_LEVEL 11 | // Inspired from Rust's "env_logger" crate (https://crates.io/crates/env_logger). 12 | // Note - fallback to "info" level on unrecognized levels 13 | // 14 | // Examples: 15 | // 16 | // set global level to debug: 17 | // export SPDLOG_LEVEL=debug 18 | // 19 | // turn off all logging except for logger1: 20 | // export SPDLOG_LEVEL="*=off,logger1=debug" 21 | // 22 | 23 | // turn off all logging except for logger1 and logger2: 24 | // export SPDLOG_LEVEL="off,logger1=debug,logger2=info" 25 | 26 | namespace spdlog { 27 | namespace cfg { 28 | inline void load_env_levels() 29 | { 30 | auto env_val = details::os::getenv("SPDLOG_LEVEL"); 31 | if (!env_val.empty()) 32 | { 33 | helpers::load_levels(env_val); 34 | } 35 | } 36 | 37 | } // namespace cfg 38 | } // namespace spdlog 39 | -------------------------------------------------------------------------------- /vulkan/spdlog/cfg/helpers.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | namespace spdlog { 10 | namespace cfg { 11 | namespace helpers { 12 | // 13 | // Init levels from given string 14 | // 15 | // Examples: 16 | // 17 | // set global level to debug: "debug" 18 | // turn off all logging except for logger1: "off,logger1=debug" 19 | // turn off all logging except for logger1 and logger2: "off,logger1=debug,logger2=info" 20 | // 21 | SPDLOG_API void load_levels(const std::string &txt); 22 | } // namespace helpers 23 | 24 | } // namespace cfg 25 | } // namespace spdlog 26 | 27 | #ifdef SPDLOG_HEADER_ONLY 28 | #include "helpers-inl.h" 29 | #endif // SPDLOG_HEADER_ONLY 30 | -------------------------------------------------------------------------------- /vulkan/spdlog/common-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | namespace spdlog { 11 | namespace level { 12 | static string_view_t level_string_views[] SPDLOG_LEVEL_NAMES; 13 | 14 | static const char *short_level_names[] SPDLOG_SHORT_LEVEL_NAMES; 15 | 16 | SPDLOG_INLINE string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT 17 | { 18 | return level_string_views[l]; 19 | } 20 | 21 | SPDLOG_INLINE const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT 22 | { 23 | return short_level_names[l]; 24 | } 25 | 26 | SPDLOG_INLINE spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT 27 | { 28 | int level = 0; 29 | for (const auto &level_str : level_string_views) 30 | { 31 | if (level_str == name) 32 | { 33 | return static_cast(level); 34 | } 35 | level++; 36 | } 37 | // check also for "warn" and "err" before giving up.. 38 | if (name == "warn") 39 | { 40 | return level::warn; 41 | } 42 | if (name == "err") 43 | { 44 | return level::err; 45 | } 46 | return level::off; 47 | } 48 | } // namespace level 49 | 50 | SPDLOG_INLINE spdlog_ex::spdlog_ex(std::string msg) 51 | : msg_(std::move(msg)) 52 | {} 53 | 54 | SPDLOG_INLINE spdlog_ex::spdlog_ex(const std::string &msg, int last_errno) 55 | { 56 | memory_buf_t outbuf; 57 | fmt::format_system_error(outbuf, last_errno, msg); 58 | msg_ = fmt::to_string(outbuf); 59 | } 60 | 61 | SPDLOG_INLINE const char *spdlog_ex::what() const SPDLOG_NOEXCEPT 62 | { 63 | return msg_.c_str(); 64 | } 65 | 66 | SPDLOG_INLINE void throw_spdlog_ex(const std::string &msg, int last_errno) 67 | { 68 | SPDLOG_THROW(spdlog_ex(msg, last_errno)); 69 | } 70 | 71 | SPDLOG_INLINE void throw_spdlog_ex(std::string msg) 72 | { 73 | SPDLOG_THROW(spdlog_ex(std::move(msg))); 74 | } 75 | 76 | } // namespace spdlog 77 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/backtracer-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | namespace spdlog { 10 | namespace details { 11 | SPDLOG_INLINE backtracer::backtracer(const backtracer &other) 12 | { 13 | std::lock_guard lock(other.mutex_); 14 | enabled_ = other.enabled(); 15 | messages_ = other.messages_; 16 | } 17 | 18 | SPDLOG_INLINE backtracer::backtracer(backtracer &&other) SPDLOG_NOEXCEPT 19 | { 20 | std::lock_guard lock(other.mutex_); 21 | enabled_ = other.enabled(); 22 | messages_ = std::move(other.messages_); 23 | } 24 | 25 | SPDLOG_INLINE backtracer &backtracer::operator=(backtracer other) 26 | { 27 | std::lock_guard lock(mutex_); 28 | enabled_ = other.enabled(); 29 | messages_ = std::move(other.messages_); 30 | return *this; 31 | } 32 | 33 | SPDLOG_INLINE void backtracer::enable(size_t size) 34 | { 35 | std::lock_guard lock{mutex_}; 36 | enabled_.store(true, std::memory_order_relaxed); 37 | messages_ = circular_q{size}; 38 | } 39 | 40 | SPDLOG_INLINE void backtracer::disable() 41 | { 42 | std::lock_guard lock{mutex_}; 43 | enabled_.store(false, std::memory_order_relaxed); 44 | } 45 | 46 | SPDLOG_INLINE bool backtracer::enabled() const 47 | { 48 | return enabled_.load(std::memory_order_relaxed); 49 | } 50 | 51 | SPDLOG_INLINE void backtracer::push_back(const log_msg &msg) 52 | { 53 | std::lock_guard lock{mutex_}; 54 | messages_.push_back(log_msg_buffer{msg}); 55 | } 56 | 57 | // pop all items in the q and apply the given fun on each of them. 58 | SPDLOG_INLINE void backtracer::foreach_pop(std::function fun) 59 | { 60 | std::lock_guard lock{mutex_}; 61 | while (!messages_.empty()) 62 | { 63 | auto &front_msg = messages_.front(); 64 | fun(front_msg); 65 | messages_.pop_front(); 66 | } 67 | } 68 | } // namespace details 69 | } // namespace spdlog 70 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/backtracer.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | // Store log messages in circular buffer. 14 | // Useful for storing debug data in case of error/warning happens. 15 | 16 | namespace spdlog { 17 | namespace details { 18 | class SPDLOG_API backtracer 19 | { 20 | mutable std::mutex mutex_; 21 | std::atomic enabled_{false}; 22 | circular_q messages_; 23 | 24 | public: 25 | backtracer() = default; 26 | backtracer(const backtracer &other); 27 | 28 | backtracer(backtracer &&other) SPDLOG_NOEXCEPT; 29 | backtracer &operator=(backtracer other); 30 | 31 | void enable(size_t size); 32 | void disable(); 33 | bool enabled() const; 34 | void push_back(const log_msg &msg); 35 | 36 | // pop all items in the q and apply the given fun on each of them. 37 | void foreach_pop(std::function fun); 38 | }; 39 | 40 | } // namespace details 41 | } // namespace spdlog 42 | 43 | #ifdef SPDLOG_HEADER_ONLY 44 | #include "backtracer-inl.h" 45 | #endif -------------------------------------------------------------------------------- /vulkan/spdlog/details/console_globals.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | namespace spdlog { 10 | namespace details { 11 | 12 | struct console_mutex 13 | { 14 | using mutex_t = std::mutex; 15 | static mutex_t &mutex() 16 | { 17 | static mutex_t s_mutex; 18 | return s_mutex; 19 | } 20 | }; 21 | 22 | struct console_nullmutex 23 | { 24 | using mutex_t = null_mutex; 25 | static mutex_t &mutex() 26 | { 27 | static mutex_t s_mutex; 28 | return s_mutex; 29 | } 30 | }; 31 | } // namespace details 32 | } // namespace spdlog 33 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/file_helper.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | namespace spdlog { 10 | namespace details { 11 | 12 | // Helper class for file sinks. 13 | // When failing to open a file, retry several times(5) with a delay interval(10 ms). 14 | // Throw spdlog_ex exception on errors. 15 | 16 | class SPDLOG_API file_helper 17 | { 18 | public: 19 | explicit file_helper() = default; 20 | 21 | file_helper(const file_helper &) = delete; 22 | file_helper &operator=(const file_helper &) = delete; 23 | ~file_helper(); 24 | 25 | void open(const filename_t &fname, bool truncate = false); 26 | void reopen(bool truncate); 27 | void flush(); 28 | void close(); 29 | void write(const memory_buf_t &buf); 30 | size_t size() const; 31 | const filename_t &filename() const; 32 | 33 | // 34 | // return file path and its extension: 35 | // 36 | // "mylog.txt" => ("mylog", ".txt") 37 | // "mylog" => ("mylog", "") 38 | // "mylog." => ("mylog.", "") 39 | // "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt") 40 | // 41 | // the starting dot in filenames is ignored (hidden files): 42 | // 43 | // ".mylog" => (".mylog". "") 44 | // "my_folder/.mylog" => ("my_folder/.mylog", "") 45 | // "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt") 46 | static std::tuple split_by_extension(const filename_t &fname); 47 | 48 | private: 49 | const int open_tries_ = 5; 50 | const int open_interval_ = 10; 51 | std::FILE *fd_{nullptr}; 52 | filename_t filename_; 53 | }; 54 | } // namespace details 55 | } // namespace spdlog 56 | 57 | #ifdef SPDLOG_HEADER_ONLY 58 | #include "file_helper-inl.h" 59 | #endif 60 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/log_msg-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | #include 11 | 12 | namespace spdlog { 13 | namespace details { 14 | 15 | SPDLOG_INLINE log_msg::log_msg(spdlog::log_clock::time_point log_time, spdlog::source_loc loc, string_view_t a_logger_name, 16 | spdlog::level::level_enum lvl, spdlog::string_view_t msg) 17 | : logger_name(a_logger_name) 18 | , level(lvl) 19 | , time(log_time) 20 | #ifndef SPDLOG_NO_THREAD_ID 21 | , thread_id(os::thread_id()) 22 | #endif 23 | , source(loc) 24 | , payload(msg) 25 | {} 26 | 27 | SPDLOG_INLINE log_msg::log_msg( 28 | spdlog::source_loc loc, string_view_t a_logger_name, spdlog::level::level_enum lvl, spdlog::string_view_t msg) 29 | : log_msg(os::now(), loc, a_logger_name, lvl, msg) 30 | {} 31 | 32 | SPDLOG_INLINE log_msg::log_msg(string_view_t a_logger_name, spdlog::level::level_enum lvl, spdlog::string_view_t msg) 33 | : log_msg(os::now(), source_loc{}, a_logger_name, lvl, msg) 34 | {} 35 | 36 | } // namespace details 37 | } // namespace spdlog 38 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/log_msg.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | namespace spdlog { 10 | namespace details { 11 | struct SPDLOG_API log_msg 12 | { 13 | log_msg() = default; 14 | log_msg(log_clock::time_point log_time, source_loc loc, string_view_t logger_name, level::level_enum lvl, string_view_t msg); 15 | log_msg(source_loc loc, string_view_t logger_name, level::level_enum lvl, string_view_t msg); 16 | log_msg(string_view_t logger_name, level::level_enum lvl, string_view_t msg); 17 | log_msg(const log_msg &other) = default; 18 | 19 | string_view_t logger_name; 20 | level::level_enum level{level::off}; 21 | log_clock::time_point time; 22 | size_t thread_id{0}; 23 | 24 | // wrapping the formatted text with color (updated by pattern_formatter). 25 | mutable size_t color_range_start{0}; 26 | mutable size_t color_range_end{0}; 27 | 28 | source_loc source; 29 | string_view_t payload; 30 | }; 31 | } // namespace details 32 | } // namespace spdlog 33 | 34 | #ifdef SPDLOG_HEADER_ONLY 35 | #include "log_msg-inl.h" 36 | #endif 37 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/log_msg_buffer-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | namespace spdlog { 11 | namespace details { 12 | 13 | SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg &orig_msg) 14 | : log_msg{orig_msg} 15 | { 16 | buffer.append(logger_name.begin(), logger_name.end()); 17 | buffer.append(payload.begin(), payload.end()); 18 | update_string_views(); 19 | } 20 | 21 | SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg_buffer &other) 22 | : log_msg{other} 23 | { 24 | buffer.append(logger_name.begin(), logger_name.end()); 25 | buffer.append(payload.begin(), payload.end()); 26 | update_string_views(); 27 | } 28 | 29 | SPDLOG_INLINE log_msg_buffer::log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT : log_msg{other}, buffer{std::move(other.buffer)} 30 | { 31 | update_string_views(); 32 | } 33 | 34 | SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(const log_msg_buffer &other) 35 | { 36 | log_msg::operator=(other); 37 | buffer.clear(); 38 | buffer.append(other.buffer.data(), other.buffer.data() + other.buffer.size()); 39 | update_string_views(); 40 | return *this; 41 | } 42 | 43 | SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT 44 | { 45 | log_msg::operator=(other); 46 | buffer = std::move(other.buffer); 47 | update_string_views(); 48 | return *this; 49 | } 50 | 51 | SPDLOG_INLINE void log_msg_buffer::update_string_views() 52 | { 53 | logger_name = string_view_t{buffer.data(), logger_name.size()}; 54 | payload = string_view_t{buffer.data() + logger_name.size(), payload.size()}; 55 | } 56 | 57 | } // namespace details 58 | } // namespace spdlog 59 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/log_msg_buffer.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | 8 | namespace spdlog { 9 | namespace details { 10 | 11 | // Extend log_msg with internal buffer to store its payload. 12 | // This is needed since log_msg holds string_views that points to stack data. 13 | 14 | class SPDLOG_API log_msg_buffer : public log_msg 15 | { 16 | memory_buf_t buffer; 17 | void update_string_views(); 18 | 19 | public: 20 | log_msg_buffer() = default; 21 | explicit log_msg_buffer(const log_msg &orig_msg); 22 | log_msg_buffer(const log_msg_buffer &other); 23 | log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT; 24 | log_msg_buffer &operator=(const log_msg_buffer &other); 25 | log_msg_buffer &operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT; 26 | }; 27 | 28 | } // namespace details 29 | } // namespace spdlog 30 | 31 | #ifdef SPDLOG_HEADER_ONLY 32 | #include "log_msg_buffer-inl.h" 33 | #endif 34 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/null_mutex.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | // null, no cost dummy "mutex" and dummy "atomic" int 9 | 10 | namespace spdlog { 11 | namespace details { 12 | struct null_mutex 13 | { 14 | void lock() const {} 15 | void unlock() const {} 16 | bool try_lock() const 17 | { 18 | return true; 19 | } 20 | }; 21 | 22 | struct null_atomic_int 23 | { 24 | int value; 25 | null_atomic_int() = default; 26 | 27 | explicit null_atomic_int(int new_value) 28 | : value(new_value) 29 | {} 30 | 31 | int load(std::memory_order = std::memory_order_relaxed) const 32 | { 33 | return value; 34 | } 35 | 36 | void store(int new_value, std::memory_order = std::memory_order_relaxed) 37 | { 38 | value = new_value; 39 | } 40 | 41 | int exchange(int new_value, std::memory_order = std::memory_order_relaxed) 42 | { 43 | std::swap(new_value, value); 44 | return new_value; // return value before the call 45 | } 46 | }; 47 | 48 | } // namespace details 49 | } // namespace spdlog 50 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/periodic_worker-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | namespace spdlog { 11 | namespace details { 12 | 13 | SPDLOG_INLINE periodic_worker::periodic_worker(const std::function &callback_fun, std::chrono::seconds interval) 14 | { 15 | active_ = (interval > std::chrono::seconds::zero()); 16 | if (!active_) 17 | { 18 | return; 19 | } 20 | 21 | worker_thread_ = std::thread([this, callback_fun, interval]() { 22 | for (;;) 23 | { 24 | std::unique_lock lock(this->mutex_); 25 | if (this->cv_.wait_for(lock, interval, [this] { return !this->active_; })) 26 | { 27 | return; // active_ == false, so exit this thread 28 | } 29 | callback_fun(); 30 | } 31 | }); 32 | } 33 | 34 | // stop the worker thread and join it 35 | SPDLOG_INLINE periodic_worker::~periodic_worker() 36 | { 37 | if (worker_thread_.joinable()) 38 | { 39 | { 40 | std::lock_guard lock(mutex_); 41 | active_ = false; 42 | } 43 | cv_.notify_one(); 44 | worker_thread_.join(); 45 | } 46 | } 47 | 48 | } // namespace details 49 | } // namespace spdlog 50 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/periodic_worker.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | // periodic worker thread - periodically executes the given callback function. 7 | // 8 | // RAII over the owned thread: 9 | // creates the thread on construction. 10 | // stops and joins the thread on destruction (if the thread is executing a callback, wait for it to finish first). 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | namespace spdlog { 18 | namespace details { 19 | 20 | class SPDLOG_API periodic_worker 21 | { 22 | public: 23 | periodic_worker(const std::function &callback_fun, std::chrono::seconds interval); 24 | periodic_worker(const periodic_worker &) = delete; 25 | periodic_worker &operator=(const periodic_worker &) = delete; 26 | // stop the worker thread and join it 27 | ~periodic_worker(); 28 | 29 | private: 30 | bool active_; 31 | std::thread worker_thread_; 32 | std::mutex mutex_; 33 | std::condition_variable cv_; 34 | }; 35 | } // namespace details 36 | } // namespace spdlog 37 | 38 | #ifdef SPDLOG_HEADER_ONLY 39 | #include "periodic_worker-inl.h" 40 | #endif 41 | -------------------------------------------------------------------------------- /vulkan/spdlog/details/synchronous_factory.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include "registry.h" 7 | 8 | namespace spdlog { 9 | 10 | // Default logger factory- creates synchronous loggers 11 | class logger; 12 | 13 | struct synchronous_factory 14 | { 15 | template 16 | static std::shared_ptr create(std::string logger_name, SinkArgs &&...args) 17 | { 18 | auto sink = std::make_shared(std::forward(args)...); 19 | auto new_logger = std::make_shared(std::move(logger_name), std::move(sink)); 20 | details::registry::instance().initialize_logger(new_logger); 21 | return new_logger; 22 | } 23 | }; 24 | } // namespace spdlog -------------------------------------------------------------------------------- /vulkan/spdlog/details/windows_include.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef NOMINMAX 4 | #define NOMINMAX // prevent windows redefining min/max 5 | #endif 6 | 7 | #ifndef WIN32_LEAN_AND_MEAN 8 | #define WIN32_LEAN_AND_MEAN 9 | #endif 10 | 11 | #include 12 | -------------------------------------------------------------------------------- /vulkan/spdlog/fmt/bundled/LICENSE.rst: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 - present, Victor Zverovich 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | --- Optional exception to the license --- 23 | 24 | As an exception, if, as a result of your compiling your source code, portions 25 | of this Software are embedded into a machine-executable object form of such 26 | source code, you may redistribute such embedded portions in such object form 27 | without including the above copyright and permission notices. 28 | -------------------------------------------------------------------------------- /vulkan/spdlog/fmt/bundled/posix.h: -------------------------------------------------------------------------------- 1 | #include "os.h" 2 | #warning "fmt/posix.h is deprecated; use fmt/os.h instead" 3 | -------------------------------------------------------------------------------- /vulkan/spdlog/fmt/chrono.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright(c) 2016 Gabi Melman. 3 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 4 | // 5 | 6 | #pragma once 7 | // 8 | // include bundled or external copy of fmtlib's chrono support 9 | // 10 | 11 | #if !defined(SPDLOG_FMT_EXTERNAL) 12 | #ifdef SPDLOG_HEADER_ONLY 13 | #ifndef FMT_HEADER_ONLY 14 | #define FMT_HEADER_ONLY 15 | #endif 16 | #endif 17 | #include 18 | #else 19 | #include 20 | #endif 21 | -------------------------------------------------------------------------------- /vulkan/spdlog/fmt/fmt.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright(c) 2016-2018 Gabi Melman. 3 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 4 | // 5 | 6 | #pragma once 7 | 8 | // 9 | // Include a bundled header-only copy of fmtlib or an external one. 10 | // By default spdlog include its own copy. 11 | // 12 | 13 | #if !defined(SPDLOG_FMT_EXTERNAL) 14 | #if !defined(SPDLOG_COMPILED_LIB) && !defined(FMT_HEADER_ONLY) 15 | #define FMT_HEADER_ONLY 16 | #endif 17 | #ifndef FMT_USE_WINDOWS_H 18 | #define FMT_USE_WINDOWS_H 0 19 | #endif 20 | // enable the 'n' flag in for backward compatibility with fmt 6.x 21 | #define FMT_DEPRECATED_N_SPECIFIER 22 | #include 23 | #include 24 | #else // SPDLOG_FMT_EXTERNAL is defined - use external fmtlib 25 | #include 26 | #include 27 | #endif -------------------------------------------------------------------------------- /vulkan/spdlog/fmt/ostr.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright(c) 2016 Gabi Melman. 3 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 4 | // 5 | 6 | #pragma once 7 | // 8 | // include bundled or external copy of fmtlib's ostream support 9 | // 10 | 11 | #if !defined(SPDLOG_FMT_EXTERNAL) 12 | #ifdef SPDLOG_HEADER_ONLY 13 | #ifndef FMT_HEADER_ONLY 14 | #define FMT_HEADER_ONLY 15 | #endif 16 | #endif 17 | #include 18 | #else 19 | #include 20 | #endif 21 | -------------------------------------------------------------------------------- /vulkan/spdlog/formatter.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | namespace spdlog { 10 | 11 | class formatter 12 | { 13 | public: 14 | virtual ~formatter() = default; 15 | virtual void format(const details::log_msg &msg, memory_buf_t &dest) = 0; 16 | virtual std::unique_ptr clone() const = 0; 17 | }; 18 | } // namespace spdlog 19 | -------------------------------------------------------------------------------- /vulkan/spdlog/fwd.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | namespace spdlog { 7 | class logger; 8 | class formatter; 9 | 10 | namespace sinks { 11 | class sink; 12 | } 13 | 14 | } // namespace spdlog 15 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/base_sink-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | template 16 | SPDLOG_INLINE spdlog::sinks::base_sink::base_sink() 17 | : formatter_{details::make_unique()} 18 | {} 19 | 20 | template 21 | SPDLOG_INLINE spdlog::sinks::base_sink::base_sink(std::unique_ptr formatter) 22 | : formatter_{std::move(formatter)} 23 | {} 24 | 25 | template 26 | void SPDLOG_INLINE spdlog::sinks::base_sink::log(const details::log_msg &msg) 27 | { 28 | std::lock_guard lock(mutex_); 29 | sink_it_(msg); 30 | } 31 | 32 | template 33 | void SPDLOG_INLINE spdlog::sinks::base_sink::flush() 34 | { 35 | std::lock_guard lock(mutex_); 36 | flush_(); 37 | } 38 | 39 | template 40 | void SPDLOG_INLINE spdlog::sinks::base_sink::set_pattern(const std::string &pattern) 41 | { 42 | std::lock_guard lock(mutex_); 43 | set_pattern_(pattern); 44 | } 45 | 46 | template 47 | void SPDLOG_INLINE spdlog::sinks::base_sink::set_formatter(std::unique_ptr sink_formatter) 48 | { 49 | std::lock_guard lock(mutex_); 50 | set_formatter_(std::move(sink_formatter)); 51 | } 52 | 53 | template 54 | void SPDLOG_INLINE spdlog::sinks::base_sink::set_pattern_(const std::string &pattern) 55 | { 56 | set_formatter_(details::make_unique(pattern)); 57 | } 58 | 59 | template 60 | void SPDLOG_INLINE spdlog::sinks::base_sink::set_formatter_(std::unique_ptr sink_formatter) 61 | { 62 | formatter_ = std::move(sink_formatter); 63 | } 64 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/base_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | // 6 | // base sink templated over a mutex (either dummy or real) 7 | // concrete implementation should override the sink_it_() and flush_() methods. 8 | // locking is taken care of in this class - no locking needed by the 9 | // implementers.. 10 | // 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | namespace spdlog { 17 | namespace sinks { 18 | template 19 | class base_sink : public sink 20 | { 21 | public: 22 | base_sink(); 23 | explicit base_sink(std::unique_ptr formatter); 24 | ~base_sink() override = default; 25 | 26 | base_sink(const base_sink &) = delete; 27 | base_sink(base_sink &&) = delete; 28 | 29 | base_sink &operator=(const base_sink &) = delete; 30 | base_sink &operator=(base_sink &&) = delete; 31 | 32 | void log(const details::log_msg &msg) final; 33 | void flush() final; 34 | void set_pattern(const std::string &pattern) final; 35 | void set_formatter(std::unique_ptr sink_formatter) final; 36 | 37 | protected: 38 | // sink formatter 39 | std::unique_ptr formatter_; 40 | Mutex mutex_; 41 | 42 | virtual void sink_it_(const details::log_msg &msg) = 0; 43 | virtual void flush_() = 0; 44 | virtual void set_pattern_(const std::string &pattern); 45 | virtual void set_formatter_(std::unique_ptr sink_formatter); 46 | }; 47 | } // namespace sinks 48 | } // namespace spdlog 49 | 50 | #ifdef SPDLOG_HEADER_ONLY 51 | #include "base_sink-inl.h" 52 | #endif 53 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/basic_file_sink-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | #include 11 | #include 12 | 13 | namespace spdlog { 14 | namespace sinks { 15 | 16 | template 17 | SPDLOG_INLINE basic_file_sink::basic_file_sink(const filename_t &filename, bool truncate) 18 | { 19 | file_helper_.open(filename, truncate); 20 | } 21 | 22 | template 23 | SPDLOG_INLINE const filename_t &basic_file_sink::filename() const 24 | { 25 | return file_helper_.filename(); 26 | } 27 | 28 | template 29 | SPDLOG_INLINE void basic_file_sink::sink_it_(const details::log_msg &msg) 30 | { 31 | memory_buf_t formatted; 32 | base_sink::formatter_->format(msg, formatted); 33 | file_helper_.write(formatted); 34 | } 35 | 36 | template 37 | SPDLOG_INLINE void basic_file_sink::flush_() 38 | { 39 | file_helper_.flush(); 40 | } 41 | 42 | } // namespace sinks 43 | } // namespace spdlog 44 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/basic_file_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | namespace spdlog { 15 | namespace sinks { 16 | /* 17 | * Trivial file sink with single file as target 18 | */ 19 | template 20 | class basic_file_sink final : public base_sink 21 | { 22 | public: 23 | explicit basic_file_sink(const filename_t &filename, bool truncate = false); 24 | const filename_t &filename() const; 25 | 26 | protected: 27 | void sink_it_(const details::log_msg &msg) override; 28 | void flush_() override; 29 | 30 | private: 31 | details::file_helper file_helper_; 32 | }; 33 | 34 | using basic_file_sink_mt = basic_file_sink; 35 | using basic_file_sink_st = basic_file_sink; 36 | 37 | } // namespace sinks 38 | 39 | // 40 | // factory functions 41 | // 42 | template 43 | inline std::shared_ptr basic_logger_mt(const std::string &logger_name, const filename_t &filename, bool truncate = false) 44 | { 45 | return Factory::template create(logger_name, filename, truncate); 46 | } 47 | 48 | template 49 | inline std::shared_ptr basic_logger_st(const std::string &logger_name, const filename_t &filename, bool truncate = false) 50 | { 51 | return Factory::template create(logger_name, filename, truncate); 52 | } 53 | 54 | } // namespace spdlog 55 | 56 | #ifdef SPDLOG_HEADER_ONLY 57 | #include "basic_file_sink-inl.h" 58 | #endif -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/dist_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include "base_sink.h" 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // Distribution sink (mux). Stores a vector of sinks which get called when log 17 | // is called 18 | 19 | namespace spdlog { 20 | namespace sinks { 21 | 22 | template 23 | class dist_sink : public base_sink 24 | { 25 | public: 26 | dist_sink() = default; 27 | explicit dist_sink(std::vector> sinks) 28 | : sinks_(sinks) 29 | {} 30 | 31 | dist_sink(const dist_sink &) = delete; 32 | dist_sink &operator=(const dist_sink &) = delete; 33 | 34 | void add_sink(std::shared_ptr sink) 35 | { 36 | std::lock_guard lock(base_sink::mutex_); 37 | sinks_.push_back(sink); 38 | } 39 | 40 | void remove_sink(std::shared_ptr sink) 41 | { 42 | std::lock_guard lock(base_sink::mutex_); 43 | sinks_.erase(std::remove(sinks_.begin(), sinks_.end(), sink), sinks_.end()); 44 | } 45 | 46 | void set_sinks(std::vector> sinks) 47 | { 48 | std::lock_guard lock(base_sink::mutex_); 49 | sinks_ = std::move(sinks); 50 | } 51 | 52 | std::vector> &sinks() 53 | { 54 | return sinks_; 55 | } 56 | 57 | protected: 58 | void sink_it_(const details::log_msg &msg) override 59 | { 60 | for (auto &sink : sinks_) 61 | { 62 | if (sink->should_log(msg.level)) 63 | { 64 | sink->log(msg); 65 | } 66 | } 67 | } 68 | 69 | void flush_() override 70 | { 71 | for (auto &sink : sinks_) 72 | { 73 | sink->flush(); 74 | } 75 | } 76 | 77 | void set_pattern_(const std::string &pattern) override 78 | { 79 | set_formatter_(details::make_unique(pattern)); 80 | } 81 | 82 | void set_formatter_(std::unique_ptr sink_formatter) override 83 | { 84 | base_sink::formatter_ = std::move(sink_formatter); 85 | for (auto &sink : sinks_) 86 | { 87 | sink->set_formatter(base_sink::formatter_->clone()); 88 | } 89 | } 90 | std::vector> sinks_; 91 | }; 92 | 93 | using dist_sink_mt = dist_sink; 94 | using dist_sink_st = dist_sink; 95 | 96 | } // namespace sinks 97 | } // namespace spdlog 98 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/msvc_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2016 Alexander Dalshov. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #if defined(_WIN32) 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | 15 | // Avoid including windows.h (https://stackoverflow.com/a/30741042) 16 | extern "C" __declspec(dllimport) void __stdcall OutputDebugStringA(const char *lpOutputString); 17 | 18 | namespace spdlog { 19 | namespace sinks { 20 | /* 21 | * MSVC sink (logging using OutputDebugStringA) 22 | */ 23 | template 24 | class msvc_sink : public base_sink 25 | { 26 | public: 27 | msvc_sink() = default; 28 | 29 | protected: 30 | void sink_it_(const details::log_msg &msg) override 31 | { 32 | memory_buf_t formatted; 33 | base_sink::formatter_->format(msg, formatted); 34 | OutputDebugStringA(fmt::to_string(formatted).c_str()); 35 | } 36 | 37 | void flush_() override {} 38 | }; 39 | 40 | using msvc_sink_mt = msvc_sink; 41 | using msvc_sink_st = msvc_sink; 42 | 43 | using windebug_sink_mt = msvc_sink_mt; 44 | using windebug_sink_st = msvc_sink_st; 45 | 46 | } // namespace sinks 47 | } // namespace spdlog 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/null_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | namespace spdlog { 13 | namespace sinks { 14 | 15 | template 16 | class null_sink : public base_sink 17 | { 18 | protected: 19 | void sink_it_(const details::log_msg &) override {} 20 | void flush_() override {} 21 | }; 22 | 23 | using null_sink_mt = null_sink; 24 | using null_sink_st = null_sink; 25 | 26 | } // namespace sinks 27 | 28 | template 29 | inline std::shared_ptr null_logger_mt(const std::string &logger_name) 30 | { 31 | auto null_logger = Factory::template create(logger_name); 32 | null_logger->set_level(level::off); 33 | return null_logger; 34 | } 35 | 36 | template 37 | inline std::shared_ptr null_logger_st(const std::string &logger_name) 38 | { 39 | auto null_logger = Factory::template create(logger_name); 40 | null_logger->set_level(level::off); 41 | return null_logger; 42 | } 43 | 44 | } // namespace spdlog 45 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/ostream_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | namespace spdlog { 13 | namespace sinks { 14 | template 15 | class ostream_sink final : public base_sink 16 | { 17 | public: 18 | explicit ostream_sink(std::ostream &os, bool force_flush = false) 19 | : ostream_(os) 20 | , force_flush_(force_flush) 21 | {} 22 | ostream_sink(const ostream_sink &) = delete; 23 | ostream_sink &operator=(const ostream_sink &) = delete; 24 | 25 | protected: 26 | void sink_it_(const details::log_msg &msg) override 27 | { 28 | memory_buf_t formatted; 29 | base_sink::formatter_->format(msg, formatted); 30 | ostream_.write(formatted.data(), static_cast(formatted.size())); 31 | if (force_flush_) 32 | { 33 | ostream_.flush(); 34 | } 35 | } 36 | 37 | void flush_() override 38 | { 39 | ostream_.flush(); 40 | } 41 | 42 | std::ostream &ostream_; 43 | bool force_flush_; 44 | }; 45 | 46 | using ostream_sink_mt = ostream_sink; 47 | using ostream_sink_st = ostream_sink; 48 | 49 | } // namespace sinks 50 | } // namespace spdlog 51 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/ringbuffer_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include "spdlog/sinks/base_sink.h" 7 | #include "spdlog/details/circular_q.h" 8 | #include "spdlog/details/log_msg_buffer.h" 9 | #include "spdlog/details/null_mutex.h" 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | namespace spdlog { 16 | namespace sinks { 17 | /* 18 | * Ring buffer sink 19 | */ 20 | template 21 | class ringbuffer_sink final : public base_sink 22 | { 23 | public: 24 | explicit ringbuffer_sink(size_t n_items) 25 | : q_{n_items} 26 | {} 27 | 28 | std::vector last_raw(size_t lim = 0) 29 | { 30 | std::lock_guard lock(base_sink::mutex_); 31 | auto items_available = q_.size(); 32 | auto n_items = lim > 0 ? (std::min)(lim, items_available) : items_available; 33 | std::vector ret; 34 | ret.reserve(n_items); 35 | for (size_t i = (items_available - n_items); i < items_available; i++) 36 | { 37 | ret.push_back(q_.at(i)); 38 | } 39 | return ret; 40 | } 41 | 42 | std::vector last_formatted(size_t lim = 0) 43 | { 44 | std::lock_guard lock(base_sink::mutex_); 45 | auto items_available = q_.size(); 46 | auto n_items = lim > 0 ? (std::min)(lim, items_available) : items_available; 47 | std::vector ret; 48 | ret.reserve(n_items); 49 | for (size_t i = (items_available - n_items); i < items_available; i++) 50 | { 51 | memory_buf_t formatted; 52 | base_sink::formatter_->format(q_.at(i), formatted); 53 | ret.push_back(fmt::to_string(formatted)); 54 | } 55 | return ret; 56 | } 57 | 58 | protected: 59 | void sink_it_(const details::log_msg &msg) override 60 | { 61 | q_.push_back(details::log_msg_buffer{msg}); 62 | } 63 | void flush_() override {} 64 | 65 | private: 66 | details::circular_q q_; 67 | }; 68 | 69 | using ringbuffer_sink_mt = ringbuffer_sink; 70 | using ringbuffer_sink_st = ringbuffer_sink; 71 | 72 | } // namespace sinks 73 | 74 | } // namespace spdlog 75 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/rotating_file_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | namespace spdlog { 16 | namespace sinks { 17 | 18 | // 19 | // Rotating file sink based on size 20 | // 21 | template 22 | class rotating_file_sink final : public base_sink 23 | { 24 | public: 25 | rotating_file_sink(filename_t base_filename, std::size_t max_size, std::size_t max_files, bool rotate_on_open = false); 26 | static filename_t calc_filename(const filename_t &filename, std::size_t index); 27 | filename_t filename(); 28 | 29 | protected: 30 | void sink_it_(const details::log_msg &msg) override; 31 | void flush_() override; 32 | 33 | private: 34 | // Rotate files: 35 | // log.txt -> log.1.txt 36 | // log.1.txt -> log.2.txt 37 | // log.2.txt -> log.3.txt 38 | // log.3.txt -> delete 39 | void rotate_(); 40 | 41 | // delete the target if exists, and rename the src file to target 42 | // return true on success, false otherwise. 43 | bool rename_file_(const filename_t &src_filename, const filename_t &target_filename); 44 | 45 | filename_t base_filename_; 46 | std::size_t max_size_; 47 | std::size_t max_files_; 48 | std::size_t current_size_; 49 | details::file_helper file_helper_; 50 | }; 51 | 52 | using rotating_file_sink_mt = rotating_file_sink; 53 | using rotating_file_sink_st = rotating_file_sink; 54 | 55 | } // namespace sinks 56 | 57 | // 58 | // factory functions 59 | // 60 | 61 | template 62 | inline std::shared_ptr rotating_logger_mt( 63 | const std::string &logger_name, const filename_t &filename, size_t max_file_size, size_t max_files, bool rotate_on_open = false) 64 | { 65 | return Factory::template create(logger_name, filename, max_file_size, max_files, rotate_on_open); 66 | } 67 | 68 | template 69 | inline std::shared_ptr rotating_logger_st( 70 | const std::string &logger_name, const filename_t &filename, size_t max_file_size, size_t max_files, bool rotate_on_open = false) 71 | { 72 | return Factory::template create(logger_name, filename, max_file_size, max_files, rotate_on_open); 73 | } 74 | } // namespace spdlog 75 | 76 | #ifdef SPDLOG_HEADER_ONLY 77 | #include "rotating_file_sink-inl.h" 78 | #endif 79 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/sink-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | #include 11 | 12 | SPDLOG_INLINE bool spdlog::sinks::sink::should_log(spdlog::level::level_enum msg_level) const 13 | { 14 | return msg_level >= level_.load(std::memory_order_relaxed); 15 | } 16 | 17 | SPDLOG_INLINE void spdlog::sinks::sink::set_level(level::level_enum log_level) 18 | { 19 | level_.store(log_level, std::memory_order_relaxed); 20 | } 21 | 22 | SPDLOG_INLINE spdlog::level::level_enum spdlog::sinks::sink::level() const 23 | { 24 | return static_cast(level_.load(std::memory_order_relaxed)); 25 | } 26 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | namespace spdlog { 10 | 11 | namespace sinks { 12 | class SPDLOG_API sink 13 | { 14 | public: 15 | virtual ~sink() = default; 16 | virtual void log(const details::log_msg &msg) = 0; 17 | virtual void flush() = 0; 18 | virtual void set_pattern(const std::string &pattern) = 0; 19 | virtual void set_formatter(std::unique_ptr sink_formatter) = 0; 20 | 21 | void set_level(level::level_enum log_level); 22 | level::level_enum level() const; 23 | bool should_log(level::level_enum msg_level) const; 24 | 25 | protected: 26 | // sink log level - default is all 27 | level_t level_{level::trace}; 28 | }; 29 | 30 | } // namespace sinks 31 | } // namespace spdlog 32 | 33 | #ifdef SPDLOG_HEADER_ONLY 34 | #include "sink-inl.h" 35 | #endif 36 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/stdout_color_sinks-inl.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifndef SPDLOG_HEADER_ONLY 7 | #include 8 | #endif 9 | 10 | #include 11 | #include 12 | 13 | namespace spdlog { 14 | 15 | template 16 | SPDLOG_INLINE std::shared_ptr stdout_color_mt(const std::string &logger_name, color_mode mode) 17 | { 18 | return Factory::template create(logger_name, mode); 19 | } 20 | 21 | template 22 | SPDLOG_INLINE std::shared_ptr stdout_color_st(const std::string &logger_name, color_mode mode) 23 | { 24 | return Factory::template create(logger_name, mode); 25 | } 26 | 27 | template 28 | SPDLOG_INLINE std::shared_ptr stderr_color_mt(const std::string &logger_name, color_mode mode) 29 | { 30 | return Factory::template create(logger_name, mode); 31 | } 32 | 33 | template 34 | SPDLOG_INLINE std::shared_ptr stderr_color_st(const std::string &logger_name, color_mode mode) 35 | { 36 | return Factory::template create(logger_name, mode); 37 | } 38 | } // namespace spdlog -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/stdout_color_sinks.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #ifdef _WIN32 7 | #include 8 | #else 9 | #include 10 | #endif 11 | 12 | #include 13 | 14 | namespace spdlog { 15 | namespace sinks { 16 | #ifdef _WIN32 17 | using stdout_color_sink_mt = wincolor_stdout_sink_mt; 18 | using stdout_color_sink_st = wincolor_stdout_sink_st; 19 | using stderr_color_sink_mt = wincolor_stderr_sink_mt; 20 | using stderr_color_sink_st = wincolor_stderr_sink_st; 21 | #else 22 | using stdout_color_sink_mt = ansicolor_stdout_sink_mt; 23 | using stdout_color_sink_st = ansicolor_stdout_sink_st; 24 | using stderr_color_sink_mt = ansicolor_stderr_sink_mt; 25 | using stderr_color_sink_st = ansicolor_stderr_sink_st; 26 | #endif 27 | } // namespace sinks 28 | 29 | template 30 | std::shared_ptr stdout_color_mt(const std::string &logger_name, color_mode mode = color_mode::automatic); 31 | 32 | template 33 | std::shared_ptr stdout_color_st(const std::string &logger_name, color_mode mode = color_mode::automatic); 34 | 35 | template 36 | std::shared_ptr stderr_color_mt(const std::string &logger_name, color_mode mode = color_mode::automatic); 37 | 38 | template 39 | std::shared_ptr stderr_color_st(const std::string &logger_name, color_mode mode = color_mode::automatic); 40 | 41 | } // namespace spdlog 42 | 43 | #ifdef SPDLOG_HEADER_ONLY 44 | #include "stdout_color_sinks-inl.h" 45 | #endif 46 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/stdout_sinks.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef _WIN32 12 | #include 13 | #endif 14 | 15 | namespace spdlog { 16 | 17 | namespace sinks { 18 | 19 | template 20 | class stdout_sink_base : public sink 21 | { 22 | public: 23 | using mutex_t = typename ConsoleMutex::mutex_t; 24 | explicit stdout_sink_base(FILE *file); 25 | ~stdout_sink_base() override = default; 26 | 27 | stdout_sink_base(const stdout_sink_base &other) = delete; 28 | stdout_sink_base(stdout_sink_base &&other) = delete; 29 | 30 | stdout_sink_base &operator=(const stdout_sink_base &other) = delete; 31 | stdout_sink_base &operator=(stdout_sink_base &&other) = delete; 32 | 33 | void log(const details::log_msg &msg) override; 34 | void flush() override; 35 | void set_pattern(const std::string &pattern) override; 36 | 37 | void set_formatter(std::unique_ptr sink_formatter) override; 38 | 39 | protected: 40 | mutex_t &mutex_; 41 | FILE *file_; 42 | std::unique_ptr formatter_; 43 | #ifdef _WIN32 44 | HANDLE handle_; 45 | #endif // WIN32 46 | }; 47 | 48 | template 49 | class stdout_sink : public stdout_sink_base 50 | { 51 | public: 52 | stdout_sink(); 53 | }; 54 | 55 | template 56 | class stderr_sink : public stdout_sink_base 57 | { 58 | public: 59 | stderr_sink(); 60 | }; 61 | 62 | using stdout_sink_mt = stdout_sink; 63 | using stdout_sink_st = stdout_sink; 64 | 65 | using stderr_sink_mt = stderr_sink; 66 | using stderr_sink_st = stderr_sink; 67 | 68 | } // namespace sinks 69 | 70 | // factory methods 71 | template 72 | std::shared_ptr stdout_logger_mt(const std::string &logger_name); 73 | 74 | template 75 | std::shared_ptr stdout_logger_st(const std::string &logger_name); 76 | 77 | template 78 | std::shared_ptr stderr_logger_mt(const std::string &logger_name); 79 | 80 | template 81 | std::shared_ptr stderr_logger_st(const std::string &logger_name); 82 | 83 | } // namespace spdlog 84 | 85 | #ifdef SPDLOG_HEADER_ONLY 86 | #include "stdout_sinks-inl.h" 87 | #endif 88 | -------------------------------------------------------------------------------- /vulkan/spdlog/sinks/tcp_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #ifdef _WIN32 10 | #include 11 | #else 12 | #include 13 | #endif 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #pragma once 21 | 22 | // Simple tcp client sink 23 | // Connects to remote address and send the formatted log. 24 | // Will attempt to reconnect if connection drops. 25 | // If more complicated behaviour is needed (i.e get responses), you can inherit it and override the sink_it_ method. 26 | 27 | namespace spdlog { 28 | namespace sinks { 29 | 30 | struct tcp_sink_config 31 | { 32 | std::string server_host; 33 | int server_port; 34 | bool lazy_connect = false; // if true connect on first log call instead of on construction 35 | 36 | tcp_sink_config(std::string host, int port) 37 | : server_host{std::move(host)} 38 | , server_port{port} 39 | {} 40 | }; 41 | 42 | template 43 | class tcp_sink : public spdlog::sinks::base_sink 44 | { 45 | public: 46 | // connect to tcp host/port or throw if failed 47 | // host can be hostname or ip address 48 | 49 | explicit tcp_sink(tcp_sink_config sink_config) 50 | : config_{std::move(sink_config)} 51 | { 52 | if (!config_.lazy_connect) 53 | { 54 | this->client_.connect(config_.server_host, config_.server_port); 55 | } 56 | } 57 | 58 | ~tcp_sink() override = default; 59 | 60 | protected: 61 | void sink_it_(const spdlog::details::log_msg &msg) override 62 | { 63 | spdlog::memory_buf_t formatted; 64 | spdlog::sinks::base_sink::formatter_->format(msg, formatted); 65 | if (!client_.is_connected()) 66 | { 67 | client_.connect(config_.server_host, config_.server_port); 68 | } 69 | client_.send(formatted.data(), formatted.size()); 70 | } 71 | 72 | void flush_() override {} 73 | tcp_sink_config config_; 74 | details::tcp_client client_; 75 | }; 76 | 77 | using tcp_sink_mt = tcp_sink; 78 | using tcp_sink_st = tcp_sink; 79 | 80 | } // namespace sinks 81 | } // namespace spdlog 82 | -------------------------------------------------------------------------------- /vulkan/spdlog/stopwatch.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include 7 | 8 | // Stopwatch support for spdlog (using std::chrono::steady_clock). 9 | // Displays elapsed seconds since construction as double. 10 | // 11 | // Usage: 12 | // 13 | // spdlog::stopwatch sw; 14 | // ... 15 | // spdlog::debug("Elapsed: {} seconds", sw); => "Elapsed 0.005116733 seconds" 16 | // spdlog::info("Elapsed: {:.6} seconds", sw); => "Elapsed 0.005163 seconds" 17 | // 18 | // 19 | // If other units are needed (e.g. millis instead of double), include "fmt/chrono.h" and use "duration_cast<..>(sw.elapsed())": 20 | // 21 | // #include 22 | //.. 23 | // using std::chrono::duration_cast; 24 | // using std::chrono::milliseconds; 25 | // spdlog::info("Elapsed {}", duration_cast(sw.elapsed())); => "Elapsed 5ms" 26 | 27 | namespace spdlog { 28 | class stopwatch 29 | { 30 | using clock = std::chrono::steady_clock; 31 | std::chrono::time_point start_tp_; 32 | 33 | public: 34 | stopwatch() 35 | : start_tp_{clock::now()} 36 | {} 37 | 38 | std::chrono::duration elapsed() const 39 | { 40 | return std::chrono::duration(clock::now() - start_tp_); 41 | } 42 | 43 | void reset() 44 | { 45 | start_tp_ = clock ::now(); 46 | } 47 | }; 48 | } // namespace spdlog 49 | 50 | // Support for fmt formatting (e.g. "{:012.9}" or just "{}") 51 | namespace fmt { 52 | template<> 53 | struct formatter : formatter 54 | { 55 | template 56 | auto format(const spdlog::stopwatch &sw, FormatContext &ctx) -> decltype(ctx.out()) 57 | { 58 | return formatter::format(sw.elapsed().count(), ctx); 59 | } 60 | }; 61 | } // namespace fmt 62 | -------------------------------------------------------------------------------- /vulkan/spdlog/version.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #define SPDLOG_VER_MAJOR 1 7 | #define SPDLOG_VER_MINOR 8 8 | #define SPDLOG_VER_PATCH 1 9 | 10 | #define SPDLOG_VERSION (SPDLOG_VER_MAJOR * 10000 + SPDLOG_VER_MINOR * 100 + SPDLOG_VER_PATCH) 11 | -------------------------------------------------------------------------------- /vulkan/test_MMult.cpp: -------------------------------------------------------------------------------- 1 | #include "parameters.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #define SPDLOG_ACTIVE_LEVEL 6 8 | 9 | void REF_MMult(int, int, int, float *, float *, float *); 10 | float MY_MMult(int, int, int, float *, float *, float *); 11 | void copy_matrix(int, int, float *, float *); 12 | void random_matrix(int, int, float *); 13 | float compare_matrices(int, int, float *, float *); 14 | 15 | double dclock(); 16 | 17 | int main() { 18 | int p, m, n, k; 19 | 20 | double diff; 21 | 22 | float *a, *b, *cref, *cold; 23 | 24 | std::vector> results; 25 | 26 | for (p = PFIRST; p <= PLAST; p += PINC) { 27 | m = (M == -1 ? p : M); 28 | n = (N == -1 ? p : N); 29 | k = (K == -1 ? p : K); 30 | 31 | /* Allocate space for the matrices */ 32 | /* Note: I create an extra column in A to make sure that 33 | prefetching beyond the matrix does not cause a segfault */ 34 | const size_t mem_size_A = m * (k + 1) * sizeof(float); 35 | const size_t mem_size_B = k * n * sizeof(float); 36 | const size_t mem_size_C = m * n * sizeof(float); 37 | constexpr size_t alignment = 64; 38 | a = (float *)std::aligned_alloc(alignment, mem_size_A * sizeof(float)); 39 | b = (float *)std::aligned_alloc(alignment, mem_size_B * sizeof(float)); 40 | cold = (float *)std::aligned_alloc(alignment, mem_size_C * sizeof(float)); 41 | cref = (float *)std::aligned_alloc(alignment, mem_size_C * sizeof(float)); 42 | 43 | /* Generate random matrices A, B, Cold */ 44 | random_matrix(m, k, a); 45 | random_matrix(k, n, b); 46 | std::memset(cold, 0, mem_size_C); 47 | std::memset(cref, 0, mem_size_C); 48 | 49 | /* Run the reference implementation so the answers can be compared */ 50 | REF_MMult(m, n, k, a, b, cref); 51 | 52 | float msecTotal = 0.0f; 53 | for (int rep = 0; rep < NREPEATS; rep++) { 54 | /* Time your implementation */ 55 | msecTotal += MY_MMult(m, n, k, a, b, cold); 56 | } 57 | 58 | diff = compare_matrices(m, n, cold, cref); 59 | if (diff > 0.5f || diff < -0.5f) { 60 | fprintf(stdout, "%d diff too big: %le\n", p, diff); 61 | exit(-1); 62 | } 63 | 64 | // Compute and print the performance 65 | float msecPerMatrixMul = msecTotal / NREPEATS; 66 | double flopsPerMatrixMul = 2.0 * m * k * n; 67 | double gflops = 68 | (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 69 | 70 | results.emplace_back(p, gflops, diff); 71 | 72 | std::free(a); 73 | std::free(b); 74 | std::free(cold); 75 | std::free(cref); 76 | } 77 | 78 | fprintf(stdout, "MY_MMult = [\n"); 79 | for (auto &item : results) { 80 | fprintf(stdout, "%d %.2f %le \n", std::get<0>(item), std::get<1>(item), 81 | std::get<2>(item)); 82 | } 83 | fprintf(stdout, "];\n"); 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /vulkan/vulkan/vk_sdk_platform.h: -------------------------------------------------------------------------------- 1 | // 2 | // File: vk_sdk_platform.h 3 | // 4 | /* 5 | * Copyright (c) 2015-2016 The Khronos Group Inc. 6 | * Copyright (c) 2015-2016 Valve Corporation 7 | * Copyright (c) 2015-2016 LunarG, Inc. 8 | * 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | */ 21 | 22 | #ifndef VK_SDK_PLATFORM_H 23 | #define VK_SDK_PLATFORM_H 24 | 25 | #if defined(_WIN32) 26 | #define NOMINMAX 27 | #ifndef __cplusplus 28 | #undef inline 29 | #define inline __inline 30 | #endif // __cplusplus 31 | 32 | #if (defined(_MSC_VER) && _MSC_VER < 1900 /*vs2015*/) 33 | // C99: 34 | // Microsoft didn't implement C99 in Visual Studio; but started adding it with 35 | // VS2013. However, VS2013 still didn't have snprintf(). The following is a 36 | // work-around (Note: The _CRT_SECURE_NO_WARNINGS macro must be set in the 37 | // "CMakeLists.txt" file). 38 | // NOTE: This is fixed in Visual Studio 2015. 39 | #define snprintf _snprintf 40 | #endif 41 | 42 | #define strdup _strdup 43 | 44 | #endif // _WIN32 45 | 46 | // Check for noexcept support using clang, with fallback to Windows or GCC version numbers 47 | #ifndef NOEXCEPT 48 | #if defined(__clang__) 49 | #if __has_feature(cxx_noexcept) 50 | #define HAS_NOEXCEPT 51 | #endif 52 | #else 53 | #if defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC__ * 10 + __GNUC_MINOR__ >= 46 54 | #define HAS_NOEXCEPT 55 | #else 56 | #if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 190023026 && defined(_HAS_EXCEPTIONS) && _HAS_EXCEPTIONS 57 | #define HAS_NOEXCEPT 58 | #endif 59 | #endif 60 | #endif 61 | 62 | #ifdef HAS_NOEXCEPT 63 | #define NOEXCEPT noexcept 64 | #else 65 | #define NOEXCEPT 66 | #endif 67 | #endif 68 | 69 | #endif // VK_SDK_PLATFORM_H 70 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_H_ 2 | #define VULKAN_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | #include "vk_platform.h" 11 | #include "vulkan_core.h" 12 | 13 | #ifdef VK_USE_PLATFORM_ANDROID_KHR 14 | #include "vulkan_android.h" 15 | #endif 16 | 17 | #ifdef VK_USE_PLATFORM_FUCHSIA 18 | #include 19 | #include "vulkan_fuchsia.h" 20 | #endif 21 | 22 | #ifdef VK_USE_PLATFORM_IOS_MVK 23 | #include "vulkan_ios.h" 24 | #endif 25 | 26 | 27 | #ifdef VK_USE_PLATFORM_MACOS_MVK 28 | #include "vulkan_macos.h" 29 | #endif 30 | 31 | #ifdef VK_USE_PLATFORM_METAL_EXT 32 | #include "vulkan_metal.h" 33 | #endif 34 | 35 | #ifdef VK_USE_PLATFORM_VI_NN 36 | #include "vulkan_vi.h" 37 | #endif 38 | 39 | 40 | #ifdef VK_USE_PLATFORM_WAYLAND_KHR 41 | #include 42 | #include "vulkan_wayland.h" 43 | #endif 44 | 45 | 46 | #ifdef VK_USE_PLATFORM_WIN32_KHR 47 | #include 48 | #include "vulkan_win32.h" 49 | #endif 50 | 51 | 52 | #ifdef VK_USE_PLATFORM_XCB_KHR 53 | #include 54 | #include "vulkan_xcb.h" 55 | #endif 56 | 57 | 58 | #ifdef VK_USE_PLATFORM_XLIB_KHR 59 | #include 60 | #include "vulkan_xlib.h" 61 | #endif 62 | 63 | 64 | #ifdef VK_USE_PLATFORM_DIRECTFB_EXT 65 | #include 66 | #include "vulkan_directfb.h" 67 | #endif 68 | 69 | 70 | #ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT 71 | #include 72 | #include 73 | #include "vulkan_xlib_xrandr.h" 74 | #endif 75 | 76 | 77 | #ifdef VK_USE_PLATFORM_GGP 78 | #include 79 | #include "vulkan_ggp.h" 80 | #endif 81 | 82 | 83 | #ifdef VK_ENABLE_BETA_EXTENSIONS 84 | #include "vulkan_beta.h" 85 | #endif 86 | 87 | #endif // VULKAN_H_ 88 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_directfb.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_DIRECTFB_H_ 2 | #define VULKAN_DIRECTFB_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_EXT_directfb_surface 1 23 | #define VK_EXT_DIRECTFB_SURFACE_SPEC_VERSION 1 24 | #define VK_EXT_DIRECTFB_SURFACE_EXTENSION_NAME "VK_EXT_directfb_surface" 25 | typedef VkFlags VkDirectFBSurfaceCreateFlagsEXT; 26 | typedef struct VkDirectFBSurfaceCreateInfoEXT { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkDirectFBSurfaceCreateFlagsEXT flags; 30 | IDirectFB* dfb; 31 | IDirectFBSurface* surface; 32 | } VkDirectFBSurfaceCreateInfoEXT; 33 | 34 | typedef VkResult (VKAPI_PTR *PFN_vkCreateDirectFBSurfaceEXT)(VkInstance instance, const VkDirectFBSurfaceCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 35 | typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceDirectFBPresentationSupportEXT)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, IDirectFB* dfb); 36 | 37 | #ifndef VK_NO_PROTOTYPES 38 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateDirectFBSurfaceEXT( 39 | VkInstance instance, 40 | const VkDirectFBSurfaceCreateInfoEXT* pCreateInfo, 41 | const VkAllocationCallbacks* pAllocator, 42 | VkSurfaceKHR* pSurface); 43 | 44 | VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceDirectFBPresentationSupportEXT( 45 | VkPhysicalDevice physicalDevice, 46 | uint32_t queueFamilyIndex, 47 | IDirectFB* dfb); 48 | #endif 49 | 50 | #ifdef __cplusplus 51 | } 52 | #endif 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_fuchsia.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_FUCHSIA_H_ 2 | #define VULKAN_FUCHSIA_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_FUCHSIA_imagepipe_surface 1 23 | #define VK_FUCHSIA_IMAGEPIPE_SURFACE_SPEC_VERSION 1 24 | #define VK_FUCHSIA_IMAGEPIPE_SURFACE_EXTENSION_NAME "VK_FUCHSIA_imagepipe_surface" 25 | typedef VkFlags VkImagePipeSurfaceCreateFlagsFUCHSIA; 26 | typedef struct VkImagePipeSurfaceCreateInfoFUCHSIA { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkImagePipeSurfaceCreateFlagsFUCHSIA flags; 30 | zx_handle_t imagePipeHandle; 31 | } VkImagePipeSurfaceCreateInfoFUCHSIA; 32 | 33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateImagePipeSurfaceFUCHSIA)(VkInstance instance, const VkImagePipeSurfaceCreateInfoFUCHSIA* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 34 | 35 | #ifndef VK_NO_PROTOTYPES 36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateImagePipeSurfaceFUCHSIA( 37 | VkInstance instance, 38 | const VkImagePipeSurfaceCreateInfoFUCHSIA* pCreateInfo, 39 | const VkAllocationCallbacks* pAllocator, 40 | VkSurfaceKHR* pSurface); 41 | #endif 42 | 43 | #ifdef __cplusplus 44 | } 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_ggp.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_GGP_H_ 2 | #define VULKAN_GGP_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_GGP_stream_descriptor_surface 1 23 | #define VK_GGP_STREAM_DESCRIPTOR_SURFACE_SPEC_VERSION 1 24 | #define VK_GGP_STREAM_DESCRIPTOR_SURFACE_EXTENSION_NAME "VK_GGP_stream_descriptor_surface" 25 | typedef VkFlags VkStreamDescriptorSurfaceCreateFlagsGGP; 26 | typedef struct VkStreamDescriptorSurfaceCreateInfoGGP { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkStreamDescriptorSurfaceCreateFlagsGGP flags; 30 | GgpStreamDescriptor streamDescriptor; 31 | } VkStreamDescriptorSurfaceCreateInfoGGP; 32 | 33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateStreamDescriptorSurfaceGGP)(VkInstance instance, const VkStreamDescriptorSurfaceCreateInfoGGP* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 34 | 35 | #ifndef VK_NO_PROTOTYPES 36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateStreamDescriptorSurfaceGGP( 37 | VkInstance instance, 38 | const VkStreamDescriptorSurfaceCreateInfoGGP* pCreateInfo, 39 | const VkAllocationCallbacks* pAllocator, 40 | VkSurfaceKHR* pSurface); 41 | #endif 42 | 43 | 44 | #define VK_GGP_frame_token 1 45 | #define VK_GGP_FRAME_TOKEN_SPEC_VERSION 1 46 | #define VK_GGP_FRAME_TOKEN_EXTENSION_NAME "VK_GGP_frame_token" 47 | typedef struct VkPresentFrameTokenGGP { 48 | VkStructureType sType; 49 | const void* pNext; 50 | GgpFrameToken frameToken; 51 | } VkPresentFrameTokenGGP; 52 | 53 | 54 | #ifdef __cplusplus 55 | } 56 | #endif 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_ios.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_IOS_H_ 2 | #define VULKAN_IOS_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_MVK_ios_surface 1 23 | #define VK_MVK_IOS_SURFACE_SPEC_VERSION 3 24 | #define VK_MVK_IOS_SURFACE_EXTENSION_NAME "VK_MVK_ios_surface" 25 | typedef VkFlags VkIOSSurfaceCreateFlagsMVK; 26 | typedef struct VkIOSSurfaceCreateInfoMVK { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkIOSSurfaceCreateFlagsMVK flags; 30 | const void* pView; 31 | } VkIOSSurfaceCreateInfoMVK; 32 | 33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateIOSSurfaceMVK)(VkInstance instance, const VkIOSSurfaceCreateInfoMVK* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 34 | 35 | #ifndef VK_NO_PROTOTYPES 36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateIOSSurfaceMVK( 37 | VkInstance instance, 38 | const VkIOSSurfaceCreateInfoMVK* pCreateInfo, 39 | const VkAllocationCallbacks* pAllocator, 40 | VkSurfaceKHR* pSurface); 41 | #endif 42 | 43 | #ifdef __cplusplus 44 | } 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_macos.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_MACOS_H_ 2 | #define VULKAN_MACOS_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_MVK_macos_surface 1 23 | #define VK_MVK_MACOS_SURFACE_SPEC_VERSION 3 24 | #define VK_MVK_MACOS_SURFACE_EXTENSION_NAME "VK_MVK_macos_surface" 25 | typedef VkFlags VkMacOSSurfaceCreateFlagsMVK; 26 | typedef struct VkMacOSSurfaceCreateInfoMVK { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkMacOSSurfaceCreateFlagsMVK flags; 30 | const void* pView; 31 | } VkMacOSSurfaceCreateInfoMVK; 32 | 33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateMacOSSurfaceMVK)(VkInstance instance, const VkMacOSSurfaceCreateInfoMVK* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 34 | 35 | #ifndef VK_NO_PROTOTYPES 36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateMacOSSurfaceMVK( 37 | VkInstance instance, 38 | const VkMacOSSurfaceCreateInfoMVK* pCreateInfo, 39 | const VkAllocationCallbacks* pAllocator, 40 | VkSurfaceKHR* pSurface); 41 | #endif 42 | 43 | #ifdef __cplusplus 44 | } 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_metal.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_METAL_H_ 2 | #define VULKAN_METAL_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_EXT_metal_surface 1 23 | 24 | #ifdef __OBJC__ 25 | @class CAMetalLayer; 26 | #else 27 | typedef void CAMetalLayer; 28 | #endif 29 | 30 | #define VK_EXT_METAL_SURFACE_SPEC_VERSION 1 31 | #define VK_EXT_METAL_SURFACE_EXTENSION_NAME "VK_EXT_metal_surface" 32 | typedef VkFlags VkMetalSurfaceCreateFlagsEXT; 33 | typedef struct VkMetalSurfaceCreateInfoEXT { 34 | VkStructureType sType; 35 | const void* pNext; 36 | VkMetalSurfaceCreateFlagsEXT flags; 37 | const CAMetalLayer* pLayer; 38 | } VkMetalSurfaceCreateInfoEXT; 39 | 40 | typedef VkResult (VKAPI_PTR *PFN_vkCreateMetalSurfaceEXT)(VkInstance instance, const VkMetalSurfaceCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 41 | 42 | #ifndef VK_NO_PROTOTYPES 43 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateMetalSurfaceEXT( 44 | VkInstance instance, 45 | const VkMetalSurfaceCreateInfoEXT* pCreateInfo, 46 | const VkAllocationCallbacks* pAllocator, 47 | VkSurfaceKHR* pSurface); 48 | #endif 49 | 50 | #ifdef __cplusplus 51 | } 52 | #endif 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_vi.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_VI_H_ 2 | #define VULKAN_VI_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_NN_vi_surface 1 23 | #define VK_NN_VI_SURFACE_SPEC_VERSION 1 24 | #define VK_NN_VI_SURFACE_EXTENSION_NAME "VK_NN_vi_surface" 25 | typedef VkFlags VkViSurfaceCreateFlagsNN; 26 | typedef struct VkViSurfaceCreateInfoNN { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkViSurfaceCreateFlagsNN flags; 30 | void* window; 31 | } VkViSurfaceCreateInfoNN; 32 | 33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateViSurfaceNN)(VkInstance instance, const VkViSurfaceCreateInfoNN* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 34 | 35 | #ifndef VK_NO_PROTOTYPES 36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateViSurfaceNN( 37 | VkInstance instance, 38 | const VkViSurfaceCreateInfoNN* pCreateInfo, 39 | const VkAllocationCallbacks* pAllocator, 40 | VkSurfaceKHR* pSurface); 41 | #endif 42 | 43 | #ifdef __cplusplus 44 | } 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_wayland.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_WAYLAND_H_ 2 | #define VULKAN_WAYLAND_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_KHR_wayland_surface 1 23 | #define VK_KHR_WAYLAND_SURFACE_SPEC_VERSION 6 24 | #define VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME "VK_KHR_wayland_surface" 25 | typedef VkFlags VkWaylandSurfaceCreateFlagsKHR; 26 | typedef struct VkWaylandSurfaceCreateInfoKHR { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkWaylandSurfaceCreateFlagsKHR flags; 30 | struct wl_display* display; 31 | struct wl_surface* surface; 32 | } VkWaylandSurfaceCreateInfoKHR; 33 | 34 | typedef VkResult (VKAPI_PTR *PFN_vkCreateWaylandSurfaceKHR)(VkInstance instance, const VkWaylandSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 35 | typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceWaylandPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, struct wl_display* display); 36 | 37 | #ifndef VK_NO_PROTOTYPES 38 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateWaylandSurfaceKHR( 39 | VkInstance instance, 40 | const VkWaylandSurfaceCreateInfoKHR* pCreateInfo, 41 | const VkAllocationCallbacks* pAllocator, 42 | VkSurfaceKHR* pSurface); 43 | 44 | VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceWaylandPresentationSupportKHR( 45 | VkPhysicalDevice physicalDevice, 46 | uint32_t queueFamilyIndex, 47 | struct wl_display* display); 48 | #endif 49 | 50 | #ifdef __cplusplus 51 | } 52 | #endif 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_xcb.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_XCB_H_ 2 | #define VULKAN_XCB_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_KHR_xcb_surface 1 23 | #define VK_KHR_XCB_SURFACE_SPEC_VERSION 6 24 | #define VK_KHR_XCB_SURFACE_EXTENSION_NAME "VK_KHR_xcb_surface" 25 | typedef VkFlags VkXcbSurfaceCreateFlagsKHR; 26 | typedef struct VkXcbSurfaceCreateInfoKHR { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkXcbSurfaceCreateFlagsKHR flags; 30 | xcb_connection_t* connection; 31 | xcb_window_t window; 32 | } VkXcbSurfaceCreateInfoKHR; 33 | 34 | typedef VkResult (VKAPI_PTR *PFN_vkCreateXcbSurfaceKHR)(VkInstance instance, const VkXcbSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 35 | typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceXcbPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, xcb_connection_t* connection, xcb_visualid_t visual_id); 36 | 37 | #ifndef VK_NO_PROTOTYPES 38 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateXcbSurfaceKHR( 39 | VkInstance instance, 40 | const VkXcbSurfaceCreateInfoKHR* pCreateInfo, 41 | const VkAllocationCallbacks* pAllocator, 42 | VkSurfaceKHR* pSurface); 43 | 44 | VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXcbPresentationSupportKHR( 45 | VkPhysicalDevice physicalDevice, 46 | uint32_t queueFamilyIndex, 47 | xcb_connection_t* connection, 48 | xcb_visualid_t visual_id); 49 | #endif 50 | 51 | #ifdef __cplusplus 52 | } 53 | #endif 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_xlib.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_XLIB_H_ 2 | #define VULKAN_XLIB_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_KHR_xlib_surface 1 23 | #define VK_KHR_XLIB_SURFACE_SPEC_VERSION 6 24 | #define VK_KHR_XLIB_SURFACE_EXTENSION_NAME "VK_KHR_xlib_surface" 25 | typedef VkFlags VkXlibSurfaceCreateFlagsKHR; 26 | typedef struct VkXlibSurfaceCreateInfoKHR { 27 | VkStructureType sType; 28 | const void* pNext; 29 | VkXlibSurfaceCreateFlagsKHR flags; 30 | Display* dpy; 31 | Window window; 32 | } VkXlibSurfaceCreateInfoKHR; 33 | 34 | typedef VkResult (VKAPI_PTR *PFN_vkCreateXlibSurfaceKHR)(VkInstance instance, const VkXlibSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); 35 | typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceXlibPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, Display* dpy, VisualID visualID); 36 | 37 | #ifndef VK_NO_PROTOTYPES 38 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateXlibSurfaceKHR( 39 | VkInstance instance, 40 | const VkXlibSurfaceCreateInfoKHR* pCreateInfo, 41 | const VkAllocationCallbacks* pAllocator, 42 | VkSurfaceKHR* pSurface); 43 | 44 | VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXlibPresentationSupportKHR( 45 | VkPhysicalDevice physicalDevice, 46 | uint32_t queueFamilyIndex, 47 | Display* dpy, 48 | VisualID visualID); 49 | #endif 50 | 51 | #ifdef __cplusplus 52 | } 53 | #endif 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /vulkan/vulkan/vulkan_xlib_xrandr.h: -------------------------------------------------------------------------------- 1 | #ifndef VULKAN_XLIB_XRANDR_H_ 2 | #define VULKAN_XLIB_XRANDR_H_ 1 3 | 4 | /* 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc. 6 | ** 7 | ** SPDX-License-Identifier: Apache-2.0 8 | */ 9 | 10 | /* 11 | ** This header is generated from the Khronos Vulkan XML API Registry. 12 | ** 13 | */ 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | 22 | #define VK_EXT_acquire_xlib_display 1 23 | #define VK_EXT_ACQUIRE_XLIB_DISPLAY_SPEC_VERSION 1 24 | #define VK_EXT_ACQUIRE_XLIB_DISPLAY_EXTENSION_NAME "VK_EXT_acquire_xlib_display" 25 | typedef VkResult (VKAPI_PTR *PFN_vkAcquireXlibDisplayEXT)(VkPhysicalDevice physicalDevice, Display* dpy, VkDisplayKHR display); 26 | typedef VkResult (VKAPI_PTR *PFN_vkGetRandROutputDisplayEXT)(VkPhysicalDevice physicalDevice, Display* dpy, RROutput rrOutput, VkDisplayKHR* pDisplay); 27 | 28 | #ifndef VK_NO_PROTOTYPES 29 | VKAPI_ATTR VkResult VKAPI_CALL vkAcquireXlibDisplayEXT( 30 | VkPhysicalDevice physicalDevice, 31 | Display* dpy, 32 | VkDisplayKHR display); 33 | 34 | VKAPI_ATTR VkResult VKAPI_CALL vkGetRandROutputDisplayEXT( 35 | VkPhysicalDevice physicalDevice, 36 | Display* dpy, 37 | RROutput rrOutput, 38 | VkDisplayKHR* pDisplay); 39 | #endif 40 | 41 | #ifdef __cplusplus 42 | } 43 | #endif 44 | 45 | #endif 46 | --------------------------------------------------------------------------------