├── .gitignore
├── .gitmodules
├── LICENSE
├── MatrixMul.pdf
├── README.md
├── README_ZH_CN.md
├── aarch64
    ├── CMakeLists.txt
    ├── MMult0.cpp
    ├── MMult1.cpp
    ├── MMult_4x4_10.cpp
    ├── MMult_4x4_11.cpp
    ├── MMult_4x4_12.cpp
    ├── MMult_4x4_13.cpp
    ├── MMult_4x4_14.cpp
    ├── MMult_4x4_15.cpp
    ├── MMult_4x4_16.cpp
    ├── MMult_4x4_17.cpp
    ├── MMult_4x4_18.cpp
    ├── MMult_4x4_19.cpp
    ├── MMult_4x4_20.cpp
    ├── MMult_4x4_21.cpp
    ├── MMult_4x4_8.cpp
    ├── MMult_4x4_9.cpp
    ├── REF_MMult.cpp
    ├── compare_matrices.cpp
    ├── copy_matrix.cpp
    ├── dclock.cpp
    ├── figures
    │   ├── compare_MMult0_MMult0.png
    │   ├── compare_MMult0_MMult_4x4_8.png
    │   ├── compare_MMult_4x4_10_MMult_4x4_11.png
    │   ├── compare_MMult_4x4_11_MMult_4x4_12.png
    │   ├── compare_MMult_4x4_12_MMult_4x4_13.png
    │   ├── compare_MMult_4x4_12_MMult_4x4_14.png
    │   ├── compare_MMult_4x4_12_MMult_4x4_17.png
    │   ├── compare_MMult_4x4_13_MMult_4x4_14.png
    │   ├── compare_MMult_4x4_14_MMult_4x4_15.png
    │   ├── compare_MMult_4x4_8_MMult_4x4_9.png
    │   └── compare_MMult_4x4_9_MMult_4x4_10.png
    ├── gflops_benchmark
    │   ├── clear.sh
    │   ├── func1.S
    │   ├── func2.S
    │   ├── main.c
    │   └── make.sh
    ├── makefile
    ├── output_MMult0.m
    ├── output_MMult1.m
    ├── output_MMult_4x4_10.m
    ├── output_MMult_4x4_11.m
    ├── output_MMult_4x4_12.m
    ├── output_MMult_4x4_13.m
    ├── output_MMult_4x4_14.m
    ├── output_MMult_4x4_15.m
    ├── output_MMult_4x4_16.m
    ├── output_MMult_4x4_17.m
    ├── output_MMult_4x4_18.m
    ├── output_MMult_4x4_8.m
    ├── output_MMult_4x4_9.m
    ├── output_new.m
    ├── output_old.m
    ├── parameters.h
    ├── plot.py
    ├── print_matrix.cpp
    ├── random_matrix.cpp
    └── test_MMult.cpp
├── armv7
    ├── MMult0.c
    ├── MMult1.c
    ├── MMult_4x4_19.c
    ├── MMult_4x4_20.c
    ├── MMult_4x4_21.c
    ├── MMult_4x4_8.c
    ├── MMult_4x4_9.c
    ├── PlotAll.m
    ├── REF_MMult.c
    ├── compare_matrices.c
    ├── copy_matrix.c
    ├── dclock.c
    ├── makefile
    ├── output_MMult_4x4_12.m
    ├── output_MMult_4x4_18.m
    ├── output_MMult_4x4_19.m
    ├── output_MMult_4x4_20.m
    ├── output_MMult_4x4_21.m
    ├── output_new.m
    ├── output_old.m
    ├── parameters.h
    ├── plot.py
    ├── print_matrix.c
    ├── proc_parameters.m
    ├── random_matrix.c
    └── test_MMult.c
├── cuda-int4
    └── README.md
├── cuda
    ├── .gitignore
    ├── MMult_cuBLAS_1.cpp
    ├── MMult_cuBLAS_2.cpp
    ├── MMult_cuda_10.cu
    ├── MMult_cuda_11.cu
    ├── MMult_cuda_12.cu
    ├── MMult_cuda_2.cu
    ├── MMult_cuda_3.cu
    ├── MMult_cuda_4.cu
    ├── MMult_cuda_5.cu
    ├── MMult_cuda_6.cu
    ├── MMult_cuda_7.cu
    ├── MMult_cuda_8.cu
    ├── MMult_cuda_9.cu
    ├── PlotAll.m
    ├── REF_MMult.cpp
    ├── compare_matrices.cpp
    ├── copy_matrix.cpp
    ├── dclock.cpp
    ├── helper.h
    ├── makefile
    ├── output_MMult_cuBLAS_1.m
    ├── output_MMult_cuBLAS_2.m
    ├── output_MMult_cuda_10.m
    ├── output_MMult_cuda_11.m
    ├── output_MMult_cuda_12.m
    ├── output_MMult_cuda_2.m
    ├── output_MMult_cuda_3.m
    ├── output_MMult_cuda_4.m
    ├── output_MMult_cuda_5.m
    ├── output_MMult_cuda_6.m
    ├── output_MMult_cuda_7.m
    ├── output_MMult_cuda_8.m
    ├── output_MMult_cuda_9.m
    ├── output_new.m
    ├── output_old.m
    ├── parameters.h
    ├── plot.py
    ├── print_matrix.cpp
    ├── proc_parameters.m
    ├── random_matrix.cpp
    └── test_MMult.cpp
├── images
    ├── aarch64-fp32-peak-vs-int8.png
    └── cublas-vs-MMult_cuda_12.jpg
├── requirements.txt
└── vulkan
    ├── .gitignore
    ├── MMult_vk_1.cpp
    ├── MMult_vk_2.comp
    ├── MMult_vk_2.cpp
    ├── MMult_vk_3.comp
    ├── MMult_vk_3.cpp
    ├── MMult_vk_naive.cpp
    ├── README.md
    ├── REF_MMult.cpp
    ├── Shader.hpp
    ├── benchmark
        ├── .gitignore
        ├── build.sh
        ├── gflops_fmla.cpp
        ├── gflops_fmla_1.comp
        ├── gflops_fmla_2.comp
        ├── gmem_bandwidth.comp
        ├── gmem_bandwidth.cpp
        ├── sampler_bandwidth.comp
        ├── smem_bandwidth.comp
        ├── smem_bandwidth.cpp
        ├── smem_bandwidth1.comp
        ├── smem_latency.cpp
        └── types.h
    ├── compare_matrices.cpp
    ├── copy_matrix.cpp
    ├── dclock.cpp
    ├── fmt
        ├── chrono.h
        ├── color.h
        ├── compile.h
        ├── core.h
        ├── format-inl.h
        ├── format.h
        ├── locale.h
        ├── os.h
        ├── ostream.h
        ├── posix.h
        ├── printf.h
        └── ranges.h
    ├── kompute
        └── Kompute.hpp
    ├── makefile
    ├── parameters.h
    ├── plot.py
    ├── print_matrix.cpp
    ├── random_matrix.cpp
    ├── spdlog
        ├── async.h
        ├── async_logger-inl.h
        ├── async_logger.h
        ├── cfg
        │   ├── argv.h
        │   ├── env.h
        │   ├── helpers-inl.h
        │   └── helpers.h
        ├── common-inl.h
        ├── common.h
        ├── details
        │   ├── backtracer-inl.h
        │   ├── backtracer.h
        │   ├── circular_q.h
        │   ├── console_globals.h
        │   ├── file_helper-inl.h
        │   ├── file_helper.h
        │   ├── fmt_helper.h
        │   ├── log_msg-inl.h
        │   ├── log_msg.h
        │   ├── log_msg_buffer-inl.h
        │   ├── log_msg_buffer.h
        │   ├── mpmc_blocking_q.h
        │   ├── null_mutex.h
        │   ├── os-inl.h
        │   ├── os.h
        │   ├── periodic_worker-inl.h
        │   ├── periodic_worker.h
        │   ├── registry-inl.h
        │   ├── registry.h
        │   ├── synchronous_factory.h
        │   ├── tcp_client-windows.h
        │   ├── tcp_client.h
        │   ├── thread_pool-inl.h
        │   ├── thread_pool.h
        │   └── windows_include.h
        ├── fmt
        │   ├── bin_to_hex.h
        │   ├── bundled
        │   │   ├── LICENSE.rst
        │   │   ├── chrono.h
        │   │   ├── color.h
        │   │   ├── compile.h
        │   │   ├── core.h
        │   │   ├── format-inl.h
        │   │   ├── format.h
        │   │   ├── locale.h
        │   │   ├── os.h
        │   │   ├── ostream.h
        │   │   ├── posix.h
        │   │   ├── printf.h
        │   │   └── ranges.h
        │   ├── chrono.h
        │   ├── fmt.h
        │   └── ostr.h
        ├── formatter.h
        ├── fwd.h
        ├── logger-inl.h
        ├── logger.h
        ├── pattern_formatter-inl.h
        ├── pattern_formatter.h
        ├── sinks
        │   ├── android_sink.h
        │   ├── ansicolor_sink-inl.h
        │   ├── ansicolor_sink.h
        │   ├── base_sink-inl.h
        │   ├── base_sink.h
        │   ├── basic_file_sink-inl.h
        │   ├── basic_file_sink.h
        │   ├── daily_file_sink.h
        │   ├── dist_sink.h
        │   ├── dup_filter_sink.h
        │   ├── msvc_sink.h
        │   ├── null_sink.h
        │   ├── ostream_sink.h
        │   ├── ringbuffer_sink.h
        │   ├── rotating_file_sink-inl.h
        │   ├── rotating_file_sink.h
        │   ├── sink-inl.h
        │   ├── sink.h
        │   ├── stdout_color_sinks-inl.h
        │   ├── stdout_color_sinks.h
        │   ├── stdout_sinks-inl.h
        │   ├── stdout_sinks.h
        │   ├── syslog_sink.h
        │   ├── systemd_sink.h
        │   ├── tcp_sink.h
        │   ├── win_eventlog_sink.h
        │   ├── wincolor_sink-inl.h
        │   └── wincolor_sink.h
        ├── spdlog-inl.h
        ├── spdlog.h
        ├── stopwatch.h
        ├── tweakme.h
        └── version.h
    ├── test_MMult.cpp
    └── vulkan
        ├── vk_icd.h
        ├── vk_layer.h
        ├── vk_platform.h
        ├── vk_sdk_platform.h
        ├── vulkan.h
        ├── vulkan.hpp
        ├── vulkan_android.h
        ├── vulkan_beta.h
        ├── vulkan_core.h
        ├── vulkan_directfb.h
        ├── vulkan_fuchsia.h
        ├── vulkan_ggp.h
        ├── vulkan_ios.h
        ├── vulkan_macos.h
        ├── vulkan_metal.h
        ├── vulkan_vi.h
        ├── vulkan_wayland.h
        ├── vulkan_win32.h
        ├── vulkan_xcb.h
        ├── vulkan_xlib.h
        └── vulkan_xlib_xrandr.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | aarch64/build
 2 | # Prerequisites
 3 | *.d
 4 | 
 5 | # Compiled Object files
 6 | *.slo
 7 | *.lo
 8 | *.o
 9 | *.obj
10 | *.x
11 | 
12 | # Precompiled Headers
13 | *.gch
14 | *.pch
15 | 
16 | # Compiled Dynamic libraries
17 | *.so
18 | *.dylib
19 | *.dll
20 | 
21 | # Fortran module files
22 | *.mod
23 | *.smod
24 | 
25 | # Compiled Static libraries
26 | *.lai
27 | *.la
28 | *.a
29 | *.lib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | 
36 | # Images and Test results
37 | src/HowToOptimizeGemm/*.png
38 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "x86"]
 2 | 	path = x86
 3 | 	url = https://github.com/flame/how-to-optimize-gemm
 4 | 	shallow = true
 5 | [submodule "kompute"]
 6 | 	shallow = true
 7 | [submodule "OpenBLAS-0.2.20"]
 8 | 	path = OpenBLAS-0.2.20
 9 | 	url = https://github.com/tpoisonooo/OpenBLAS
10 | [submodule "aarch64-int8"]
11 | 	path = aarch64-int8
12 | 	url = https://github.com/tpoisonooo/chgemm
13 | [submodule "mperf"]
14 | 	path = mperf
15 | 	url = https://github.com/tpoisonooo/mperf
16 | 


--------------------------------------------------------------------------------
/MatrixMul.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/MatrixMul.pdf


--------------------------------------------------------------------------------
/aarch64/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.15.2)
 2 |     
 3 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "")
 4 | 
 5 | project(how-to-optimize-gemm LANGUAGES C CXX ASM VERSION 0.1)
 6 | set(CMAKE_CXX_STANDARD 17)
 7 | 
 8 | option(MPERF_ENABLE "build with mperf." OFF)
 9 | set(CMAKE_CXX_FLAGS "-O2 -g -march=native -ftree-vectorize ${CMAKE_CXX_FLAGS}")
10 | 
11 | if(MPERF_ENABLE)
12 |     add_definitions(-DMPERF=1)
13 |     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../mperf ${CMAKE_CURRENT_BINARY_DIR}/mperf)
14 | endif()
15 | 
16 | function(add_bin source_file)
17 |     get_filename_component(target_name ${source_file} NAME_WE)
18 |     add_executable(${target_name} ${source_file} test_MMult.cpp compare_matrices.cpp random_matrix.cpp copy_matrix.cpp dclock.cpp REF_MMult.cpp print_matrix.cpp)
19 |     
20 |     if(MPERF_ENABLE)
21 |         target_link_libraries(${target_name} mperf)
22 |     endif()
23 | endfunction()
24 | 
25 | 
26 | add_bin(MMult0.cpp)
27 | add_bin(MMult1.cpp)
28 | add_bin(MMult_4x4_8.cpp)
29 | add_bin(MMult_4x4_9.cpp)
30 | add_bin(MMult_4x4_10.cpp)
31 | add_bin(MMult_4x4_11.cpp)
32 | add_bin(MMult_4x4_12.cpp)
33 | add_bin(MMult_4x4_13.cpp)
34 | add_bin(MMult_4x4_14.cpp)
35 | add_bin(MMult_4x4_15.cpp)
36 | add_bin(MMult_4x4_16.cpp)
37 | add_bin(MMult_4x4_17.cpp)
38 | add_bin(MMult_4x4_18.cpp)
39 | add_bin(MMult_4x4_19.cpp)
40 | 
41 | 


--------------------------------------------------------------------------------
/aarch64/MMult0.cpp:
--------------------------------------------------------------------------------
 1 | /* Routine for computing C = A * B */
 2 | 
 3 | void MY_MMult(int m, int n, int k, float *a, int lda, float *b, int ldb,
 4 |               float *c, int ldc) {
 5 | #define A(i, j) a[(i) * k + (j)]
 6 | #define B(i, j) b[(i) * n + (j)]
 7 | #define C(i, j) c[(i) * n + (j)]
 8 | 
 9 |   int i, j, p;
10 | 
11 |   for (i = 0; i < m; i++) {     /* Loop over the rows of C */
12 |     for (j = 0; j < n; j++) {   /* Loop over the columns of C */
13 |       for (p = 0; p < k; p++) { /* Update C( i,j ) with the inner
14 |                                    product of the ith row of A and
15 |                                    the jth column of B */
16 |         C(i, j) = C(i, j) + A(i, p) * B(p, j);
17 |       }
18 |     }
19 |   }
20 | #undef A
21 | #undef B
22 | #undef C
23 | }
24 | 


--------------------------------------------------------------------------------
/aarch64/MMult1.cpp:
--------------------------------------------------------------------------------
 1 | /* Routine for computing C = A * B */
 2 | 
 3 | void AddDot(int, float *, float *, int, float *);
 4 | 
 5 | void MY_MMult(int m, int n, int k, float *a, int lda, float *b, int ldb,
 6 |               float *c, int ldc) {
 7 |   int i, j;
 8 | 
 9 | #define A(i, j) a[(i) * k + (j)]
10 | #define B(i, j) b[(i) * n + (j)]
11 | #define C(i, j) c[(i) * n + (j)]
12 | 
13 |   for (j = 0; j < n; ++j) {   /* Loop over the columns of C */
14 |     for (i = 0; i < m; ++i) { /* Loop over the rows of C */
15 |       /* Update the C( i,j ) with the inner product of the ith row of A
16 |          and the jth column of B */
17 | 
18 |       AddDot(k, &A(i, 0), &B(0, j), ldb, &C(i, j));
19 |     }
20 |   }
21 | #undef A
22 | #undef B
23 | #undef C
24 | }
25 | 
26 | /* Create macro to let X( i ) equal the ith element of x */
27 | void AddDot(int k, float *x, float *y, int ldb, float *gamma) {
28 |   /* compute gamma := x' * y + gamma with vectors x and y of length n.
29 | 
30 |      Here x starts at location x with increment (stride) incx and y starts at
31 |      location y and has (implicit) stride of 1.
32 |   */
33 |   for (int p = 0; p < k; p++) {
34 |     *gamma += x[p] * y[p * ldb];
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/aarch64/REF_MMult.cpp:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in row-major order */
 2 | 
 3 | #if 0
 4 | #include <cblas.h>
 5 | /* Routine for computing C = A * B + C */
 6 | void REF_MMult(int m, int n, int k, float *a, float *b, float *c) {
 7 |   cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0f, a, k,
 8 |               b, n, 0.0f, c, n);
 9 | }
10 | 
11 | #else
12 | 
13 | #define A(i, j) a[(i) * k + (j)]
14 | #define B(i, j) b[(i) * n + (j)]
15 | #define C(i, j) c[(i) * n + (j)]
16 | /* Routine for computing C = A * B + C */
17 | 
18 | void REF_MMult(int m, int n, int k, float *a, float *b, float *c) {
19 |   int i, j, p;
20 | 
21 |   for (i = 0; i < m; i++) {
22 |     for (j = 0; j < n; j++) {
23 |       for (p = 0; p < k; p++) {
24 |         C(i, j) += A(i, p) * B(p, j);
25 |       }
26 |     }
27 |   }
28 | }
29 | 
30 | #undef A
31 | #undef B
32 | #undef C
33 | #endif
34 | 


--------------------------------------------------------------------------------
/aarch64/compare_matrices.cpp:
--------------------------------------------------------------------------------
 1 | #define abs(x) ((x) < 0.0 ? -(x) : (x))
 2 | 
 3 | #include <stdio.h>
 4 | 
 5 | float compare_matrices(int m, int n, float *a, float *b) {
 6 | #define A(i, j) a[(i) * n + (j)]
 7 | #define B(i, j) b[(i) * n + (j)]
 8 |   //    printf("\n---result----\n");
 9 |   //    print_matrix(m, n, a, lda);
10 |   //    printf("\n-------\n");
11 |   //    print_matrix(m, n, b, ldb);
12 |   //    printf("\n-------\n");
13 |   int i, j;
14 |   float max_diff = 0.0, diff;
15 |   int printed = 0;
16 | 
17 |   for (i = 0; i < m; i++) {
18 |     for (j = 0; j < n; j++) {
19 |       diff = abs(A(i, j) - B(i, j));
20 |       max_diff = (diff > max_diff ? diff : max_diff);
21 |       if (0 == printed)
22 |         if (max_diff > 0.5f || max_diff < -0.5f) {
23 |           fprintf(stdout, "error: i %d  j %d diff %f  got %f  expect %f \n", i,
24 |                   j, max_diff, A(i, j), B(i, j));
25 |           printed = 1;
26 |         }
27 |     }
28 |   }
29 | 
30 |   return max_diff;
31 | #undef A
32 | #undef B
33 | }
34 | 


--------------------------------------------------------------------------------
/aarch64/copy_matrix.cpp:
--------------------------------------------------------------------------------
 1 | void copy_matrix(int m, int n, float *a, float *b) {
 2 | #define A(i, j) a[(i) * n + (j)]
 3 | #define B(i, j) b[(i) * n + (j)]
 4 | 
 5 |   int i, j;
 6 | 
 7 |   for (j = 0; j < n; j++) {
 8 |     for (i = 0; i < m; i++) {
 9 |       B(i, j) = A(i, j);
10 |     }
11 |   }
12 | 
13 | #undef A
14 | #undef B
15 | }
16 | 


--------------------------------------------------------------------------------
/aarch64/dclock.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | #include <time.h>
 3 | 
 4 | static double gtod_ref_time_sec = 0.0;
 5 | 
 6 | /* Adapted from the bl2_clock() routine in the BLIS library */
 7 | 
 8 | double dclock() {
 9 |   double the_time, norm_sec;
10 |   struct timeval tv;
11 | 
12 |   gettimeofday(&tv, NULL);
13 | 
14 |   if (gtod_ref_time_sec == 0.0)
15 |     gtod_ref_time_sec = (double)tv.tv_sec;
16 | 
17 |   norm_sec = (double)tv.tv_sec - gtod_ref_time_sec;
18 | 
19 |   the_time = norm_sec + tv.tv_usec * 1.0e-6;
20 | 
21 |   return the_time;
22 | }
23 | 


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult0_MMult0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult0_MMult0.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult0_MMult_4x4_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult0_MMult_4x4_8.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_10_MMult_4x4_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_10_MMult_4x4_11.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_11_MMult_4x4_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_11_MMult_4x4_12.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_13.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_14.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_12_MMult_4x4_17.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_13_MMult_4x4_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_13_MMult_4x4_14.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_14_MMult_4x4_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_14_MMult_4x4_15.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_8_MMult_4x4_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_8_MMult_4x4_9.png


--------------------------------------------------------------------------------
/aarch64/figures/compare_MMult_4x4_9_MMult_4x4_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/aarch64/figures/compare_MMult_4x4_9_MMult_4x4_10.png


--------------------------------------------------------------------------------
/aarch64/gflops_benchmark/clear.sh:
--------------------------------------------------------------------------------
1 | rm -rf main
2 | rm -rf *.o
3 | 


--------------------------------------------------------------------------------
/aarch64/gflops_benchmark/func1.S:
--------------------------------------------------------------------------------
 1 | .text
 2 | .align 5
 3 | .global func1 
 4 | 
 5 | func1:
 6 | .loop1:
 7 |     fmla v0.4s, v0.4s, v16.s[0]
 8 |     fmla v1.4s, v1.4s, v16.s[1]
 9 |     fmla v2.4s, v2.4s, v16.s[2]
10 |     fmla v3.4s, v3.4s, v16.s[3]
11 | 
12 |     subs x0, x0, #1
13 | 
14 |     fmla v4.4s, v4.4s, v17.s[0]
15 |     fmla v5.4s, v5.4s, v17.s[1]
16 |     fmla v6.4s, v6.4s, v17.s[2]
17 |     fmla v7.4s, v7.4s, v17.s[3]
18 | 
19 |     fmla v8.4s, v8.4s, v18.s[0]
20 |     fmla v9.4s, v9.4s, v18.s[1]
21 |     bne .loop1
22 |     ret
23 | 


--------------------------------------------------------------------------------
/aarch64/gflops_benchmark/func2.S:
--------------------------------------------------------------------------------
 1 | .text
 2 | .align 5
 3 | .global func2 
 4 | 
 5 | func2:
 6 | .loop2:
 7 |     fmla v0.4s, v0.4s, v0.4s
 8 |     fmla v1.4s, v1.4s, v1.4s
 9 |     fmla v2.4s, v2.4s, v2.4s
10 |     fmla v3.4s, v3.4s, v3.4s
11 | 
12 |     fmla v4.4s, v4.4s, v4.4s
13 |     fmla v5.4s, v5.4s, v5.4s
14 |     fmla v6.4s, v6.4s, v6.4s
15 |     fmla v7.4s, v7.4s, v7.4s
16 | 
17 |     fmla v8.4s, v8.4s, v8.4s
18 |     fmla v9.4s, v9.4s, v9.4s
19 | 
20 |     subs x0, x0, #1
21 |     bne .loop2
22 |     ret
23 | 


--------------------------------------------------------------------------------
/aarch64/gflops_benchmark/main.c:
--------------------------------------------------------------------------------
 1 | #include <time.h>
 2 | #include <stdio.h>
 3 | 
 4 | #define LOOP (1e9)
 5 | #define OP_FLOATS (80)
 6 | 
 7 | void func1(int);
 8 | void func2(int);
 9 | 
10 | static double get_time(struct timespec *start,
11 |                        struct timespec *end) {
12 |     return end->tv_sec - start->tv_sec + (end->tv_nsec - start->tv_nsec) * 1e-9;
13 | }
14 | 
15 | int main() {
16 |     struct timespec start, end;
17 |     double time_used = 0.0;
18 | 
19 |     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
20 | //    func1(LOOP);
21 |     func2(LOOP);
22 |     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
23 | 
24 |     time_used = get_time(&start, &end);
25 |     printf("perf: %.6lf \r\n", LOOP * OP_FLOATS * 1.0 * 1e-9 / time_used);
26 | }
27 | 


--------------------------------------------------------------------------------
/aarch64/gflops_benchmark/make.sh:
--------------------------------------------------------------------------------
1 | as -o func1.o func1.S
2 | as -o func2.o func2.S
3 | gcc -c main.c
4 | gcc -o main main.o func2.o func1.o
5 | 


--------------------------------------------------------------------------------
/aarch64/makefile:
--------------------------------------------------------------------------------
 1 | OLD  := MMult_4x4_10
 2 | NEW  := MMult_4x4_21
 3 | # ARCH := armv7-a
 4 | # ARCH := aarch64
 5 | ARCH := native
 6 | 
 7 | #
 8 | # sample makefile
 9 | #
10 | 
11 | CC         := g++
12 | LINKER     := $(CC)
13 | #CFLAGS     := -O0 -g -Wall
14 | CFLAGS     := -std=c++17 -O2 -g -march=$(ARCH) -ftree-vectorize
15 | LDFLAGS    := -lm
16 | 
17 | UTIL       := copy_matrix.o \
18 |               compare_matrices.o \
19 |               random_matrix.o \
20 |               dclock.o \
21 |               REF_MMult.o \
22 |               print_matrix.o
23 | 
24 | TEST_OBJS  := test_MMult.o $(NEW).o 
25 | 
26 | %.o: %.cpp
27 | 	$(CC) $(CFLAGS) -c $< -o $@
28 | 
29 | all: 
30 | 	make clean;
31 | 	make test_MMult.x
32 | 
33 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h
34 | 	$(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \
35 |         $(BLAS_LIB) -o $(TEST_BIN) $@ 
36 | 
37 | run:	
38 | 	make all
39 | 	export OMP_NUM_THREADS=1
40 | 	export GOTO_NUM_THREADS=1
41 | 	echo "version = '$(NEW)';" > output_$(NEW).m
42 | 	./test_MMult.x >> output_$(NEW).m
43 | 	cp output_$(OLD).m output_old.m
44 | 	cp output_$(NEW).m output_new.m
45 | 
46 | clean:
47 | 	rm -f *.o *~ core *.x
48 | 
49 | cleanall:
50 | 	rm -f *.o *~ core *.x output*.m *.eps *.png
51 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult0.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult0';
 2 | MY_MMult = [
 3 | 40 1.542169e+00 0.000000e+00 
 4 | 80 1.404664e+00 0.000000e+00 
 5 | 120 1.360094e+00 0.000000e+00 
 6 | 160 1.331816e+00 0.000000e+00 
 7 | 200 1.329787e+00 0.000000e+00 
 8 | 240 1.321227e+00 0.000000e+00 
 9 | 280 1.319191e+00 0.000000e+00 
10 | 320 1.041146e+00 0.000000e+00 
11 | 360 1.308063e+00 0.000000e+00 
12 | 400 1.255653e+00 0.000000e+00 
13 | 440 1.229162e+00 0.000000e+00 
14 | 480 1.228200e+00 0.000000e+00 
15 | 520 1.172813e+00 0.000000e+00 
16 | 560 1.213585e+00 0.000000e+00 
17 | 600 1.216463e+00 0.000000e+00 
18 | 640 1.033379e+00 0.000000e+00 
19 | 680 1.221786e+00 0.000000e+00 
20 | 720 1.222401e+00 0.000000e+00 
21 | 760 1.238574e+00 0.000000e+00 
22 | 800 1.000343e+00 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult1.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult1';
 2 | MY_MMult = [
 3 | 40 1.560976e+00 0.000000e+00 
 4 | 80 1.424200e+00 0.000000e+00 
 5 | 120 1.364390e+00 0.000000e+00 
 6 | 160 1.331816e+00 0.000000e+00 
 7 | 200 1.327030e+00 0.000000e+00 
 8 | 240 1.321480e+00 0.000000e+00 
 9 | 280 1.290687e+00 0.000000e+00 
10 | 320 1.028823e+00 0.000000e+00 
11 | 360 1.243646e+00 0.000000e+00 
12 | 400 1.285347e+00 0.000000e+00 
13 | 440 1.267855e+00 0.000000e+00 
14 | 480 1.267428e+00 0.000000e+00 
15 | 520 1.224942e+00 0.000000e+00 
16 | 560 1.285491e+00 0.000000e+00 
17 | 600 1.269580e+00 0.000000e+00 
18 | 640 9.652605e-01 0.000000e+00 
19 | 680 1.270135e+00 0.000000e+00 
20 | 720 1.270478e+00 0.000000e+00 
21 | 760 1.250450e+00 0.000000e+00 
22 | 800 9.419695e-01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_10.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_10';
 2 | MY_MMult = [
 3 | 40 1.628223e+01 0.000000e+00 
 4 | 80 1.628223e+01 0.000000e+00 
 5 | 120 1.626016e+01 0.000000e+00 
 6 | 160 1.628223e+01 0.000000e+00 
 7 | 200 1.610738e+01 0.000000e+00 
 8 | 240 1.621622e+01 0.000000e+00 
 9 | 280 1.628223e+01 0.000000e+00 
10 | 320 1.628223e+01 0.000000e+00 
11 | 360 1.626016e+01 0.000000e+00 
12 | 400 1.628223e+01 0.000000e+00 
13 | 440 1.628223e+01 0.000000e+00 
14 | 480 1.626016e+01 0.000000e+00 
15 | 520 1.628223e+01 0.000000e+00 
16 | 560 1.628223e+01 0.000000e+00 
17 | 600 1.626016e+01 0.000000e+00 
18 | 640 1.628223e+01 0.000000e+00 
19 | 680 1.628223e+01 0.000000e+00 
20 | 720 1.626016e+01 0.000000e+00 
21 | 760 1.628223e+01 0.000000e+00 
22 | 800 1.626016e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_11.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_11';
 2 | MY_MMult = [
 3 | 40 1.548387e+01 0.000000e+00 
 4 | 80 1.552393e+01 0.000000e+00 
 5 | 120 1.554404e+01 0.000000e+00 
 6 | 160 1.560468e+01 0.000000e+00 
 7 | 200 1.556420e+01 0.000000e+00 
 8 | 240 1.550388e+01 0.000000e+00 
 9 | 280 1.560468e+01 0.000000e+00 
10 | 320 1.558442e+01 0.000000e+00 
11 | 360 1.556420e+01 0.000000e+00 
12 | 400 1.556420e+01 0.000000e+00 
13 | 440 1.558442e+01 0.000000e+00 
14 | 480 1.558442e+01 0.000000e+00 
15 | 520 1.556420e+01 0.000000e+00 
16 | 560 1.558442e+01 0.000000e+00 
17 | 600 1.558442e+01 0.000000e+00 
18 | 640 1.558442e+01 0.000000e+00 
19 | 680 1.558442e+01 0.000000e+00 
20 | 720 1.558442e+01 0.000000e+00 
21 | 760 1.558442e+01 0.000000e+00 
22 | 800 1.558442e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_12.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_12';
 2 | MY_MMult = [
 3 | 40 1.589404e+01 0.000000e+00 
 4 | 80 1.593625e+01 0.000000e+00 
 5 | 120 1.593625e+01 0.000000e+00 
 6 | 160 1.591512e+01 0.000000e+00 
 7 | 200 1.591512e+01 0.000000e+00 
 8 | 240 1.587302e+01 0.000000e+00 
 9 | 280 1.591512e+01 0.000000e+00 
10 | 320 1.595745e+01 0.000000e+00 
11 | 360 1.593625e+01 0.000000e+00 
12 | 400 1.593625e+01 0.000000e+00 
13 | 440 1.593625e+01 0.000000e+00 
14 | 480 1.595745e+01 0.000000e+00 
15 | 520 1.593625e+01 0.000000e+00 
16 | 560 1.595745e+01 0.000000e+00 
17 | 600 1.593625e+01 0.000000e+00 
18 | 640 1.593625e+01 0.000000e+00 
19 | 680 1.595745e+01 0.000000e+00 
20 | 720 1.595745e+01 0.000000e+00 
21 | 760 1.593625e+01 0.000000e+00 
22 | 800 1.593625e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_13.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_13';
 2 | MY_MMult = [
 3 | 40 1.280000e+01 0.000000e+00 
 4 | 80 1.651613e+01 0.000000e+00 
 5 | 120 1.868108e+01 0.000000e+00 
 6 | 160 1.820444e+01 0.000000e+00 
 7 | 200 1.884570e+01 0.000000e+00 
 8 | 240 1.901513e+01 0.000000e+00 
 9 | 280 1.875438e+01 0.000000e+00 
10 | 320 1.888646e+01 0.000000e+00 
11 | 360 1.906661e+01 0.000000e+00 
12 | 400 1.919328e+01 0.000000e+00 
13 | 440 1.932925e+01 0.000000e+00 
14 | 480 1.943450e+01 0.000000e+00 
15 | 520 1.929970e+01 0.000000e+00 
16 | 560 1.935163e+01 0.000000e+00 
17 | 600 1.935571e+01 0.000000e+00 
18 | 640 1.859837e+01 0.000000e+00 
19 | 680 1.943818e+01 0.000000e+00 
20 | 720 1.919161e+01 0.000000e+00 
21 | 760 1.924236e+01 0.000000e+00 
22 | 800 1.942890e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_14.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_14';
 2 | MY_MMult = [
 3 | 40 1.163636e+01 0.000000e+00 
 4 | 80 1.575385e+01 0.000000e+00 
 5 | 120 1.868108e+01 0.000000e+00 
 6 | 160 1.728270e+01 0.000000e+00 
 7 | 200 1.893491e+01 0.000000e+00 
 8 | 240 1.908075e+01 0.000000e+00 
 9 | 280 1.913862e+01 0.000000e+00 
10 | 320 1.919063e+01 0.000000e+00 
11 | 360 1.961985e+01 0.000000e+00 
12 | 400 1.943812e+01 0.000000e+00 
13 | 440 1.947063e+01 0.000000e+00 
14 | 480 1.971864e+01 0.000000e+00 
15 | 520 1.968748e+01 0.000000e+00 
16 | 560 1.950639e+01 0.000000e+00 
17 | 600 1.970084e+01 0.000000e+00 
18 | 640 1.974868e+01 0.000000e+00 
19 | 680 1.996204e+01 0.000000e+00 
20 | 720 1.966948e+01 0.000000e+00 
21 | 760 1.978483e+01 0.000000e+00 
22 | 800 1.967301e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_15.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_15';
 2 | MY_MMult = [
 3 | 40 9.142857e+00 0.000000e+00 
 4 | 80 1.204706e+01 0.000000e+00 
 5 | 120 1.355294e+01 0.000000e+00 
 6 | 160 1.317042e+01 0.000000e+00 
 7 | 200 1.360544e+01 0.000000e+00 
 8 | 240 1.366683e+01 0.000000e+00 
 9 | 280 1.370715e+01 0.000000e+00 
10 | 320 1.371333e+01 0.000000e+00 
11 | 360 1.369215e+01 0.000000e+00 
12 | 400 1.387083e+01 0.000000e+00 
13 | 440 1.384093e+01 0.000000e+00 
14 | 480 1.383957e+01 0.000000e+00 
15 | 520 1.379727e+01 0.000000e+00 
16 | 560 1.397438e+01 0.000000e+00 
17 | 600 1.392336e+01 0.000000e+00 
18 | 640 1.390574e+01 0.000000e+00 
19 | 680 1.403088e+01 0.000000e+00 
20 | 720 1.403768e+01 0.000000e+00 
21 | 760 1.398525e+01 0.000000e+00 
22 | 800 1.396560e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_16.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_16';
 2 | MY_MMult = [
 3 | 40 1.163636e+01 0.000000e+00 
 4 | 80 1.651613e+01 0.000000e+00 
 5 | 120 1.898901e+01 0.000000e+00 
 6 | 160 1.804405e+01 0.000000e+00 
 7 | 200 1.871345e+01 0.000000e+00 
 8 | 240 1.897598e+01 0.000000e+00 
 9 | 280 1.902253e+01 0.000000e+00 
10 | 320 1.899594e+01 0.000000e+00 
11 | 360 1.922373e+01 0.000000e+00 
12 | 400 1.929164e+01 0.000000e+00 
13 | 440 1.895927e+01 0.000000e+00 
14 | 480 1.928202e+01 0.000000e+00 
15 | 520 1.924292e+01 0.000000e+00 
16 | 560 1.924453e+01 0.000000e+00 
17 | 600 1.934011e+01 0.000000e+00 
18 | 640 1.938935e+01 0.000000e+00 
19 | 680 1.943878e+01 0.000000e+00 
20 | 720 1.954690e+01 0.000000e+00 
21 | 760 1.941985e+01 0.000000e+00 
22 | 800 1.953006e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_17.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_17';
 2 | MY_MMult = [
 3 | 40 1.163636e+01 0.000000e+00 
 4 | 80 1.600000e+01 0.000000e+00 
 5 | 120 1.888525e+01 0.000000e+00 
 6 | 160 1.840899e+01 0.000000e+00 
 7 | 200 1.916168e+01 0.000000e+00 
 8 | 240 1.901513e+01 0.000000e+00 
 9 | 280 1.923084e+01 0.000000e+00 
10 | 320 1.911228e+01 0.000000e+00 
11 | 360 1.939555e+01 0.000000e+00 
12 | 400 1.943812e+01 0.000000e+00 
13 | 440 1.949960e+01 0.000000e+00 
14 | 480 1.958594e+01 0.000000e+00 
15 | 520 1.944651e+01 0.000000e+00 
16 | 560 1.950747e+01 0.000000e+00 
17 | 600 1.936438e+01 0.000000e+00 
18 | 640 1.946060e+01 0.000000e+00 
19 | 680 1.958041e+01 0.000000e+00 
20 | 720 1.955765e+01 0.000000e+00 
21 | 760 1.941942e+01 0.000000e+00 
22 | 800 1.963002e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_18.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_18';
 2 | MY_MMult = [
 3 | 40 3.069054e+01 0.000000e+00 
 4 | 80 3.061224e+01 0.000000e+00 
 5 | 120 3.084833e+01 0.000000e+00 
 6 | 160 3.076923e+01 0.000000e+00 
 7 | 200 3.084833e+01 0.000000e+00 
 8 | 240 3.076923e+01 0.000000e+00 
 9 | 280 3.076923e+01 0.000000e+00 
10 | 320 3.084833e+01 0.000000e+00 
11 | 360 3.084833e+01 0.000000e+00 
12 | 400 3.076923e+01 0.000000e+00 
13 | 440 3.076923e+01 0.000000e+00 
14 | 480 3.084833e+01 0.000000e+00 
15 | 520 3.084833e+01 0.000000e+00 
16 | 560 3.084833e+01 0.000000e+00 
17 | 600 3.076923e+01 0.000000e+00 
18 | 640 3.076923e+01 0.000000e+00 
19 | 680 3.084833e+01 0.000000e+00 
20 | 720 3.084833e+01 0.000000e+00 
21 | 760 3.076923e+01 0.000000e+00 
22 | 800 3.076923e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_8.m:
--------------------------------------------------------------------------------
1 | version = 'MMult_4x4_8';
2 | MY_MMult = [
3 | error: i 0  j 0 diff 600.000000  got 800.000000  expect 200.000000 
4 | 40 diff too big: 6.000000e+02
5 | 


--------------------------------------------------------------------------------
/aarch64/output_MMult_4x4_9.m:
--------------------------------------------------------------------------------
1 | version = 'MMult_4x4_9';
2 | MY_MMult = [
3 | error: i 0  j 0 diff 600.000000  got 800.000000  expect 200.000000 
4 | 40 diff too big: 6.000000e+02
5 | 


--------------------------------------------------------------------------------
/aarch64/output_new.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_18';
 2 | MY_MMult = [
 3 | 40 3.069054e+01 0.000000e+00 
 4 | 80 3.061224e+01 0.000000e+00 
 5 | 120 3.084833e+01 0.000000e+00 
 6 | 160 3.076923e+01 0.000000e+00 
 7 | 200 3.084833e+01 0.000000e+00 
 8 | 240 3.076923e+01 0.000000e+00 
 9 | 280 3.076923e+01 0.000000e+00 
10 | 320 3.084833e+01 0.000000e+00 
11 | 360 3.084833e+01 0.000000e+00 
12 | 400 3.076923e+01 0.000000e+00 
13 | 440 3.076923e+01 0.000000e+00 
14 | 480 3.084833e+01 0.000000e+00 
15 | 520 3.084833e+01 0.000000e+00 
16 | 560 3.084833e+01 0.000000e+00 
17 | 600 3.076923e+01 0.000000e+00 
18 | 640 3.076923e+01 0.000000e+00 
19 | 680 3.084833e+01 0.000000e+00 
20 | 720 3.084833e+01 0.000000e+00 
21 | 760 3.076923e+01 0.000000e+00 
22 | 800 3.076923e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/output_old.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_10';
 2 | MY_MMult = [
 3 | 40 1.628223e+01 0.000000e+00 
 4 | 80 1.628223e+01 0.000000e+00 
 5 | 120 1.626016e+01 0.000000e+00 
 6 | 160 1.628223e+01 0.000000e+00 
 7 | 200 1.610738e+01 0.000000e+00 
 8 | 240 1.621622e+01 0.000000e+00 
 9 | 280 1.628223e+01 0.000000e+00 
10 | 320 1.628223e+01 0.000000e+00 
11 | 360 1.626016e+01 0.000000e+00 
12 | 400 1.628223e+01 0.000000e+00 
13 | 440 1.628223e+01 0.000000e+00 
14 | 480 1.626016e+01 0.000000e+00 
15 | 520 1.628223e+01 0.000000e+00 
16 | 560 1.628223e+01 0.000000e+00 
17 | 600 1.626016e+01 0.000000e+00 
18 | 640 1.628223e+01 0.000000e+00 
19 | 680 1.628223e+01 0.000000e+00 
20 | 720 1.626016e+01 0.000000e+00 
21 | 760 1.628223e+01 0.000000e+00 
22 | 800 1.626016e+01 0.000000e+00 
23 | ];
24 | 


--------------------------------------------------------------------------------
/aarch64/parameters.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )"
 3 | The below parameters set this range of values that p takes on 
 4 | */   
 5 | #define PFIRST 48
 6 | #define PLAST  960
 7 | #define PINC   48
 8 | 
 9 | /* 
10 | In the test driver, the m, n, and k dimensions are set to the below 
11 | values.  If the value equals "-1" then that dimension is bound to the
12 | index p, given above.
13 | */
14 | 
15 | #define M -1
16 | #define N -1
17 | #define K -1
18 | 
19 | /* 
20 | In the test driver, each experiment is repeated NREPEATS times and
21 | the best time from these repeats is used to compute the performance
22 | */
23 | 
24 | #define NREPEATS 10
25 | 
26 | /* 
27 | Matrices A, B, and C are stored in two dimensional arrays with
28 | row dimensions that are greater than or equal to the row dimension
29 | of the matrix.  This row dimension of the array is known as the 
30 | "leading dimension" and determines the stride (the number of 
31 | double precision numbers) when one goes from one element in a row
32 | to the next.  Having this number larger than the row dimension of
33 | the matrix tends to adversely affect performance.  LDX equals the
34 | leading dimension of the array that stores matrix X.  If LDX=-1 
35 | then the leading dimension is set to the row dimension of matrix X.
36 | */
37 | 
38 | #if 0
39 | #define LDA 1000
40 | #define LDB 1000
41 | #define LDC 1000
42 | #else
43 | #define LDA -1 
44 | #define LDB -1 
45 | #define LDC -1 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/aarch64/plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def readFile(filename):
 5 |     f = open(filename)
 6 |     sizes = [40]
 7 |     times = [0.0]
 8 |     title = ''
 9 |     try:
10 |         title = f.readline()
11 |         # skip 1 line
12 |         f.readline()
13 |         while True:
14 |             line = f.readline()
15 |             if line:
16 |                 slices = line.split(" ")
17 |                 if len(slices) <= 2:
18 |                     break;
19 |                 size = int(slices[0])
20 |                 time = float(slices[1])
21 |                 sizes.append(size)
22 |                 times.append(time)
23 |     finally:
24 |         f.close()
25 |     return title, sizes, times
26 | 
27 | if __name__ == '__main__':
28 |     plt.xlabel('size')
29 |     plt.ylabel('gflops')
30 |     t1, x1, y1 = readFile('output_old.m')
31 |     plt.plot(x1, y1, label=t1)
32 |     t2, x2, y2 = readFile('output_new.m')
33 |     plt.plot(x2, y2, label=t2)
34 |     plt.legend()
35 |     plt.show()
36 |     
37 | 


--------------------------------------------------------------------------------
/aarch64/print_matrix.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define A(i, j) a[(i) * lda + (j)]
 4 | 
 5 | void print_matrix(int m, int n, float *a, int lda) {
 6 |   int i, j;
 7 | 
 8 |   for (i = 0; i < m; i++) {
 9 |     for (j = 0; j < n; j++) {
10 |       printf("%.1f\t", A(i, j));
11 |     }
12 |     printf("\n");
13 |   }
14 |   printf("\n");
15 | }
16 | 


--------------------------------------------------------------------------------
/aarch64/random_matrix.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | void random_matrix(int m, int n, float *a) {
 4 | #define A(i, j) a[(i) * n + (j)]
 5 | 
 6 |   double drand48() __THROW;
 7 |   int i, j;
 8 | 
 9 |   for (i = 0; i < m; i++) {
10 |     for (j = 0; j < n; j++) {
11 | // #if 0
12 | //       A(i, j) = 2.0 * (float)drand48() - 1.0;
13 | // #else
14 | //       A(i, j) = (j - i) % 3;
15 | // #endif
16 |       A(i, j) = 1.0f;
17 |     }
18 |   }
19 | #undef A
20 | }
21 | 


--------------------------------------------------------------------------------
/armv7/MMult0.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in row-major order */
 2 | 
 3 | #define A(i,j) a[ (i)*lda + (j) ]
 4 | #define B(i,j) b[ (i)*ldb + (j) ]
 5 | #define C(i,j) c[ (i)*ldc + (j) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void MY_MMult( int m, int n, int k, float *a, int lda, 
10 |                                     float *b, int ldb,
11 |                                     float *c, int ldc )
12 | {
13 |   int i, j, p;
14 | 
15 |   for ( i=0; i<m; i++ ){        /* Loop over the rows of C */
16 |     for ( j=0; j<n; j++ ){        /* Loop over the columns of C */
17 |       for ( p=0; p<k; p++ ){        /* Update C( i,j ) with the inner
18 | 				       product of the ith row of A and
19 | 				       the jth column of B */
20 | 	C( i,j ) = C( i,j ) +  A( i,p ) * B( p,j );
21 |       }
22 |     }
23 |   }
24 | }
25 | 
26 | 
27 |   
28 | 


--------------------------------------------------------------------------------
/armv7/MMult1.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in row-major order */
 2 | 
 3 | #define A(i,j) a[ (i)*lda + (j) ]
 4 | #define B(i,j) b[ (i)*ldb + (j) ]
 5 | #define C(i,j) c[ (i)*ldc + (j) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void AddDot( int, float *, int, float *, float * );
10 | 
11 | void MY_MMult( int m, int n, int k, float *a, int lda, 
12 |                                     float *b, int ldb,
13 |                                     float *c, int ldc )
14 | {
15 |   int i, j;
16 | 
17 |   for ( j=0; j<n; j+=1 ){        /* Loop over the columns of C */
18 |     for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
19 |       /* Update the C( i,j ) with the inner product of the ith row of A
20 | 	 and the jth column of B */
21 | 
22 |       AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
23 |     }
24 |   }
25 | }
26 | 
27 | 
28 | /* Create macro to let X( i ) equal the ith element of x */
29 | 
30 | #define X(i) x[ (i)*incx ]
31 | 
32 | void AddDot( int k, float *x, int incx,  float *y, float *gamma )
33 | {
34 |   /* compute gamma := x' * y + gamma with vectors x and y of length n.
35 | 
36 |      Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
37 |   */
38 |  
39 |   int p;
40 | 
41 |   for ( p=0; p<k; p++ ){
42 |     *gamma += X( p ) * y[ p ];     
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/armv7/PlotAll.m:
--------------------------------------------------------------------------------
 1 | %
 2 | % Clear all variables and close all graphs
 3 | %
 4 | 
 5 | clear all
 6 | close all
 7 | 
 8 | %
 9 | % Get max_gflops from /proc/cpuinfo by reading the parameters
10 | % set in file proc_parameters.m
11 | %
12 | 
13 | proc_parameters
14 | 
15 | max_gflops = nflops_per_cycle * nprocessors * GHz_of_processor;
16 | 
17 | %
18 | % Read in the first data set and plot it.
19 | %
20 | 
21 | output_old
22 | 
23 | version_old = version;
24 | 
25 | plot( MY_MMult( :,1 ), MY_MMult( :,2 ), 'bo-.;OLD;' );
26 | last = size( MY_MMult, 1 );
27 | 
28 | hold on
29 | 
30 | axis( [ 0 MY_MMult( last,1 ) 0 max_gflops ] );
31 | 
32 | xlabel( 'm = n = k' );
33 | ylabel( 'GFLOPS/sec.' );
34 | 
35 | %
36 | % Read in second data set and plot it.
37 | %
38 | 
39 | output_new
40 | 
41 | version_new = version
42 | 
43 | title_string = sprintf("OLD = %s, NEW = %s", version_old, version_new);
44 | 
45 | plot( MY_MMult( :,1 ), MY_MMult( :,2 ), 'r-*;NEW;' );
46 | 
47 | title( title_string );
48 | 
49 | filename = sprintf( "compare_%s_%s", version_old, version_new );
50 | 
51 | print( filename, '-dpng' );
52 | 


--------------------------------------------------------------------------------
/armv7/REF_MMult.c:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in row-major order */
 2 | 
 3 | #define A(i,j) a[ (i)*lda + (j) ]
 4 | #define B(i,j) b[ (i)*ldb + (j) ]
 5 | #define C(i,j) c[ (i)*ldc + (j) ]
 6 | 
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void REF_MMult( int m, int n, int k, float *a, int lda, 
10 |                                     float *b, int ldb,
11 |                                     float *c, int ldc )
12 | {
13 |   int i, j, p;
14 | 
15 |   for ( i=0; i<m; i++ ){
16 |     for ( j=0; j<n; j++ ){
17 |       for ( p=0; p<k; p++ ){
18 | 	C( i,j ) = C( i,j ) +  A( i,p ) * B( p,j );
19 |       }
20 |     }
21 |   }
22 | }
23 | 
24 | 
25 |   
26 | 


--------------------------------------------------------------------------------
/armv7/compare_matrices.c:
--------------------------------------------------------------------------------
 1 | #define A( i, j ) a[ (i)*lda + (j) ]
 2 | #define B( i, j ) b[ (i)*ldb + (j) ]
 3 | #define abs( x ) ( (x) < 0.0 ? -(x) : (x) )
 4 | 
 5 | #include <stdio.h>
 6 | 
 7 | float compare_matrices( int m, int n, float *a, int lda, float *b, int ldb )
 8 | {
 9 | //    printf("\n---result----\n");
10 | //    print_matrix(m, n, a, lda);
11 | //    printf("\n-------\n");
12 | //    print_matrix(m, n, b, ldb);
13 | //    printf("\n-------\n");
14 |   int i, j;
15 |   float max_diff = 0.0, diff;
16 |   int printed = 0;
17 | 
18 |   for ( i=0; i<m; i++ ){
19 |     for ( j=0; j<n; j++ ){
20 |       diff = abs( A( i,j ) - B( i,j ) );
21 |       max_diff = ( diff > max_diff ? diff : max_diff );
22 |       if(0 == printed)
23 |       if(max_diff > 0.5f || max_diff < -0.5f) {
24 |         printf("\n error: i %d  j %d diff %f", i, j, max_diff);
25 |         printed = 1;
26 |       }
27 |     }
28 |   }
29 | 
30 |   return max_diff;
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/armv7/copy_matrix.c:
--------------------------------------------------------------------------------
 1 | #define A( i, j ) a[ (i)*lda + (j) ]
 2 | #define B( i, j ) b[ (i)*ldb + (j) ]
 3 | 
 4 | void copy_matrix( int m, int n, float *a, int lda, float *b, int ldb )
 5 | {
 6 |   int i, j;
 7 | 
 8 |   for ( j=0; j<n; j++ )
 9 |     for ( i=0; i<m; i++ )
10 |       B( i,j ) = A( i,j );
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/armv7/dclock.c:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | #include <time.h>
 3 | 
 4 | static double gtod_ref_time_sec = 0.0;
 5 | 
 6 | /* Adapted from the bl2_clock() routine in the BLIS library */
 7 | 
 8 | double dclock()
 9 | {
10 |         double the_time, norm_sec;
11 |         struct timeval tv;
12 | 
13 |         gettimeofday( &tv, NULL );
14 | 
15 |         if ( gtod_ref_time_sec == 0.0 )
16 |                 gtod_ref_time_sec = ( double ) tv.tv_sec;
17 | 
18 |         norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec;
19 | 
20 |         the_time = norm_sec + tv.tv_usec * 1.0e-6;
21 | 
22 |         return the_time;
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/armv7/makefile:
--------------------------------------------------------------------------------
 1 | OLD  := MMult_4x4_19
 2 | NEW  := MMult_4x4_19
 3 | 
 4 | #
 5 | # sample makefile
 6 | #
 7 | 
 8 | CC         := g++
 9 | LINKER     := $(CC)
10 | #CFLAGS     := -O0 -g -Wall
11 | CFLAGS     := -std=c++11 -O3 -march=armv7-a -mfpu=neon -ftree-vectorize
12 | LDFLAGS    := -lm
13 | 
14 | UTIL       := copy_matrix.o \
15 |               compare_matrices.o \
16 |               random_matrix.o \
17 |               dclock.o \
18 |               REF_MMult.o \
19 |               print_matrix.o
20 | 
21 | TEST_OBJS  := test_MMult.o $(NEW).o 
22 | 
23 | %.o: %.c
24 | 	$(CC) $(CFLAGS) -c $< -o $@
25 | %.o: %.c
26 | 	$(CC) $(CFLAGS) -c $< -o $@
27 | 
28 | all: 
29 | 	make clean;
30 | 	make test_MMult.x
31 | 
32 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h
33 | 	$(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \
34 |         $(BLAS_LIB) -o $(TEST_BIN) $@ 
35 | 
36 | run:	
37 | 	make all
38 | 	export OMP_NUM_THREADS=1
39 | 	export GOTO_NUM_THREADS=1
40 | 	echo "version = '$(NEW)';" > output_$(NEW).m
41 | 	./test_MMult.x >> output_$(NEW).m
42 | 	cp output_$(OLD).m output_old.m
43 | 	cp output_$(NEW).m output_new.m
44 | 
45 | clean:
46 | 	rm -f *.o *~ core *.x
47 | 
48 | cleanall:
49 | 	rm -f *.o *~ core *.x output*.m *.eps *.png
50 | 


--------------------------------------------------------------------------------
/armv7/output_MMult_4x4_12.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_12';
 2 | MY_MMult = [
 3 | 40 1.910448e+00 0.000000e+00 
 4 | 80 1.939394e+00 0.000000e+00 
 5 | 120 2.016336e+00 0.000000e+00 
 6 | 160 1.960278e+00 0.000000e+00 
 7 | 200 1.881689e+00 0.000000e+00 
 8 | 240 1.879666e+00 0.000000e+00 
 9 | 280 1.748258e+00 7.629395e-06 
10 | 320 1.787037e+00 1.144409e-05 
11 | 360 1.817282e+00 1.525879e-05 
12 | 400 1.827318e+00 2.288818e-05 
13 | 440 1.820832e+00 2.670288e-05 
14 | 480 1.854326e+00 2.861023e-05 
15 | 520 1.784230e+00 3.242493e-05 
16 | 560 1.830802e+00 3.814697e-05 
17 | 600 1.822593e+00 4.005432e-05 
18 | 640 1.807011e+00 4.005432e-05 
19 | 680 1.829843e+00 4.386902e-05 
20 | 720 1.864469e+00 5.340576e-05 
21 | 760 1.862807e+00 4.959106e-05 
22 | 800 1.830948e+00 5.912781e-05 
23 | ];
24 | 


--------------------------------------------------------------------------------
/armv7/output_MMult_4x4_18.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_18';
 2 | MY_MMult = [
 3 | 40 1.828571e+00 0.000000e+00 
 4 | 80 2.027723e+00 0.000000e+00 
 5 | 120 2.084439e+00 0.000000e+00 
 6 | 160 1.971600e+00 0.000000e+00 
 7 | 200 1.950268e+00 0.000000e+00 
 8 | 240 1.970915e+00 0.000000e+00 
 9 | 280 1.920896e+00 1.716614e-05 
10 | 320 1.963273e+00 2.098083e-05 
11 | 360 1.976154e+00 2.098083e-05 
12 | 400 1.996444e+00 2.670288e-05 
13 | 440 2.008725e+00 2.670288e-05 
14 | 480 2.038543e+00 2.861023e-05 
15 | 520 1.977067e+00 3.242493e-05 
16 | 560 2.009877e+00 3.433228e-05 
17 | 600 2.009910e+00 3.623962e-05 
18 | 640 2.032573e+00 4.005432e-05 
19 | 680 1.975727e+00 4.386902e-05 
20 | ];
21 | 


--------------------------------------------------------------------------------
/armv7/output_MMult_4x4_19.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_19';
 2 | MY_MMult = [
 3 | 40 2.461538e+00 0.000000e+00 
 4 | 80 2.805479e+00 0.000000e+00 
 5 | 120 2.870432e+00 0.000000e+00 
 6 | 160 2.747150e+00 0.000000e+00 
 7 | 200 2.675585e+00 0.000000e+00 
 8 | 240 2.720457e+00 0.000000e+00 
 9 | 280 2.639889e+00 0.000000e+00 
10 | 320 2.748186e+00 0.000000e+00 
11 | 360 2.713741e+00 0.000000e+00 
12 | 400 2.780010e+00 0.000000e+00 
13 | 440 2.742077e+00 0.000000e+00 
14 | 480 2.835692e+00 0.000000e+00 
15 | 520 2.719270e+00 0.000000e+00 
16 | 560 2.828980e+00 0.000000e+00 
17 | 600 2.768823e+00 0.000000e+00 
18 | 640 2.866309e+00 0.000000e+00 
19 | 680 2.808218e+00 0.000000e+00 
20 | ];
21 | 


--------------------------------------------------------------------------------
/armv7/output_MMult_4x4_20.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_20';
 2 | MY_MMult = [
 3 | 40 3.121951e+00 0.000000e+00 
 4 | 80 3.555556e+00 0.000000e+00 
 5 | 120 3.700214e+00 0.000000e+00 
 6 | 160 3.499359e+00 0.000000e+00 
 7 | 200 3.285421e+00 0.000000e+00 
 8 | 240 3.454704e+00 0.000000e+00 
 9 | 280 3.278620e+00 0.000000e+00 
10 | 320 3.498425e+00 0.000000e+00 
11 | 360 3.407164e+00 0.000000e+00 
12 | 400 3.527337e+00 0.000000e+00 
13 | 440 3.441569e+00 0.000000e+00 
14 | 480 3.619914e+00 0.000000e+00 
15 | 520 3.421952e+00 0.000000e+00 
16 | 560 3.617666e+00 0.000000e+00 
17 | 


--------------------------------------------------------------------------------
/armv7/output_MMult_4x4_21.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_21';
 2 | MY_MMult = [
 3 | 40 3.200000e+00 0.000000e+00 
 4 | 80 3.778598e+00 0.000000e+00 
 5 | 120 3.967853e+00 0.000000e+00 
 6 | 160 3.756075e+00 0.000000e+00 
 7 | 200 3.474484e+00 0.000000e+00 
 8 | 240 3.681491e+00 0.000000e+00 
 9 | 280 3.468204e+00 0.000000e+00 
10 | 320 3.714561e+00 0.000000e+00 
11 | 360 3.615343e+00 0.000000e+00 
12 | 400 3.750256e+00 0.000000e+00 
13 | 440 3.659264e+00 0.000000e+00 
14 | 480 3.860374e+00 0.000000e+00 
15 | 520 3.633610e+00 0.000000e+00 
16 | 560 3.852707e+00 0.000000e+00 
17 | 600 3.730312e+00 0.000000e+00 
18 | 640 3.926516e+00 0.000000e+00 
19 | 680 3.803391e+00 0.000000e+00 
20 | ];
21 | 


--------------------------------------------------------------------------------
/armv7/output_new.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_19';
 2 | MY_MMult = [
 3 | 40 2.461538e+00 0.000000e+00 
 4 | 80 2.805479e+00 0.000000e+00 
 5 | 120 2.870432e+00 0.000000e+00 
 6 | 160 2.747150e+00 0.000000e+00 
 7 | 200 2.675585e+00 0.000000e+00 
 8 | 240 2.720457e+00 0.000000e+00 
 9 | 280 2.639889e+00 0.000000e+00 
10 | 320 2.748186e+00 0.000000e+00 
11 | 360 2.713741e+00 0.000000e+00 
12 | 400 2.780010e+00 0.000000e+00 
13 | 440 2.742077e+00 0.000000e+00 
14 | 480 2.835692e+00 0.000000e+00 
15 | 520 2.719270e+00 0.000000e+00 
16 | 560 2.828980e+00 0.000000e+00 
17 | 600 2.768823e+00 0.000000e+00 
18 | 640 2.866309e+00 0.000000e+00 
19 | 680 2.808218e+00 0.000000e+00 
20 | ];
21 | 


--------------------------------------------------------------------------------
/armv7/output_old.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_4x4_19';
 2 | MY_MMult = [
 3 | 40 2.461538e+00 0.000000e+00 
 4 | 80 2.805479e+00 0.000000e+00 
 5 | 120 2.870432e+00 0.000000e+00 
 6 | 160 2.747150e+00 0.000000e+00 
 7 | 200 2.675585e+00 0.000000e+00 
 8 | 240 2.720457e+00 0.000000e+00 
 9 | 280 2.639889e+00 0.000000e+00 
10 | 320 2.748186e+00 0.000000e+00 
11 | 360 2.713741e+00 0.000000e+00 
12 | 400 2.780010e+00 0.000000e+00 
13 | 440 2.742077e+00 0.000000e+00 
14 | 480 2.835692e+00 0.000000e+00 
15 | 520 2.719270e+00 0.000000e+00 
16 | 560 2.828980e+00 0.000000e+00 
17 | 600 2.768823e+00 0.000000e+00 
18 | 640 2.866309e+00 0.000000e+00 
19 | 680 2.808218e+00 0.000000e+00 
20 | ];
21 | 


--------------------------------------------------------------------------------
/armv7/parameters.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )"
 3 | The below parameters set this range of values that p takes on 
 4 | */   
 5 | #define PFIRST 40
 6 | #define PLAST  700
 7 | #define PINC   40 
 8 | 
 9 | /* 
10 | In the test driver, the m, n, and k dimensions are set to the below 
11 | values.  If the value equals "-1" then that dimension is bound to the
12 | index p, given above.
13 | */
14 | 
15 | #define M -1
16 | #define N -1
17 | #define K -1
18 | 
19 | /* 
20 | In the test driver, each experiment is repeated NREPEATS times and
21 | the best time from these repeats is used to compute the performance
22 | */
23 | 
24 | #define NREPEATS 20 
25 | 
26 | /* 
27 | Matrices A, B, and C are stored in two dimensional arrays with
28 | row dimensions that are greater than or equal to the row dimension
29 | of the matrix.  This row dimension of the array is known as the 
30 | "leading dimension" and determines the stride (the number of 
31 | double precision numbers) when one goes from one element in a row
32 | to the next.  Having this number larger than the row dimension of
33 | the matrix tends to adversely affect performance.  LDX equals the
34 | leading dimension of the array that stores matrix X.  If LDX=-1 
35 | then the leading dimension is set to the row dimension of matrix X.
36 | */
37 | 
38 | #if 0
39 | #define LDA 1000
40 | #define LDB 1000
41 | #define LDC 1000
42 | #else
43 | #define LDA -1 
44 | #define LDB -1 
45 | #define LDC -1 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/armv7/plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def readFile(filename):
 5 |     f = open(filename)
 6 |     sizes = [40]
 7 |     times = [0.0]
 8 |     title = ''
 9 |     try:
10 |         title = f.readline()
11 |         # skip 1 line
12 |         f.readline()
13 |         while True:
14 |             line = f.readline()
15 |             if line:
16 |                 slices = line.split(" ")
17 |                 if len(slices) <= 2:
18 |                     break;
19 |                 size = int(slices[0])
20 |                 time = float(slices[1])
21 |                 sizes.append(size)
22 |                 times.append(time)
23 |     finally:
24 |         f.close()
25 |     return title, sizes, times
26 | 
27 | if __name__ == '__main__':
28 |     plt.xlabel('size')
29 |     plt.ylabel('gflops')
30 |     t1, x1, y1 = readFile('output_old.m')
31 |     plt.plot(x1, y1, label=t1)
32 |     t2, x2, y2 = readFile('output_new.m')
33 |     plt.plot(x2, y2, label=t2)
34 |     plt.legend()
35 |     plt.show()
36 |     
37 | 


--------------------------------------------------------------------------------
/armv7/print_matrix.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define A( i, j ) a[ (i)*lda + (j) ]
 4 | 
 5 | void print_matrix( int m, int n, float *a, int lda )
 6 | {
 7 |   int i, j;
 8 | 
 9 |   for ( i=0; i<m; i++ ){
10 |       for ( j=0; j<n; j++ ) {
11 |         printf("%.1f\t", A( i,j ) );
12 |       }
13 |     printf("\n");
14 |   }
15 |   printf("\n");
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/armv7/proc_parameters.m:
--------------------------------------------------------------------------------
 1 | % Indicate the number of floating point operations that can be executed
 2 | % per clock cycle
 3 | %
 4 | 
 5 | nflops_per_cycle = 4;
 6 | 
 7 | %
 8 | % Indicate the number of processors being used (in case you are using a
 9 | % multicore or SMP)
10 | %
11 | 
12 | nprocessors = 1;
13 | 
14 | %
15 | % Indicate the clock speed of the processor.  On a Linux machine this info
16 | % can be found in the file /proc/cpuinfo
17 | %
18 | % Note: some processors have a "turbo boost" mode, which increases
19 | % the peak clock rate...
20 | %
21 | 
22 | GHz_of_processor = 2.6;
23 | 


--------------------------------------------------------------------------------
/armv7/random_matrix.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | #define A( i,j ) a[ (j)*lda + (i) ]
 4 | 
 5 | void random_matrix( int m, int n, float *a, int lda )
 6 | {
 7 |   double drand48();
 8 |   int i,j;
 9 | 
10 |   for ( i=0; i<m; i++ )
11 |     for ( j=0; j<n; j++ )
12 | #if 0 
13 |       A( i,j ) = 2.0 * (float)drand48( ) - 1.0;
14 | #else
15 |       A( i, j) = (j-i) % 2;
16 | #endif
17 | }
18 | 


--------------------------------------------------------------------------------
/armv7/test_MMult.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | // #include <malloc.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | 
  6 | #include "parameters.h"
  7 | 
  8 | void REF_MMult(int, int, int, float *, int, float *, int, float *, int );
  9 | void MY_MMult(int, int, int, float *, int, float *, int, float *, int );
 10 | void copy_matrix(int, int, float *, int, float *, int );
 11 | void random_matrix(int, int, float *, int);
 12 | float compare_matrices( int, int, float *, int, float *, int );
 13 | 
 14 | double dclock();
 15 | 
 16 | int main()
 17 | {
 18 |   int 
 19 |     p, 
 20 |     m, n, k,
 21 |     lda, ldb, ldc, 
 22 |     rep;
 23 | 
 24 |   double 
 25 |     dtime, dtime_best,        
 26 |     gflops, 
 27 |     diff;
 28 | 
 29 |   float 
 30 |     *a, *b, *c, *cref, *cold;    
 31 |   
 32 |   printf( "MY_MMult = [\n" );
 33 |     
 34 |   for ( p=PFIRST; p<=PLAST; p+=PINC ){
 35 |     m = ( M == -1 ? p : M );
 36 |     n = ( N == -1 ? p : N );
 37 |     k = ( K == -1 ? p : K );
 38 | 
 39 |     gflops = 2.0 * m * n * k * 1.0e-09;
 40 | 
 41 |     lda = ( LDA == -1 ? m : LDA );
 42 |     ldb = ( LDB == -1 ? k : LDB );
 43 |     ldc = ( LDC == -1 ? m : LDC );
 44 | 
 45 |     /* Allocate space for the matrices */
 46 |     /* Note: I create an extra column in A to make sure that
 47 |        prefetching beyond the matrix does not cause a segfault */
 48 |     a = ( float * ) malloc( lda * (k+1) * sizeof( float ) );  
 49 |     b = ( float * ) malloc( ldb * n * sizeof( float ) );
 50 |     c = ( float * ) malloc( ldc * n * sizeof( float ) );
 51 |     cold = ( float * ) malloc( ldc * n * sizeof( float ) );
 52 |     cref = ( float * ) malloc( ldc * n * sizeof( float ) );
 53 | 
 54 |     /* Generate random matrices A, B, Cold */
 55 |     random_matrix( m, k, a, lda );
 56 |     random_matrix( k, n, b, ldb );
 57 |     random_matrix( m, n, cold, ldc );
 58 | #if 1 
 59 |     memset(cold, 0, ldc * n * sizeof(float));
 60 | #endif
 61 | 
 62 |     copy_matrix( m, n, cold, ldc, cref, ldc );
 63 | 
 64 |     /* Run the reference implementation so the answers can be compared */
 65 | 
 66 |     REF_MMult( m, n, k, a, lda, b, ldb, cref, ldc );
 67 | 
 68 |     /* Time the "optimized" implementation */
 69 |     for ( rep=0; rep<NREPEATS; rep++ ){
 70 |       copy_matrix( m, n, cold, ldc, c, ldc );
 71 | 
 72 |       /* Time your implementation */
 73 |       dtime = dclock();
 74 | 
 75 |       MY_MMult( m, n, k, a, lda, b, ldb, c, ldc );
 76 |       
 77 |       dtime = dclock() - dtime;
 78 | 
 79 |       if ( rep==0 )
 80 |         dtime_best = dtime;
 81 |       else
 82 | 	    dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
 83 |     }
 84 | 
 85 |     diff = compare_matrices( m, n, c, ldc, cref, ldc );
 86 |     if(diff > 0.5f || diff < -0.5f){
 87 |         exit(0);
 88 |     }
 89 | 
 90 |     printf( "%d %le %le \n", p, gflops / dtime_best, diff );
 91 |     fflush( stdout );
 92 | 
 93 |     free( a );
 94 |     free( b );
 95 |     free( c );
 96 |     free( cold );
 97 |     free( cref );
 98 |   }
 99 | 
100 |   printf( "];\n" );
101 | 
102 |   exit( 0 );
103 | }
104 | 
105 | 


--------------------------------------------------------------------------------
/cuda-int4/README.md:
--------------------------------------------------------------------------------
1 | WIP
2 | 


--------------------------------------------------------------------------------
/cuda/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | run.sh
3 | 


--------------------------------------------------------------------------------
/cuda/MMult_cuBLAS_1.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // CUDA runtime
 5 | #include "helper.h"
 6 | #include <cublas_v2.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | // CUDA and CUBLAS functions
10 | 
11 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda,
12 |               float *d_B, int ldb, float *d_C, int ldc) {
13 | 
14 |   const float alpha = 1.0f;
15 |   const float beta = 0.0f;
16 | 
17 |   checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha,
18 |                               d_B, n, d_A, k, &beta, d_C, n));
19 | }
20 | 


--------------------------------------------------------------------------------
/cuda/MMult_cuBLAS_2.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // CUDA runtime
 5 | #include "helper.h"
 6 | #include <cublas_v2.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | // CUDA and CUBLAS functions
10 | 
11 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda,
12 |               float *d_B, int ldb, float *d_C, int ldc) {
13 | 
14 |   const float alpha = 1.0f;
15 |   const float beta = 0.0f;
16 | #if __CUDACC_VER_MAJOR__ >= 11
17 |     cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
18 | #else
19 |     cudaDataType_t compute_type = CUDA_R_32F;
20 | #endif
21 | 
22 | checkCudaErrors(cublasGemmEx(
23 |     handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
24 |     (void*)(&alpha), d_B, CUDA_R_32F, n, d_A, CUDA_R_32F, k,
25 |     (void*)(&beta), d_C, CUDA_R_32F, n, compute_type, CUBLAS_GEMM_DEFAULT));
26 | }
27 | 


--------------------------------------------------------------------------------
/cuda/MMult_cuda_2.cu:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // CUDA runtime
 5 | #include "helper.h"
 6 | #include <cublas_v2.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | /**
10 |  * naive 实现
11 |  */
12 | template <int BLOCK>
13 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb,
14 |                       float *c, int ldc) {
15 |   int _m = blockIdx.x * BLOCK + threadIdx.x;
16 |   int _n = blockIdx.y * BLOCK + threadIdx.y;
17 |   if (_m < m and _n < n) {
18 |     float sum = 0.f;
19 |     for (int i = 0; i < k; ++i) {
20 |       sum += a[_m * k + i] * b[i * n + _n];
21 |     }
22 |     c[_m * n + _n] = sum;
23 |   }
24 | }
25 | 
26 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda,
27 |               float *d_B, int ldb, float *d_C, int ldc) {
28 | 
29 |   constexpr int BLOCK = 16;
30 |   // subm, subn, subk
31 |   dim3 block(BLOCK, BLOCK);
32 |   dim3 grid((m + BLOCK - 1) / BLOCK, (n + BLOCK - 1) / BLOCK);
33 | 
34 |   sgemm<BLOCK><<<grid, block>>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc);
35 | }
36 | 


--------------------------------------------------------------------------------
/cuda/MMult_cuda_3.cu:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // CUDA runtime
 5 | #include "helper.h"
 6 | #include <cublas_v2.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | // a = mxk, b = kxn
10 | template <int BLOCK>
11 | __global__ void sgemm(int m, int n, int k, float *a, float *b, float *c) {
12 |   // blockIdx control subpanel matrix
13 | 
14 |   const int tx = threadIdx.x;
15 |   const int ty = threadIdx.y;
16 |   const int bx = blockIdx.x;
17 |   const int by = blockIdx.y;
18 | 
19 |   float *begin_a = a + bx * BLOCK * k;
20 |   float *begin_b = b + by * BLOCK;
21 |   float *end_a = begin_a + k;
22 | 
23 |   float sum = 0.f;
24 | 
25 |   for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a;
26 |        a_ptr += BLOCK, b_ptr += BLOCK * n) {
27 | 
28 |     __shared__ float ashare[BLOCK][BLOCK];
29 |     __shared__ float bshare[BLOCK][BLOCK];
30 | 
31 |     ashare[ty][tx] = a_ptr[ty * k + tx];
32 |     bshare[ty][tx] = b_ptr[ty * n + tx];
33 |     __syncthreads();
34 | 
35 | #pragma unroll
36 |     for (int kk = 0; kk < BLOCK; ++kk) {
37 |       sum += ashare[ty][kk] * bshare[kk][tx];
38 |     }
39 |     __syncthreads();
40 |   }
41 | 
42 |   c[(BLOCK * bx + ty) * n + BLOCK * by + tx] = sum;
43 | }
44 | 
45 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda,
46 |               float *d_B, int ldb, float *d_C, int ldc) {
47 | 
48 |   constexpr int BLOCK = 16;
49 |   dim3 block(BLOCK, BLOCK);
50 |   dim3 grid((m + BLOCK - 1) / BLOCK, (n + BLOCK - 1) / BLOCK);
51 | 
52 |   sgemm<BLOCK><<<grid, block>>>(m, n, k, d_A, d_B, d_C);
53 | }
54 | 


--------------------------------------------------------------------------------
/cuda/MMult_cuda_4.cu:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // CUDA runtime
 5 | #include "helper.h"
 6 | #include <cublas_v2.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | // a = mxk, b = kxn
10 | template <int BLOCK, int STRIDE>
11 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb,
12 |                       float *c, int ldc) {
13 |   // blockIdx control subpanel matrix
14 |   constexpr int STEP = BLOCK * STRIDE;
15 |   const int tx = threadIdx.x;
16 |   const int ty = threadIdx.y;
17 |   const int bx = blockIdx.x;
18 |   const int by = blockIdx.y;
19 | 
20 |   float *begin_a = a + by * STEP * k;
21 |   float *begin_b = b + bx * STEP;
22 |   float *end_a = begin_a + k;
23 | 
24 |   float sum[STRIDE][STRIDE] = {0.f};
25 |   for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a;
26 |        a_ptr += STEP, b_ptr += STEP * n) {
27 |     __shared__ float ashare[STEP][STEP];
28 |     __shared__ float bshare[STEP][STEP];
29 | 
30 |     for (int i = 0; i < STRIDE; ++i) {
31 |       for (int j = 0; j < STRIDE; ++j) {
32 |         ashare[ty * STRIDE + i][tx * STRIDE + j] =
33 |             a_ptr[(ty * STRIDE + i) * k + tx * STRIDE + j];
34 |         bshare[ty * STRIDE + i][tx * STRIDE + j] =
35 |             b_ptr[(ty * STRIDE + i) * n + tx * STRIDE + j];
36 |       }
37 |     }
38 |     __syncthreads();
39 | 
40 | #pragma unroll
41 |     for (int i = 0; i < STRIDE; ++i) {
42 |       for (int j = 0; j < STRIDE; ++j) {
43 |         for (int kk = 0; kk < STEP; ++kk) {
44 |           sum[i][j] +=
45 |               ashare[ty * STRIDE + i][kk] * bshare[kk][tx * STRIDE + j];
46 |         }
47 |       }
48 |     }
49 | 
50 |     __syncthreads();
51 |   }
52 | 
53 |   for (int i = 0; i < STRIDE; ++i) {
54 |     for (int j = 0; j < STRIDE; ++j) {
55 |       c[(STEP * by + ty * STRIDE + i) * n + STEP * bx + tx * STRIDE + j] =
56 |           sum[i][j];
57 |     }
58 |   }
59 | }
60 | 
61 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda,
62 |               float *d_B, int ldb, float *d_C, int ldc) {
63 | 
64 |   constexpr int BLOCK = 16;
65 |   constexpr int STRIDE = 2; // every thread calc STRIDExSTRIDE result
66 |   dim3 block(BLOCK, BLOCK);
67 |   dim3 grid((m + BLOCK - 1) / BLOCK / STRIDE, (n + BLOCK - 1) / BLOCK / STRIDE);
68 | 
69 |   sgemm<BLOCK, STRIDE><<<grid, block>>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc);
70 | }
71 | 


--------------------------------------------------------------------------------
/cuda/MMult_cuda_5.cu:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // CUDA runtime
 5 | #include "helper.h"
 6 | #include <cublas_v2.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | // MY_MMult = [
10 | // 1024 6467.51 7.247925e-05
11 | // 2048 6693.74 1.525879e-04
12 | // 3072 7096.70 2.288818e-04
13 | // 4096 6677.67 4.425049e-04
14 | // ];
15 | /**
16 |  * 和 version4 的区别：
17 |  * 1. 修改了分块尺寸
18 |  * 2. 每个 block 有 8x8 个线程，每个线程计算 4x4 个结果
19 |  */
20 | template <int BLOCK, int STRIDE>
21 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb,
22 |                       float *c, int ldc) {
23 |   // blockIdx control subpanel matrix
24 |   constexpr int STEP = BLOCK * STRIDE;
25 |   const int tx = threadIdx.x * STRIDE;
26 |   const int ty = threadIdx.y * STRIDE;
27 |   const int bx = blockIdx.x * STEP;
28 |   const int by = blockIdx.y * STEP;
29 | 
30 |   float *begin_a = a + by * k;
31 |   float *begin_b = b + bx;
32 |   float *end_a = begin_a + k;
33 | 
34 |   float sum[STRIDE][STRIDE] = {0.f};
35 |   for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a;
36 |        a_ptr += STEP, b_ptr += STEP * n) {
37 |     __shared__ __align__(16 * 1024) float ashare[STEP][STEP];
38 |     __shared__ __align__(16 * 1024) float bshare[STEP][STEP];
39 | 
40 |     for (int i = 0; i < STRIDE; ++i) {
41 |       for (int j = 0; j < STRIDE; ++j) {
42 |         ashare[ty + i][tx + j] = a_ptr[(ty + i) * k + tx + j];
43 |         bshare[ty + i][tx + j] = b_ptr[(ty + i) * n + tx + j];
44 |       }
45 |     }
46 |     __syncthreads();
47 | 
48 |     for (int i = 0; i < STRIDE; ++i) {
49 |       for (int j = 0; j < STRIDE; ++j) {
50 |         for (int kk = 0; kk < STEP; ++kk) {
51 |           sum[i][j] += ashare[ty + i][kk] * bshare[kk][tx + j];
52 |         }
53 |       }
54 |     }
55 | 
56 |     __syncthreads();
57 |   }
58 | 
59 | #pragma unroll
60 |   for (int i = 0; i < STRIDE; ++i) {
61 |     for (int j = 0; j < STRIDE; ++j) {
62 |       c[(by + ty + i) * n + bx + tx + j] = sum[i][j];
63 |     }
64 |   }
65 | }
66 | 
67 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda,
68 |               float *d_B, int ldb, float *d_C, int ldc) {
69 | 
70 |   constexpr int BLOCK = 8;
71 |   constexpr int STRIDE = 4; // every thread calc STRIDExSTRIDE result
72 |   dim3 block(BLOCK, BLOCK);
73 |   dim3 grid((m + BLOCK - 1) / BLOCK / STRIDE, (n + BLOCK - 1) / BLOCK / STRIDE);
74 | 
75 |   sgemm<BLOCK, STRIDE><<<grid, block>>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc);
76 | }
77 | 


--------------------------------------------------------------------------------
/cuda/MMult_cuda_6.cu:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // CUDA runtime
 5 | #include "helper.h"
 6 | #include <cublas_v2.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | // a = mxk, b = kxn
10 | template <int BLOCK, int STRIDE>
11 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb,
12 |                       float *c, int ldc) {
13 |   // blockIdx control subpanel matrix
14 |   constexpr int STEP = BLOCK * STRIDE;
15 |   const int tx = threadIdx.x * STRIDE;
16 |   const int ty = threadIdx.y * STRIDE;
17 |   const int bx = blockIdx.x * STEP;
18 |   const int by = blockIdx.y * STEP;
19 | 
20 |   float *begin_a = a + by * k;
21 |   float *begin_b = b + bx;
22 |   float *end_a = begin_a + k;
23 | 
24 |   float sum[STRIDE][STRIDE] = {0.f};
25 | 
26 |   __shared__ float ashare[STEP][2 * STEP];
27 |   __shared__ float bshare[2 * STEP][STEP];
28 |   // bigger split
29 |   for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a;
30 |        a_ptr += 2 * STEP, b_ptr += 2 * STEP * n) {
31 | 
32 |     for (int i = 0; i < STRIDE; ++i) {
33 |       for (int j = 0; j < STRIDE; ++j) {
34 |         ashare[ty + i][tx + j] = a_ptr[(ty + i) * k + tx + j];
35 |         ashare[ty + i][tx + j + STEP] = a_ptr[(ty + i) * k + tx + j + STEP];
36 | 
37 |         bshare[ty + i][tx + j] = b_ptr[(ty + i) * n + tx + j];
38 |         bshare[ty + i + STEP][tx + j] = b_ptr[(ty + i + STEP) * n + tx + j];
39 |       }
40 |     }
41 |     __syncthreads();
42 | 
43 |     for (int i = 0; i < STRIDE; ++i) {
44 |       for (int j = 0; j < STRIDE; ++j) {
45 |         for (int kk = 0; kk < 2 * STEP; ++kk) {
46 |           sum[i][j] += ashare[ty + i][kk] * bshare[kk][tx + j];
47 |         }
48 |       }
49 |     }
50 | 
51 |     __syncthreads();
52 |   }
53 | 
54 | #pragma unroll
55 |   for (int i = 0; i < STRIDE; ++i) {
56 |     for (int j = 0; j < STRIDE; ++j) {
57 |       c[(by + ty + i) * n + bx + tx + j] = sum[i][j];
58 |     }
59 |   }
60 | }
61 | 
62 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda,
63 |               float *d_B, int ldb, float *d_C, int ldc) {
64 | 
65 |   constexpr int BLOCK = 16;
66 |   constexpr int STRIDE = 2; // every thread calc STRIDExSTRIDE result
67 |   dim3 block(BLOCK, BLOCK);
68 |   dim3 grid((m + BLOCK - 1) / BLOCK / STRIDE, (n + BLOCK - 1) / BLOCK / STRIDE);
69 | 
70 |   sgemm<BLOCK, STRIDE><<<grid, block>>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc);
71 | }
72 | 


--------------------------------------------------------------------------------
/cuda/MMult_cuda_7.cu:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // CUDA runtime
 5 | #include "helper.h"
 6 | #include <cublas_v2.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | // a = mxk, b = kxn
10 | __global__ void sgemm(int m, int n, int k, float *a, int lda, float *b, int ldb,
11 |                       float *c, int ldc) {
12 |   const int tx = (threadIdx.x % 16) * 2;
13 |   const int ty = threadIdx.x / 16 * 2;
14 |   const int bx = blockIdx.x * 64;
15 |   const int by = blockIdx.y * 64;
16 | 
17 |   float *begin_a = a + by * k;
18 |   float *begin_b = b + bx;
19 |   float *end_a = begin_a + k;
20 | 
21 |   __shared__ float ashare[64][64];
22 |   __shared__ float bshare[64][64];
23 |   float sum0[2][2] = {0};
24 |   float sum1[2][2] = {0};
25 |   float sum2[2][2] = {0};
26 |   float sum3[2][2] = {0};
27 | 
28 |   // bigger split
29 |   for (float *a_ptr = begin_a, *b_ptr = begin_b; a_ptr < end_a;
30 |        a_ptr += 64, b_ptr += 64 * n) {
31 | 
32 | #pragma unroll
33 |     for (int i = 0; i < 2; ++i) {
34 |       for (int j = 0; j < 2; ++j) {
35 |         ashare[ty + i][tx + j] = a_ptr[(ty + i) * k + tx + j];
36 |         ashare[ty + i][tx + j + 32] = a_ptr[(ty + i) * k + tx + j + 32];
37 |         ashare[ty + i + 32][tx + j] = a_ptr[(ty + 32 + i) * k + tx + j];
38 |         ashare[ty + i + 32][tx + j + 32] =
39 |             a_ptr[(ty + 32 + i) * k + tx + j + 32];
40 | 
41 |         bshare[ty + i][tx + j] = b_ptr[(ty + i) * n + tx + j];
42 |         bshare[ty + i][tx + j + 32] = b_ptr[(ty + i) * n + tx + j + 32];
43 |         bshare[ty + i + 32][tx + j] = b_ptr[(ty + i + 32) * n + tx + j];
44 |         bshare[ty + i + 32][tx + j + 32] =
45 |             b_ptr[(ty + i + 32) * n + tx + j + 32];
46 |       }
47 |     }
48 |     __syncthreads();
49 | 
50 | #pragma unroll
51 |     for (int i = 0; i < 2; ++i) {
52 |       for (int j = 0; j < 2; ++j) {
53 |         for (int subk = 0; subk < 64; ++subk) {
54 |           sum0[i][j] += ashare[ty + i][subk] * bshare[subk][tx + j];
55 |           sum1[i][j] += ashare[ty + i][subk] * bshare[subk][tx + j + 32];
56 |           sum2[i][j] += ashare[ty + i + 32][subk] * bshare[subk][tx + j];
57 |           sum3[i][j] += ashare[ty + i + 32][subk] * bshare[subk][tx + j + 32];
58 |         }
59 |       }
60 |     }
61 |     __syncthreads();
62 |   }
63 | 
64 | #pragma unroll
65 |   for (int i = 0; i < 2; ++i) {
66 |     for (int j = 0; j < 2; ++j) {
67 |       c[(by + ty + i) * n + bx + tx + j] = sum0[i][j];
68 |       c[(by + ty + i) * n + bx + tx + 32 + j] = sum1[i][j];
69 |       c[(by + ty + i + 32) * n + bx + tx + j] = sum2[i][j];
70 |       c[(by + ty + i + 32) * n + bx + tx + 32 + j] = sum3[i][j];
71 |     }
72 |   }
73 | }
74 | 
75 | void MY_MMult(cublasHandle_t handle, int m, int n, int k, float *d_A, int lda,
76 |               float *d_B, int ldb, float *d_C, int ldc) {
77 | 
78 |   dim3 block(256);
79 |   dim3 grid(m / 64, n / 64);
80 | 
81 |   sgemm<<<grid, block>>>(m, n, k, d_A, lda, d_B, ldb, d_C, ldc);
82 | }
83 | 


--------------------------------------------------------------------------------
/cuda/PlotAll.m:
--------------------------------------------------------------------------------
 1 | %
 2 | % Clear all variables and close all graphs
 3 | %
 4 | 
 5 | clear all
 6 | close all
 7 | 
 8 | %
 9 | % Get max_gflops from /proc/cpuinfo by reading the parameters
10 | % set in file proc_parameters.m
11 | %
12 | 
13 | proc_parameters
14 | 
15 | max_gflops = nflops_per_cycle * nprocessors * GHz_of_processor;
16 | 
17 | %
18 | % Read in the first data set and plot it.
19 | %
20 | 
21 | output_old
22 | 
23 | version_old = version;
24 | 
25 | plot( MY_MMult( :,1 ), MY_MMult( :,2 ), 'bo-.;OLD;' );
26 | last = size( MY_MMult, 1 );
27 | 
28 | hold on
29 | 
30 | axis( [ 0 MY_MMult( last,1 ) 0 max_gflops ] );
31 | 
32 | xlabel( 'm = n = k' );
33 | ylabel( 'GFLOPS/sec.' );
34 | 
35 | %
36 | % Read in second data set and plot it.
37 | %
38 | 
39 | output_new
40 | 
41 | version_new = version
42 | 
43 | title_string = sprintf("OLD = %s, NEW = %s", version_old, version_new);
44 | 
45 | plot( MY_MMult( :,1 ), MY_MMult( :,2 ), 'r-*;NEW;' );
46 | 
47 | title( title_string );
48 | 
49 | filename = sprintf( "compare_%s_%s", version_old, version_new );
50 | 
51 | print( filename, '-dpng' );
52 | 


--------------------------------------------------------------------------------
/cuda/REF_MMult.cpp:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in row-major order */
 2 | #define A(i, j) a[(i)*lda + (j)]
 3 | #define B(i, j) b[(i)*ldb + (j)]
 4 | #define C(i, j) c[(i)*ldc + (j)]
 5 | 
 6 | #include <cblas.h>
 7 | /* Routine for computing C = A * B + C */
 8 | 
 9 | void REF_MMult(int m, int n, int k, float *a, int lda, float *b, int ldb,
10 |                float *c, int ldc) {
11 |   cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0f, a, lda,
12 |               b, ldb, 0.0f, c, ldc);
13 | }
14 | 


--------------------------------------------------------------------------------
/cuda/compare_matrices.cpp:
--------------------------------------------------------------------------------
 1 | #define A(i, j) a[(i)*lda + (j)]
 2 | #define B(i, j) b[(i)*ldb + (j)]
 3 | #define abs(x) ((x) < 0.0 ? -(x) : (x))
 4 | 
 5 | #include <stdio.h>
 6 | 
 7 | float compare_matrices(int m, int n, float *a, int lda, float *b, int ldb) {
 8 |   //    printf("\n---result----\n");
 9 |   //    print_matrix(m, n, a, lda);
10 |   //    printf("\n-------\n");
11 |   //    print_matrix(m, n, b, ldb);
12 |   //    printf("\n-------\n");
13 |   int i, j;
14 |   float max_diff = 0.0, diff;
15 |   int printed = 0;
16 | 
17 |   for (i = 0; i < m; i++) {
18 |     for (j = 0; j < n; j++) {
19 |       diff = abs(A(i, j) - B(i, j));
20 |       max_diff = (diff > max_diff ? diff : max_diff);
21 |       if (0 == printed)
22 |         if (max_diff > 0.5f || max_diff < -0.5f) {
23 |           printf("\n error: i %d  j %d diff %f  got %f  expect %f ", i, j, max_diff, A(i, j), B(i, j));
24 |           printed = 1;
25 |         }
26 |     }
27 |   }
28 | 
29 |   return max_diff;
30 | }
31 | 


--------------------------------------------------------------------------------
/cuda/copy_matrix.cpp:
--------------------------------------------------------------------------------
 1 | #define A(i, j) a[(i)*lda + (j)]
 2 | #define B(i, j) b[(i)*ldb + (j)]
 3 | 
 4 | void copy_matrix(int m, int n, float *a, int lda, float *b, int ldb) {
 5 |   int i, j;
 6 | 
 7 |   for (j = 0; j < n; j++)
 8 |     for (i = 0; i < m; i++)
 9 |       B(i, j) = A(i, j);
10 | }
11 | 


--------------------------------------------------------------------------------
/cuda/dclock.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | #include <time.h>
 3 | 
 4 | static double gtod_ref_time_sec = 0.0;
 5 | 
 6 | /* Adapted from the bl2_clock() routine in the BLIS library */
 7 | 
 8 | double dclock() {
 9 |   double the_time, norm_sec;
10 |   struct timeval tv;
11 | 
12 |   gettimeofday(&tv, NULL);
13 | 
14 |   if (gtod_ref_time_sec == 0.0)
15 |     gtod_ref_time_sec = (double)tv.tv_sec;
16 | 
17 |   norm_sec = (double)tv.tv_sec - gtod_ref_time_sec;
18 | 
19 |   the_time = norm_sec + tv.tv_usec * 1.0e-6;
20 | 
21 |   return the_time;
22 | }
23 | 


--------------------------------------------------------------------------------
/cuda/helper.h:
--------------------------------------------------------------------------------
 1 | #pragma _HELPER_H_
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <cublas_v2.h>
 5 | #include <stdio.h>
 6 | 
 7 | template <typename T>
 8 | void check(T result, char const *const func, const char *const file,
 9 |            int const line) {
10 |   if (result) {
11 |     fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
12 |             static_cast<unsigned int>(result), func);
13 |     exit(EXIT_FAILURE);
14 |   }
15 | }
16 | 
17 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
18 | 


--------------------------------------------------------------------------------
/cuda/makefile:
--------------------------------------------------------------------------------
 1 | OLD  := MMult_cuBLAS_1
 2 | #NEW  := MMult_cuBLAS_1
 3 | NEW := MMult_cuda_3
 4 | SMS ?= 70 75 80 86
 5 | 
 6 | #
 7 | # sample makefile
 8 | #
 9 | 
10 | CC         := nvcc 
11 | LINKER     := $(CC)
12 | #CFLAGS     := -O0 -g -Wall
13 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
14 | # CFLAGS     := -std=c++17 -O0 -g -G
15 | CFLAGS     := -std=c++17 -O2
16 | LDFLAGS    := -lm  -lcublas  -lopenblas
17 | 
18 | UTIL       := copy_matrix.o \
19 |               compare_matrices.o \
20 |               random_matrix.o \
21 |               dclock.o \
22 |               REF_MMult.o \
23 |               print_matrix.o
24 | 
25 | TEST_OBJS  := test_MMult.o $(NEW).o 
26 | 
27 | %.o: %.cpp
28 | 	$(CC) $(CFLAGS) $(GENCODE_FLAGS)  -c $< -o $@
29 | 
30 | %.o: %.cu
31 | 	$(CC) $(CFLAGS) $(GENCODE_FLAGS)  -c $< -o $@
32 | 
33 | all: 
34 | 	make clean;
35 | 	make test_MMult.x
36 | 
37 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h
38 | 	$(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \
39 |         $(BLAS_LIB) -o $(TEST_BIN) $@ 
40 | 
41 | run:	
42 | 	make all
43 | 	echo "version = '$(NEW)';" > output_$(NEW).m
44 | 	./test_MMult.x >> output_$(NEW).m
45 | 	cp output_$(OLD).m output_old.m
46 | 	cp output_$(NEW).m output_new.m
47 | 
48 | clean:
49 | 	rm -f *.o *~ core *.x
50 | 
51 | cleanall:
52 | 	rm -f *.o *~ core *.x output*.m *.eps *.png
53 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuBLAS_1.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuBLAS_1';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 10637.93 3.242493e-05 
 6 | 1152 16397.92 3.480911e-05 
 7 | 1280 16559.61 5.626678e-05 
 8 | 1408 13734.41 5.340576e-05 
 9 | 1536 14581.54 5.340576e-05 
10 | 1664 14285.89 4.577637e-05 
11 | 1792 13704.07 5.340576e-05 
12 | 1920 13878.71 5.912781e-05 
13 | 2048 16339.17 1.564026e-04 
14 | 2176 12957.95 1.716614e-04 
15 | 2304 16535.38 1.716614e-04 
16 | 2432 12519.21 8.392334e-05 
17 | 2560 16971.11 1.945496e-04 
18 | 2688 18144.32 1.907349e-04 
19 | 2816 12950.36 2.193451e-04 
20 | 2944 16634.41 2.593994e-04 
21 | 3072 17836.77 2.441406e-04 
22 | 3200 12842.99 2.746582e-04 
23 | 3328 16601.17 3.280640e-04 
24 | 3456 16300.24 3.280640e-04 
25 | 3584 12411.39 1.029968e-04 
26 | 3712 17320.47 3.280640e-04 
27 | 3840 14158.14 3.738403e-04 
28 | 3968 13989.40 3.738403e-04 
29 | 4096 14217.98 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuBLAS_2.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuBLAS_2';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 10768.11 3.242493e-05 
 6 | 1152 16446.89 3.480911e-05 
 7 | 1280 16571.86 5.626678e-05 
 8 | 1408 13898.33 5.340576e-05 
 9 | 1536 14489.48 5.340576e-05 
10 | 1664 14472.10 4.577637e-05 
11 | 1792 13926.61 5.340576e-05 
12 | 1920 13842.92 5.912781e-05 
13 | 2048 16421.74 1.564026e-04 
14 | 2176 13009.90 1.716614e-04 
15 | 2304 16599.08 1.716614e-04 
16 | 2432 12421.43 8.392334e-05 
17 | 2560 16870.40 1.945496e-04 
18 | 2688 17895.05 1.907349e-04 
19 | 2816 12906.06 2.193451e-04 
20 | 2944 16560.19 2.593994e-04 
21 | 3072 17843.88 2.441406e-04 
22 | 3200 12758.92 2.746582e-04 
23 | 3328 16618.32 3.280640e-04 
24 | 3456 16333.63 3.280640e-04 
25 | 3584 12356.31 1.029968e-04 
26 | 3712 17338.03 3.280640e-04 
27 | 3840 14111.53 3.738403e-04 
28 | 3968 13971.58 3.738403e-04 
29 | 4096 14184.42 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_10.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_10';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 10860.91 7.247925e-05 
 6 | 1152 9017.44 8.392334e-05 
 7 | 1280 11176.41 1.068115e-04 
 8 | 1408 13409.62 1.258850e-04 
 9 | 1536 10387.59 1.182556e-04 
10 | 1664 12207.25 1.258850e-04 
11 | 1792 14192.14 1.373291e-04 
12 | 1920 12808.66 1.907349e-04 
13 | 2048 15352.21 1.564026e-04 
14 | 2176 13981.38 1.716614e-04 
15 | 2304 15379.50 1.716614e-04 
16 | 2432 14650.03 1.831055e-04 
17 | 2560 15833.78 1.945496e-04 
18 | 2688 15134.27 1.907349e-04 
19 | 2816 14683.54 2.193451e-04 
20 | 2944 15758.60 2.593994e-04 
21 | 3072 15407.97 2.441406e-04 
22 | 3200 15168.65 2.746582e-04 
23 | 3328 16030.18 3.280640e-04 
24 | 3456 15862.35 3.280640e-04 
25 | 3584 15620.07 2.899170e-04 
26 | 3712 15600.41 3.280640e-04 
27 | 3840 15479.32 3.738403e-04 
28 | 3968 15421.87 3.738403e-04 
29 | 4096 15388.22 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_11.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_11';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 12636.12 7.247925e-05 
 6 | 1152 10658.86 8.392334e-05 
 7 | 1280 13162.70 1.068115e-04 
 8 | 1408 15935.04 1.258850e-04 
 9 | 1536 11895.09 1.182556e-04 
10 | 1664 14023.52 1.258850e-04 
11 | 1792 16268.22 1.373291e-04 
12 | 1920 14974.50 1.907349e-04 
13 | 2048 16748.66 1.564026e-04 
14 | 2176 15306.64 1.716614e-04 
15 | 2304 17145.08 1.716614e-04 
16 | 2432 16403.14 1.831055e-04 
17 | 2560 17659.38 1.945496e-04 
18 | 2688 16749.79 1.907349e-04 
19 | 2816 16429.85 2.193451e-04 
20 | 2944 17211.17 2.593994e-04 
21 | 3072 16926.06 2.441406e-04 
22 | 3200 16721.53 2.746582e-04 
23 | 3328 17413.48 3.280640e-04 
24 | 3456 17003.49 3.280640e-04 
25 | 3584 17123.20 2.899170e-04 
26 | 3712 16784.96 3.280640e-04 
27 | 3840 16988.37 3.738403e-04 
28 | 3968 16687.82 3.738403e-04 
29 | 4096 16831.67 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_12.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_12';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3090" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 16257.53 7.247925e-05 
 6 | 1152 20440.02 8.392334e-05 
 7 | 1280 14655.91 1.068115e-04 
 8 | 1408 17732.82 1.258850e-04 
 9 | 1536 20937.86 1.182556e-04 
10 | 1664 16541.26 1.258850e-04 
11 | 1792 19023.14 1.373291e-04 
12 | 1920 21486.99 1.907349e-04 
13 | 2048 18795.48 1.564026e-04 
14 | 2176 20553.09 1.716614e-04 
15 | 2304 22417.96 1.716614e-04 
16 | 2432 20389.30 1.831055e-04 
17 | 2560 21924.48 1.945496e-04 
18 | 2688 20747.71 1.907349e-04 
19 | 2816 22255.11 2.193451e-04 
20 | 2944 20980.56 2.593994e-04 
21 | 3072 20402.22 2.441406e-04 
22 | 3200 21205.12 2.746582e-04 
23 | 3328 20986.45 3.280640e-04 
24 | 3456 21859.21 3.280640e-04 
25 | 3584 21398.98 2.899170e-04 
26 | 3712 20981.10 3.280640e-04 
27 | 3840 21836.18 3.738403e-04 
28 | 3968 21428.58 3.738403e-04 
29 | 4096 21410.87 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_2.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_2';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 449.47 7.247925e-05 
 6 | 1152 484.94 8.392334e-05 
 7 | 1280 473.72 1.068115e-04 
 8 | 1408 483.25 1.258850e-04 
 9 | 1536 477.37 1.182556e-04 
10 | 1664 480.06 1.258850e-04 
11 | 1792 482.28 1.373291e-04 
12 | 1920 474.67 1.907349e-04 
13 | 2048 483.35 1.564026e-04 
14 | 2176 474.56 1.716614e-04 
15 | 2304 477.77 1.716614e-04 
16 | 2432 476.35 1.831055e-04 
17 | 2560 465.45 1.945496e-04 
18 | 2688 474.31 1.907349e-04 
19 | 2816 481.86 2.193451e-04 
20 | 2944 482.28 2.593994e-04 
21 | 3072 476.64 2.441406e-04 
22 | 3200 476.94 2.746582e-04 
23 | 3328 485.21 3.280640e-04 
24 | 3456 484.32 3.280640e-04 
25 | 3584 484.52 2.899170e-04 
26 | 3712 482.32 3.280640e-04 
27 | 3840 471.74 3.738403e-04 
28 | 3968 483.43 3.738403e-04 
29 | 4096 481.02 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_3.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_3';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 2348.83 7.247925e-05 
 6 | 1152 2368.86 8.392334e-05 
 7 | 1280 2696.05 1.068115e-04 
 8 | 1408 2685.68 1.258850e-04 
 9 | 1536 2663.44 1.182556e-04 
10 | 1664 2664.02 1.258850e-04 
11 | 1792 2667.68 1.373291e-04 
12 | 1920 2674.12 1.907349e-04 
13 | 2048 2673.12 1.564026e-04 
14 | 2176 2669.83 1.716614e-04 
15 | 2304 2659.31 1.716614e-04 
16 | 2432 2653.01 1.831055e-04 
17 | 2560 2649.64 1.945496e-04 
18 | 2688 2643.43 1.907349e-04 
19 | 2816 2637.53 2.193451e-04 
20 | 2944 2631.19 2.593994e-04 
21 | 3072 2628.32 2.441406e-04 
22 | 3200 2626.45 2.746582e-04 
23 | 3328 2594.80 3.280640e-04 
24 | 3456 2588.46 3.280640e-04 
25 | 3584 2588.16 2.899170e-04 
26 | 3712 2590.75 3.280640e-04 
27 | 3840 2601.79 3.738403e-04 
28 | 3968 2600.13 3.738403e-04 
29 | 4096 2604.26 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_4.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_4';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 6109.22 7.247925e-05 
 6 | 1152 6208.11 8.392334e-05 
 7 | 1280 6391.93 1.068115e-04 
 8 | 1408 6413.10 1.258850e-04 
 9 | 1536 6490.45 1.182556e-04 
10 | 1664 6592.70 1.258850e-04 
11 | 1792 6491.07 1.373291e-04 
12 | 1920 6541.92 1.907349e-04 
13 | 2048 6522.68 1.564026e-04 
14 | 2176 6503.46 1.716614e-04 
15 | 2304 6550.76 1.716614e-04 
16 | 2432 6531.24 1.831055e-04 
17 | 2560 6503.59 1.945496e-04 
18 | 2688 6515.36 1.907349e-04 
19 | 2816 6509.47 2.193451e-04 
20 | 2944 6453.46 2.593994e-04 
21 | 3072 6495.38 2.441406e-04 
22 | 3200 6483.08 2.746582e-04 
23 | 3328 6460.18 3.280640e-04 
24 | 3456 6496.13 3.280640e-04 
25 | 3584 6501.23 2.899170e-04 
26 | 3712 6524.08 3.280640e-04 
27 | 3840 6499.54 3.738403e-04 
28 | 3968 6487.26 3.738403e-04 
29 | 4096 6505.56 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_5.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_5';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 6476.59 7.247925e-05 
 6 | 1152 6727.65 8.392334e-05 
 7 | 1280 6989.59 1.068115e-04 
 8 | 1408 6370.10 1.258850e-04 
 9 | 1536 6755.63 1.182556e-04 
10 | 1664 7304.07 1.258850e-04 
11 | 1792 7107.59 1.373291e-04 
12 | 1920 7109.28 1.907349e-04 
13 | 2048 6943.20 1.564026e-04 
14 | 2176 7306.77 1.716614e-04 
15 | 2304 7060.76 1.716614e-04 
16 | 2432 7053.52 1.831055e-04 
17 | 2560 7173.36 1.945496e-04 
18 | 2688 7071.53 1.907349e-04 
19 | 2816 6955.60 2.193451e-04 
20 | 2944 6923.04 2.593994e-04 
21 | 3072 6969.85 2.441406e-04 
22 | 3200 6918.04 2.746582e-04 
23 | 3328 6844.63 3.280640e-04 
24 | 3456 6825.10 3.280640e-04 
25 | 3584 6766.01 2.899170e-04 
26 | 3712 6780.47 3.280640e-04 
27 | 3840 6814.50 3.738403e-04 
28 | 3968 6746.66 3.738403e-04 
29 | 4096 6534.27 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_6.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_6';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 3165.02 7.247925e-05 
 6 | 1152 3219.39 8.392334e-05 
 7 | 1280 3311.29 1.068115e-04 
 8 | 1408 3318.39 1.258850e-04 
 9 | 1536 3735.15 1.182556e-04 
10 | 1664 3694.61 1.258850e-04 
11 | 1792 3671.32 1.373291e-04 
12 | 1920 3711.67 1.907349e-04 
13 | 2048 3668.11 1.564026e-04 
14 | 2176 3714.72 1.716614e-04 
15 | 2304 3699.94 1.716614e-04 
16 | 2432 3719.28 1.831055e-04 
17 | 2560 3667.40 1.945496e-04 
18 | 2688 3695.71 1.907349e-04 
19 | 2816 3678.39 2.193451e-04 
20 | 2944 3683.63 2.593994e-04 
21 | 3072 3671.62 2.441406e-04 
22 | 3200 3665.02 2.746582e-04 
23 | 3328 3658.52 3.280640e-04 
24 | 3456 3669.57 3.280640e-04 
25 | 3584 3664.37 2.899170e-04 
26 | 3712 3668.71 3.280640e-04 
27 | 3840 3674.00 3.738403e-04 
28 | 3968 3677.78 3.738403e-04 
29 | 4096 3670.64 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_7.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_7';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3090" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 4799.10 7.247925e-05 
 6 | 1152 6046.16 8.392334e-05 
 7 | 1280 6020.74 1.068115e-04 
 8 | 1408 6103.93 1.258850e-04 
 9 | 1536 5485.28 1.182556e-04 
10 | 1664 6441.97 1.258850e-04 
11 | 1792 6729.21 1.373291e-04 
12 | 1920 6952.87 1.907349e-04 
13 | 2048 6714.18 1.564026e-04 
14 | 2176 6628.49 1.716614e-04 
15 | 2304 6876.40 1.716614e-04 
16 | 2432 6800.96 1.831055e-04 
17 | 2560 6763.13 1.945496e-04 
18 | 2688 6786.04 1.907349e-04 
19 | 2816 6795.07 2.193451e-04 
20 | 2944 6847.96 2.593994e-04 
21 | 3072 6692.63 2.441406e-04 
22 | 3200 6743.39 2.746582e-04 
23 | 3328 6839.06 3.280640e-04 
24 | 3456 6744.52 3.280640e-04 
25 | 3584 6679.47 2.899170e-04 
26 | 3712 6643.94 3.280640e-04 
27 | 3840 6731.34 3.738403e-04 
28 | 3968 6725.52 3.738403e-04 
29 | 4096 6730.59 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_8.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_8';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 4820.90 7.247925e-05 
 6 | 1152 4996.40 8.392334e-05 
 7 | 1280 5156.29 1.068115e-04 
 8 | 1408 4870.39 1.258850e-04 
 9 | 1536 5387.88 1.182556e-04 
10 | 1664 5544.25 1.258850e-04 
11 | 1792 5585.79 1.373291e-04 
12 | 1920 5577.91 1.907349e-04 
13 | 2048 5486.57 1.564026e-04 
14 | 2176 5569.79 1.716614e-04 
15 | 2304 5461.78 1.716614e-04 
16 | 2432 5444.76 1.831055e-04 
17 | 2560 5508.09 1.945496e-04 
18 | 2688 5434.42 1.907349e-04 
19 | 2816 5395.18 2.193451e-04 
20 | 2944 5418.13 2.593994e-04 
21 | 3072 5443.66 2.441406e-04 
22 | 3200 5413.37 2.746582e-04 
23 | 3328 5363.66 3.280640e-04 
24 | 3456 5364.71 3.280640e-04 
25 | 3584 5336.84 2.899170e-04 
26 | 3712 5355.17 3.280640e-04 
27 | 3840 5362.73 3.738403e-04 
28 | 3968 5306.01 3.738403e-04 
29 | 4096 5220.77 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_MMult_cuda_9.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_9';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 10787.91 7.247925e-05 
 6 | 1152 9263.87 8.392334e-05 
 7 | 1280 11465.92 1.068115e-04 
 8 | 1408 13740.90 1.258850e-04 
 9 | 1536 10607.32 1.182556e-04 
10 | 1664 12420.91 1.258850e-04 
11 | 1792 14453.16 1.373291e-04 
12 | 1920 13160.41 1.907349e-04 
13 | 2048 14995.85 1.564026e-04 
14 | 2176 13423.66 1.716614e-04 
15 | 2304 15407.11 1.716614e-04 
16 | 2432 14797.16 1.831055e-04 
17 | 2560 15988.39 1.945496e-04 
18 | 2688 15164.42 1.907349e-04 
19 | 2816 14900.56 2.193451e-04 
20 | 2944 15936.94 2.593994e-04 
21 | 3072 15665.25 2.441406e-04 
22 | 3200 15319.37 2.746582e-04 
23 | 3328 16216.18 3.280640e-04 
24 | 3456 15997.54 3.280640e-04 
25 | 3584 15892.93 2.899170e-04 
26 | 3712 15742.18 3.280640e-04 
27 | 3840 15672.13 3.738403e-04 
28 | 3968 15652.81 3.738403e-04 
29 | 4096 15611.22 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_new.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuda_7';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3090" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 4799.10 7.247925e-05 
 6 | 1152 6046.16 8.392334e-05 
 7 | 1280 6020.74 1.068115e-04 
 8 | 1408 6103.93 1.258850e-04 
 9 | 1536 5485.28 1.182556e-04 
10 | 1664 6441.97 1.258850e-04 
11 | 1792 6729.21 1.373291e-04 
12 | 1920 6952.87 1.907349e-04 
13 | 2048 6714.18 1.564026e-04 
14 | 2176 6628.49 1.716614e-04 
15 | 2304 6876.40 1.716614e-04 
16 | 2432 6800.96 1.831055e-04 
17 | 2560 6763.13 1.945496e-04 
18 | 2688 6786.04 1.907349e-04 
19 | 2816 6795.07 2.193451e-04 
20 | 2944 6847.96 2.593994e-04 
21 | 3072 6692.63 2.441406e-04 
22 | 3200 6743.39 2.746582e-04 
23 | 3328 6839.06 3.280640e-04 
24 | 3456 6744.52 3.280640e-04 
25 | 3584 6679.47 2.899170e-04 
26 | 3712 6643.94 3.280640e-04 
27 | 3840 6731.34 3.738403e-04 
28 | 3968 6725.52 3.738403e-04 
29 | 4096 6730.59 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/output_old.m:
--------------------------------------------------------------------------------
 1 | version = 'MMult_cuBLAS_1';
 2 | GPU Device 0: "NVIDIA GeForce RTX 3080" with compute capability 8.6
 3 | 
 4 | MY_MMult = [
 5 | 1024 10637.93 3.242493e-05 
 6 | 1152 16397.92 3.480911e-05 
 7 | 1280 16559.61 5.626678e-05 
 8 | 1408 13734.41 5.340576e-05 
 9 | 1536 14581.54 5.340576e-05 
10 | 1664 14285.89 4.577637e-05 
11 | 1792 13704.07 5.340576e-05 
12 | 1920 13878.71 5.912781e-05 
13 | 2048 16339.17 1.564026e-04 
14 | 2176 12957.95 1.716614e-04 
15 | 2304 16535.38 1.716614e-04 
16 | 2432 12519.21 8.392334e-05 
17 | 2560 16971.11 1.945496e-04 
18 | 2688 18144.32 1.907349e-04 
19 | 2816 12950.36 2.193451e-04 
20 | 2944 16634.41 2.593994e-04 
21 | 3072 17836.77 2.441406e-04 
22 | 3200 12842.99 2.746582e-04 
23 | 3328 16601.17 3.280640e-04 
24 | 3456 16300.24 3.280640e-04 
25 | 3584 12411.39 1.029968e-04 
26 | 3712 17320.47 3.280640e-04 
27 | 3840 14158.14 3.738403e-04 
28 | 3968 13989.40 3.738403e-04 
29 | 4096 14217.98 3.509521e-04 
30 | ];
31 | 


--------------------------------------------------------------------------------
/cuda/parameters.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )"
 3 | The below parameters set this range of values that p takes on 
 4 | */   
 5 | #define PFIRST 1024
 6 | #define PLAST  4096
 7 | #define PINC   128
 8 | 
 9 | /* 
10 | In the test driver, the m, n, and k dimensions are set to the below 
11 | values.  If the value equals "-1" then that dimension is bound to the
12 | index p, given above.
13 | */
14 | 
15 | #define M -1 
16 | #define N -1
17 | #define K -1
18 | 
19 | /* 
20 | In the test driver, each experiment is repeated NREPEATS times and
21 | the best time from these repeats is used to compute the performance
22 | */
23 | 
24 | #define NREPEATS 20
25 | 
26 | /* 
27 | Matrices A, B, and C are stored in two dimensional arrays with
28 | row dimensions that are greater than or equal to the row dimension
29 | of the matrix.  This row dimension of the array is known as the 
30 | "leading dimension" and determines the stride (the number of 
31 | double precision numbers) when one goes from one element in a row
32 | to the next.  Having this number larger than the row dimension of
33 | the matrix tends to adversely affect performance.  LDX equals the
34 | leading dimension of the array that stores matrix X.  If LDX=-1 
35 | then the leading dimension is set to the row dimension of matrix X.
36 | */
37 | 


--------------------------------------------------------------------------------
/cuda/plot.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def readFile(filename):
 6 |     f = open(filename)
 7 |     sizes = []
 8 |     times = []
 9 |     title = ''
10 |     try:
11 |         title = f.readline()
12 |         # skip 3 line
13 |         f.readline()
14 |         f.readline()
15 |         f.readline()
16 |         while True:
17 |             line = f.readline()
18 |             if line:
19 |                 slices = line.split(" ")
20 |                 if len(slices) <= 2:
21 |                     break;
22 |                 size = int(slices[0])
23 |                 time = float(slices[1])
24 |                 sizes.append(size)
25 |                 times.append(time)
26 |     finally:
27 |         f.close()
28 |     return title, sizes, times
29 | 
30 | if __name__ == '__main__':
31 |     plt.xlabel('shape')
32 |     plt.ylabel('gflops')
33 |     l = len(sys.argv)
34 |     for i,item in enumerate(sys.argv):
35 |         if i == 0:
36 |             continue
37 |         t,x,y = readFile(item)
38 |         plt.plot(x,y,label=t)
39 |     plt.legend()
40 |     plt.show()
41 | 
42 | 


--------------------------------------------------------------------------------
/cuda/print_matrix.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define A(i, j) a[(i)*lda + (j)]
 4 | 
 5 | void print_matrix(int m, int n, float *a, int lda) {
 6 |   int i, j;
 7 | 
 8 |   for (i = 0; i < m; i++) {
 9 |     for (j = 0; j < n; j++) {
10 |       printf("%.1f\t", A(i, j));
11 |     }
12 |     printf("\n");
13 |   }
14 |   printf("\n");
15 | }
16 | 


--------------------------------------------------------------------------------
/cuda/proc_parameters.m:
--------------------------------------------------------------------------------
 1 | % Indicate the number of floating point operations that can be executed
 2 | % per clock cycle
 3 | %
 4 | 
 5 | nflops_per_cycle = 4;
 6 | 
 7 | %
 8 | % Indicate the number of processors being used (in case you are using a
 9 | % multicore or SMP)
10 | %
11 | 
12 | nprocessors = 1;
13 | 
14 | %
15 | % Indicate the clock speed of the processor.  On a Linux machine this info
16 | % can be found in the file /proc/cpuinfo
17 | %
18 | % Note: some processors have a "turbo boost" mode, which increases
19 | % the peak clock rate...
20 | %
21 | 
22 | GHz_of_processor = 2.6;
23 | 


--------------------------------------------------------------------------------
/cuda/random_matrix.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | #define A(i, j) a[(j)*lda + (i)]
 4 | 
 5 | double drand48();
 6 | void random_matrix(int m, int n, float *a, int lda) {
 7 |   int i, j;
 8 | 
 9 |   for (i = 0; i < m; i++)
10 |     for (j = 0; j < n; j++)
11 | #if 1
12 |       A(i, j) = 2.0 * (float)drand48() - 1.0;
13 | #else
14 |       A(i, j) = (j - i) % 3;
15 | #endif
16 | }
17 | 


--------------------------------------------------------------------------------
/images/aarch64-fp32-peak-vs-int8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/images/aarch64-fp32-peak-vs-int8.png


--------------------------------------------------------------------------------
/images/cublas-vs-MMult_cuda_12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpoisonooo/how-to-optimize-gemm/ad9c7a3a1b50dbc08b410b19ac2b6fb0b9e38105/images/cublas-vs-MMult_cuda_12.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | matplotlib
3 | 


--------------------------------------------------------------------------------
/vulkan/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | run.sh
3 | tmp_kp_shader.comp
4 | tmp_kp_shader.comp.spv
5 | test1
6 | test2
7 | 


--------------------------------------------------------------------------------
/vulkan/MMult_vk_2.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #extension GL_EXT_control_flow_attributes : enable
 3 | 
 4 | layout (local_size_x = 16, local_size_y = 16) in;
 5 | 
 6 | layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; };
 7 | layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; };
 8 | layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; };
 9 | 
10 | layout (constant_id = 0) const float tensor_size_f = 0;
11 | 
12 | shared float sub_tensor_1[16][16];
13 | shared float sub_tensor_2[16][16];
14 | 
15 | void main() {
16 |     uint block = 16;
17 |     uint tensor_size = uint(tensor_size_f);
18 |     uint loop = tensor_size / block;
19 | 
20 |     uvec3 threadID = gl_LocalInvocationID;
21 | 
22 |     uint globalCol = gl_WorkGroupID.y * block +threadID.y;
23 |     uint globalRow = gl_WorkGroupID.x * block + threadID.x;
24 | 
25 |     float acc = 0.0;
26 |     [[unroll]] for (uint i = 0u; i < loop; ++i) {
27 |         sub_tensor_1[threadID.y][threadID.x] = in_tensor_1[tensor_size * globalCol + i * block + threadID.x];
28 |         sub_tensor_2[threadID.y][threadID.x] = in_tensor_2[tensor_size * (i * block + threadID.y) + globalRow];
29 | 
30 |         // memoryBarrierShared();
31 |         barrier();
32 | 
33 | #if 1
34 |         for (uint k = 0u; k < block; ++k) {
35 |             acc += sub_tensor_1[threadID.y][k] * sub_tensor_2[k][threadID.x];
36 |         }
37 | #else
38 |         for (uint k = 0u; k < block; k+=4) {
39 |             vec4 a;
40 |             a.r = sub_tensor_1[threadID.y][k];
41 |             a.g = sub_tensor_1[threadID.y][k+1];
42 |             a.b = sub_tensor_1[threadID.y][k+2];
43 |             a.a = sub_tensor_1[threadID.y][k+3];
44 | 
45 |             vec4 b;
46 |             b.r = sub_tensor_2[k][threadID.x];
47 |             b.g = sub_tensor_2[k+1][threadID.x];
48 |             b.b = sub_tensor_2[k+2][threadID.x];
49 |             b.a = sub_tensor_2[k+3][threadID.x];
50 |             
51 |             acc += dot(a, b);
52 |         }
53 | #endif
54 |         barrier();
55 |     }
56 | 
57 |     out_tensor[(globalCol * tensor_size) + globalRow] = acc;
58 | }
59 | 


--------------------------------------------------------------------------------
/vulkan/MMult_vk_2.cpp:
--------------------------------------------------------------------------------
 1 | #define SPDLOG_ACTIVE_LEVEL 5
 2 | #include "Shader.hpp"
 3 | #include "kompute/Kompute.hpp"
 4 | #include <cassert>
 5 | #include <chrono>
 6 | #include <iostream>
 7 | 
 8 | // MY_MMult = [
 9 | // 64 36.24 0.000000e+00 
10 | // 128 48.75 0.000000e+00 
11 | // 192 51.88 0.000000e+00 
12 | // 256 52.91 0.000000e+00 
13 | // 320 53.30 0.000000e+00 
14 | // 384 53.51 0.000000e+00 
15 | // 448 53.64 0.000000e+00 
16 | // 512 53.72 0.000000e+00 
17 | // ];
18 | float kompute(const std::string &comp, uint32_t m, uint32_t k,
19 |               uint32_t n, float *a, float *b, float *c) {
20 |   constexpr uint32_t local_size = 16;
21 |   kp::Manager mgr;
22 | 
23 |   // Create and initialise Kompute Tensors through manager
24 |   auto dtype = kp::Tensor::TensorDataTypes::eFloat;
25 |   auto tensorInA = mgr.tensor(a, m * k, sizeof(float), dtype);
26 |   auto tensorInB = mgr.tensor(b, k * n, sizeof(float), dtype);
27 |   auto tensorInC = mgr.tensor(c, m * n, sizeof(float), dtype);
28 | 
29 |   std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB,
30 |                                                      tensorInC};
31 | 
32 |   // Create algorithm based on shader (supports buffers & push/spec constants)
33 |   kp::Workgroup workgroup({m / local_size, n / local_size, 1});
34 | 
35 |   auto algorithm =
36 |       mgr.algorithm(params, compileFile(comp), workgroup, {k * 1.f});
37 | 
38 |     mgr.sequence()->record<kp::OpTensorSyncDevice>(params)->eval();
39 |     // use weired vk timestamps
40 |     auto seq = mgr.sequence(0, 1);
41 |     seq->record<kp::OpAlgoDispatch>(algorithm)->eval();
42 | 
43 |     mgr.sequence()->record<kp::OpTensorSyncLocal>(params)->eval();
44 | 
45 |     auto timestamps = seq->getTimestamps();
46 |     auto computecost = timestamps[1] - timestamps[0];
47 |      memcpy(c, tensorInC->data<float>(), m * n * sizeof(float));
48 |     return computecost/1e6f;
49 | }
50 | 
51 | float MY_MMult(int m, int n, int k, float *a, float *b, float *c) {
52 |   return kompute("MMult_vk_2.comp", static_cast<uint32_t>(m),
53 |                  static_cast<uint32_t>(n), static_cast<uint32_t>(k), a, b, c);
54 | }
55 | 


--------------------------------------------------------------------------------
/vulkan/MMult_vk_3.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (local_size_x = 16, local_size_y = 16) in;
 4 | 
 5 | layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { vec4 in_tensor_a[]; };
 6 | layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_b[]; };
 7 | layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; };
 8 | 
 9 | layout (constant_id = 0) const float tensor_size_f = 0;
10 | 
11 | void main()
12 | {
13 |     uint block = 16;
14 |     uint tensor_size = uint(tensor_size_f);
15 |     uint lda = tensor_size / 4;
16 |     uint ldb = tensor_size;
17 |     uint ldc = tensor_size;
18 |     uint loop_k = tensor_size / 4;
19 | 
20 |     uint globalRow = gl_WorkGroupID.x * block + gl_LocalInvocationID.x;
21 |     uint globalCol = gl_WorkGroupID.y * block +gl_LocalInvocationID.y;
22 |     
23 |     float acc = 0.0;
24 |     for(uint k = 0u; k < loop_k; k++) {
25 |         vec4 a = in_tensor_a[(globalCol * lda) + k];
26 |         vec4 b;
27 |         b.r = in_tensor_b[(k * 4 * ldb) + globalRow];
28 |         b.g = in_tensor_b[((k * 4 + 1) * ldb) + globalRow];
29 |         b.b = in_tensor_b[((k * 4 + 2)* ldb) + globalRow];
30 |         b.a = in_tensor_b[((k * 4 + 3)* ldb) + globalRow];
31 | 
32 |         acc += dot(a, b);
33 |     }
34 |     out_tensor[(globalCol * ldc) + globalRow] = acc;
35 | }


--------------------------------------------------------------------------------
/vulkan/MMult_vk_3.cpp:
--------------------------------------------------------------------------------
 1 | #define SPDLOG_ACTIVE_LEVEL 5
 2 | #include "Shader.hpp"
 3 | #include "kompute/Kompute.hpp"
 4 | #include <cassert>
 5 | #include <chrono>
 6 | #include <iostream>
 7 | 
 8 | // MY_MMult = [
 9 | // 64 20.13 2.861023e-06 
10 | // 128 22.85 5.722046e-06 
11 | // 192 23.48 1.144409e-05 
12 | // 256 23.24 1.716614e-05 
13 | // 320 23.26 2.098083e-05 
14 | // 384 23.29 2.288818e-05 
15 | // 448 23.30 2.861023e-05 
16 | // 512 23.31 3.433228e-05 
17 | // ];
18 | float kompute(const std::string &comp, uint32_t m, uint32_t k,
19 |               uint32_t n, float *a, float *b, float *c) {
20 |   constexpr uint32_t local_size = 16;
21 |   kp::Manager mgr;
22 | 
23 |   // Create and initialise Kompute Tensors through manager
24 |   auto dtype = kp::Tensor::TensorDataTypes::eFloat;
25 |   auto tensorInA = mgr.tensor(a, m * k, sizeof(float), dtype);
26 |   auto tensorInB = mgr.tensor(b, k * n, sizeof(float), dtype);
27 |   auto tensorInC = mgr.tensor(c, m * n, sizeof(float), dtype);
28 | 
29 |   std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB,
30 |                                                      tensorInC};
31 | 
32 |   // Create algorithm based on shader (supports buffers & push/spec constants)
33 |   kp::Workgroup workgroup({m / local_size, n / local_size, 1});
34 | 
35 |   auto algorithm =
36 |       mgr.algorithm(params, compileFile(comp), workgroup, {k * 1.f});
37 | 
38 |     auto seq = mgr.sequence(0, 3);
39 |     seq->record<kp::OpTensorSyncDevice>(params)
40 |       ->record<kp::OpAlgoDispatch>(algorithm)
41 |       ->record<kp::OpTensorSyncLocal>(params)->eval();
42 | 
43 |     auto timestamps = seq->getTimestamps();
44 |     auto computecost = timestamps[2] - timestamps[1];
45 |      memcpy(c, tensorInC->data<float>(), m * n * sizeof(float));
46 |     return computecost/1e6f;
47 | }
48 | 
49 | float MY_MMult(int m, int n, int k, float *a, float *b, float *c) {
50 |   return kompute("MMult_vk_3.comp", static_cast<uint32_t>(m),
51 |                  static_cast<uint32_t>(n), static_cast<uint32_t>(k), a, b, c);
52 | }
53 | 


--------------------------------------------------------------------------------
/vulkan/README.md:
--------------------------------------------------------------------------------
 1 | # How to Build
 2 | 
 3 | ## Fetch Vulkan SDK
 4 | 
 5 | ```bash
 6 | $  wget https://sdk.lunarg.com/sdk/download/1.3.204.1/linux/vulkansdk-linux-x86_64-1.3.204.1.tar.gz
 7 | $ tar xvf vulkansdk-linux-x86_64-1.3.204.1.tar.gz
 8 | $ export VULKAN_SDK=/path/to/1.3.204.1/x86_64
 9 | ```
10 | 
11 | ## Build and Install `glslangValidator`
12 | 
13 | ```bash
14 | $ git clone git clone https://github.com/KhronosGroup/glslang.git  --recursive --depth=1
15 | $ cd glslang
16 | $ ./update_glslang_sources.py
17 | $ cmake -DCMAKE_INSTALL_PREFIX="/path/to/glslang/install"  ..
18 | $ make && make install
19 | $ export PATH=/path/to/glslang/install/bin
20 | ```
21 | 
22 | ## Build and Install `kompute`
23 | 
24 | ```bash
25 | $ git clone https://github.com/KomputeProject/kompute  --depth=1 --recursive
26 | $ cd kompute
27 | $ mkdir -p build && cd build
28 | $ cmake -DCMAKE_INSTALL_PREFIX="/path/to/kompute/install" ..
29 | $ make && make install
30 | ```
31 | 
32 | ## Build
33 | Now we have `libkompute.a` and `glslangValidator`, edit makefile and compile our GEMM implementation.
34 | ```bash
35 | $ vim makefile
36 | #  update KOMPUTE_BUILD
37 | $ export CPLUS_INCLUDE_PATH=`pwd`
38 | $ make
39 | ...
40 | ```
41 | 
42 | ## Run
43 | On Jetson Nano, enable MAXN power mode first.
44 | 
45 | ```bash
46 | $ sudo jetson_clocks
47 | ...
48 | $ ./test_MMult.x
49 | ...
50 | ```
51 | 


--------------------------------------------------------------------------------
/vulkan/REF_MMult.cpp:
--------------------------------------------------------------------------------
 1 | /* Create macros so that the matrices are stored in row-major order */
 2 | 
 3 | #if 0
 4 | #include <cblas.h>
 5 | /* Routine for computing C = A * B + C */
 6 | void REF_MMult(int m, int n, int k, float *a, float *b, float *c) {
 7 |   cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0f, a, k,
 8 |               b, n, 0.0f, c, n);
 9 | }
10 | 
11 | #else
12 | 
13 | #define A(i, j) a[(i)*k + (j)]
14 | #define B(i, j) b[(i)*n + (j)]
15 | #define C(i, j) c[(i)*n + (j)]
16 | /* Routine for computing C = A * B + C */
17 | 
18 | void REF_MMult(int m, int n, int k, float *a, float *b, float *c) {
19 |   int i, j, p;
20 | 
21 |   for (i = 0; i < m; i++) {
22 |     for (j = 0; j < n; j++) {
23 |       for (p = 0; p < k; p++) {
24 |         C(i, j) = C(i, j) + A(i, p) * B(p, j);
25 |       }
26 |     }
27 |   }
28 | }
29 | #endif


--------------------------------------------------------------------------------
/vulkan/Shader.hpp:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <iostream>
 6 | #include <vector>
 7 | #include <fstream>
 8 | 
 9 | /**
10 |  * Compile a single glslang source from string value. This is only meant
11 |  * to be used for testing as it's non threadsafe, and it had to be removed
12 |  * from the glslang dependency and now can only run the CLI directly due to 
13 |  * license issues: see https://github.com/KomputeProject/kompute/pull/235
14 |  *
15 |  * @param source An individual raw glsl shader in string format
16 |  * @return The compiled SPIR-V binary in unsigned int32 format
17 |  */
18 | static
19 | std::vector<uint32_t>
20 | compileSource(
21 |   const std::string& source)
22 | {
23 |     std::ofstream fileOut("tmp_kp_shader.comp");
24 | 	fileOut << source;
25 | 	fileOut.close();
26 |     if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str()))
27 |         throw std::runtime_error("Error running glslangValidator command");
28 |     std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
29 |     std::vector<char> buffer;
30 |     buffer.insert(buffer.begin(), std::istreambuf_iterator<char>(fileStream), {});
31 |     return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
32 | }
33 | 
34 | /**
35 |  * @param source An individual raw glsl shader filename
36 |  * @return The compiled SPIR-V binary in unsigned int32 format
37 |  */
38 | static
39 | std::vector<uint32_t>
40 | compileFile(const std::string& filename)
41 | {
42 |     char cmd[256] = {0};
43 |     sprintf(cmd, "glslangValidator -V %s -o tmp_kp_shader.comp.spv", filename.c_str());
44 |     if (system(cmd))
45 |         throw std::runtime_error("Error running glslangValidator command");
46 |     std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
47 |     std::vector<char> buffer;
48 |     buffer.insert(buffer.begin(), std::istreambuf_iterator<char>(fileStream), {});
49 |     return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
50 | }
51 | 


--------------------------------------------------------------------------------
/vulkan/benchmark/.gitignore:
--------------------------------------------------------------------------------
 1 | gflops_fmla
 2 | 
 3 | gmem_latency
 4 | gmem_bandwidth
 5 | gmem_banchmark
 6 | 
 7 | smem_bandwidth
 8 | smem_latency
 9 | 
10 | 


--------------------------------------------------------------------------------
/vulkan/benchmark/build.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | NAME=${1}
4 | KOMPUTE_BUILD="/home/khj/kompute/build"
5 | 
6 | g++ -g -O0 -std=c++17 -c ${NAME}.cpp
7 | g++ -o ${NAME} ${NAME}.o  "${KOMPUTE_BUILD}/src/libkompute.a" "${KOMPUTE_BUILD}/src/kompute_fmt/libfmt.a" "${KOMPUTE_BUILD}/src/kompute_spdlog/libspdlog.a" -L/usr/local/lib -lvulkan -lpthread
8 | 


--------------------------------------------------------------------------------
/vulkan/benchmark/gflops_fmla.cpp:
--------------------------------------------------------------------------------
 1 | #include "../Shader.hpp"
 2 | #include "../kompute/Kompute.hpp"
 3 | #include <iostream>
 4 | #include <cassert>
 5 | #include "types.h"
 6 | 
 7 | // gflops_fmla: 184.358588 
 8 | constexpr uint32_t COUNT = 16384;
 9 | constexpr uint32_t BLOCK = 256;
10 | 
11 | constexpr float LOOP = 3000000.0;
12 | 
13 | uint64_t kompute(const std::string &filename) {
14 | 
15 |   kp::Manager mgr;
16 |   kp::Workgroup workgroup({COUNT / BLOCK, 1, 1});
17 |   
18 |   AlignVector data1(COUNT, 1.0f);
19 |   AlignVector data2(COUNT, 0.0f);
20 |   AlignVector data3(COUNT, 0.0f);
21 | 
22 |   auto dtype = kp::Tensor::TensorDataTypes::eFloat;
23 |   auto tensorIn1 = mgr.tensor(data1.data(), data1.size(), sizeof(float), dtype);
24 |   auto tensorIn2 = mgr.tensor(data2.data(), data2.size(), sizeof(float), dtype);
25 |   auto tensorOut = mgr.tensor(data3.data(), data3.size(), sizeof(float), dtype);
26 | 
27 |   std::vector<std::shared_ptr<kp::Tensor>> params = {tensorIn1, tensorIn2, tensorOut};
28 |   auto algorithm = mgr.algorithm(params, compileFile(filename), workgroup, {LOOP});
29 |   auto seq = mgr.sequence(0, 3);
30 | 
31 |   seq->record<kp::OpTensorSyncDevice>(params)
32 |       ->record<kp::OpAlgoDispatch>(algorithm)
33 |       ->record<kp::OpTensorSyncLocal>(params)
34 |       ->eval();
35 | 
36 |   float* pResult = static_cast<float*>(tensorOut->rawData());
37 |   for (int  i =0;  i< 10; ++i) {
38 |     fprintf(stdout, "%f ", pResult[i]);
39 |   }
40 | 
41 |   auto timestamps = seq->getTimestamps();
42 |   return (timestamps[3] - timestamps[0]);
43 | }
44 | 
45 | int main() {
46 |   auto rw_compute_cost = kompute("gflops_fmla_1.comp");
47 |   auto rw_cost = kompute("gflops_fmla_2.comp");
48 |   fprintf(stdout, "gflops_fmla: %lf \n",  LOOP * COUNT * 10.0/ (rw_compute_cost - rw_cost));
49 | 
50 |   return 0;
51 | }
52 | 


--------------------------------------------------------------------------------
/vulkan/benchmark/gflops_fmla_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (local_size_x = 256) in;
 4 | layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; };
 5 | layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; };
 6 | layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; };
 7 | 
 8 | layout (constant_id = 0) const float loopf = 0;
 9 | 
10 | void main() {
11 |     float a = in_tensor_1[gl_GlobalInvocationID.x];
12 |     float b = in_tensor_2[gl_GlobalInvocationID.x];
13 |     float c = 1.0;
14 |     int loop = int(loopf);
15 | 
16 |     for (int i = 0; i < loop; ++i) {
17 |         c = a * c + b;
18 |         c = a * c + b;
19 |         c = a * c + b;
20 |         c = a * c + b;
21 |         c = a * c + b;
22 | 
23 |         c = a * c + b;
24 |         c = a * c + b;
25 |         c = a * c + b;
26 |         c = a * c + b;
27 |         c = a * c + b;
28 |     }
29 |     out_tensor[gl_GlobalInvocationID.x] = c;
30 | }


--------------------------------------------------------------------------------
/vulkan/benchmark/gflops_fmla_2.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (local_size_x = 256) in;
 4 | layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; };
 5 | layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; };
 6 | layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; };
 7 | 
 8 | layout (constant_id = 0) const float loopf = 0;
 9 | 
10 | void main() {
11 |     float a = in_tensor_1[gl_GlobalInvocationID.x];
12 |     float b = in_tensor_2[gl_GlobalInvocationID.x];
13 | 
14 |     out_tensor[gl_GlobalInvocationID.x] = a+b;
15 | }


--------------------------------------------------------------------------------
/vulkan/benchmark/gmem_bandwidth.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (local_size_x = 256) in;
 4 | 
 5 | // The input tensors bind index is relative to index in parameter passed
 6 | layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
 7 | layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
 8 | layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
 9 | 
10 | void main() {
11 |     uint index = gl_GlobalInvocationID.x;
12 |     out_a[index] = in_a[index];
13 | }
14 | 


--------------------------------------------------------------------------------
/vulkan/benchmark/gmem_bandwidth.cpp:
--------------------------------------------------------------------------------
 1 | #include "Shader.hpp"
 2 | #include "kompute/Kompute.hpp"
 3 | #include <iostream>
 4 | #include <cassert>
 5 | #include "types.h"
 6 | 
 7 | 
 8 | void kompute(const std::string &shader) {
 9 |   kp::Manager mgr;
10 | 
11 |   constexpr uint32_t MB = 256;
12 |   constexpr uint32_t SIZE = MB * 1024 * 1024;
13 |   constexpr uint32_t COUNT = SIZE/ sizeof(float); //  cannot exceed `vulkaninfo | grep maxComputeWorkGroupCount`
14 |   assert(COUNT <= 2147483647);
15 |   AlignVector data(COUNT, 3.14f);
16 | 
17 |   auto dtype = kp::Tensor::TensorDataTypes::eFloat;
18 |   auto tensorIn1 = mgr.tensor(data.data(), data.size(), sizeof(float), dtype);
19 |   auto tensorIn2 = mgr.tensor(data.data(), data.size(), sizeof(float), dtype);
20 |   auto tensorOut = mgr.tensor(data.data(), data.size(), sizeof(float), dtype);
21 | 
22 |   std::vector<std::shared_ptr<kp::Tensor>> params = {tensorIn1, tensorIn2, tensorOut};
23 |   kp::Workgroup workgroup({COUNT / 256, 1, 1});
24 | 
25 |   auto algorithm = mgr.algorithm(params, compileFile(shader), workgroup);
26 | 
27 |   auto seq = mgr.sequence(0, 3);
28 | 
29 |   seq->record<kp::OpTensorSyncDevice>(params)
30 |       ->record<kp::OpAlgoDispatch>(algorithm)
31 |       ->record<kp::OpTensorSyncLocal>(params)
32 |       ->eval();
33 | 
34 |   auto timestamps = seq->getTimestamps();
35 |   for (int i = 0; i < timestamps.size() -1; ++i) {
36 |     auto cost = timestamps[i+1] - timestamps[i];
37 |     fprintf(stdout, "time cost %ld  %0.4f GB/s \n", cost, MB / (cost/1e9f) / 1000.f);
38 |   }
39 |   // auto h2d = (timestamps[1] - timestamps[0]) / 1e9f;
40 |   // auto d2d = (timestamps[2] - timestamps[1]) / 1e9f;
41 |   // auto d2h = (timestamps[3] - timestamps[2]) / 1e9f;
42 | 
43 |   // fprintf(stdout, "h2d: %f MB/s, \nd2d %f MB/s, \nd2h: %f MB/s \n",  MB/h2d, MB/d2d,  MB/d2h);
44 | }
45 | 
46 | int main() {
47 |   kompute("gmem_bandwidth.comp");
48 | }
49 | 


--------------------------------------------------------------------------------
/vulkan/benchmark/sampler_bandwidth.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (local_size_x = 64) in;
 4 | 
 5 | // The input tensors bind index is relative to index in parameter passed
 6 | layout (binding = 0) uniform sampler2D in_a;
 7 | layout (constant_id = 0) const float tensor_size_f = 0;
 8 | 
 9 | shared vec4 sub_tensor_1[64];
10 | 
11 | void main() {
12 |     uint index = gl_GlobalInvocationID.x;
13 |     uint loop = uint(tensor_size_f);
14 |     for (uint x = 0; x < loop; ++x) {
15 | 
16 |         ivec2 ipos =  ivec2(index, 0);
17 |         sub_tensor_1[index] = texelFetch(in_a, ipos, 0);
18 |         barrier();
19 |     }
20 | }


--------------------------------------------------------------------------------
/vulkan/benchmark/smem_bandwidth.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #pragma use_vulkan_memory_model
 3 | 
 4 | layout (local_size_x = 256) in;
 5 | 
 6 | // The input tensors bind index is relative to index in parameter passed
 7 | layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
 8 | layout (constant_id = 0) const float tensor_size_f = 0;
 9 | 
10 | shared float sub_tensor_1[256];
11 | 
12 | void main() {
13 |     uint index = gl_GlobalInvocationID.x;
14 |     uint loop = uint(tensor_size_f);
15 |     for (uint x = 0; x < loop; ++x) {
16 |         sub_tensor_1[index] = in_a[index];
17 |         barrier();
18 |     }
19 | }


--------------------------------------------------------------------------------
/vulkan/benchmark/smem_bandwidth.cpp:
--------------------------------------------------------------------------------
 1 | #include "Shader.hpp"
 2 | #include "kompute/Kompute.hpp"
 3 | #include <iostream>
 4 | #include <cassert>
 5 | #include "types.h"
 6 | 
 7 | void kompute(const std::string &shader) {
 8 |   kp::Manager mgr;
 9 | 
10 |   constexpr uint32_t SIZE_IN_BYTES = 32768;
11 |   constexpr uint32_t BLOCK = 256;
12 |   AlignVector data(SIZE_IN_BYTES/ sizeof(float), 3.14f);
13 |  
14 |   auto dtype = kp::Tensor::TensorDataTypes::eFloat;
15 |   auto tensorIn = mgr.tensor(data.data(), data.size(), sizeof(float), dtype);
16 | 
17 |   std::vector<std::shared_ptr<kp::Tensor>> params = {tensorIn};
18 |   kp::Workgroup workgroup({SIZE_IN_BYTES / BLOCK, 1, 1});
19 |   constexpr float LOOP = 1000000.f;
20 |   auto algorithm = mgr.algorithm(params, compileFile(shader), workgroup, {LOOP});
21 | 
22 |   auto seq = mgr.sequence(0, 2);
23 | 
24 |   seq->record<kp::OpTensorSyncDevice>(params)
25 |       ->record<kp::OpAlgoDispatch>(algorithm)
26 |       ->eval();
27 | 
28 |   auto timestamps = seq->getTimestamps();
29 |   assert(timestamps.size() == 3);
30 |   auto gmem2smem = (timestamps[2] - timestamps[1]);
31 | 
32 |   const float sec = gmem2smem * 1.0 / LOOP / 1e9f;
33 |   fprintf(stdout, "***** %s bandwidth %0.3f GB/s \n", shader.c_str(),SIZE_IN_BYTES / 1024. / 1024. / 1024. / sec);
34 | }
35 | 
36 | int main() {
37 |   // smem_bandwidth.comp bandwidth 10.665 GB/s
38 |   kompute("smem_bandwidth.comp");
39 |   // smem_bandwidth1.comp bandwidth 18.663 GB/s
40 |   kompute("smem_bandwidth1.comp");
41 | // sampler_bandwidth.comp bandwidth 35.502 GB/s
42 |   kompute("sampler_bandwidth.comp");
43 | }
44 | 


--------------------------------------------------------------------------------
/vulkan/benchmark/smem_bandwidth1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #pragma use_vulkan_memory_model
 3 | 
 4 | layout (local_size_x = 32) in;
 5 | 
 6 | // The input tensors bind index is relative to index in parameter passed
 7 | layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
 8 | layout (constant_id = 0) const float tensor_size_f = 0;
 9 | 
10 | shared vec4 sub_tensor_1[64][2];
11 | 
12 | void main() {
13 |     uint index = gl_GlobalInvocationID.x;
14 |     uint loop = uint(tensor_size_f);
15 |     for (uint x = 0; x < loop; ++x) {
16 |         vec4 val0;
17 |         val0.r = in_a[index];
18 |         val0.g = in_a[index  + 32];
19 |         val0.b = in_a[index + 64];
20 |         val0.a = in_a[index + 96];
21 |         sub_tensor_1[index][0] = val0;
22 | 
23 |         vec4 val1;
24 |         val1.r = in_a[index + 128];
25 |         val1.g = in_a[index  + 160];
26 |         val1.b = in_a[index + 192];
27 |         val1.a = in_a[index + 224];
28 |         sub_tensor_1[index][1] = val1;
29 |         barrier();
30 |     }
31 | }


--------------------------------------------------------------------------------
/vulkan/benchmark/smem_latency.cpp:
--------------------------------------------------------------------------------
 1 | #include "Shader.hpp"
 2 | #include "kompute/Kompute.hpp"
 3 | #include <iostream>
 4 | #include <cassert>
 5 | #include "types.h"
 6 | 
 7 | // gmem2smem 80.194374 ns ~  72 cycle  0.899 GHz
 8 | void kompute(const std::string &shader) {
 9 |   kp::Manager mgr;
10 | 
11 |   constexpr uint32_t SIZE = 128; // 128B
12 |   constexpr uint32_t COUNT = SIZE/ sizeof(float); //  cannot exceed `vulkaninfo | grep maxComputeWorkGroupCount`
13 |   AlignVector data(COUNT, 3.14f);
14 | 
15 |   auto dtype = kp::Tensor::TensorDataTypes::eFloat;
16 |   auto tensorIn = mgr.tensor(data.data(), data.size(), sizeof(float), dtype);
17 | 
18 |   std::vector<std::shared_ptr<kp::Tensor>> params = {tensorIn};
19 |   kp::Workgroup workgroup({1, 1, 1});
20 |   constexpr float LOOP = 10000000.f;
21 |   auto algorithm = mgr.algorithm(params, compileSource(shader), workgroup, {LOOP});
22 | 
23 |   auto seq = mgr.sequence(0, 2);
24 | 
25 |   seq->record<kp::OpTensorSyncDevice>(params)
26 |       ->record<kp::OpAlgoDispatch>(algorithm)
27 |       ->eval();
28 | 
29 |   auto timestamps = seq->getTimestamps();
30 |   assert(timestamps.size() == 3);
31 |   auto gmem2smem = (timestamps[2] - timestamps[1]);
32 | 
33 |   const float ns = gmem2smem / LOOP;
34 |   constexpr float GHz = 921/ 1024.f; // jetson nano max_frequency.
35 |   const int cycle = ns * GHz;
36 |   fprintf(stdout, "***** gmem2smem %f ns ~ %d cycle  %0.3f GHz \n",  ns, cycle, GHz);
37 | }
38 | 
39 | int main() {
40 | 
41 |   std::string shader = (R"(
42 |         #version 450
43 | 
44 |         layout (local_size_x = 32) in;
45 | 
46 |         // The input tensors bind index is relative to index in parameter passed
47 |         layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
48 |         layout (constant_id = 0) const float tensor_size_f = 0;
49 | 
50 |         shared float sub_tensor_1[32];
51 | 
52 |         void main() {
53 |             uint index = gl_GlobalInvocationID.x;
54 |             uint loop = uint(tensor_size_f);
55 |             for (uint x = 0; x < loop; ++x) {
56 |               sub_tensor_1[index] = in_a[index];
57 |               barrier();
58 |             }
59 |         }
60 |     )");
61 | 
62 |   kompute(shader);
63 | }
64 | 


--------------------------------------------------------------------------------
/vulkan/benchmark/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | 
 4 | template <class T>
 5 | struct AlignAllocator {
 6 |     typedef T value_type;
 7 | 
 8 |     AlignAllocator() = default;
 9 |     template <class U>
10 |     constexpr AlignAllocator(const AlignAllocator<U>&) noexcept {}
11 | 
12 |     [[nodiscard]] T* allocate(std::size_t n) {
13 |         if (n > std::numeric_limits<std::size_t>::max() / sizeof(T))
14 |             throw std::bad_alloc();
15 | 
16 |         if (void* p = std::aligned_alloc(64, n * sizeof(T))) {
17 |             return static_cast<T*>(p);
18 |         }
19 | 
20 |         throw std::bad_alloc();
21 |     }
22 | 
23 |     void deallocate(T* p, std::size_t n) noexcept { std::free(p); }
24 | };
25 | 
26 | template <class T, class U>
27 | bool operator==(const AlignAllocator<T>&, const AlignAllocator<U>&) {
28 |     return true;
29 | }
30 | 
31 | template <class T, class U>
32 | bool operator!=(const AlignAllocator<T>&, const AlignAllocator<U>&) {
33 |     return false;
34 | }
35 | 
36 | using AlignVector = std::vector<float, AlignAllocator<float> >;
37 | 


--------------------------------------------------------------------------------
/vulkan/compare_matrices.cpp:
--------------------------------------------------------------------------------
 1 | #define abs(x) ((x) < 0.0 ? -(x) : (x))
 2 | 
 3 | #include <stdio.h>
 4 | 
 5 | float compare_matrices(int m, int n, float *a, float *b) {
 6 | #define A(i, j) a[(i)*n + (j)]
 7 | #define B(i, j) b[(i)*n + (j)]
 8 |   //    printf("\n---result----\n");
 9 |   //    print_matrix(m, n, a, lda);
10 |   //    printf("\n-------\n");
11 |   //    print_matrix(m, n, b, ldb);
12 |   //    printf("\n-------\n");
13 |   int i, j;
14 |   float max_diff = 0.0, diff;
15 |   int printed = 0;
16 | 
17 |   for (i = 0; i < m; i++) {
18 |     for (j = 0; j < n; j++) {
19 |       diff = abs(A(i, j) - B(i, j));
20 |       max_diff = (diff > max_diff ? diff : max_diff);
21 |       if (0 == printed)
22 |         if (max_diff > 0.5f || max_diff < -0.5f) {
23 |           fprintf(stdout, "error: i %d  j %d diff %f  got %f  expect %f \n", i,
24 |                   j, max_diff, A(i, j), B(i, j));
25 |           printed = 1;
26 |         }
27 |     }
28 |   }
29 | 
30 |   return max_diff;
31 | #undef A
32 | #undef B
33 | }
34 | 


--------------------------------------------------------------------------------
/vulkan/copy_matrix.cpp:
--------------------------------------------------------------------------------
 1 | void copy_matrix(int m, int n, float *a, float *b) {
 2 | #define A(i, j) a[(i)*n + (j)]
 3 | #define B(i, j) b[(i)*n + (j)]
 4 | 
 5 |   int i, j;
 6 | 
 7 |   for (j = 0; j < n; j++) {
 8 |     for (i = 0; i < m; i++) {
 9 |       B(i, j) = A(i, j);
10 |     }
11 |   }
12 | 
13 | #undef A
14 | #undef B
15 | }
16 | 


--------------------------------------------------------------------------------
/vulkan/dclock.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | #include <time.h>
 3 | 
 4 | static double gtod_ref_time_sec = 0.0;
 5 | 
 6 | /* Adapted from the bl2_clock() routine in the BLIS library */
 7 | 
 8 | double dclock() {
 9 |   double the_time, norm_sec;
10 |   struct timeval tv;
11 | 
12 |   gettimeofday(&tv, NULL);
13 | 
14 |   if (gtod_ref_time_sec == 0.0)
15 |     gtod_ref_time_sec = (double)tv.tv_sec;
16 | 
17 |   norm_sec = (double)tv.tv_sec - gtod_ref_time_sec;
18 | 
19 |   the_time = norm_sec + tv.tv_usec * 1.0e-6;
20 | 
21 |   return the_time;
22 | }
23 | 


--------------------------------------------------------------------------------
/vulkan/fmt/locale.h:
--------------------------------------------------------------------------------
 1 | // Formatting library for C++ - std::locale support
 2 | //
 3 | // Copyright (c) 2012 - present, Victor Zverovich
 4 | // All rights reserved.
 5 | //
 6 | // For the license information refer to format.h.
 7 | 
 8 | #ifndef FMT_LOCALE_H_
 9 | #define FMT_LOCALE_H_
10 | 
11 | #include <locale>
12 | 
13 | #include "format.h"
14 | 
15 | FMT_BEGIN_NAMESPACE
16 | 
17 | namespace detail {
18 | template <typename Char>
19 | std::basic_string<Char> vformat(
20 |     const std::locale& loc, basic_string_view<Char> format_str,
21 |     basic_format_args<buffer_context<type_identity_t<Char>>> args) {
22 |   basic_memory_buffer<Char> buffer;
23 |   detail::vformat_to(buffer, format_str, args, detail::locale_ref(loc));
24 |   return fmt::to_string(buffer);
25 | }
26 | }  // namespace detail
27 | 
28 | template <typename S, typename Char = char_t<S>>
29 | inline std::basic_string<Char> vformat(
30 |     const std::locale& loc, const S& format_str,
31 |     basic_format_args<buffer_context<type_identity_t<Char>>> args) {
32 |   return detail::vformat(loc, to_string_view(format_str), args);
33 | }
34 | 
35 | template <typename S, typename... Args, typename Char = char_t<S>>
36 | inline std::basic_string<Char> format(const std::locale& loc,
37 |                                       const S& format_str, Args&&... args) {
38 |   return detail::vformat(loc, to_string_view(format_str),
39 |                          fmt::make_args_checked<Args...>(format_str, args...));
40 | }
41 | 
42 | template <typename S, typename OutputIt, typename... Args,
43 |           typename Char = char_t<S>,
44 |           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
45 | inline OutputIt vformat_to(
46 |     OutputIt out, const std::locale& loc, const S& format_str,
47 |     basic_format_args<buffer_context<type_identity_t<Char>>> args) {
48 |   decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
49 |   vformat_to(buf, to_string_view(format_str), args, detail::locale_ref(loc));
50 |   return detail::get_iterator(buf);
51 | }
52 | 
53 | template <typename OutputIt, typename S, typename... Args,
54 |           bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
55 | inline auto format_to(OutputIt out, const std::locale& loc,
56 |                       const S& format_str, Args&&... args) ->
57 |     typename std::enable_if<enable, OutputIt>::type {
58 |   const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
59 |   return vformat_to(out, loc, to_string_view(format_str), vargs);
60 | }
61 | 
62 | FMT_END_NAMESPACE
63 | 
64 | #endif  // FMT_LOCALE_H_
65 | 


--------------------------------------------------------------------------------
/vulkan/fmt/posix.h:
--------------------------------------------------------------------------------
1 | #include "os.h"
2 | #warning "fmt/posix.h is deprecated; use fmt/os.h instead"
3 | 


--------------------------------------------------------------------------------
/vulkan/makefile:
--------------------------------------------------------------------------------
 1 | OLD  := MMult_vk_naive
 2 | NEW := MMult_vk_2
 3 | KOMPUTE_BUILD := /home/khj/kompute/build
 4 | 
 5 | CC         := g++
 6 | LINKER     := $(CC)
 7 | # CFLAGS     := -std=c++17 -O0 -g -Wall
 8 | CFLAGS     := -std=c++17 -O2 -g
 9 | LDFLAGS    := -lm $(KOMPUTE_BUILD)/src/libkompute.a $(KOMPUTE_BUILD)/src/kompute_fmt/libfmt.a $(KOMPUTE_BUILD)/src/kompute_spdlog/libspdlog.a `pkg-config --libs vulkan` -lpthread
10 | 
11 | UTIL       := copy_matrix.o \
12 |               compare_matrices.o \
13 |               random_matrix.o \
14 |               dclock.o \
15 |               REF_MMult.o \
16 |               print_matrix.o
17 | 
18 | TEST_OBJS  := test_MMult.o $(NEW).o 
19 | 
20 | %.o: %.cpp
21 | 	$(CC) $(CFLAGS) $(GENCODE_FLAGS)  -c $< -o $@
22 | 
23 | all: 
24 | 	make clean;
25 | 	make test_MMult.x
26 | 
27 | test_MMult.x: $(TEST_OBJS) $(UTIL) parameters.h
28 | 	$(LINKER) $(TEST_OBJS) $(UTIL) $(LDFLAGS) \
29 |         $(BLAS_LIB) -o $(TEST_BIN) $@ 
30 | 
31 | run:	
32 | 	make all
33 | 	echo "version = '$(NEW)';" > output_$(NEW).m
34 | 	./test_MMult.x >> output_$(NEW).m
35 | 	cp output_$(OLD).m output_old.m
36 | 	cp output_$(NEW).m output_new.m
37 | 
38 | clean:
39 | 	rm -f *.o *~ core *.x
40 | 
41 | cleanall:
42 | 	rm -f *.o *~ core *.x output*.m *.eps *.png
43 | 


--------------------------------------------------------------------------------
/vulkan/parameters.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 | In the test driver, there is a loop "for ( p=PFIRST; p<= PLAST; p+= PINC )"
 3 | The below parameters set this range of values that p takes on 
 4 | */   
 5 | #define PFIRST 64
 6 | #define PLAST  512
 7 | #define PINC   64
 8 | 
 9 | /* 
10 | In the test driver, the m, n, and k dimensions are set to the below 
11 | values.  If the value equals "-1" then that dimension is bound to the
12 | index p, given above.
13 | */
14 | 
15 | #define M -1
16 | #define N -1
17 | #define K -1
18 | 
19 | /* 
20 | In the test driver, each experiment is repeated NREPEATS times and
21 | the best time from these repeats is used to compute the performance
22 | */
23 | 
24 | #define NREPEATS 1
25 | 
26 | /* 
27 | Matrices A, B, and C are stored in two dimensional arrays with
28 | row dimensions that are greater than or equal to the row dimension
29 | of the matrix.  This row dimension of the array is known as the 
30 | "leading dimension" and determines the stride (the number of 
31 | double precision numbers) when one goes from one element in a row
32 | to the next.  Having this number larger than the row dimension of
33 | the matrix tends to adversely affect performance.  LDX equals the
34 | leading dimension of the array that stores matrix X.  If LDX=-1 
35 | then the leading dimension is set to the row dimension of matrix X.
36 | */
37 | 
38 | #if 0
39 | #define LDA 1000
40 | #define LDB 1000
41 | #define LDC 1000
42 | #else
43 | #define LDA -1 
44 | #define LDB -1 
45 | #define LDC -1 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/vulkan/plot.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def readFile(filename):
 6 |     f = open(filename)
 7 |     sizes = []
 8 |     times = []
 9 |     title = ''
10 |     try:
11 |         title = f.readline()
12 |         # skip 3 line
13 |         f.readline()
14 |         f.readline()
15 |         f.readline()
16 |         while True:
17 |             line = f.readline()
18 |             if line:
19 |                 slices = line.split(" ")
20 |                 if len(slices) <= 2:
21 |                     break;
22 |                 size = int(slices[0])
23 |                 time = float(slices[1])
24 |                 sizes.append(size)
25 |                 times.append(time)
26 |     finally:
27 |         f.close()
28 |     return title, sizes, times
29 | 
30 | if __name__ == '__main__':
31 |     plt.xlabel('shape')
32 |     plt.ylabel('gflops')
33 |     l = len(sys.argv)
34 |     for i,item in enumerate(sys.argv):
35 |         if i == 0:
36 |             continue
37 |         t,x,y = readFile(item)
38 |         plt.plot(x,y,label=t)
39 |     plt.legend()
40 |     plt.show()
41 | 
42 | 


--------------------------------------------------------------------------------
/vulkan/print_matrix.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define A(i, j) a[(i)*lda + (j)]
 4 | 
 5 | void print_matrix(int m, int n, float *a, int lda) {
 6 |   int i, j;
 7 | 
 8 |   for (i = 0; i < m; i++) {
 9 |     for (j = 0; j < n; j++) {
10 |       printf("%.1f\t", A(i, j));
11 |     }
12 |     printf("\n");
13 |   }
14 |   printf("\n");
15 | }
16 | 


--------------------------------------------------------------------------------
/vulkan/random_matrix.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | void random_matrix(int m, int n, float *a) {
 4 | #define A(i, j) a[(i)*n + (j)]
 5 | 
 6 |   double drand48();
 7 |   int i, j;
 8 | 
 9 |   for (i = 0; i < m; i++) {
10 |     for (j = 0; j < n; j++) {
11 | #if 1
12 |       A(i, j) = 2.0 * (float)drand48() - 1.0;
13 | #else
14 |       A(i, j) = (j - i) % 3;
15 | #endif
16 |       // A(i, j) = 1;
17 |     }
18 |   }
19 | #undef A
20 | }
21 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/async_logger-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/async_logger.h>
 8 | #endif
 9 | 
10 | #include <spdlog/sinks/sink.h>
11 | #include <spdlog/details/thread_pool.h>
12 | 
13 | #include <memory>
14 | #include <string>
15 | 
16 | SPDLOG_INLINE spdlog::async_logger::async_logger(
17 |     std::string logger_name, sinks_init_list sinks_list, std::weak_ptr<details::thread_pool> tp, async_overflow_policy overflow_policy)
18 |     : async_logger(std::move(logger_name), sinks_list.begin(), sinks_list.end(), std::move(tp), overflow_policy)
19 | {}
20 | 
21 | SPDLOG_INLINE spdlog::async_logger::async_logger(
22 |     std::string logger_name, sink_ptr single_sink, std::weak_ptr<details::thread_pool> tp, async_overflow_policy overflow_policy)
23 |     : async_logger(std::move(logger_name), {std::move(single_sink)}, std::move(tp), overflow_policy)
24 | {}
25 | 
26 | // send the log message to the thread pool
27 | SPDLOG_INLINE void spdlog::async_logger::sink_it_(const details::log_msg &msg)
28 | {
29 |     if (auto pool_ptr = thread_pool_.lock())
30 |     {
31 |         pool_ptr->post_log(shared_from_this(), msg, overflow_policy_);
32 |     }
33 |     else
34 |     {
35 |         throw_spdlog_ex("async log: thread pool doesn't exist anymore");
36 |     }
37 | }
38 | 
39 | // send flush request to the thread pool
40 | SPDLOG_INLINE void spdlog::async_logger::flush_()
41 | {
42 |     if (auto pool_ptr = thread_pool_.lock())
43 |     {
44 |         pool_ptr->post_flush(shared_from_this(), overflow_policy_);
45 |     }
46 |     else
47 |     {
48 |         throw_spdlog_ex("async flush: thread pool doesn't exist anymore");
49 |     }
50 | }
51 | 
52 | //
53 | // backend functions - called from the thread pool to do the actual job
54 | //
55 | SPDLOG_INLINE void spdlog::async_logger::backend_sink_it_(const details::log_msg &msg)
56 | {
57 |     for (auto &sink : sinks_)
58 |     {
59 |         if (sink->should_log(msg.level))
60 |         {
61 |             SPDLOG_TRY
62 |             {
63 |                 sink->log(msg);
64 |             }
65 |             SPDLOG_LOGGER_CATCH()
66 |         }
67 |     }
68 | 
69 |     if (should_flush_(msg))
70 |     {
71 |         backend_flush_();
72 |     }
73 | }
74 | 
75 | SPDLOG_INLINE void spdlog::async_logger::backend_flush_()
76 | {
77 |     for (auto &sink : sinks_)
78 |     {
79 |         SPDLOG_TRY
80 |         {
81 |             sink->flush();
82 |         }
83 |         SPDLOG_LOGGER_CATCH()
84 |     }
85 | }
86 | 
87 | SPDLOG_INLINE std::shared_ptr<spdlog::logger> spdlog::async_logger::clone(std::string new_name)
88 | {
89 |     auto cloned = std::make_shared<spdlog::async_logger>(*this);
90 |     cloned->name_ = std::move(new_name);
91 |     return cloned;
92 | }
93 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/async_logger.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | // Fast asynchronous logger.
 7 | // Uses pre allocated queue.
 8 | // Creates a single back thread to pop messages from the queue and log them.
 9 | //
10 | // Upon each log write the logger:
11 | //    1. Checks if its log level is enough to log the message
12 | //    2. Push a new copy of the message to a queue (or block the caller until
13 | //    space is available in the queue)
14 | // Upon destruction, logs all remaining messages in the queue before
15 | // destructing..
16 | 
17 | #include <spdlog/logger.h>
18 | 
19 | namespace spdlog {
20 | 
21 | // Async overflow policy - block by default.
22 | enum class async_overflow_policy
23 | {
24 |     block,         // Block until message can be enqueued
25 |     overrun_oldest // Discard oldest message in the queue if full when trying to
26 |                    // add new item.
27 | };
28 | 
29 | namespace details {
30 | class thread_pool;
31 | }
32 | 
33 | class SPDLOG_API async_logger final : public std::enable_shared_from_this<async_logger>, public logger
34 | {
35 |     friend class details::thread_pool;
36 | 
37 | public:
38 |     template<typename It>
39 |     async_logger(std::string logger_name, It begin, It end, std::weak_ptr<details::thread_pool> tp,
40 |         async_overflow_policy overflow_policy = async_overflow_policy::block)
41 |         : logger(std::move(logger_name), begin, end)
42 |         , thread_pool_(std::move(tp))
43 |         , overflow_policy_(overflow_policy)
44 |     {}
45 | 
46 |     async_logger(std::string logger_name, sinks_init_list sinks_list, std::weak_ptr<details::thread_pool> tp,
47 |         async_overflow_policy overflow_policy = async_overflow_policy::block);
48 | 
49 |     async_logger(std::string logger_name, sink_ptr single_sink, std::weak_ptr<details::thread_pool> tp,
50 |         async_overflow_policy overflow_policy = async_overflow_policy::block);
51 | 
52 |     std::shared_ptr<logger> clone(std::string new_name) override;
53 | 
54 | protected:
55 |     void sink_it_(const details::log_msg &msg) override;
56 |     void flush_() override;
57 |     void backend_sink_it_(const details::log_msg &incoming_log_msg);
58 |     void backend_flush_();
59 | 
60 | private:
61 |     std::weak_ptr<details::thread_pool> thread_pool_;
62 |     async_overflow_policy overflow_policy_;
63 | };
64 | } // namespace spdlog
65 | 
66 | #ifdef SPDLOG_HEADER_ONLY
67 | #include "async_logger-inl.h"
68 | #endif
69 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/cfg/argv.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | #include <spdlog/cfg/helpers.h>
 6 | #include <spdlog/details/registry.h>
 7 | 
 8 | //
 9 | // Init log levels using each argv entry that starts with "SPDLOG_LEVEL="
10 | //
11 | // set all loggers to debug level:
12 | // example.exe "SPDLOG_LEVEL=debug"
13 | 
14 | // set logger1 to trace level
15 | // example.exe "SPDLOG_LEVEL=logger1=trace"
16 | 
17 | // turn off all logging except for logger1 and logger2:
18 | // example.exe "SPDLOG_LEVEL=off,logger1=debug,logger2=info"
19 | 
20 | namespace spdlog {
21 | namespace cfg {
22 | 
23 | // search for SPDLOG_LEVEL= in the args and use it to init the levels
24 | inline void load_argv_levels(int argc, const char **argv)
25 | {
26 |     const std::string spdlog_level_prefix = "SPDLOG_LEVEL=";
27 |     for (int i = 1; i < argc; i++)
28 |     {
29 |         std::string arg = argv[i];
30 |         if (arg.find(spdlog_level_prefix) == 0)
31 |         {
32 |             auto levels_string = arg.substr(spdlog_level_prefix.size());
33 |             helpers::load_levels(levels_string);
34 |         }
35 |     }
36 | }
37 | 
38 | inline void load_argv_levels(int argc, char **argv)
39 | {
40 |     load_argv_levels(argc, const_cast<const char **>(argv));
41 | }
42 | 
43 | } // namespace cfg
44 | } // namespace spdlog
45 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/cfg/env.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | #include <spdlog/cfg/helpers.h>
 6 | #include <spdlog/details/registry.h>
 7 | #include <spdlog/details/os.h>
 8 | 
 9 | //
10 | // Init levels and patterns from env variables SPDLOG_LEVEL
11 | // Inspired from Rust's "env_logger" crate (https://crates.io/crates/env_logger).
12 | // Note - fallback to "info" level on unrecognized levels
13 | //
14 | // Examples:
15 | //
16 | // set global level to debug:
17 | // export SPDLOG_LEVEL=debug
18 | //
19 | // turn off all logging except for logger1:
20 | // export SPDLOG_LEVEL="*=off,logger1=debug"
21 | //
22 | 
23 | // turn off all logging except for logger1 and logger2:
24 | // export SPDLOG_LEVEL="off,logger1=debug,logger2=info"
25 | 
26 | namespace spdlog {
27 | namespace cfg {
28 | inline void load_env_levels()
29 | {
30 |     auto env_val = details::os::getenv("SPDLOG_LEVEL");
31 |     if (!env_val.empty())
32 |     {
33 |         helpers::load_levels(env_val);
34 |     }
35 | }
36 | 
37 | } // namespace cfg
38 | } // namespace spdlog
39 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/cfg/helpers.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/common.h>
 7 | #include <unordered_map>
 8 | 
 9 | namespace spdlog {
10 | namespace cfg {
11 | namespace helpers {
12 | //
13 | // Init levels from given string
14 | //
15 | // Examples:
16 | //
17 | // set global level to debug: "debug"
18 | // turn off all logging except for logger1: "off,logger1=debug"
19 | // turn off all logging except for logger1 and logger2: "off,logger1=debug,logger2=info"
20 | //
21 | SPDLOG_API void load_levels(const std::string &txt);
22 | } // namespace helpers
23 | 
24 | } // namespace cfg
25 | } // namespace spdlog
26 | 
27 | #ifdef SPDLOG_HEADER_ONLY
28 | #include "helpers-inl.h"
29 | #endif // SPDLOG_HEADER_ONLY
30 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/common-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/common.h>
 8 | #endif
 9 | 
10 | namespace spdlog {
11 | namespace level {
12 | static string_view_t level_string_views[] SPDLOG_LEVEL_NAMES;
13 | 
14 | static const char *short_level_names[] SPDLOG_SHORT_LEVEL_NAMES;
15 | 
16 | SPDLOG_INLINE string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT
17 | {
18 |     return level_string_views[l];
19 | }
20 | 
21 | SPDLOG_INLINE const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT
22 | {
23 |     return short_level_names[l];
24 | }
25 | 
26 | SPDLOG_INLINE spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT
27 | {
28 |     int level = 0;
29 |     for (const auto &level_str : level_string_views)
30 |     {
31 |         if (level_str == name)
32 |         {
33 |             return static_cast<level::level_enum>(level);
34 |         }
35 |         level++;
36 |     }
37 |     // check also for "warn" and "err" before giving up..
38 |     if (name == "warn")
39 |     {
40 |         return level::warn;
41 |     }
42 |     if (name == "err")
43 |     {
44 |         return level::err;
45 |     }
46 |     return level::off;
47 | }
48 | } // namespace level
49 | 
50 | SPDLOG_INLINE spdlog_ex::spdlog_ex(std::string msg)
51 |     : msg_(std::move(msg))
52 | {}
53 | 
54 | SPDLOG_INLINE spdlog_ex::spdlog_ex(const std::string &msg, int last_errno)
55 | {
56 |     memory_buf_t outbuf;
57 |     fmt::format_system_error(outbuf, last_errno, msg);
58 |     msg_ = fmt::to_string(outbuf);
59 | }
60 | 
61 | SPDLOG_INLINE const char *spdlog_ex::what() const SPDLOG_NOEXCEPT
62 | {
63 |     return msg_.c_str();
64 | }
65 | 
66 | SPDLOG_INLINE void throw_spdlog_ex(const std::string &msg, int last_errno)
67 | {
68 |     SPDLOG_THROW(spdlog_ex(msg, last_errno));
69 | }
70 | 
71 | SPDLOG_INLINE void throw_spdlog_ex(std::string msg)
72 | {
73 |     SPDLOG_THROW(spdlog_ex(std::move(msg)));
74 | }
75 | 
76 | } // namespace spdlog
77 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/backtracer-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/details/backtracer.h>
 8 | #endif
 9 | namespace spdlog {
10 | namespace details {
11 | SPDLOG_INLINE backtracer::backtracer(const backtracer &other)
12 | {
13 |     std::lock_guard<std::mutex> lock(other.mutex_);
14 |     enabled_ = other.enabled();
15 |     messages_ = other.messages_;
16 | }
17 | 
18 | SPDLOG_INLINE backtracer::backtracer(backtracer &&other) SPDLOG_NOEXCEPT
19 | {
20 |     std::lock_guard<std::mutex> lock(other.mutex_);
21 |     enabled_ = other.enabled();
22 |     messages_ = std::move(other.messages_);
23 | }
24 | 
25 | SPDLOG_INLINE backtracer &backtracer::operator=(backtracer other)
26 | {
27 |     std::lock_guard<std::mutex> lock(mutex_);
28 |     enabled_ = other.enabled();
29 |     messages_ = std::move(other.messages_);
30 |     return *this;
31 | }
32 | 
33 | SPDLOG_INLINE void backtracer::enable(size_t size)
34 | {
35 |     std::lock_guard<std::mutex> lock{mutex_};
36 |     enabled_.store(true, std::memory_order_relaxed);
37 |     messages_ = circular_q<log_msg_buffer>{size};
38 | }
39 | 
40 | SPDLOG_INLINE void backtracer::disable()
41 | {
42 |     std::lock_guard<std::mutex> lock{mutex_};
43 |     enabled_.store(false, std::memory_order_relaxed);
44 | }
45 | 
46 | SPDLOG_INLINE bool backtracer::enabled() const
47 | {
48 |     return enabled_.load(std::memory_order_relaxed);
49 | }
50 | 
51 | SPDLOG_INLINE void backtracer::push_back(const log_msg &msg)
52 | {
53 |     std::lock_guard<std::mutex> lock{mutex_};
54 |     messages_.push_back(log_msg_buffer{msg});
55 | }
56 | 
57 | // pop all items in the q and apply the given fun on each of them.
58 | SPDLOG_INLINE void backtracer::foreach_pop(std::function<void(const details::log_msg &)> fun)
59 | {
60 |     std::lock_guard<std::mutex> lock{mutex_};
61 |     while (!messages_.empty())
62 |     {
63 |         auto &front_msg = messages_.front();
64 |         fun(front_msg);
65 |         messages_.pop_front();
66 |     }
67 | }
68 | } // namespace details
69 | } // namespace spdlog
70 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/backtracer.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/log_msg_buffer.h>
 7 | #include <spdlog/details/circular_q.h>
 8 | 
 9 | #include <atomic>
10 | #include <mutex>
11 | #include <functional>
12 | 
13 | // Store log messages in circular buffer.
14 | // Useful for storing debug data in case of error/warning happens.
15 | 
16 | namespace spdlog {
17 | namespace details {
18 | class SPDLOG_API backtracer
19 | {
20 |     mutable std::mutex mutex_;
21 |     std::atomic<bool> enabled_{false};
22 |     circular_q<log_msg_buffer> messages_;
23 | 
24 | public:
25 |     backtracer() = default;
26 |     backtracer(const backtracer &other);
27 | 
28 |     backtracer(backtracer &&other) SPDLOG_NOEXCEPT;
29 |     backtracer &operator=(backtracer other);
30 | 
31 |     void enable(size_t size);
32 |     void disable();
33 |     bool enabled() const;
34 |     void push_back(const log_msg &msg);
35 | 
36 |     // pop all items in the q and apply the given fun on each of them.
37 |     void foreach_pop(std::function<void(const details::log_msg &)> fun);
38 | };
39 | 
40 | } // namespace details
41 | } // namespace spdlog
42 | 
43 | #ifdef SPDLOG_HEADER_ONLY
44 | #include "backtracer-inl.h"
45 | #endif


--------------------------------------------------------------------------------
/vulkan/spdlog/details/console_globals.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/null_mutex.h>
 7 | #include <mutex>
 8 | 
 9 | namespace spdlog {
10 | namespace details {
11 | 
12 | struct console_mutex
13 | {
14 |     using mutex_t = std::mutex;
15 |     static mutex_t &mutex()
16 |     {
17 |         static mutex_t s_mutex;
18 |         return s_mutex;
19 |     }
20 | };
21 | 
22 | struct console_nullmutex
23 | {
24 |     using mutex_t = null_mutex;
25 |     static mutex_t &mutex()
26 |     {
27 |         static mutex_t s_mutex;
28 |         return s_mutex;
29 |     }
30 | };
31 | } // namespace details
32 | } // namespace spdlog
33 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/file_helper.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/common.h>
 7 | #include <tuple>
 8 | 
 9 | namespace spdlog {
10 | namespace details {
11 | 
12 | // Helper class for file sinks.
13 | // When failing to open a file, retry several times(5) with a delay interval(10 ms).
14 | // Throw spdlog_ex exception on errors.
15 | 
16 | class SPDLOG_API file_helper
17 | {
18 | public:
19 |     explicit file_helper() = default;
20 | 
21 |     file_helper(const file_helper &) = delete;
22 |     file_helper &operator=(const file_helper &) = delete;
23 |     ~file_helper();
24 | 
25 |     void open(const filename_t &fname, bool truncate = false);
26 |     void reopen(bool truncate);
27 |     void flush();
28 |     void close();
29 |     void write(const memory_buf_t &buf);
30 |     size_t size() const;
31 |     const filename_t &filename() const;
32 | 
33 |     //
34 |     // return file path and its extension:
35 |     //
36 |     // "mylog.txt" => ("mylog", ".txt")
37 |     // "mylog" => ("mylog", "")
38 |     // "mylog." => ("mylog.", "")
39 |     // "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
40 |     //
41 |     // the starting dot in filenames is ignored (hidden files):
42 |     //
43 |     // ".mylog" => (".mylog". "")
44 |     // "my_folder/.mylog" => ("my_folder/.mylog", "")
45 |     // "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
46 |     static std::tuple<filename_t, filename_t> split_by_extension(const filename_t &fname);
47 | 
48 | private:
49 |     const int open_tries_ = 5;
50 |     const int open_interval_ = 10;
51 |     std::FILE *fd_{nullptr};
52 |     filename_t filename_;
53 | };
54 | } // namespace details
55 | } // namespace spdlog
56 | 
57 | #ifdef SPDLOG_HEADER_ONLY
58 | #include "file_helper-inl.h"
59 | #endif
60 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/log_msg-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/details/log_msg.h>
 8 | #endif
 9 | 
10 | #include <spdlog/details/os.h>
11 | 
12 | namespace spdlog {
13 | namespace details {
14 | 
15 | SPDLOG_INLINE log_msg::log_msg(spdlog::log_clock::time_point log_time, spdlog::source_loc loc, string_view_t a_logger_name,
16 |     spdlog::level::level_enum lvl, spdlog::string_view_t msg)
17 |     : logger_name(a_logger_name)
18 |     , level(lvl)
19 |     , time(log_time)
20 | #ifndef SPDLOG_NO_THREAD_ID
21 |     , thread_id(os::thread_id())
22 | #endif
23 |     , source(loc)
24 |     , payload(msg)
25 | {}
26 | 
27 | SPDLOG_INLINE log_msg::log_msg(
28 |     spdlog::source_loc loc, string_view_t a_logger_name, spdlog::level::level_enum lvl, spdlog::string_view_t msg)
29 |     : log_msg(os::now(), loc, a_logger_name, lvl, msg)
30 | {}
31 | 
32 | SPDLOG_INLINE log_msg::log_msg(string_view_t a_logger_name, spdlog::level::level_enum lvl, spdlog::string_view_t msg)
33 |     : log_msg(os::now(), source_loc{}, a_logger_name, lvl, msg)
34 | {}
35 | 
36 | } // namespace details
37 | } // namespace spdlog
38 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/log_msg.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/common.h>
 7 | #include <string>
 8 | 
 9 | namespace spdlog {
10 | namespace details {
11 | struct SPDLOG_API log_msg
12 | {
13 |     log_msg() = default;
14 |     log_msg(log_clock::time_point log_time, source_loc loc, string_view_t logger_name, level::level_enum lvl, string_view_t msg);
15 |     log_msg(source_loc loc, string_view_t logger_name, level::level_enum lvl, string_view_t msg);
16 |     log_msg(string_view_t logger_name, level::level_enum lvl, string_view_t msg);
17 |     log_msg(const log_msg &other) = default;
18 | 
19 |     string_view_t logger_name;
20 |     level::level_enum level{level::off};
21 |     log_clock::time_point time;
22 |     size_t thread_id{0};
23 | 
24 |     // wrapping the formatted text with color (updated by pattern_formatter).
25 |     mutable size_t color_range_start{0};
26 |     mutable size_t color_range_end{0};
27 | 
28 |     source_loc source;
29 |     string_view_t payload;
30 | };
31 | } // namespace details
32 | } // namespace spdlog
33 | 
34 | #ifdef SPDLOG_HEADER_ONLY
35 | #include "log_msg-inl.h"
36 | #endif
37 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/log_msg_buffer-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/details/log_msg_buffer.h>
 8 | #endif
 9 | 
10 | namespace spdlog {
11 | namespace details {
12 | 
13 | SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg &orig_msg)
14 |     : log_msg{orig_msg}
15 | {
16 |     buffer.append(logger_name.begin(), logger_name.end());
17 |     buffer.append(payload.begin(), payload.end());
18 |     update_string_views();
19 | }
20 | 
21 | SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg_buffer &other)
22 |     : log_msg{other}
23 | {
24 |     buffer.append(logger_name.begin(), logger_name.end());
25 |     buffer.append(payload.begin(), payload.end());
26 |     update_string_views();
27 | }
28 | 
29 | SPDLOG_INLINE log_msg_buffer::log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT : log_msg{other}, buffer{std::move(other.buffer)}
30 | {
31 |     update_string_views();
32 | }
33 | 
34 | SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(const log_msg_buffer &other)
35 | {
36 |     log_msg::operator=(other);
37 |     buffer.clear();
38 |     buffer.append(other.buffer.data(), other.buffer.data() + other.buffer.size());
39 |     update_string_views();
40 |     return *this;
41 | }
42 | 
43 | SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT
44 | {
45 |     log_msg::operator=(other);
46 |     buffer = std::move(other.buffer);
47 |     update_string_views();
48 |     return *this;
49 | }
50 | 
51 | SPDLOG_INLINE void log_msg_buffer::update_string_views()
52 | {
53 |     logger_name = string_view_t{buffer.data(), logger_name.size()};
54 |     payload = string_view_t{buffer.data() + logger_name.size(), payload.size()};
55 | }
56 | 
57 | } // namespace details
58 | } // namespace spdlog
59 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/log_msg_buffer.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/log_msg.h>
 7 | 
 8 | namespace spdlog {
 9 | namespace details {
10 | 
11 | // Extend log_msg with internal buffer to store its payload.
12 | // This is needed since log_msg holds string_views that points to stack data.
13 | 
14 | class SPDLOG_API log_msg_buffer : public log_msg
15 | {
16 |     memory_buf_t buffer;
17 |     void update_string_views();
18 | 
19 | public:
20 |     log_msg_buffer() = default;
21 |     explicit log_msg_buffer(const log_msg &orig_msg);
22 |     log_msg_buffer(const log_msg_buffer &other);
23 |     log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT;
24 |     log_msg_buffer &operator=(const log_msg_buffer &other);
25 |     log_msg_buffer &operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT;
26 | };
27 | 
28 | } // namespace details
29 | } // namespace spdlog
30 | 
31 | #ifdef SPDLOG_HEADER_ONLY
32 | #include "log_msg_buffer-inl.h"
33 | #endif
34 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/null_mutex.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <atomic>
 7 | #include <utility>
 8 | // null, no cost dummy "mutex" and dummy "atomic" int
 9 | 
10 | namespace spdlog {
11 | namespace details {
12 | struct null_mutex
13 | {
14 |     void lock() const {}
15 |     void unlock() const {}
16 |     bool try_lock() const
17 |     {
18 |         return true;
19 |     }
20 | };
21 | 
22 | struct null_atomic_int
23 | {
24 |     int value;
25 |     null_atomic_int() = default;
26 | 
27 |     explicit null_atomic_int(int new_value)
28 |         : value(new_value)
29 |     {}
30 | 
31 |     int load(std::memory_order = std::memory_order_relaxed) const
32 |     {
33 |         return value;
34 |     }
35 | 
36 |     void store(int new_value, std::memory_order = std::memory_order_relaxed)
37 |     {
38 |         value = new_value;
39 |     }
40 | 
41 |     int exchange(int new_value, std::memory_order = std::memory_order_relaxed)
42 |     {
43 |         std::swap(new_value, value);
44 |         return new_value; // return value before the call
45 |     }
46 | };
47 | 
48 | } // namespace details
49 | } // namespace spdlog
50 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/periodic_worker-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/details/periodic_worker.h>
 8 | #endif
 9 | 
10 | namespace spdlog {
11 | namespace details {
12 | 
13 | SPDLOG_INLINE periodic_worker::periodic_worker(const std::function<void()> &callback_fun, std::chrono::seconds interval)
14 | {
15 |     active_ = (interval > std::chrono::seconds::zero());
16 |     if (!active_)
17 |     {
18 |         return;
19 |     }
20 | 
21 |     worker_thread_ = std::thread([this, callback_fun, interval]() {
22 |         for (;;)
23 |         {
24 |             std::unique_lock<std::mutex> lock(this->mutex_);
25 |             if (this->cv_.wait_for(lock, interval, [this] { return !this->active_; }))
26 |             {
27 |                 return; // active_ == false, so exit this thread
28 |             }
29 |             callback_fun();
30 |         }
31 |     });
32 | }
33 | 
34 | // stop the worker thread and join it
35 | SPDLOG_INLINE periodic_worker::~periodic_worker()
36 | {
37 |     if (worker_thread_.joinable())
38 |     {
39 |         {
40 |             std::lock_guard<std::mutex> lock(mutex_);
41 |             active_ = false;
42 |         }
43 |         cv_.notify_one();
44 |         worker_thread_.join();
45 |     }
46 | }
47 | 
48 | } // namespace details
49 | } // namespace spdlog
50 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/periodic_worker.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | // periodic worker thread - periodically executes the given callback function.
 7 | //
 8 | // RAII over the owned thread:
 9 | //    creates the thread on construction.
10 | //    stops and joins the thread on destruction (if the thread is executing a callback, wait for it to finish first).
11 | 
12 | #include <chrono>
13 | #include <condition_variable>
14 | #include <functional>
15 | #include <mutex>
16 | #include <thread>
17 | namespace spdlog {
18 | namespace details {
19 | 
20 | class SPDLOG_API periodic_worker
21 | {
22 | public:
23 |     periodic_worker(const std::function<void()> &callback_fun, std::chrono::seconds interval);
24 |     periodic_worker(const periodic_worker &) = delete;
25 |     periodic_worker &operator=(const periodic_worker &) = delete;
26 |     // stop the worker thread and join it
27 |     ~periodic_worker();
28 | 
29 | private:
30 |     bool active_;
31 |     std::thread worker_thread_;
32 |     std::mutex mutex_;
33 |     std::condition_variable cv_;
34 | };
35 | } // namespace details
36 | } // namespace spdlog
37 | 
38 | #ifdef SPDLOG_HEADER_ONLY
39 | #include "periodic_worker-inl.h"
40 | #endif
41 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/details/synchronous_factory.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "registry.h"
 7 | 
 8 | namespace spdlog {
 9 | 
10 | // Default logger factory-  creates synchronous loggers
11 | class logger;
12 | 
13 | struct synchronous_factory
14 | {
15 |     template<typename Sink, typename... SinkArgs>
16 |     static std::shared_ptr<spdlog::logger> create(std::string logger_name, SinkArgs &&...args)
17 |     {
18 |         auto sink = std::make_shared<Sink>(std::forward<SinkArgs>(args)...);
19 |         auto new_logger = std::make_shared<spdlog::logger>(std::move(logger_name), std::move(sink));
20 |         details::registry::instance().initialize_logger(new_logger);
21 |         return new_logger;
22 |     }
23 | };
24 | } // namespace spdlog


--------------------------------------------------------------------------------
/vulkan/spdlog/details/windows_include.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifndef NOMINMAX
 4 | #define NOMINMAX // prevent windows redefining min/max
 5 | #endif
 6 | 
 7 | #ifndef WIN32_LEAN_AND_MEAN
 8 | #define WIN32_LEAN_AND_MEAN
 9 | #endif
10 | 
11 | #include <windows.h>
12 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/fmt/bundled/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 - present, Victor Zverovich
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | --- Optional exception to the license ---
23 | 
24 | As an exception, if, as a result of your compiling your source code, portions
25 | of this Software are embedded into a machine-executable object form of such
26 | source code, you may redistribute such embedded portions in such object form
27 | without including the above copyright and permission notices.
28 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/fmt/bundled/posix.h:
--------------------------------------------------------------------------------
1 | #include "os.h"
2 | #warning "fmt/posix.h is deprecated; use fmt/os.h instead"
3 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/fmt/chrono.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright(c) 2016 Gabi Melman.
 3 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 4 | //
 5 | 
 6 | #pragma once
 7 | //
 8 | // include bundled or external copy of fmtlib's chrono support
 9 | //
10 | 
11 | #if !defined(SPDLOG_FMT_EXTERNAL)
12 | #ifdef SPDLOG_HEADER_ONLY
13 | #ifndef FMT_HEADER_ONLY
14 | #define FMT_HEADER_ONLY
15 | #endif
16 | #endif
17 | #include <spdlog/fmt/bundled/chrono.h>
18 | #else
19 | #include <fmt/chrono.h>
20 | #endif
21 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/fmt/fmt.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright(c) 2016-2018 Gabi Melman.
 3 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 4 | //
 5 | 
 6 | #pragma once
 7 | 
 8 | //
 9 | // Include a bundled header-only copy of fmtlib or an external one.
10 | // By default spdlog include its own copy.
11 | //
12 | 
13 | #if !defined(SPDLOG_FMT_EXTERNAL)
14 | #if !defined(SPDLOG_COMPILED_LIB) && !defined(FMT_HEADER_ONLY)
15 | #define FMT_HEADER_ONLY
16 | #endif
17 | #ifndef FMT_USE_WINDOWS_H
18 | #define FMT_USE_WINDOWS_H 0
19 | #endif
20 | // enable the 'n' flag in for backward compatibility with fmt 6.x
21 | #define FMT_DEPRECATED_N_SPECIFIER
22 | #include <spdlog/fmt/bundled/core.h>
23 | #include <spdlog/fmt/bundled/format.h>
24 | #else // SPDLOG_FMT_EXTERNAL is defined - use external fmtlib
25 | #include <fmt/core.h>
26 | #include <fmt/format.h>
27 | #endif


--------------------------------------------------------------------------------
/vulkan/spdlog/fmt/ostr.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright(c) 2016 Gabi Melman.
 3 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 4 | //
 5 | 
 6 | #pragma once
 7 | //
 8 | // include bundled or external copy of fmtlib's ostream support
 9 | //
10 | 
11 | #if !defined(SPDLOG_FMT_EXTERNAL)
12 | #ifdef SPDLOG_HEADER_ONLY
13 | #ifndef FMT_HEADER_ONLY
14 | #define FMT_HEADER_ONLY
15 | #endif
16 | #endif
17 | #include <spdlog/fmt/bundled/ostream.h>
18 | #else
19 | #include <fmt/ostream.h>
20 | #endif
21 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/formatter.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/fmt/fmt.h>
 7 | #include <spdlog/details/log_msg.h>
 8 | 
 9 | namespace spdlog {
10 | 
11 | class formatter
12 | {
13 | public:
14 |     virtual ~formatter() = default;
15 |     virtual void format(const details::log_msg &msg, memory_buf_t &dest) = 0;
16 |     virtual std::unique_ptr<formatter> clone() const = 0;
17 | };
18 | } // namespace spdlog
19 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/fwd.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | namespace spdlog {
 7 | class logger;
 8 | class formatter;
 9 | 
10 | namespace sinks {
11 | class sink;
12 | }
13 | 
14 | } // namespace spdlog
15 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/base_sink-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/sinks/base_sink.h>
 8 | #endif
 9 | 
10 | #include <spdlog/common.h>
11 | #include <spdlog/pattern_formatter.h>
12 | 
13 | #include <memory>
14 | 
15 | template<typename Mutex>
16 | SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::base_sink()
17 |     : formatter_{details::make_unique<spdlog::pattern_formatter>()}
18 | {}
19 | 
20 | template<typename Mutex>
21 | SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::base_sink(std::unique_ptr<spdlog::formatter> formatter)
22 |     : formatter_{std::move(formatter)}
23 | {}
24 | 
25 | template<typename Mutex>
26 | void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::log(const details::log_msg &msg)
27 | {
28 |     std::lock_guard<Mutex> lock(mutex_);
29 |     sink_it_(msg);
30 | }
31 | 
32 | template<typename Mutex>
33 | void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::flush()
34 | {
35 |     std::lock_guard<Mutex> lock(mutex_);
36 |     flush_();
37 | }
38 | 
39 | template<typename Mutex>
40 | void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_pattern(const std::string &pattern)
41 | {
42 |     std::lock_guard<Mutex> lock(mutex_);
43 |     set_pattern_(pattern);
44 | }
45 | 
46 | template<typename Mutex>
47 | void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter)
48 | {
49 |     std::lock_guard<Mutex> lock(mutex_);
50 |     set_formatter_(std::move(sink_formatter));
51 | }
52 | 
53 | template<typename Mutex>
54 | void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_pattern_(const std::string &pattern)
55 | {
56 |     set_formatter_(details::make_unique<spdlog::pattern_formatter>(pattern));
57 | }
58 | 
59 | template<typename Mutex>
60 | void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_formatter_(std::unique_ptr<spdlog::formatter> sink_formatter)
61 | {
62 |     formatter_ = std::move(sink_formatter);
63 | }
64 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/base_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | //
 6 | // base sink templated over a mutex (either dummy or real)
 7 | // concrete implementation should override the sink_it_() and flush_()  methods.
 8 | // locking is taken care of in this class - no locking needed by the
 9 | // implementers..
10 | //
11 | 
12 | #include <spdlog/common.h>
13 | #include <spdlog/details/log_msg.h>
14 | #include <spdlog/sinks/sink.h>
15 | 
16 | namespace spdlog {
17 | namespace sinks {
18 | template<typename Mutex>
19 | class base_sink : public sink
20 | {
21 | public:
22 |     base_sink();
23 |     explicit base_sink(std::unique_ptr<spdlog::formatter> formatter);
24 |     ~base_sink() override = default;
25 | 
26 |     base_sink(const base_sink &) = delete;
27 |     base_sink(base_sink &&) = delete;
28 | 
29 |     base_sink &operator=(const base_sink &) = delete;
30 |     base_sink &operator=(base_sink &&) = delete;
31 | 
32 |     void log(const details::log_msg &msg) final;
33 |     void flush() final;
34 |     void set_pattern(const std::string &pattern) final;
35 |     void set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) final;
36 | 
37 | protected:
38 |     // sink formatter
39 |     std::unique_ptr<spdlog::formatter> formatter_;
40 |     Mutex mutex_;
41 | 
42 |     virtual void sink_it_(const details::log_msg &msg) = 0;
43 |     virtual void flush_() = 0;
44 |     virtual void set_pattern_(const std::string &pattern);
45 |     virtual void set_formatter_(std::unique_ptr<spdlog::formatter> sink_formatter);
46 | };
47 | } // namespace sinks
48 | } // namespace spdlog
49 | 
50 | #ifdef SPDLOG_HEADER_ONLY
51 | #include "base_sink-inl.h"
52 | #endif
53 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/basic_file_sink-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/sinks/basic_file_sink.h>
 8 | #endif
 9 | 
10 | #include <spdlog/common.h>
11 | #include <spdlog/details/os.h>
12 | 
13 | namespace spdlog {
14 | namespace sinks {
15 | 
16 | template<typename Mutex>
17 | SPDLOG_INLINE basic_file_sink<Mutex>::basic_file_sink(const filename_t &filename, bool truncate)
18 | {
19 |     file_helper_.open(filename, truncate);
20 | }
21 | 
22 | template<typename Mutex>
23 | SPDLOG_INLINE const filename_t &basic_file_sink<Mutex>::filename() const
24 | {
25 |     return file_helper_.filename();
26 | }
27 | 
28 | template<typename Mutex>
29 | SPDLOG_INLINE void basic_file_sink<Mutex>::sink_it_(const details::log_msg &msg)
30 | {
31 |     memory_buf_t formatted;
32 |     base_sink<Mutex>::formatter_->format(msg, formatted);
33 |     file_helper_.write(formatted);
34 | }
35 | 
36 | template<typename Mutex>
37 | SPDLOG_INLINE void basic_file_sink<Mutex>::flush_()
38 | {
39 |     file_helper_.flush();
40 | }
41 | 
42 | } // namespace sinks
43 | } // namespace spdlog
44 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/basic_file_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/file_helper.h>
 7 | #include <spdlog/details/null_mutex.h>
 8 | #include <spdlog/sinks/base_sink.h>
 9 | #include <spdlog/details/synchronous_factory.h>
10 | 
11 | #include <mutex>
12 | #include <string>
13 | 
14 | namespace spdlog {
15 | namespace sinks {
16 | /*
17 |  * Trivial file sink with single file as target
18 |  */
19 | template<typename Mutex>
20 | class basic_file_sink final : public base_sink<Mutex>
21 | {
22 | public:
23 |     explicit basic_file_sink(const filename_t &filename, bool truncate = false);
24 |     const filename_t &filename() const;
25 | 
26 | protected:
27 |     void sink_it_(const details::log_msg &msg) override;
28 |     void flush_() override;
29 | 
30 | private:
31 |     details::file_helper file_helper_;
32 | };
33 | 
34 | using basic_file_sink_mt = basic_file_sink<std::mutex>;
35 | using basic_file_sink_st = basic_file_sink<details::null_mutex>;
36 | 
37 | } // namespace sinks
38 | 
39 | //
40 | // factory functions
41 | //
42 | template<typename Factory = spdlog::synchronous_factory>
43 | inline std::shared_ptr<logger> basic_logger_mt(const std::string &logger_name, const filename_t &filename, bool truncate = false)
44 | {
45 |     return Factory::template create<sinks::basic_file_sink_mt>(logger_name, filename, truncate);
46 | }
47 | 
48 | template<typename Factory = spdlog::synchronous_factory>
49 | inline std::shared_ptr<logger> basic_logger_st(const std::string &logger_name, const filename_t &filename, bool truncate = false)
50 | {
51 |     return Factory::template create<sinks::basic_file_sink_st>(logger_name, filename, truncate);
52 | }
53 | 
54 | } // namespace spdlog
55 | 
56 | #ifdef SPDLOG_HEADER_ONLY
57 | #include "basic_file_sink-inl.h"
58 | #endif


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/dist_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "base_sink.h"
 7 | #include <spdlog/details/log_msg.h>
 8 | #include <spdlog/details/null_mutex.h>
 9 | #include <spdlog/pattern_formatter.h>
10 | 
11 | #include <algorithm>
12 | #include <memory>
13 | #include <mutex>
14 | #include <vector>
15 | 
16 | // Distribution sink (mux). Stores a vector of sinks which get called when log
17 | // is called
18 | 
19 | namespace spdlog {
20 | namespace sinks {
21 | 
22 | template<typename Mutex>
23 | class dist_sink : public base_sink<Mutex>
24 | {
25 | public:
26 |     dist_sink() = default;
27 |     explicit dist_sink(std::vector<std::shared_ptr<sink>> sinks)
28 |         : sinks_(sinks)
29 |     {}
30 | 
31 |     dist_sink(const dist_sink &) = delete;
32 |     dist_sink &operator=(const dist_sink &) = delete;
33 | 
34 |     void add_sink(std::shared_ptr<sink> sink)
35 |     {
36 |         std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
37 |         sinks_.push_back(sink);
38 |     }
39 | 
40 |     void remove_sink(std::shared_ptr<sink> sink)
41 |     {
42 |         std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
43 |         sinks_.erase(std::remove(sinks_.begin(), sinks_.end(), sink), sinks_.end());
44 |     }
45 | 
46 |     void set_sinks(std::vector<std::shared_ptr<sink>> sinks)
47 |     {
48 |         std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
49 |         sinks_ = std::move(sinks);
50 |     }
51 | 
52 |     std::vector<std::shared_ptr<sink>> &sinks()
53 |     {
54 |         return sinks_;
55 |     }
56 | 
57 | protected:
58 |     void sink_it_(const details::log_msg &msg) override
59 |     {
60 |         for (auto &sink : sinks_)
61 |         {
62 |             if (sink->should_log(msg.level))
63 |             {
64 |                 sink->log(msg);
65 |             }
66 |         }
67 |     }
68 | 
69 |     void flush_() override
70 |     {
71 |         for (auto &sink : sinks_)
72 |         {
73 |             sink->flush();
74 |         }
75 |     }
76 | 
77 |     void set_pattern_(const std::string &pattern) override
78 |     {
79 |         set_formatter_(details::make_unique<spdlog::pattern_formatter>(pattern));
80 |     }
81 | 
82 |     void set_formatter_(std::unique_ptr<spdlog::formatter> sink_formatter) override
83 |     {
84 |         base_sink<Mutex>::formatter_ = std::move(sink_formatter);
85 |         for (auto &sink : sinks_)
86 |         {
87 |             sink->set_formatter(base_sink<Mutex>::formatter_->clone());
88 |         }
89 |     }
90 |     std::vector<std::shared_ptr<sink>> sinks_;
91 | };
92 | 
93 | using dist_sink_mt = dist_sink<std::mutex>;
94 | using dist_sink_st = dist_sink<details::null_mutex>;
95 | 
96 | } // namespace sinks
97 | } // namespace spdlog
98 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/msvc_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2016 Alexander Dalshov.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #if defined(_WIN32)
 7 | 
 8 | #include <spdlog/details/null_mutex.h>
 9 | #include <spdlog/sinks/base_sink.h>
10 | 
11 | #include <mutex>
12 | #include <string>
13 | 
14 | 
15 | // Avoid including windows.h (https://stackoverflow.com/a/30741042)
16 | extern "C" __declspec(dllimport) void __stdcall OutputDebugStringA(const char *lpOutputString);
17 | 
18 | namespace spdlog {
19 | namespace sinks {
20 | /*
21 |  * MSVC sink (logging using OutputDebugStringA)
22 |  */
23 | template<typename Mutex>
24 | class msvc_sink : public base_sink<Mutex>
25 | {
26 | public:
27 |     msvc_sink() = default;
28 | 
29 | protected:
30 |     void sink_it_(const details::log_msg &msg) override
31 |     {
32 |         memory_buf_t formatted;
33 |         base_sink<Mutex>::formatter_->format(msg, formatted);
34 |         OutputDebugStringA(fmt::to_string(formatted).c_str());
35 |     }
36 | 
37 |     void flush_() override {}
38 | };
39 | 
40 | using msvc_sink_mt = msvc_sink<std::mutex>;
41 | using msvc_sink_st = msvc_sink<details::null_mutex>;
42 | 
43 | using windebug_sink_mt = msvc_sink_mt;
44 | using windebug_sink_st = msvc_sink_st;
45 | 
46 | } // namespace sinks
47 | } // namespace spdlog
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/null_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/null_mutex.h>
 7 | #include <spdlog/sinks/base_sink.h>
 8 | #include <spdlog/details/synchronous_factory.h>
 9 | 
10 | #include <mutex>
11 | 
12 | namespace spdlog {
13 | namespace sinks {
14 | 
15 | template<typename Mutex>
16 | class null_sink : public base_sink<Mutex>
17 | {
18 | protected:
19 |     void sink_it_(const details::log_msg &) override {}
20 |     void flush_() override {}
21 | };
22 | 
23 | using null_sink_mt = null_sink<details::null_mutex>;
24 | using null_sink_st = null_sink<details::null_mutex>;
25 | 
26 | } // namespace sinks
27 | 
28 | template<typename Factory = spdlog::synchronous_factory>
29 | inline std::shared_ptr<logger> null_logger_mt(const std::string &logger_name)
30 | {
31 |     auto null_logger = Factory::template create<sinks::null_sink_mt>(logger_name);
32 |     null_logger->set_level(level::off);
33 |     return null_logger;
34 | }
35 | 
36 | template<typename Factory = spdlog::synchronous_factory>
37 | inline std::shared_ptr<logger> null_logger_st(const std::string &logger_name)
38 | {
39 |     auto null_logger = Factory::template create<sinks::null_sink_st>(logger_name);
40 |     null_logger->set_level(level::off);
41 |     return null_logger;
42 | }
43 | 
44 | } // namespace spdlog
45 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/ostream_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/null_mutex.h>
 7 | #include <spdlog/sinks/base_sink.h>
 8 | 
 9 | #include <mutex>
10 | #include <ostream>
11 | 
12 | namespace spdlog {
13 | namespace sinks {
14 | template<typename Mutex>
15 | class ostream_sink final : public base_sink<Mutex>
16 | {
17 | public:
18 |     explicit ostream_sink(std::ostream &os, bool force_flush = false)
19 |         : ostream_(os)
20 |         , force_flush_(force_flush)
21 |     {}
22 |     ostream_sink(const ostream_sink &) = delete;
23 |     ostream_sink &operator=(const ostream_sink &) = delete;
24 | 
25 | protected:
26 |     void sink_it_(const details::log_msg &msg) override
27 |     {
28 |         memory_buf_t formatted;
29 |         base_sink<Mutex>::formatter_->format(msg, formatted);
30 |         ostream_.write(formatted.data(), static_cast<std::streamsize>(formatted.size()));
31 |         if (force_flush_)
32 |         {
33 |             ostream_.flush();
34 |         }
35 |     }
36 | 
37 |     void flush_() override
38 |     {
39 |         ostream_.flush();
40 |     }
41 | 
42 |     std::ostream &ostream_;
43 |     bool force_flush_;
44 | };
45 | 
46 | using ostream_sink_mt = ostream_sink<std::mutex>;
47 | using ostream_sink_st = ostream_sink<details::null_mutex>;
48 | 
49 | } // namespace sinks
50 | } // namespace spdlog
51 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/ringbuffer_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "spdlog/sinks/base_sink.h"
 7 | #include "spdlog/details/circular_q.h"
 8 | #include "spdlog/details/log_msg_buffer.h"
 9 | #include "spdlog/details/null_mutex.h"
10 | 
11 | #include <mutex>
12 | #include <string>
13 | #include <vector>
14 | 
15 | namespace spdlog {
16 | namespace sinks {
17 | /*
18 |  * Ring buffer sink
19 |  */
20 | template<typename Mutex>
21 | class ringbuffer_sink final : public base_sink<Mutex>
22 | {
23 | public:
24 |     explicit ringbuffer_sink(size_t n_items)
25 |         : q_{n_items}
26 |     {}
27 | 
28 |     std::vector<details::log_msg_buffer> last_raw(size_t lim = 0)
29 |     {
30 |         std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
31 |         auto items_available = q_.size();
32 |         auto n_items = lim > 0 ? (std::min)(lim, items_available) : items_available;
33 |         std::vector<details::log_msg_buffer> ret;
34 |         ret.reserve(n_items);
35 |         for (size_t i = (items_available - n_items); i < items_available; i++)
36 |         {
37 |             ret.push_back(q_.at(i));
38 |         }
39 |         return ret;
40 |     }
41 | 
42 |     std::vector<std::string> last_formatted(size_t lim = 0)
43 |     {
44 |         std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
45 |         auto items_available = q_.size();
46 |         auto n_items = lim > 0 ? (std::min)(lim, items_available) : items_available;
47 |         std::vector<std::string> ret;
48 |         ret.reserve(n_items);
49 |         for (size_t i = (items_available - n_items); i < items_available; i++)
50 |         {
51 |             memory_buf_t formatted;
52 |             base_sink<Mutex>::formatter_->format(q_.at(i), formatted);
53 |             ret.push_back(fmt::to_string(formatted));
54 |         }
55 |         return ret;
56 |     }
57 | 
58 | protected:
59 |     void sink_it_(const details::log_msg &msg) override
60 |     {
61 |         q_.push_back(details::log_msg_buffer{msg});
62 |     }
63 |     void flush_() override {}
64 | 
65 | private:
66 |     details::circular_q<details::log_msg_buffer> q_;
67 | };
68 | 
69 | using ringbuffer_sink_mt = ringbuffer_sink<std::mutex>;
70 | using ringbuffer_sink_st = ringbuffer_sink<details::null_mutex>;
71 | 
72 | } // namespace sinks
73 | 
74 | } // namespace spdlog
75 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/rotating_file_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/sinks/base_sink.h>
 7 | #include <spdlog/details/file_helper.h>
 8 | #include <spdlog/details/null_mutex.h>
 9 | #include <spdlog/details/synchronous_factory.h>
10 | 
11 | #include <chrono>
12 | #include <mutex>
13 | #include <string>
14 | 
15 | namespace spdlog {
16 | namespace sinks {
17 | 
18 | //
19 | // Rotating file sink based on size
20 | //
21 | template<typename Mutex>
22 | class rotating_file_sink final : public base_sink<Mutex>
23 | {
24 | public:
25 |     rotating_file_sink(filename_t base_filename, std::size_t max_size, std::size_t max_files, bool rotate_on_open = false);
26 |     static filename_t calc_filename(const filename_t &filename, std::size_t index);
27 |     filename_t filename();
28 | 
29 | protected:
30 |     void sink_it_(const details::log_msg &msg) override;
31 |     void flush_() override;
32 | 
33 | private:
34 |     // Rotate files:
35 |     // log.txt -> log.1.txt
36 |     // log.1.txt -> log.2.txt
37 |     // log.2.txt -> log.3.txt
38 |     // log.3.txt -> delete
39 |     void rotate_();
40 | 
41 |     // delete the target if exists, and rename the src file  to target
42 |     // return true on success, false otherwise.
43 |     bool rename_file_(const filename_t &src_filename, const filename_t &target_filename);
44 | 
45 |     filename_t base_filename_;
46 |     std::size_t max_size_;
47 |     std::size_t max_files_;
48 |     std::size_t current_size_;
49 |     details::file_helper file_helper_;
50 | };
51 | 
52 | using rotating_file_sink_mt = rotating_file_sink<std::mutex>;
53 | using rotating_file_sink_st = rotating_file_sink<details::null_mutex>;
54 | 
55 | } // namespace sinks
56 | 
57 | //
58 | // factory functions
59 | //
60 | 
61 | template<typename Factory = spdlog::synchronous_factory>
62 | inline std::shared_ptr<logger> rotating_logger_mt(
63 |     const std::string &logger_name, const filename_t &filename, size_t max_file_size, size_t max_files, bool rotate_on_open = false)
64 | {
65 |     return Factory::template create<sinks::rotating_file_sink_mt>(logger_name, filename, max_file_size, max_files, rotate_on_open);
66 | }
67 | 
68 | template<typename Factory = spdlog::synchronous_factory>
69 | inline std::shared_ptr<logger> rotating_logger_st(
70 |     const std::string &logger_name, const filename_t &filename, size_t max_file_size, size_t max_files, bool rotate_on_open = false)
71 | {
72 |     return Factory::template create<sinks::rotating_file_sink_st>(logger_name, filename, max_file_size, max_files, rotate_on_open);
73 | }
74 | } // namespace spdlog
75 | 
76 | #ifdef SPDLOG_HEADER_ONLY
77 | #include "rotating_file_sink-inl.h"
78 | #endif
79 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/sink-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/sinks/sink.h>
 8 | #endif
 9 | 
10 | #include <spdlog/common.h>
11 | 
12 | SPDLOG_INLINE bool spdlog::sinks::sink::should_log(spdlog::level::level_enum msg_level) const
13 | {
14 |     return msg_level >= level_.load(std::memory_order_relaxed);
15 | }
16 | 
17 | SPDLOG_INLINE void spdlog::sinks::sink::set_level(level::level_enum log_level)
18 | {
19 |     level_.store(log_level, std::memory_order_relaxed);
20 | }
21 | 
22 | SPDLOG_INLINE spdlog::level::level_enum spdlog::sinks::sink::level() const
23 | {
24 |     return static_cast<spdlog::level::level_enum>(level_.load(std::memory_order_relaxed));
25 | }
26 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/log_msg.h>
 7 | #include <spdlog/formatter.h>
 8 | 
 9 | namespace spdlog {
10 | 
11 | namespace sinks {
12 | class SPDLOG_API sink
13 | {
14 | public:
15 |     virtual ~sink() = default;
16 |     virtual void log(const details::log_msg &msg) = 0;
17 |     virtual void flush() = 0;
18 |     virtual void set_pattern(const std::string &pattern) = 0;
19 |     virtual void set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) = 0;
20 | 
21 |     void set_level(level::level_enum log_level);
22 |     level::level_enum level() const;
23 |     bool should_log(level::level_enum msg_level) const;
24 | 
25 | protected:
26 |     // sink log level - default is all
27 |     level_t level_{level::trace};
28 | };
29 | 
30 | } // namespace sinks
31 | } // namespace spdlog
32 | 
33 | #ifdef SPDLOG_HEADER_ONLY
34 | #include "sink-inl.h"
35 | #endif
36 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/stdout_color_sinks-inl.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef SPDLOG_HEADER_ONLY
 7 | #include <spdlog/sinks/stdout_color_sinks.h>
 8 | #endif
 9 | 
10 | #include <spdlog/logger.h>
11 | #include <spdlog/common.h>
12 | 
13 | namespace spdlog {
14 | 
15 | template<typename Factory>
16 | SPDLOG_INLINE std::shared_ptr<logger> stdout_color_mt(const std::string &logger_name, color_mode mode)
17 | {
18 |     return Factory::template create<sinks::stdout_color_sink_mt>(logger_name, mode);
19 | }
20 | 
21 | template<typename Factory>
22 | SPDLOG_INLINE std::shared_ptr<logger> stdout_color_st(const std::string &logger_name, color_mode mode)
23 | {
24 |     return Factory::template create<sinks::stdout_color_sink_st>(logger_name, mode);
25 | }
26 | 
27 | template<typename Factory>
28 | SPDLOG_INLINE std::shared_ptr<logger> stderr_color_mt(const std::string &logger_name, color_mode mode)
29 | {
30 |     return Factory::template create<sinks::stderr_color_sink_mt>(logger_name, mode);
31 | }
32 | 
33 | template<typename Factory>
34 | SPDLOG_INLINE std::shared_ptr<logger> stderr_color_st(const std::string &logger_name, color_mode mode)
35 | {
36 |     return Factory::template create<sinks::stderr_color_sink_st>(logger_name, mode);
37 | }
38 | } // namespace spdlog


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/stdout_color_sinks.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifdef _WIN32
 7 | #include <spdlog/sinks/wincolor_sink.h>
 8 | #else
 9 | #include <spdlog/sinks/ansicolor_sink.h>
10 | #endif
11 | 
12 | #include <spdlog/details/synchronous_factory.h>
13 | 
14 | namespace spdlog {
15 | namespace sinks {
16 | #ifdef _WIN32
17 | using stdout_color_sink_mt = wincolor_stdout_sink_mt;
18 | using stdout_color_sink_st = wincolor_stdout_sink_st;
19 | using stderr_color_sink_mt = wincolor_stderr_sink_mt;
20 | using stderr_color_sink_st = wincolor_stderr_sink_st;
21 | #else
22 | using stdout_color_sink_mt = ansicolor_stdout_sink_mt;
23 | using stdout_color_sink_st = ansicolor_stdout_sink_st;
24 | using stderr_color_sink_mt = ansicolor_stderr_sink_mt;
25 | using stderr_color_sink_st = ansicolor_stderr_sink_st;
26 | #endif
27 | } // namespace sinks
28 | 
29 | template<typename Factory = spdlog::synchronous_factory>
30 | std::shared_ptr<logger> stdout_color_mt(const std::string &logger_name, color_mode mode = color_mode::automatic);
31 | 
32 | template<typename Factory = spdlog::synchronous_factory>
33 | std::shared_ptr<logger> stdout_color_st(const std::string &logger_name, color_mode mode = color_mode::automatic);
34 | 
35 | template<typename Factory = spdlog::synchronous_factory>
36 | std::shared_ptr<logger> stderr_color_mt(const std::string &logger_name, color_mode mode = color_mode::automatic);
37 | 
38 | template<typename Factory = spdlog::synchronous_factory>
39 | std::shared_ptr<logger> stderr_color_st(const std::string &logger_name, color_mode mode = color_mode::automatic);
40 | 
41 | } // namespace spdlog
42 | 
43 | #ifdef SPDLOG_HEADER_ONLY
44 | #include "stdout_color_sinks-inl.h"
45 | #endif
46 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/stdout_sinks.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/console_globals.h>
 7 | #include <spdlog/details/synchronous_factory.h>
 8 | #include <spdlog/sinks/sink.h>
 9 | #include <cstdio>
10 | 
11 | #ifdef _WIN32
12 | #include <spdlog/details/windows_include.h>
13 | #endif
14 | 
15 | namespace spdlog {
16 | 
17 | namespace sinks {
18 | 
19 | template<typename ConsoleMutex>
20 | class stdout_sink_base : public sink
21 | {
22 | public:
23 |     using mutex_t = typename ConsoleMutex::mutex_t;
24 |     explicit stdout_sink_base(FILE *file);
25 |     ~stdout_sink_base() override = default;
26 | 
27 |     stdout_sink_base(const stdout_sink_base &other) = delete;
28 |     stdout_sink_base(stdout_sink_base &&other) = delete;
29 | 
30 |     stdout_sink_base &operator=(const stdout_sink_base &other) = delete;
31 |     stdout_sink_base &operator=(stdout_sink_base &&other) = delete;
32 | 
33 |     void log(const details::log_msg &msg) override;
34 |     void flush() override;
35 |     void set_pattern(const std::string &pattern) override;
36 | 
37 |     void set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) override;
38 | 
39 | protected:
40 |     mutex_t &mutex_;
41 |     FILE *file_;
42 |     std::unique_ptr<spdlog::formatter> formatter_;
43 | #ifdef _WIN32
44 |     HANDLE handle_;    
45 | #endif // WIN32
46 | };
47 | 
48 | template<typename ConsoleMutex>
49 | class stdout_sink : public stdout_sink_base<ConsoleMutex>
50 | {
51 | public:
52 |     stdout_sink();
53 | };
54 | 
55 | template<typename ConsoleMutex>
56 | class stderr_sink : public stdout_sink_base<ConsoleMutex>
57 | {
58 | public:
59 |     stderr_sink();
60 | };
61 | 
62 | using stdout_sink_mt = stdout_sink<details::console_mutex>;
63 | using stdout_sink_st = stdout_sink<details::console_nullmutex>;
64 | 
65 | using stderr_sink_mt = stderr_sink<details::console_mutex>;
66 | using stderr_sink_st = stderr_sink<details::console_nullmutex>;
67 | 
68 | } // namespace sinks
69 | 
70 | // factory methods
71 | template<typename Factory = spdlog::synchronous_factory>
72 | std::shared_ptr<logger> stdout_logger_mt(const std::string &logger_name);
73 | 
74 | template<typename Factory = spdlog::synchronous_factory>
75 | std::shared_ptr<logger> stdout_logger_st(const std::string &logger_name);
76 | 
77 | template<typename Factory = spdlog::synchronous_factory>
78 | std::shared_ptr<logger> stderr_logger_mt(const std::string &logger_name);
79 | 
80 | template<typename Factory = spdlog::synchronous_factory>
81 | std::shared_ptr<logger> stderr_logger_st(const std::string &logger_name);
82 | 
83 | } // namespace spdlog
84 | 
85 | #ifdef SPDLOG_HEADER_ONLY
86 | #include "stdout_sinks-inl.h"
87 | #endif
88 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/sinks/tcp_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/common.h>
 7 | #include <spdlog/sinks/base_sink.h>
 8 | #include <spdlog/details/null_mutex.h>
 9 | #ifdef _WIN32
10 | #include <spdlog/details/tcp_client-windows.h>
11 | #else
12 | #include <spdlog/details/tcp_client.h>
13 | #endif
14 | 
15 | #include <mutex>
16 | #include <string>
17 | #include <chrono>
18 | #include <functional>
19 | 
20 | #pragma once
21 | 
22 | // Simple tcp client sink
23 | // Connects to remote address and send the formatted log.
24 | // Will attempt to reconnect if connection drops.
25 | // If more complicated behaviour is needed (i.e get responses), you can inherit it and override the sink_it_ method.
26 | 
27 | namespace spdlog {
28 | namespace sinks {
29 | 
30 | struct tcp_sink_config
31 | {
32 |     std::string server_host;
33 |     int server_port;
34 |     bool lazy_connect = false; // if true connect on first log call instead of on construction
35 | 
36 |     tcp_sink_config(std::string host, int port)
37 |         : server_host{std::move(host)}
38 |         , server_port{port}
39 |     {}
40 | };
41 | 
42 | template<typename Mutex>
43 | class tcp_sink : public spdlog::sinks::base_sink<Mutex>
44 | {
45 | public:
46 |     // connect to tcp host/port or throw if failed
47 |     // host can be hostname or ip address
48 | 
49 |     explicit tcp_sink(tcp_sink_config sink_config)
50 |         : config_{std::move(sink_config)}
51 |     {
52 |         if (!config_.lazy_connect)
53 |         {
54 |             this->client_.connect(config_.server_host, config_.server_port);
55 |         }
56 |     }
57 | 
58 |     ~tcp_sink() override = default;
59 | 
60 | protected:
61 |     void sink_it_(const spdlog::details::log_msg &msg) override
62 |     {
63 |         spdlog::memory_buf_t formatted;
64 |         spdlog::sinks::base_sink<Mutex>::formatter_->format(msg, formatted);
65 |         if (!client_.is_connected())
66 |         {
67 |             client_.connect(config_.server_host, config_.server_port);
68 |         }
69 |         client_.send(formatted.data(), formatted.size());
70 |     }
71 | 
72 |     void flush_() override {}
73 |     tcp_sink_config config_;
74 |     details::tcp_client client_;
75 | };
76 | 
77 | using tcp_sink_mt = tcp_sink<std::mutex>;
78 | using tcp_sink_st = tcp_sink<spdlog::details::null_mutex>;
79 | 
80 | } // namespace sinks
81 | } // namespace spdlog
82 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/stopwatch.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/fmt/fmt.h>
 7 | 
 8 | // Stopwatch support for spdlog  (using std::chrono::steady_clock).
 9 | // Displays elapsed seconds since construction as double.
10 | //
11 | // Usage:
12 | //
13 | // spdlog::stopwatch sw;
14 | // ...
15 | // spdlog::debug("Elapsed: {} seconds", sw);    =>  "Elapsed 0.005116733 seconds"
16 | // spdlog::info("Elapsed: {:.6} seconds", sw);  =>  "Elapsed 0.005163 seconds"
17 | //
18 | //
19 | // If other units are needed (e.g. millis instead of double), include "fmt/chrono.h" and use "duration_cast<..>(sw.elapsed())":
20 | //
21 | // #include <spdlog/fmt/chrono.h>
22 | //..
23 | // using std::chrono::duration_cast;
24 | // using std::chrono::milliseconds;
25 | // spdlog::info("Elapsed {}", duration_cast<milliseconds>(sw.elapsed())); => "Elapsed 5ms"
26 | 
27 | namespace spdlog {
28 | class stopwatch
29 | {
30 |     using clock = std::chrono::steady_clock;
31 |     std::chrono::time_point<clock> start_tp_;
32 | 
33 | public:
34 |     stopwatch()
35 |         : start_tp_{clock::now()}
36 |     {}
37 | 
38 |     std::chrono::duration<double> elapsed() const
39 |     {
40 |         return std::chrono::duration<double>(clock::now() - start_tp_);
41 |     }
42 | 
43 |     void reset()
44 |     {
45 |         start_tp_ = clock ::now();
46 |     }
47 | };
48 | } // namespace spdlog
49 | 
50 | // Support for fmt formatting  (e.g. "{:012.9}" or just "{}")
51 | namespace fmt {
52 | template<>
53 | struct formatter<spdlog::stopwatch> : formatter<double>
54 | {
55 |     template<typename FormatContext>
56 |     auto format(const spdlog::stopwatch &sw, FormatContext &ctx) -> decltype(ctx.out())
57 |     {
58 |         return formatter<double>::format(sw.elapsed().count(), ctx);
59 |     }
60 | };
61 | } // namespace fmt
62 | 


--------------------------------------------------------------------------------
/vulkan/spdlog/version.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #define SPDLOG_VER_MAJOR 1
 7 | #define SPDLOG_VER_MINOR 8
 8 | #define SPDLOG_VER_PATCH 1
 9 | 
10 | #define SPDLOG_VERSION (SPDLOG_VER_MAJOR * 10000 + SPDLOG_VER_MINOR * 100 + SPDLOG_VER_PATCH)
11 | 


--------------------------------------------------------------------------------
/vulkan/test_MMult.cpp:
--------------------------------------------------------------------------------
 1 | #include "parameters.h"
 2 | #include <cstdlib>
 3 | #include <cstring>
 4 | #include <iostream>
 5 | #include <tuple>
 6 | #include <vector>
 7 | #define SPDLOG_ACTIVE_LEVEL 6
 8 | 
 9 | void REF_MMult(int, int, int, float *, float *, float *);
10 | float MY_MMult(int, int, int, float *, float *, float *);
11 | void copy_matrix(int, int, float *, float *);
12 | void random_matrix(int, int, float *);
13 | float compare_matrices(int, int, float *, float *);
14 | 
15 | double dclock();
16 | 
17 | int main() {
18 |   int p, m, n, k;
19 | 
20 |   double diff;
21 | 
22 |   float *a, *b, *cref, *cold;
23 | 
24 |   std::vector<std::tuple<int, double, double>> results;
25 | 
26 |   for (p = PFIRST; p <= PLAST; p += PINC) {
27 |     m = (M == -1 ? p : M);
28 |     n = (N == -1 ? p : N);
29 |     k = (K == -1 ? p : K);
30 | 
31 |     /* Allocate space for the matrices */
32 |     /* Note: I create an extra column in A to make sure that
33 |        prefetching beyond the matrix does not cause a segfault */
34 |     const size_t mem_size_A = m * (k + 1) * sizeof(float);
35 |     const size_t mem_size_B = k * n * sizeof(float);
36 |     const size_t mem_size_C = m * n * sizeof(float);
37 |     constexpr size_t alignment = 64;
38 |     a = (float *)std::aligned_alloc(alignment, mem_size_A * sizeof(float));
39 |     b = (float *)std::aligned_alloc(alignment, mem_size_B * sizeof(float));
40 |     cold = (float *)std::aligned_alloc(alignment, mem_size_C * sizeof(float));
41 |     cref = (float *)std::aligned_alloc(alignment, mem_size_C * sizeof(float));
42 | 
43 |     /* Generate random matrices A, B, Cold */
44 |     random_matrix(m, k, a);
45 |     random_matrix(k, n, b);
46 |     std::memset(cold, 0, mem_size_C);
47 |     std::memset(cref, 0, mem_size_C);
48 | 
49 |     /* Run the reference implementation so the answers can be compared */
50 |     REF_MMult(m, n, k, a, b, cref);
51 | 
52 |     float msecTotal = 0.0f;
53 |     for (int rep = 0; rep < NREPEATS; rep++) {
54 |       /* Time your implementation */
55 |       msecTotal += MY_MMult(m, n, k, a, b, cold);
56 |     }
57 | 
58 |     diff = compare_matrices(m, n, cold, cref);
59 |     if (diff > 0.5f || diff < -0.5f) {
60 |       fprintf(stdout, "%d diff too big: %le\n", p, diff);
61 |       exit(-1);
62 |     }
63 | 
64 |     // Compute and print the performance
65 |     float msecPerMatrixMul = msecTotal / NREPEATS;
66 |     double flopsPerMatrixMul = 2.0 * m * k * n;
67 |     double gflops =
68 |         (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
69 | 
70 |     results.emplace_back(p, gflops, diff);
71 | 
72 |     std::free(a);
73 |     std::free(b);
74 |     std::free(cold);
75 |     std::free(cref);
76 |   }
77 | 
78 |   fprintf(stdout, "MY_MMult = [\n");
79 |   for (auto &item : results) {
80 |     fprintf(stdout, "%d %.2f %le \n", std::get<0>(item), std::get<1>(item),
81 |             std::get<2>(item));
82 |   }
83 |   fprintf(stdout, "];\n");
84 |   return 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vk_sdk_platform.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // File: vk_sdk_platform.h
 3 | //
 4 | /*
 5 |  * Copyright (c) 2015-2016 The Khronos Group Inc.
 6 |  * Copyright (c) 2015-2016 Valve Corporation
 7 |  * Copyright (c) 2015-2016 LunarG, Inc.
 8 |  *
 9 |  * Licensed under the Apache License, Version 2.0 (the "License");
10 |  * you may not use this file except in compliance with the License.
11 |  * You may obtain a copy of the License at
12 |  *
13 |  *     http://www.apache.org/licenses/LICENSE-2.0
14 |  *
15 |  * Unless required by applicable law or agreed to in writing, software
16 |  * distributed under the License is distributed on an "AS IS" BASIS,
17 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 |  * See the License for the specific language governing permissions and
19 |  * limitations under the License.
20 |  */
21 | 
22 | #ifndef VK_SDK_PLATFORM_H
23 | #define VK_SDK_PLATFORM_H
24 | 
25 | #if defined(_WIN32)
26 | #define NOMINMAX
27 | #ifndef __cplusplus
28 | #undef inline
29 | #define inline __inline
30 | #endif // __cplusplus
31 | 
32 | #if (defined(_MSC_VER) && _MSC_VER < 1900 /*vs2015*/)
33 | // C99:
34 | // Microsoft didn't implement C99 in Visual Studio; but started adding it with
35 | // VS2013.  However, VS2013 still didn't have snprintf().  The following is a
36 | // work-around (Note: The _CRT_SECURE_NO_WARNINGS macro must be set in the
37 | // "CMakeLists.txt" file).
38 | // NOTE: This is fixed in Visual Studio 2015.
39 | #define snprintf _snprintf
40 | #endif
41 | 
42 | #define strdup _strdup
43 | 
44 | #endif // _WIN32
45 | 
46 | // Check for noexcept support using clang, with fallback to Windows or GCC version numbers
47 | #ifndef NOEXCEPT
48 | #if defined(__clang__)
49 | #if __has_feature(cxx_noexcept)
50 | #define HAS_NOEXCEPT
51 | #endif
52 | #else
53 | #if defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC__ * 10 + __GNUC_MINOR__ >= 46
54 | #define HAS_NOEXCEPT
55 | #else
56 | #if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 190023026 && defined(_HAS_EXCEPTIONS) && _HAS_EXCEPTIONS
57 | #define HAS_NOEXCEPT
58 | #endif
59 | #endif
60 | #endif
61 | 
62 | #ifdef HAS_NOEXCEPT
63 | #define NOEXCEPT noexcept
64 | #else
65 | #define NOEXCEPT
66 | #endif
67 | #endif
68 | 
69 | #endif  // VK_SDK_PLATFORM_H
70 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_H_
 2 | #define VULKAN_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | #include "vk_platform.h"
11 | #include "vulkan_core.h"
12 | 
13 | #ifdef VK_USE_PLATFORM_ANDROID_KHR
14 | #include "vulkan_android.h"
15 | #endif
16 | 
17 | #ifdef VK_USE_PLATFORM_FUCHSIA
18 | #include <zircon/types.h>
19 | #include "vulkan_fuchsia.h"
20 | #endif
21 | 
22 | #ifdef VK_USE_PLATFORM_IOS_MVK
23 | #include "vulkan_ios.h"
24 | #endif
25 | 
26 | 
27 | #ifdef VK_USE_PLATFORM_MACOS_MVK
28 | #include "vulkan_macos.h"
29 | #endif
30 | 
31 | #ifdef VK_USE_PLATFORM_METAL_EXT
32 | #include "vulkan_metal.h"
33 | #endif
34 | 
35 | #ifdef VK_USE_PLATFORM_VI_NN
36 | #include "vulkan_vi.h"
37 | #endif
38 | 
39 | 
40 | #ifdef VK_USE_PLATFORM_WAYLAND_KHR
41 | #include <wayland-client.h>
42 | #include "vulkan_wayland.h"
43 | #endif
44 | 
45 | 
46 | #ifdef VK_USE_PLATFORM_WIN32_KHR
47 | #include <windows.h>
48 | #include "vulkan_win32.h"
49 | #endif
50 | 
51 | 
52 | #ifdef VK_USE_PLATFORM_XCB_KHR
53 | #include <xcb/xcb.h>
54 | #include "vulkan_xcb.h"
55 | #endif
56 | 
57 | 
58 | #ifdef VK_USE_PLATFORM_XLIB_KHR
59 | #include <X11/Xlib.h>
60 | #include "vulkan_xlib.h"
61 | #endif
62 | 
63 | 
64 | #ifdef VK_USE_PLATFORM_DIRECTFB_EXT
65 | #include <directfb.h>
66 | #include "vulkan_directfb.h"
67 | #endif
68 | 
69 | 
70 | #ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
71 | #include <X11/Xlib.h>
72 | #include <X11/extensions/Xrandr.h>
73 | #include "vulkan_xlib_xrandr.h"
74 | #endif
75 | 
76 | 
77 | #ifdef VK_USE_PLATFORM_GGP
78 | #include <ggp_c/vulkan_types.h>
79 | #include "vulkan_ggp.h"
80 | #endif
81 | 
82 | 
83 | #ifdef VK_ENABLE_BETA_EXTENSIONS
84 | #include "vulkan_beta.h"
85 | #endif
86 | 
87 | #endif // VULKAN_H_
88 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_directfb.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_DIRECTFB_H_
 2 | #define VULKAN_DIRECTFB_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_EXT_directfb_surface 1
23 | #define VK_EXT_DIRECTFB_SURFACE_SPEC_VERSION 1
24 | #define VK_EXT_DIRECTFB_SURFACE_EXTENSION_NAME "VK_EXT_directfb_surface"
25 | typedef VkFlags VkDirectFBSurfaceCreateFlagsEXT;
26 | typedef struct VkDirectFBSurfaceCreateInfoEXT {
27 |     VkStructureType                    sType;
28 |     const void*                        pNext;
29 |     VkDirectFBSurfaceCreateFlagsEXT    flags;
30 |     IDirectFB*                         dfb;
31 |     IDirectFBSurface*                  surface;
32 | } VkDirectFBSurfaceCreateInfoEXT;
33 | 
34 | typedef VkResult (VKAPI_PTR *PFN_vkCreateDirectFBSurfaceEXT)(VkInstance instance, const VkDirectFBSurfaceCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
35 | typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceDirectFBPresentationSupportEXT)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, IDirectFB* dfb);
36 | 
37 | #ifndef VK_NO_PROTOTYPES
38 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateDirectFBSurfaceEXT(
39 |     VkInstance                                  instance,
40 |     const VkDirectFBSurfaceCreateInfoEXT*       pCreateInfo,
41 |     const VkAllocationCallbacks*                pAllocator,
42 |     VkSurfaceKHR*                               pSurface);
43 | 
44 | VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceDirectFBPresentationSupportEXT(
45 |     VkPhysicalDevice                            physicalDevice,
46 |     uint32_t                                    queueFamilyIndex,
47 |     IDirectFB*                                  dfb);
48 | #endif
49 | 
50 | #ifdef __cplusplus
51 | }
52 | #endif
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_fuchsia.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_FUCHSIA_H_
 2 | #define VULKAN_FUCHSIA_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_FUCHSIA_imagepipe_surface 1
23 | #define VK_FUCHSIA_IMAGEPIPE_SURFACE_SPEC_VERSION 1
24 | #define VK_FUCHSIA_IMAGEPIPE_SURFACE_EXTENSION_NAME "VK_FUCHSIA_imagepipe_surface"
25 | typedef VkFlags VkImagePipeSurfaceCreateFlagsFUCHSIA;
26 | typedef struct VkImagePipeSurfaceCreateInfoFUCHSIA {
27 |     VkStructureType                         sType;
28 |     const void*                             pNext;
29 |     VkImagePipeSurfaceCreateFlagsFUCHSIA    flags;
30 |     zx_handle_t                             imagePipeHandle;
31 | } VkImagePipeSurfaceCreateInfoFUCHSIA;
32 | 
33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateImagePipeSurfaceFUCHSIA)(VkInstance instance, const VkImagePipeSurfaceCreateInfoFUCHSIA* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
34 | 
35 | #ifndef VK_NO_PROTOTYPES
36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateImagePipeSurfaceFUCHSIA(
37 |     VkInstance                                  instance,
38 |     const VkImagePipeSurfaceCreateInfoFUCHSIA*  pCreateInfo,
39 |     const VkAllocationCallbacks*                pAllocator,
40 |     VkSurfaceKHR*                               pSurface);
41 | #endif
42 | 
43 | #ifdef __cplusplus
44 | }
45 | #endif
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_ggp.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_GGP_H_
 2 | #define VULKAN_GGP_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_GGP_stream_descriptor_surface 1
23 | #define VK_GGP_STREAM_DESCRIPTOR_SURFACE_SPEC_VERSION 1
24 | #define VK_GGP_STREAM_DESCRIPTOR_SURFACE_EXTENSION_NAME "VK_GGP_stream_descriptor_surface"
25 | typedef VkFlags VkStreamDescriptorSurfaceCreateFlagsGGP;
26 | typedef struct VkStreamDescriptorSurfaceCreateInfoGGP {
27 |     VkStructureType                            sType;
28 |     const void*                                pNext;
29 |     VkStreamDescriptorSurfaceCreateFlagsGGP    flags;
30 |     GgpStreamDescriptor                        streamDescriptor;
31 | } VkStreamDescriptorSurfaceCreateInfoGGP;
32 | 
33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateStreamDescriptorSurfaceGGP)(VkInstance instance, const VkStreamDescriptorSurfaceCreateInfoGGP* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
34 | 
35 | #ifndef VK_NO_PROTOTYPES
36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateStreamDescriptorSurfaceGGP(
37 |     VkInstance                                  instance,
38 |     const VkStreamDescriptorSurfaceCreateInfoGGP* pCreateInfo,
39 |     const VkAllocationCallbacks*                pAllocator,
40 |     VkSurfaceKHR*                               pSurface);
41 | #endif
42 | 
43 | 
44 | #define VK_GGP_frame_token 1
45 | #define VK_GGP_FRAME_TOKEN_SPEC_VERSION   1
46 | #define VK_GGP_FRAME_TOKEN_EXTENSION_NAME "VK_GGP_frame_token"
47 | typedef struct VkPresentFrameTokenGGP {
48 |     VkStructureType    sType;
49 |     const void*        pNext;
50 |     GgpFrameToken      frameToken;
51 | } VkPresentFrameTokenGGP;
52 | 
53 | 
54 | #ifdef __cplusplus
55 | }
56 | #endif
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_ios.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_IOS_H_
 2 | #define VULKAN_IOS_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_MVK_ios_surface 1
23 | #define VK_MVK_IOS_SURFACE_SPEC_VERSION   3
24 | #define VK_MVK_IOS_SURFACE_EXTENSION_NAME "VK_MVK_ios_surface"
25 | typedef VkFlags VkIOSSurfaceCreateFlagsMVK;
26 | typedef struct VkIOSSurfaceCreateInfoMVK {
27 |     VkStructureType               sType;
28 |     const void*                   pNext;
29 |     VkIOSSurfaceCreateFlagsMVK    flags;
30 |     const void*                   pView;
31 | } VkIOSSurfaceCreateInfoMVK;
32 | 
33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateIOSSurfaceMVK)(VkInstance instance, const VkIOSSurfaceCreateInfoMVK* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
34 | 
35 | #ifndef VK_NO_PROTOTYPES
36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateIOSSurfaceMVK(
37 |     VkInstance                                  instance,
38 |     const VkIOSSurfaceCreateInfoMVK*            pCreateInfo,
39 |     const VkAllocationCallbacks*                pAllocator,
40 |     VkSurfaceKHR*                               pSurface);
41 | #endif
42 | 
43 | #ifdef __cplusplus
44 | }
45 | #endif
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_macos.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_MACOS_H_
 2 | #define VULKAN_MACOS_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_MVK_macos_surface 1
23 | #define VK_MVK_MACOS_SURFACE_SPEC_VERSION 3
24 | #define VK_MVK_MACOS_SURFACE_EXTENSION_NAME "VK_MVK_macos_surface"
25 | typedef VkFlags VkMacOSSurfaceCreateFlagsMVK;
26 | typedef struct VkMacOSSurfaceCreateInfoMVK {
27 |     VkStructureType                 sType;
28 |     const void*                     pNext;
29 |     VkMacOSSurfaceCreateFlagsMVK    flags;
30 |     const void*                     pView;
31 | } VkMacOSSurfaceCreateInfoMVK;
32 | 
33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateMacOSSurfaceMVK)(VkInstance instance, const VkMacOSSurfaceCreateInfoMVK* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
34 | 
35 | #ifndef VK_NO_PROTOTYPES
36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateMacOSSurfaceMVK(
37 |     VkInstance                                  instance,
38 |     const VkMacOSSurfaceCreateInfoMVK*          pCreateInfo,
39 |     const VkAllocationCallbacks*                pAllocator,
40 |     VkSurfaceKHR*                               pSurface);
41 | #endif
42 | 
43 | #ifdef __cplusplus
44 | }
45 | #endif
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_metal.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_METAL_H_
 2 | #define VULKAN_METAL_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_EXT_metal_surface 1
23 | 
24 | #ifdef __OBJC__
25 | @class CAMetalLayer;
26 | #else
27 | typedef void CAMetalLayer;
28 | #endif
29 | 
30 | #define VK_EXT_METAL_SURFACE_SPEC_VERSION 1
31 | #define VK_EXT_METAL_SURFACE_EXTENSION_NAME "VK_EXT_metal_surface"
32 | typedef VkFlags VkMetalSurfaceCreateFlagsEXT;
33 | typedef struct VkMetalSurfaceCreateInfoEXT {
34 |     VkStructureType                 sType;
35 |     const void*                     pNext;
36 |     VkMetalSurfaceCreateFlagsEXT    flags;
37 |     const CAMetalLayer*             pLayer;
38 | } VkMetalSurfaceCreateInfoEXT;
39 | 
40 | typedef VkResult (VKAPI_PTR *PFN_vkCreateMetalSurfaceEXT)(VkInstance instance, const VkMetalSurfaceCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
41 | 
42 | #ifndef VK_NO_PROTOTYPES
43 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateMetalSurfaceEXT(
44 |     VkInstance                                  instance,
45 |     const VkMetalSurfaceCreateInfoEXT*          pCreateInfo,
46 |     const VkAllocationCallbacks*                pAllocator,
47 |     VkSurfaceKHR*                               pSurface);
48 | #endif
49 | 
50 | #ifdef __cplusplus
51 | }
52 | #endif
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_vi.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_VI_H_
 2 | #define VULKAN_VI_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_NN_vi_surface 1
23 | #define VK_NN_VI_SURFACE_SPEC_VERSION     1
24 | #define VK_NN_VI_SURFACE_EXTENSION_NAME   "VK_NN_vi_surface"
25 | typedef VkFlags VkViSurfaceCreateFlagsNN;
26 | typedef struct VkViSurfaceCreateInfoNN {
27 |     VkStructureType             sType;
28 |     const void*                 pNext;
29 |     VkViSurfaceCreateFlagsNN    flags;
30 |     void*                       window;
31 | } VkViSurfaceCreateInfoNN;
32 | 
33 | typedef VkResult (VKAPI_PTR *PFN_vkCreateViSurfaceNN)(VkInstance instance, const VkViSurfaceCreateInfoNN* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
34 | 
35 | #ifndef VK_NO_PROTOTYPES
36 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateViSurfaceNN(
37 |     VkInstance                                  instance,
38 |     const VkViSurfaceCreateInfoNN*              pCreateInfo,
39 |     const VkAllocationCallbacks*                pAllocator,
40 |     VkSurfaceKHR*                               pSurface);
41 | #endif
42 | 
43 | #ifdef __cplusplus
44 | }
45 | #endif
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_wayland.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_WAYLAND_H_
 2 | #define VULKAN_WAYLAND_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_KHR_wayland_surface 1
23 | #define VK_KHR_WAYLAND_SURFACE_SPEC_VERSION 6
24 | #define VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME "VK_KHR_wayland_surface"
25 | typedef VkFlags VkWaylandSurfaceCreateFlagsKHR;
26 | typedef struct VkWaylandSurfaceCreateInfoKHR {
27 |     VkStructureType                   sType;
28 |     const void*                       pNext;
29 |     VkWaylandSurfaceCreateFlagsKHR    flags;
30 |     struct wl_display*                display;
31 |     struct wl_surface*                surface;
32 | } VkWaylandSurfaceCreateInfoKHR;
33 | 
34 | typedef VkResult (VKAPI_PTR *PFN_vkCreateWaylandSurfaceKHR)(VkInstance instance, const VkWaylandSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
35 | typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceWaylandPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, struct wl_display* display);
36 | 
37 | #ifndef VK_NO_PROTOTYPES
38 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateWaylandSurfaceKHR(
39 |     VkInstance                                  instance,
40 |     const VkWaylandSurfaceCreateInfoKHR*        pCreateInfo,
41 |     const VkAllocationCallbacks*                pAllocator,
42 |     VkSurfaceKHR*                               pSurface);
43 | 
44 | VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceWaylandPresentationSupportKHR(
45 |     VkPhysicalDevice                            physicalDevice,
46 |     uint32_t                                    queueFamilyIndex,
47 |     struct wl_display*                          display);
48 | #endif
49 | 
50 | #ifdef __cplusplus
51 | }
52 | #endif
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_xcb.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_XCB_H_
 2 | #define VULKAN_XCB_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_KHR_xcb_surface 1
23 | #define VK_KHR_XCB_SURFACE_SPEC_VERSION   6
24 | #define VK_KHR_XCB_SURFACE_EXTENSION_NAME "VK_KHR_xcb_surface"
25 | typedef VkFlags VkXcbSurfaceCreateFlagsKHR;
26 | typedef struct VkXcbSurfaceCreateInfoKHR {
27 |     VkStructureType               sType;
28 |     const void*                   pNext;
29 |     VkXcbSurfaceCreateFlagsKHR    flags;
30 |     xcb_connection_t*             connection;
31 |     xcb_window_t                  window;
32 | } VkXcbSurfaceCreateInfoKHR;
33 | 
34 | typedef VkResult (VKAPI_PTR *PFN_vkCreateXcbSurfaceKHR)(VkInstance instance, const VkXcbSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
35 | typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceXcbPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, xcb_connection_t* connection, xcb_visualid_t visual_id);
36 | 
37 | #ifndef VK_NO_PROTOTYPES
38 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateXcbSurfaceKHR(
39 |     VkInstance                                  instance,
40 |     const VkXcbSurfaceCreateInfoKHR*            pCreateInfo,
41 |     const VkAllocationCallbacks*                pAllocator,
42 |     VkSurfaceKHR*                               pSurface);
43 | 
44 | VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXcbPresentationSupportKHR(
45 |     VkPhysicalDevice                            physicalDevice,
46 |     uint32_t                                    queueFamilyIndex,
47 |     xcb_connection_t*                           connection,
48 |     xcb_visualid_t                              visual_id);
49 | #endif
50 | 
51 | #ifdef __cplusplus
52 | }
53 | #endif
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_xlib.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_XLIB_H_
 2 | #define VULKAN_XLIB_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_KHR_xlib_surface 1
23 | #define VK_KHR_XLIB_SURFACE_SPEC_VERSION  6
24 | #define VK_KHR_XLIB_SURFACE_EXTENSION_NAME "VK_KHR_xlib_surface"
25 | typedef VkFlags VkXlibSurfaceCreateFlagsKHR;
26 | typedef struct VkXlibSurfaceCreateInfoKHR {
27 |     VkStructureType                sType;
28 |     const void*                    pNext;
29 |     VkXlibSurfaceCreateFlagsKHR    flags;
30 |     Display*                       dpy;
31 |     Window                         window;
32 | } VkXlibSurfaceCreateInfoKHR;
33 | 
34 | typedef VkResult (VKAPI_PTR *PFN_vkCreateXlibSurfaceKHR)(VkInstance instance, const VkXlibSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
35 | typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceXlibPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, Display* dpy, VisualID visualID);
36 | 
37 | #ifndef VK_NO_PROTOTYPES
38 | VKAPI_ATTR VkResult VKAPI_CALL vkCreateXlibSurfaceKHR(
39 |     VkInstance                                  instance,
40 |     const VkXlibSurfaceCreateInfoKHR*           pCreateInfo,
41 |     const VkAllocationCallbacks*                pAllocator,
42 |     VkSurfaceKHR*                               pSurface);
43 | 
44 | VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXlibPresentationSupportKHR(
45 |     VkPhysicalDevice                            physicalDevice,
46 |     uint32_t                                    queueFamilyIndex,
47 |     Display*                                    dpy,
48 |     VisualID                                    visualID);
49 | #endif
50 | 
51 | #ifdef __cplusplus
52 | }
53 | #endif
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/vulkan/vulkan/vulkan_xlib_xrandr.h:
--------------------------------------------------------------------------------
 1 | #ifndef VULKAN_XLIB_XRANDR_H_
 2 | #define VULKAN_XLIB_XRANDR_H_ 1
 3 | 
 4 | /*
 5 | ** Copyright (c) 2015-2020 The Khronos Group Inc.
 6 | **
 7 | ** SPDX-License-Identifier: Apache-2.0
 8 | */
 9 | 
10 | /*
11 | ** This header is generated from the Khronos Vulkan XML API Registry.
12 | **
13 | */
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | 
21 | 
22 | #define VK_EXT_acquire_xlib_display 1
23 | #define VK_EXT_ACQUIRE_XLIB_DISPLAY_SPEC_VERSION 1
24 | #define VK_EXT_ACQUIRE_XLIB_DISPLAY_EXTENSION_NAME "VK_EXT_acquire_xlib_display"
25 | typedef VkResult (VKAPI_PTR *PFN_vkAcquireXlibDisplayEXT)(VkPhysicalDevice physicalDevice, Display* dpy, VkDisplayKHR display);
26 | typedef VkResult (VKAPI_PTR *PFN_vkGetRandROutputDisplayEXT)(VkPhysicalDevice physicalDevice, Display* dpy, RROutput rrOutput, VkDisplayKHR* pDisplay);
27 | 
28 | #ifndef VK_NO_PROTOTYPES
29 | VKAPI_ATTR VkResult VKAPI_CALL vkAcquireXlibDisplayEXT(
30 |     VkPhysicalDevice                            physicalDevice,
31 |     Display*                                    dpy,
32 |     VkDisplayKHR                                display);
33 | 
34 | VKAPI_ATTR VkResult VKAPI_CALL vkGetRandROutputDisplayEXT(
35 |     VkPhysicalDevice                            physicalDevice,
36 |     Display*                                    dpy,
37 |     RROutput                                    rrOutput,
38 |     VkDisplayKHR*                               pDisplay);
39 | #endif
40 | 
41 | #ifdef __cplusplus
42 | }
43 | #endif
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------