├── .gitmodules
├── LICENSE
├── .github
    └── workflows
    │   ├── CI.yml
    │   └── release.yml
├── README.md
├── CMakeLists.txt
└── vkpeak.cpp


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ncnn"]
2 | 	path = ncnn
3 | 	url = https://github.com/Tencent/ncnn.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 nihui
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: [push, pull_request]
 3 | 
 4 | env:
 5 |   VULKANSDK_VERSION: 1.4.309.0
 6 |   DEVELOPER_DIR: /Applications/Xcode_15.2.app/Contents/Developer
 7 |   UseMultiToolTask: true
 8 | 
 9 | concurrency:
10 |   group: CI-${{ github.ref }}
11 |   cancel-in-progress: true
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   windows:
17 |     runs-on: windows-latest
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |       with:
21 |         submodules: 'recursive'
22 |     - name: build
23 |       run: |
24 |         mkdir build; cd build
25 |         cmake -A x64 ..
26 |         cmake --build . --config Release -j 4
27 | 
28 |   ubuntu:
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |     - uses: actions/checkout@v4
32 |       with:
33 |         submodules: 'recursive'
34 |     - name: build
35 |       run: |
36 |         mkdir build && cd build
37 |         cmake ..
38 |         cmake --build . -j 4
39 | 
40 |   macos:
41 |     runs-on: macos-13
42 |     steps:
43 |     - uses: actions/checkout@v4
44 |       with:
45 |         submodules: 'recursive'
46 |     - name: vulkansdk
47 |       run: |
48 |         wget -q https://sdk.lunarg.com/sdk/download/${{ env.VULKANSDK_VERSION }}/mac/vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip?Human=true -O vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip
49 |         unzip -q vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip
50 |         sudo InstallVulkan-${{ env.VULKANSDK_VERSION }}.app/Contents/MacOS/InstallVulkan-${{ env.VULKANSDK_VERSION }} --root $GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }} --accept-licenses --default-answer --confirm-command install
51 |     - name: build-x86_64
52 |       run: |
53 |         mkdir build-x86_64 && cd build-x86_64
54 |         cmake -DCMAKE_OSX_ARCHITECTURES="x86_64" \
55 |             -DVulkan_LIBRARY=$GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }}/macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a \
56 |             ..
57 |         cmake --build . -j 4
58 |     - name: build-arm64
59 |       run: |
60 |         mkdir build-arm64 && cd build-arm64
61 |         cmake -DCMAKE_OSX_ARCHITECTURES="arm64" \
62 |             -DVulkan_LIBRARY=$GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }}/macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a \
63 |             ..
64 |         cmake --build . -j 4
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # vkpeak
  2 | 
  3 | ![CI](https://github.com/nihui/vkpeak/workflows/CI/badge.svg)
  4 | ![download](https://img.shields.io/github/downloads/nihui/vkpeak/total.svg)
  5 | 
  6 | A synthetic benchmarking tool to measure peak capabilities of vulkan devices. It only measures the peak metrics that can be achieved using vector operations and does not represent a real-world use case.
  7 | 
  8 | ## [Download](https://github.com/nihui/vkpeak/releases)
  9 | 
 10 | Download Windows/Linux/MacOS Executable for Intel/AMD/Nvidia/Apple GPU
 11 | 
 12 | **https://github.com/nihui/vkpeak/releases**
 13 | 
 14 | ## Usages
 15 | 
 16 | ```shell
 17 | vkpeak.exe
 18 | ```
 19 | 
 20 | vkpeak will choose the default vulkan device.
 21 | 
 22 | If you need to specify device id, then
 23 | 
 24 | ```shell
 25 | vkpeak.exe 0
 26 | ```
 27 | 
 28 | The only parameter 0 is the device id.
 29 | 
 30 | If you encounter a crash or error, try upgrading your GPU driver:
 31 | 
 32 | - Intel: https://downloadcenter.intel.com/product/80939/Graphics-Drivers
 33 | - AMD: https://www.amd.com/en/support
 34 | - NVIDIA: https://www.nvidia.com/Download/index.aspx
 35 | 
 36 | ## Build from Source
 37 | 
 38 | 1. Clone this project with all submodules
 39 | 
 40 | ```shell
 41 | git clone https://github.com/nihui/vkpeak.git
 42 | cd vkpeak
 43 | git submodule update --init --recursive
 44 | ```
 45 | 
 46 | 2. Build with CMake
 47 |   - You can pass -DVulkan_LIBRARY=<path to your macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a> option to link static MoltenVK library on MacOS, MoltenVK is part of Vulkan SDK from https://vulkan.lunarg.com/
 48 | 
 49 | ```shell
 50 | mkdir build
 51 | cd build
 52 | cmake ..
 53 | cmake --build . -j 4
 54 | ```
 55 | 
 56 | ## Sample
 57 | 
 58 | NVIDIA RTX5060Ti 16GB
 59 | ```
 60 | device       = NVIDIA GeForce RTX 5060 Ti
 61 | 
 62 | fp32-scalar  = 17137.46 GFLOPS
 63 | fp32-vec4    = 16910.07 GFLOPS
 64 | 
 65 | fp16-scalar  = 12730.03 GFLOPS
 66 | fp16-vec4    = 12715.02 GFLOPS
 67 | fp16-matrix  = 101485.35 GFLOPS
 68 | 
 69 | fp64-scalar  = 398.59 GFLOPS
 70 | fp64-vec4    = 394.08 GFLOPS
 71 | 
 72 | int32-scalar = 12703.68 GIOPS
 73 | int32-vec4   = 12181.98 GIOPS
 74 | 
 75 | int16-scalar = 12690.05 GIOPS
 76 | int16-vec4   = 12208.29 GIOPS
 77 | 
 78 | int64-scalar = 3104.59 GIOPS
 79 | int64-vec4   = 2666.86 GIOPS
 80 | 
 81 | int8-dotprod = 16101.59 GIOPS
 82 | int8-matrix  = 202947.80 GIOPS
 83 | 
 84 | bf16-dotprod = 0.00 GFLOPS
 85 | bf16-matrix  = 0.00 GFLOPS
 86 | 
 87 | fp8-matrix   = 0.00 GFLOPS
 88 | bf8-matrix   = 0.00 GFLOPS
 89 | 
 90 | copy-h2h     = 18.17 GBPS
 91 | copy-h2d     = 17.93 GBPS
 92 | copy-d2h     = 18.09 GBPS
 93 | copy-d2d     = 190.70 GBPS
 94 | ```
 95 | 
 96 | AMD RX9060XT 16GB
 97 | ```
 98 | device       = AMD Radeon Graphics (RADV GFX1200)
 99 | 
100 | fp32-scalar  = 17606.54 GFLOPS
101 | fp32-vec4    = 12155.22 GFLOPS
102 | 
103 | fp16-scalar  = 16921.16 GFLOPS
104 | fp16-vec4    = 27833.48 GFLOPS
105 | fp16-matrix  = 105337.66 GFLOPS
106 | 
107 | fp64-scalar  = 442.80 GFLOPS
108 | fp64-vec4    = 437.55 GFLOPS
109 | 
110 | int32-scalar = 2804.59 GIOPS
111 | int32-vec4   = 2796.74 GIOPS
112 | 
113 | int16-scalar = 15034.62 GIOPS
114 | int16-vec4   = 26356.38 GIOPS
115 | 
116 | int64-scalar = 932.14 GIOPS
117 | int64-vec4   = 768.53 GIOPS
118 | 
119 | int8-dotprod = 53893.32 GIOPS
120 | int8-matrix  = 194476.41 GIOPS
121 | 
122 | bf16-dotprod = 24427.68 GFLOPS
123 | bf16-matrix  = 105099.82 GFLOPS
124 | 
125 | fp8-matrix   = 205061.72 GFLOPS
126 | bf8-matrix   = 208234.02 GFLOPS
127 | 
128 | copy-h2h     = 21.05 GBPS
129 | copy-h2d     = 21.17 GBPS
130 | copy-d2h     = 23.70 GBPS
131 | copy-d2d     = 145.23 GBPS
132 | ```
133 | 
134 | ## Other Open-Source Code Used
135 | 
136 | - https://github.com/Tencent/ncnn for fast neural network inference on ALL PLATFORMS
137 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: release
  2 | on: workflow_dispatch
  3 | 
  4 | env:
  5 |   VULKANSDK_VERSION: 1.4.309.0
  6 |   DEVELOPER_DIR: /Applications/Xcode_15.2.app/Contents/Developer
  7 |   UseMultiToolTask: true
  8 |   APPLICATION_NAME: vkpeak
  9 | 
 10 | jobs:
 11 | 
 12 |   setup:
 13 |     runs-on: ubuntu-latest
 14 |     outputs:
 15 |       APPNAME: ${{ steps.get_appname.outputs.APPNAME }}
 16 |       VERSION: ${{ steps.get_version.outputs.VERSION }}
 17 |     steps:
 18 |     - name: get-appname
 19 |       id: get_appname
 20 |       run: echo "APPNAME=${APPLICATION_NAME}" >> $GITHUB_OUTPUT
 21 |     - name: get-version
 22 |       id: get_version
 23 |       run: |
 24 |         DATE=`date +'%Y%m%d'`
 25 |         echo "VERSION=${DATE}" >> $GITHUB_OUTPUT
 26 | 
 27 |   ubuntu:
 28 |     needs: [setup]
 29 |     runs-on: ubuntu-22.04
 30 |     env:
 31 |       PACKAGENAME: ${{ needs.setup.outputs.APPNAME }}-${{ needs.setup.outputs.VERSION }}-ubuntu
 32 |     steps:
 33 |     - uses: actions/checkout@v4
 34 |       with:
 35 |         submodules: 'recursive'
 36 |     - name: build
 37 |       run: |
 38 |         mkdir build && cd build
 39 |         cmake ..
 40 |         cmake --build . -j 4
 41 |     - name: package
 42 |       run: |
 43 |         mkdir -p ${{ env.PACKAGENAME }}
 44 |         cp README.md LICENSE ${{ env.PACKAGENAME }}
 45 |         cp build/${{ needs.setup.outputs.APPNAME }} ${{ env.PACKAGENAME }}
 46 |         strip -g ${{ env.PACKAGENAME }}/${{ needs.setup.outputs.APPNAME }}
 47 |         zip -9 -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
 48 |     - name: upload
 49 |       uses: actions/upload-artifact@v4
 50 |       with:
 51 |         name: ${{ env.PACKAGENAME }}
 52 |         path: ${{ env.PACKAGENAME }}.zip
 53 | 
 54 |   macos:
 55 |     needs: [setup]
 56 |     runs-on: macos-13
 57 |     env:
 58 |       PACKAGENAME: ${{ needs.setup.outputs.APPNAME }}-${{ needs.setup.outputs.VERSION }}-macos
 59 |     steps:
 60 |     - uses: actions/checkout@v4
 61 |       with:
 62 |         submodules: 'recursive'
 63 |     - name: vulkansdk
 64 |       run: |
 65 |         wget -q https://sdk.lunarg.com/sdk/download/${{ env.VULKANSDK_VERSION }}/mac/vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip?Human=true -O vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip
 66 |         unzip -q vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip
 67 |         sudo InstallVulkan-${{ env.VULKANSDK_VERSION }}.app/Contents/MacOS/InstallVulkan-${{ env.VULKANSDK_VERSION }} --root $GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }} --accept-licenses --default-answer --confirm-command install
 68 |     - name: build-x86_64
 69 |       run: |
 70 |         mkdir build-x86_64 && cd build-x86_64
 71 |         cmake -DCMAKE_OSX_ARCHITECTURES="x86_64" \
 72 |             -DVulkan_LIBRARY=$GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }}/macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a \
 73 |             ..
 74 |         cmake --build . -j 4
 75 |     - name: build-arm64
 76 |       run: |
 77 |         mkdir build-arm64 && cd build-arm64
 78 |         cmake -DCMAKE_OSX_ARCHITECTURES="arm64" \
 79 |             -DVulkan_LIBRARY=$GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }}/macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a \
 80 |             ..
 81 |         cmake --build . -j 4
 82 |     - name: package
 83 |       run: |
 84 |         mkdir -p ${{ env.PACKAGENAME }}
 85 |         cp README.md LICENSE ${{ env.PACKAGENAME }}
 86 |         lipo -create build-x86_64/${{ needs.setup.outputs.APPNAME }} build-arm64/${{ needs.setup.outputs.APPNAME }} -o ${{ env.PACKAGENAME }}/${{ needs.setup.outputs.APPNAME }}
 87 |         strip ${{ env.PACKAGENAME }}/${{ needs.setup.outputs.APPNAME }}
 88 |         zip -9 -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
 89 |     - name: upload
 90 |       uses: actions/upload-artifact@v4
 91 |       with:
 92 |         name: ${{ env.PACKAGENAME }}
 93 |         path: ${{ env.PACKAGENAME }}.zip
 94 | 
 95 |   windows:
 96 |     needs: [setup]
 97 |     runs-on: windows-latest
 98 |     env:
 99 |       UseMultiToolTask: true
100 |       PACKAGENAME: ${{ needs.setup.outputs.APPNAME }}-${{ needs.setup.outputs.VERSION }}-windows
101 |     steps:
102 |     - uses: actions/checkout@v4
103 |       with:
104 |         submodules: 'recursive'
105 |     - name: build
106 |       run: |
107 |         mkdir build; cd build
108 |         cmake -A x64 ..
109 |         cmake --build . --config Release -j 4
110 |     - name: package
111 |       run: |
112 |         mkdir ${{ env.PACKAGENAME }}
113 |         Copy-Item -Verbose -Path "README.md" -Destination "${{ env.PACKAGENAME }}"
114 |         Copy-Item -Verbose -Path "LICENSE" -Destination "${{ env.PACKAGENAME }}"
115 |         Copy-Item -Verbose -Path "build\Release\${{ needs.setup.outputs.APPNAME }}.exe" -Destination "${{ env.PACKAGENAME }}"
116 |         7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
117 |     - name: upload
118 |       uses: actions/upload-artifact@v4
119 |       with:
120 |         name: ${{ env.PACKAGENAME }}
121 |         path: ${{ env.PACKAGENAME }}.zip
122 | 
123 |   release:
124 |     needs: [setup, ubuntu, macos, windows]
125 |     runs-on: ubuntu-latest
126 |     steps:
127 |     - name: download
128 |       uses: actions/download-artifact@v4
129 |       with:
130 |         path: artifacts
131 | 
132 |     - name: create-release
133 |       uses: softprops/action-gh-release@v2
134 |       with:
135 |         token: ${{ secrets.GITHUB_TOKEN }}
136 |         tag_name: ${{ needs.setup.outputs.VERSION }}
137 |         name: Release ${{ needs.setup.outputs.VERSION }}
138 |         files: artifacts/*/*.zip
139 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_policy(SET CMP0091 NEW)
  2 | set(CMAKE_POLICY_DEFAULT_CMP0091 NEW)
  3 | set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
  4 | 
  5 | project(vkpeak)
  6 | cmake_minimum_required(VERSION 3.10)
  7 | 
  8 | if(NOT CMAKE_BUILD_TYPE)
  9 |     set(CMAKE_BUILD_TYPE release CACHE STRING "Choose the type of build" FORCE)
 10 | endif()
 11 | 
 12 | # build ncnn library
 13 | if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ncnn/CMakeLists.txt")
 14 |     message(FATAL_ERROR "The submodules were not downloaded! Please update submodules with \"git submodule update --init --recursive\" and try again.")
 15 | endif()
 16 | 
 17 | option(NCNN_INSTALL_SDK "" OFF)
 18 | option(NCNN_STRING "" OFF)
 19 | option(NCNN_STDIO "" OFF)
 20 | option(NCNN_C_API "" OFF)
 21 | option(NCNN_PIXEL "" OFF)
 22 | option(NCNN_PIXEL_ROTATE "" OFF)
 23 | option(NCNN_PIXEL_AFFINE "" OFF)
 24 | option(NCNN_PIXEL_DRAWING "" OFF)
 25 | option(NCNN_PLATFORM_API "" OFF)
 26 | option(NCNN_VULKAN "" ON)
 27 | option(NCNN_BUILD_BENCHMARK "" OFF)
 28 | option(NCNN_BUILD_TESTS "" OFF)
 29 | option(NCNN_BUILD_TOOLS "" OFF)
 30 | option(NCNN_BUILD_EXAMPLES "" OFF)
 31 | option(NCNN_INT8 "" OFF)
 32 | option(NCNN_BF16 "" OFF)
 33 | option(NCNN_OPENMP "" OFF)
 34 | option(NCNN_THREADS "" ON)
 35 | option(NCNN_DISABLE_RTTI "" ON)
 36 | option(NCNN_DISABLE_EXCEPTION "" ON)
 37 | 
 38 | option(WITH_LAYER_absval "" OFF)
 39 | option(WITH_LAYER_argmax "" OFF)
 40 | option(WITH_LAYER_batchnorm "" OFF)
 41 | option(WITH_LAYER_bias "" OFF)
 42 | option(WITH_LAYER_bnll "" OFF)
 43 | option(WITH_LAYER_concat "" OFF)
 44 | option(WITH_LAYER_convolution "" OFF)
 45 | option(WITH_LAYER_crop "" OFF)
 46 | option(WITH_LAYER_deconvolution "" OFF)
 47 | option(WITH_LAYER_dropout "" OFF)
 48 | option(WITH_LAYER_eltwise "" OFF)
 49 | option(WITH_LAYER_elu "" OFF)
 50 | option(WITH_LAYER_embed "" OFF)
 51 | option(WITH_LAYER_exp "" OFF)
 52 | option(WITH_LAYER_flatten "" OFF)
 53 | option(WITH_LAYER_innerproduct "" OFF)
 54 | option(WITH_LAYER_input "" OFF)
 55 | option(WITH_LAYER_log "" OFF)
 56 | option(WITH_LAYER_lrn "" OFF)
 57 | option(WITH_LAYER_memorydata "" OFF)
 58 | option(WITH_LAYER_mvn "" OFF)
 59 | option(WITH_LAYER_pooling "" OFF)
 60 | option(WITH_LAYER_power "" OFF)
 61 | option(WITH_LAYER_prelu "" OFF)
 62 | option(WITH_LAYER_proposal "" OFF)
 63 | option(WITH_LAYER_reduction "" OFF)
 64 | option(WITH_LAYER_relu "" OFF)
 65 | option(WITH_LAYER_reshape "" OFF)
 66 | option(WITH_LAYER_roipooling "" OFF)
 67 | option(WITH_LAYER_scale "" OFF)
 68 | option(WITH_LAYER_sigmoid "" OFF)
 69 | option(WITH_LAYER_slice "" OFF)
 70 | option(WITH_LAYER_softmax "" OFF)
 71 | option(WITH_LAYER_split "" OFF)
 72 | option(WITH_LAYER_spp "" OFF)
 73 | option(WITH_LAYER_tanh "" OFF)
 74 | option(WITH_LAYER_threshold "" OFF)
 75 | option(WITH_LAYER_tile "" OFF)
 76 | option(WITH_LAYER_rnn "" OFF)
 77 | option(WITH_LAYER_lstm "" OFF)
 78 | option(WITH_LAYER_binaryop "" OFF)
 79 | option(WITH_LAYER_unaryop "" OFF)
 80 | option(WITH_LAYER_convolutiondepthwise "" OFF)
 81 | option(WITH_LAYER_padding "" OFF)
 82 | option(WITH_LAYER_squeeze "" OFF)
 83 | option(WITH_LAYER_expanddims "" OFF)
 84 | option(WITH_LAYER_normalize "" OFF)
 85 | option(WITH_LAYER_permute "" OFF)
 86 | option(WITH_LAYER_priorbox "" OFF)
 87 | option(WITH_LAYER_detectionoutput "" OFF)
 88 | option(WITH_LAYER_interp "" OFF)
 89 | option(WITH_LAYER_deconvolutiondepthwise "" OFF)
 90 | option(WITH_LAYER_shufflechannel "" OFF)
 91 | option(WITH_LAYER_instancenorm "" OFF)
 92 | option(WITH_LAYER_clip "" OFF)
 93 | option(WITH_LAYER_reorg "" OFF)
 94 | option(WITH_LAYER_yolodetectionoutput "" OFF)
 95 | option(WITH_LAYER_quantize "" OFF)
 96 | option(WITH_LAYER_dequantize "" OFF)
 97 | option(WITH_LAYER_yolov3detectionoutput "" OFF)
 98 | option(WITH_LAYER_psroipooling "" OFF)
 99 | option(WITH_LAYER_roialign "" OFF)
100 | option(WITH_LAYER_packing "" ON)
101 | option(WITH_LAYER_requantize "" OFF)
102 | option(WITH_LAYER_cast "" OFF)
103 | option(WITH_LAYER_hardsigmoid "" OFF)
104 | option(WITH_LAYER_selu "" OFF)
105 | option(WITH_LAYER_hardswish "" OFF)
106 | option(WITH_LAYER_noop "" OFF)
107 | option(WITH_LAYER_pixelshuffle "" OFF)
108 | option(WITH_LAYER_deepcopy "" OFF)
109 | option(WITH_LAYER_mish "" OFF)
110 | option(WITH_LAYER_statisticspooling "" OFF)
111 | option(WITH_LAYER_swish "" OFF)
112 | option(WITH_LAYER_gemm "" OFF)
113 | option(WITH_LAYER_groupnorm "" OFF)
114 | option(WITH_LAYER_layernorm "" OFF)
115 | option(WITH_LAYER_softplus "" OFF)
116 | option(WITH_LAYER_gru "" OFF)
117 | option(WITH_LAYER_multiheadattention "" OFF)
118 | option(WITH_LAYER_gelu "" OFF)
119 | option(WITH_LAYER_convolution1d "" OFF)
120 | option(WITH_LAYER_pooling1d "" OFF)
121 | option(WITH_LAYER_convolutiondepthwise1d "" OFF)
122 | option(WITH_LAYER_convolution3d "" OFF)
123 | option(WITH_LAYER_convolutiondepthwise3d "" OFF)
124 | option(WITH_LAYER_pooling3d "" OFF)
125 | option(WITH_LAYER_matmul "" OFF)
126 | option(WITH_LAYER_deconvolution1d "" OFF)
127 | option(WITH_LAYER_deconvolutiondepthwise1d "" OFF)
128 | option(WITH_LAYER_deconvolution3d "" OFF)
129 | option(WITH_LAYER_deconvolutiondepthwise3d "" OFF)
130 | option(WITH_LAYER_einsum "" OFF)
131 | option(WITH_LAYER_deformableconv2d "" OFF)
132 | option(WITH_LAYER_glu "" OFF)
133 | option(WITH_LAYER_fold "" OFF)
134 | option(WITH_LAYER_unfold "" OFF)
135 | option(WITH_LAYER_gridsample "" OFF)
136 | option(WITH_LAYER_cumulativesum "" OFF)
137 | option(WITH_LAYER_copyto "" OFF)
138 | option(WITH_LAYER_erf "" OFF)
139 | option(WITH_LAYER_diag "" OFF)
140 | option(WITH_LAYER_celu "" OFF)
141 | option(WITH_LAYER_shrink "" OFF)
142 | option(WITH_LAYER_rmsnorm "" OFF)
143 | option(WITH_LAYER_spectrogram "" OFF)
144 | option(WITH_LAYER_inversespectrogram "" OFF)
145 | option(WITH_LAYER_flip "" OFF)
146 | 
147 | add_subdirectory(ncnn)
148 | 
149 | add_executable(vkpeak vkpeak.cpp)
150 | 
151 | set_target_properties(vkpeak PROPERTIES CXX_STANDARD 11)
152 | 
153 | target_link_libraries(vkpeak ncnn)
154 | 


--------------------------------------------------------------------------------
/vkpeak.cpp:
--------------------------------------------------------------------------------
   1 | // vkpeak implemented with ncnn library
   2 | 
   3 | #include <benchmark.h>
   4 | #include <command.h>
   5 | #include <gpu.h>
   6 | #include <mat.h>
   7 | 
   8 | #define REPEAT_1(...) #__VA_ARGS__
   9 | #define REPEAT_2(...) REPEAT_1(__VA_ARGS__) REPEAT_1(__VA_ARGS__)
  10 | #define REPEAT_4(...) REPEAT_2(__VA_ARGS__) REPEAT_2(__VA_ARGS__)
  11 | #define REPEAT_8(...) REPEAT_4(__VA_ARGS__) REPEAT_4(__VA_ARGS__)
  12 | #define REPEAT_16(...) REPEAT_8(__VA_ARGS__) REPEAT_8(__VA_ARGS__)
  13 | 
  14 | static const char glsl_p1_data[] = R"(
  15 | #version 450
  16 | 
  17 | layout (constant_id = 0) const int loop = 1;
  18 | 
  19 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
  20 | 
  21 | void main()
  22 | {
  23 |     const uint gx = gl_GlobalInvocationID.x;
  24 |     const uint lx = gl_LocalInvocationID.x;
  25 | 
  26 |     afp c = afp(gx);
  27 | 
  28 |     afp a = c;
  29 |     afp b = afp(lx);
  30 | 
  31 |     for (int i = 0; i < loop; i++)
  32 |     {)"
  33 |         REPEAT_16(c = a * c + b;)
  34 |     R"(}
  35 | 
  36 |     c_blob_data[gx] = float(c);
  37 | }
  38 | )";
  39 | 
  40 | static const char glsl_p1_dual_data[] = R"(
  41 | #version 450
  42 | 
  43 | layout (constant_id = 0) const int loop = 1;
  44 | 
  45 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
  46 | 
  47 | void main()
  48 | {
  49 |     const uint gx = gl_GlobalInvocationID.x;
  50 |     const uint lx = gl_LocalInvocationID.x;
  51 | 
  52 |     afp c0 = afp(gx);
  53 |     afp c1 = afp(lx);
  54 | 
  55 |     afp a = c0;
  56 |     afp b = c1;
  57 | 
  58 |     for (int i = 0; i < loop; i++)
  59 |     {)"
  60 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
  61 |     R"(}
  62 | 
  63 |     c0 = c0 + c1;
  64 |     c_blob_data[gx] = float(c0);
  65 | }
  66 | )";
  67 | 
  68 | static const char glsl_p4_data[] = R"(
  69 | #version 450
  70 | 
  71 | layout (constant_id = 0) const int loop = 1;
  72 | 
  73 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
  74 | 
  75 | void main()
  76 | {
  77 |     const uint gx = gl_GlobalInvocationID.x;
  78 |     const uint lx = gl_LocalInvocationID.x;
  79 | 
  80 |     afpvec4 c = afpvec4(gx);
  81 | 
  82 |     afpvec4 a = c + afpvec4(0,1,2,-3);
  83 |     afpvec4 b = afpvec4(lx) + afpvec4(2,3,5,-7);
  84 | 
  85 |     for (int i = 0; i < loop; i++)
  86 |     {)"
  87 |         REPEAT_16(c = a * c + b;)
  88 |     R"(}
  89 | 
  90 |     c_blob_data[gx] = float((c[0] + c[1]) + (c[2] + c[3]));
  91 | }
  92 | )";
  93 | 
  94 | static const char glsl_p4_dual_data[] = R"(
  95 | #version 450
  96 | 
  97 | layout (constant_id = 0) const int loop = 1;
  98 | 
  99 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
 100 | 
 101 | void main()
 102 | {
 103 |     const uint gx = gl_GlobalInvocationID.x;
 104 |     const uint lx = gl_LocalInvocationID.x;
 105 | 
 106 |     afpvec4 c0 = afpvec4(gx);
 107 |     afpvec4 c1 = afpvec4(lx);
 108 | 
 109 |     afpvec4 a = c0 + afpvec4(0,1,2,-3);
 110 |     afpvec4 b = c1 + afpvec4(2,3,5,-7);
 111 | 
 112 |     for (int i = 0; i < loop; i++)
 113 |     {)"
 114 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 115 |     R"(}
 116 | 
 117 |     c0 = c0 + c1;
 118 |     c_blob_data[gx] = float((c0[0] + c0[1]) + (c0[2] + c0[3]));
 119 | }
 120 | )";
 121 | 
 122 | static const char glsl_fp64_p1_data[] = R"(
 123 | #version 450
 124 | 
 125 | layout (constant_id = 0) const int loop = 1;
 126 | 
 127 | layout (binding = 0) writeonly buffer c_blob { double c_blob_data[]; };
 128 | 
 129 | void main()
 130 | {
 131 |     const uint gx = gl_GlobalInvocationID.x;
 132 |     const uint lx = gl_LocalInvocationID.x;
 133 | 
 134 |     double c = double(gx);
 135 | 
 136 |     double a = c;
 137 |     double b = double(lx);
 138 | 
 139 |     for (int i = 0; i < loop; i++)
 140 |     {)"
 141 |         REPEAT_16(c = a * c + b;)
 142 |     R"(}
 143 | 
 144 |     c_blob_data[gx] = c;
 145 | }
 146 | )";
 147 | 
 148 | static const char glsl_fp64_p1_dual_data[] = R"(
 149 | #version 450
 150 | 
 151 | layout (constant_id = 0) const int loop = 1;
 152 | 
 153 | layout (binding = 0) writeonly buffer c_blob { double c_blob_data[]; };
 154 | 
 155 | void main()
 156 | {
 157 |     const uint gx = gl_GlobalInvocationID.x;
 158 |     const uint lx = gl_LocalInvocationID.x;
 159 | 
 160 |     double c0 = double(gx);
 161 |     double c1 = double(lx);
 162 | 
 163 |     double a = c0;
 164 |     double b = c1;
 165 | 
 166 |     for (int i = 0; i < loop; i++)
 167 |     {)"
 168 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 169 |     R"(}
 170 | 
 171 |     c0 = c0 + c1;
 172 |     c_blob_data[gx] = c0;
 173 | }
 174 | )";
 175 | 
 176 | static const char glsl_fp64_p4_data[] = R"(
 177 | #version 450
 178 | 
 179 | layout (constant_id = 0) const int loop = 1;
 180 | 
 181 | layout (binding = 0) writeonly buffer c_blob { double c_blob_data[]; };
 182 | 
 183 | void main()
 184 | {
 185 |     const uint gx = gl_GlobalInvocationID.x;
 186 |     const uint lx = gl_LocalInvocationID.x;
 187 | 
 188 |     dvec4 c = dvec4(gx);
 189 | 
 190 |     dvec4 a = c + dvec4(0,1,2,-3);
 191 |     dvec4 b = dvec4(lx) + dvec4(2,3,5,-7);
 192 | 
 193 |     for (int i = 0; i < loop; i++)
 194 |     {)"
 195 |         REPEAT_16(c = a * c + b;)
 196 |     R"(}
 197 | 
 198 |     c_blob_data[gx] = (c[0] + c[1]) + (c[2] + c[3]);
 199 | }
 200 | )";
 201 | 
 202 | static const char glsl_fp64_p4_dual_data[] = R"(
 203 | #version 450
 204 | 
 205 | layout (constant_id = 0) const int loop = 1;
 206 | 
 207 | layout (binding = 0) writeonly buffer c_blob { double c_blob_data[]; };
 208 | 
 209 | void main()
 210 | {
 211 |     const uint gx = gl_GlobalInvocationID.x;
 212 |     const uint lx = gl_LocalInvocationID.x;
 213 | 
 214 |     dvec4 c0 = dvec4(gx);
 215 |     dvec4 c1 = dvec4(lx);
 216 | 
 217 |     dvec4 a = c0 + dvec4(0,1,2,-3);
 218 |     dvec4 b = c1 + dvec4(2,3,5,-7);
 219 | 
 220 |     for (int i = 0; i < loop; i++)
 221 |     {)"
 222 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 223 |     R"(}
 224 | 
 225 |     c0 = c0 + c1;
 226 |     c_blob_data[gx] = (c0[0] + c0[1]) + (c0[2] + c0[3]);
 227 | }
 228 | )";
 229 | 
 230 | static const char glsl_int32_p1_data[] = R"(
 231 | #version 450
 232 | 
 233 | layout (constant_id = 0) const int loop = 1;
 234 | 
 235 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 236 | 
 237 | void main()
 238 | {
 239 |     const uint gx = gl_GlobalInvocationID.x;
 240 |     const uint lx = gl_LocalInvocationID.x;
 241 | 
 242 |     int c = int(gx);
 243 | 
 244 |     int a = c;
 245 |     int b = int(lx);
 246 | 
 247 |     for (int i = 0; i < loop; i++)
 248 |     {)"
 249 |         REPEAT_16(c = a * c + b;)
 250 |     R"(}
 251 | 
 252 |     c_blob_data[gx] = c;
 253 | }
 254 | )";
 255 | 
 256 | static const char glsl_int32_p1_dual_data[] = R"(
 257 | #version 450
 258 | 
 259 | layout (constant_id = 0) const int loop = 1;
 260 | 
 261 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 262 | 
 263 | void main()
 264 | {
 265 |     const uint gx = gl_GlobalInvocationID.x;
 266 |     const uint lx = gl_LocalInvocationID.x;
 267 | 
 268 |     int c0 = int(gx);
 269 |     int c1 = int(lx);
 270 | 
 271 |     int a = c0;
 272 |     int b = c1;
 273 | 
 274 |     for (int i = 0; i < loop; i++)
 275 |     {)"
 276 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 277 |     R"(}
 278 | 
 279 |     c0 = c0 + c1;
 280 |     c_blob_data[gx] = c0;
 281 | }
 282 | )";
 283 | 
 284 | static const char glsl_int32_p4_data[] = R"(
 285 | #version 450
 286 | 
 287 | layout (constant_id = 0) const int loop = 1;
 288 | 
 289 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 290 | 
 291 | void main()
 292 | {
 293 |     const uint gx = gl_GlobalInvocationID.x;
 294 |     const uint lx = gl_LocalInvocationID.x;
 295 | 
 296 |     ivec4 c = ivec4(gx);
 297 | 
 298 |     ivec4 a = c + ivec4(0,1,2,-3);
 299 |     ivec4 b = ivec4(lx) + ivec4(2,3,5,-7);
 300 | 
 301 |     for (int i = 0; i < loop; i++)
 302 |     {)"
 303 |         REPEAT_16(c = a * c + b;)
 304 |     R"(}
 305 | 
 306 |     c_blob_data[gx] = (c[0] + c[1]) + (c[2] + c[3]);
 307 | }
 308 | )";
 309 | 
 310 | static const char glsl_int32_p4_dual_data[] = R"(
 311 | #version 450
 312 | 
 313 | layout (constant_id = 0) const int loop = 1;
 314 | 
 315 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 316 | 
 317 | void main()
 318 | {
 319 |     const uint gx = gl_GlobalInvocationID.x;
 320 |     const uint lx = gl_LocalInvocationID.x;
 321 | 
 322 |     ivec4 c0 = ivec4(gx);
 323 |     ivec4 c1 = ivec4(lx);
 324 | 
 325 |     ivec4 a = c0 + ivec4(0,1,2,-3);
 326 |     ivec4 b = c1 + ivec4(2,3,5,-7);
 327 | 
 328 |     for (int i = 0; i < loop; i++)
 329 |     {)"
 330 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 331 |     R"(}
 332 | 
 333 |     c0 = c0 + c1;
 334 |     c_blob_data[gx] = (c0[0] + c0[1]) + (c0[2] + c0[3]);
 335 | }
 336 | )";
 337 | 
 338 | static const char glsl_int16_p1_data[] = R"(
 339 | #version 450
 340 | 
 341 | #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
 342 | 
 343 | layout (constant_id = 0) const int loop = 1;
 344 | 
 345 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 346 | 
 347 | void main()
 348 | {
 349 |     const uint gx = gl_GlobalInvocationID.x;
 350 |     const uint lx = gl_LocalInvocationID.x;
 351 | 
 352 |     int16_t c = int16_t(gx);
 353 | 
 354 |     int16_t a = c;
 355 |     int16_t b = int16_t(lx);
 356 | 
 357 |     for (int i = 0; i < loop; i++)
 358 |     {)"
 359 |         REPEAT_16(c = a * c + b;)
 360 |     R"(}
 361 | 
 362 |     c_blob_data[gx] = int(c);
 363 | }
 364 | )";
 365 | 
 366 | static const char glsl_int16_p1_dual_data[] = R"(
 367 | #version 450
 368 | 
 369 | #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
 370 | 
 371 | layout (constant_id = 0) const int loop = 1;
 372 | 
 373 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 374 | 
 375 | void main()
 376 | {
 377 |     const uint gx = gl_GlobalInvocationID.x;
 378 |     const uint lx = gl_LocalInvocationID.x;
 379 | 
 380 |     int16_t c0 = int16_t(gx);
 381 |     int16_t c1 = int16_t(lx);
 382 | 
 383 |     int16_t a = c0;
 384 |     int16_t b = c1;
 385 | 
 386 |     for (int i = 0; i < loop; i++)
 387 |     {)"
 388 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 389 |     R"(}
 390 | 
 391 |     c0 = c0 + c1;
 392 |     c_blob_data[gx] = int(c0);
 393 | }
 394 | )";
 395 | 
 396 | static const char glsl_int16_p4_data[] = R"(
 397 | #version 450
 398 | 
 399 | #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
 400 | 
 401 | layout (constant_id = 0) const int loop = 1;
 402 | 
 403 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 404 | 
 405 | void main()
 406 | {
 407 |     const uint gx = gl_GlobalInvocationID.x;
 408 |     const uint lx = gl_LocalInvocationID.x;
 409 | 
 410 |     i16vec4 c = i16vec4(gx);
 411 | 
 412 |     i16vec4 a = c + i16vec4(0,1,2,-3);
 413 |     i16vec4 b = i16vec4(lx) + i16vec4(2,3,5,-7);
 414 | 
 415 |     for (int i = 0; i < loop; i++)
 416 |     {)"
 417 |         REPEAT_16(c = a * c + b;)
 418 |     R"(}
 419 | 
 420 |     c_blob_data[gx] = int((c[0] + c[1]) + (c[2] + c[3]));
 421 | }
 422 | )";
 423 | 
 424 | static const char glsl_int16_p4_dual_data[] = R"(
 425 | #version 450
 426 | 
 427 | #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
 428 | 
 429 | layout (constant_id = 0) const int loop = 1;
 430 | 
 431 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 432 | 
 433 | void main()
 434 | {
 435 |     const uint gx = gl_GlobalInvocationID.x;
 436 |     const uint lx = gl_LocalInvocationID.x;
 437 | 
 438 |     i16vec4 c0 = i16vec4(gx);
 439 |     i16vec4 c1 = i16vec4(lx);
 440 | 
 441 |     i16vec4 a = c0 + i16vec4(0,1,2,-3);
 442 |     i16vec4 b = c1 + i16vec4(2,3,5,-7);
 443 | 
 444 |     for (int i = 0; i < loop; i++)
 445 |     {)"
 446 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 447 |     R"(}
 448 | 
 449 |     c0 = c0 + c1;
 450 |     c_blob_data[gx] = int((c0[0] + c0[1]) + (c0[2] + c0[3]));
 451 | }
 452 | )";
 453 | 
 454 | static const char glsl_int64_p1_data[] = R"(
 455 | #version 450
 456 | 
 457 | #extension GL_EXT_shader_explicit_arithmetic_types_int64: require
 458 | 
 459 | layout (constant_id = 0) const int loop = 1;
 460 | 
 461 | layout (binding = 0) writeonly buffer c_blob { int64_t c_blob_data[]; };
 462 | 
 463 | void main()
 464 | {
 465 |     const uint gx = gl_GlobalInvocationID.x;
 466 |     const uint lx = gl_LocalInvocationID.x;
 467 | 
 468 |     int64_t c = int64_t(gx);
 469 | 
 470 |     int64_t a = c;
 471 |     int64_t b = int64_t(lx);
 472 | 
 473 |     for (int i = 0; i < loop; i++)
 474 |     {)"
 475 |         REPEAT_16(c = a * c + b;)
 476 |     R"(}
 477 | 
 478 |     c_blob_data[gx] = c;
 479 | }
 480 | )";
 481 | 
 482 | static const char glsl_int64_p1_dual_data[] = R"(
 483 | #version 450
 484 | 
 485 | #extension GL_EXT_shader_explicit_arithmetic_types_int64: require
 486 | 
 487 | layout (constant_id = 0) const int loop = 1;
 488 | 
 489 | layout (binding = 0) writeonly buffer c_blob { int64_t c_blob_data[]; };
 490 | 
 491 | void main()
 492 | {
 493 |     const uint gx = gl_GlobalInvocationID.x;
 494 |     const uint lx = gl_LocalInvocationID.x;
 495 | 
 496 |     int64_t c0 = int64_t(gx);
 497 |     int64_t c1 = int64_t(lx);
 498 | 
 499 |     int64_t a = c0;
 500 |     int64_t b = c1;
 501 | 
 502 |     for (int i = 0; i < loop; i++)
 503 |     {)"
 504 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 505 |     R"(}
 506 | 
 507 |     c0 = c0 + c1;
 508 |     c_blob_data[gx] = c0;
 509 | }
 510 | )";
 511 | 
 512 | static const char glsl_int64_p4_data[] = R"(
 513 | #version 450
 514 | 
 515 | #extension GL_EXT_shader_explicit_arithmetic_types_int64: require
 516 | 
 517 | layout (constant_id = 0) const int loop = 1;
 518 | 
 519 | layout (binding = 0) writeonly buffer c_blob { int64_t c_blob_data[]; };
 520 | 
 521 | void main()
 522 | {
 523 |     const uint gx = gl_GlobalInvocationID.x;
 524 |     const uint lx = gl_LocalInvocationID.x;
 525 | 
 526 |     i64vec4 c = i64vec4(gx);
 527 | 
 528 |     i64vec4 a = c + i64vec4(0,1,2,-3);
 529 |     i64vec4 b = i64vec4(lx) + i64vec4(2,3,5,-7);
 530 | 
 531 |     for (int i = 0; i < loop; i++)
 532 |     {)"
 533 |         REPEAT_16(c = a * c + b;)
 534 |     R"(}
 535 | 
 536 |     c_blob_data[gx] = (c[0] + c[1]) + (c[2] + c[3]);
 537 | }
 538 | )";
 539 | 
 540 | static const char glsl_int64_p4_dual_data[] = R"(
 541 | #version 450
 542 | 
 543 | #extension GL_EXT_shader_explicit_arithmetic_types_int64: require
 544 | 
 545 | layout (constant_id = 0) const int loop = 1;
 546 | 
 547 | layout (binding = 0) writeonly buffer c_blob { int64_t c_blob_data[]; };
 548 | 
 549 | void main()
 550 | {
 551 |     const uint gx = gl_GlobalInvocationID.x;
 552 |     const uint lx = gl_LocalInvocationID.x;
 553 | 
 554 |     i64vec4 c0 = i64vec4(gx);
 555 |     i64vec4 c1 = i64vec4(lx);
 556 | 
 557 |     i64vec4 a = c0 + i64vec4(0,1,2,-3);
 558 |     i64vec4 b = c1 + i64vec4(2,3,5,-7);
 559 | 
 560 |     for (int i = 0; i < loop; i++)
 561 |     {)"
 562 |         REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;)
 563 |     R"(}
 564 | 
 565 |     c0 = c0 + c1;
 566 |     c_blob_data[gx] = (c0[0] + c0[1]) + (c0[2] + c0[3]);
 567 | }
 568 | )";
 569 | 
 570 | static const char glsl_int8_p4_data[] = R"(
 571 | #version 450
 572 | 
 573 | #extension GL_EXT_integer_dot_product: require
 574 | 
 575 | layout (constant_id = 0) const int loop = 1;
 576 | 
 577 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 578 | 
 579 | void main()
 580 | {
 581 |     const uint gx = gl_GlobalInvocationID.x;
 582 |     const uint lx = gl_LocalInvocationID.x;
 583 | 
 584 |     int c = int(gx);
 585 | 
 586 |     int a = int(gx);
 587 |     int b = int(lx);
 588 | 
 589 |     for (int i = 0; i < loop; i++)
 590 |     {)"
 591 |         REPEAT_16(c = dotPacked4x8AccSatEXT(a, b, c);)
 592 |     R"(}
 593 | 
 594 |     c_blob_data[gx] = c;
 595 | }
 596 | )";
 597 | 
 598 | static const char glsl_int8_p4_dual_data[] = R"(
 599 | #version 450
 600 | 
 601 | #extension GL_EXT_integer_dot_product: require
 602 | 
 603 | layout (constant_id = 0) const int loop = 1;
 604 | 
 605 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 606 | 
 607 | void main()
 608 | {
 609 |     const uint gx = gl_GlobalInvocationID.x;
 610 |     const uint lx = gl_LocalInvocationID.x;
 611 | 
 612 |     int c0 = int(gx);
 613 |     int c1 = int(lx);
 614 | 
 615 |     int a = int(gx);
 616 |     int b = int(lx);
 617 | 
 618 |     for (int i = 0; i < loop; i++)
 619 |     {)"
 620 |         REPEAT_8(c0 = dotPacked4x8AccSatEXT(a, b, c0); c1 = dotPacked4x8AccSatEXT(a, b, c1);)
 621 |     R"(}
 622 | 
 623 |     c0 = c0 + c1;
 624 |     c_blob_data[gx] = c0;
 625 | }
 626 | )";
 627 | 
 628 | static const char glsl_bf16_p4_data[] = R"(
 629 | #version 450
 630 | 
 631 | #extension GL_EXT_shader_explicit_arithmetic_types : require
 632 | #extension GL_EXT_bfloat16: require
 633 | 
 634 | layout (constant_id = 0) const int loop = 1;
 635 | 
 636 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
 637 | 
 638 | void main()
 639 | {
 640 |     const uint gx = gl_GlobalInvocationID.x;
 641 |     const uint lx = gl_LocalInvocationID.x;
 642 | 
 643 |     bfloat16_t c = bfloat16_t(gx);
 644 | 
 645 |     u16vec4 a = uint16_t(gx) + u16vec4(0,1,2,3);
 646 |     bf16vec4 b = uintBitsToBFloat16EXT(uint16_t(lx) + u16vec4(2,3,5,7));
 647 | 
 648 |     for (int i = 0; i < loop; i++)
 649 |     {)"
 650 |         REPEAT_4(c = dot(uintBitsToBFloat16EXT(a), b); a.x = bfloat16BitsToUintEXT(c);
 651 |         c = dot(uintBitsToBFloat16EXT(a), b); a.y = bfloat16BitsToUintEXT(c);
 652 |         c = dot(uintBitsToBFloat16EXT(a), b); a.z = bfloat16BitsToUintEXT(c);
 653 |         c = dot(uintBitsToBFloat16EXT(a), b); a.w = bfloat16BitsToUintEXT(c);)
 654 |     R"(}
 655 | 
 656 |     c_blob_data[gx] = float(c);
 657 | }
 658 | )";
 659 | 
 660 | static const char glsl_bf16_p4_dual_data[] = R"(
 661 | #version 450
 662 | 
 663 | #extension GL_EXT_shader_explicit_arithmetic_types : require
 664 | #extension GL_EXT_bfloat16: require
 665 | 
 666 | layout (constant_id = 0) const int loop = 1;
 667 | 
 668 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
 669 | 
 670 | void main()
 671 | {
 672 |     const uint gx = gl_GlobalInvocationID.x;
 673 |     const uint lx = gl_LocalInvocationID.x;
 674 | 
 675 |     bfloat16_t c0 = bfloat16_t(gx);
 676 |     bfloat16_t c1 = bfloat16_t(lx);
 677 | 
 678 |     u16vec4 a0 = uint16_t(gx) + u16vec4(0,1,2,3);
 679 |     u16vec4 a1 = uint16_t(gx) + u16vec4(10,21,32,43);
 680 |     bf16vec4 b = uintBitsToBFloat16EXT(uint16_t(lx) + u16vec4(2,3,5,7));
 681 | 
 682 |     for (int i = 0; i < loop; i++)
 683 |     {)"
 684 |         REPEAT_4(c0 = dot(uintBitsToBFloat16EXT(a0), b); a0.x = bfloat16BitsToUintEXT(c0);
 685 |         c1 = dot(uintBitsToBFloat16EXT(a1), b); a1.y = bfloat16BitsToUintEXT(c1);
 686 |         c0 = dot(uintBitsToBFloat16EXT(a0), b); a0.z = bfloat16BitsToUintEXT(c0);
 687 |         c1 = dot(uintBitsToBFloat16EXT(a1), b); a1.w = bfloat16BitsToUintEXT(c1);)
 688 |     R"(}
 689 | 
 690 |     c_blob_data[gx] = float(c0) + float(c1);
 691 | }
 692 | )";
 693 | 
 694 | static const char glsl_fp16_matrix_data[] = R"(
 695 | #version 450
 696 | 
 697 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 698 | #extension GL_KHR_memory_scope_semantics: require
 699 | #extension GL_EXT_shader_explicit_arithmetic_types: require
 700 | #if ncnn_VK_KHR_cooperative_matrix
 701 | #extension GL_KHR_cooperative_matrix: require
 702 | #elif ncnn_VK_NV_cooperative_matrix
 703 | #extension GL_NV_cooperative_matrix: require
 704 | #endif
 705 | 
 706 | layout (constant_id = 0) const int loop = 1;
 707 | layout (constant_id = 1) const int M = 1;
 708 | layout (constant_id = 2) const int N = 1;
 709 | layout (constant_id = 3) const int K = 1;
 710 | layout (constant_id = 4) const int SCOPE = 3;
 711 | 
 712 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
 713 | 
 714 | void main()
 715 | {
 716 |     const uint gx = gl_GlobalInvocationID.x;
 717 |     const uint lx = gl_LocalInvocationID.x;
 718 | 
 719 | #if ncnn_VK_KHR_cooperative_matrix
 720 |     coopmat<float16_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<float16_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
 721 |     coopmat<float16_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<float16_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
 722 | 
 723 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
 724 | 
 725 |     for (int i = 0; i < loop; i++)
 726 |     {)"
 727 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
 728 |     R"(}
 729 | 
 730 |     coopMatStore(c, c_blob_data, gx * (M * N) / 2, N / 2, gl_CooperativeMatrixLayoutRowMajor);
 731 | #elif ncnn_VK_NV_cooperative_matrix
 732 |     fcoopmatNV<16, SCOPE, M, K> a = fcoopmatNV<16, SCOPE, M, K>(float(gx));
 733 |     fcoopmatNV<16, SCOPE, K, N> b = fcoopmatNV<16, SCOPE, K, N>(float(lx));
 734 | 
 735 |     fcoopmatNV<16, SCOPE, M, N> c = fcoopmatNV<16, SCOPE, M, N>(float(gx));
 736 | 
 737 |     for (int i = 0; i < loop; i++)
 738 |     {)"
 739 |         REPEAT_16(c = coopMatMulAddNV(a, b, c);)
 740 |     R"(}
 741 | 
 742 |     coopMatStoreNV(c, c_blob_data, gx * (M * N) / 2, N / 2, false);
 743 | #endif
 744 | }
 745 | )";
 746 | 
 747 | static const char glsl_fp16_matrix_dual_data[] = R"(
 748 | #version 450
 749 | 
 750 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 751 | #extension GL_KHR_memory_scope_semantics: require
 752 | #extension GL_EXT_shader_explicit_arithmetic_types: require
 753 | #if ncnn_VK_KHR_cooperative_matrix
 754 | #extension GL_KHR_cooperative_matrix: require
 755 | #elif ncnn_VK_NV_cooperative_matrix
 756 | #extension GL_NV_cooperative_matrix: require
 757 | #endif
 758 | 
 759 | layout (constant_id = 0) const int loop = 1;
 760 | layout (constant_id = 1) const int M = 1;
 761 | layout (constant_id = 2) const int N = 1;
 762 | layout (constant_id = 3) const int K = 1;
 763 | layout (constant_id = 4) const int SCOPE = 3;
 764 | 
 765 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
 766 | 
 767 | void main()
 768 | {
 769 |     const uint gx = gl_GlobalInvocationID.x;
 770 |     const uint lx = gl_LocalInvocationID.x;
 771 | 
 772 | #if ncnn_VK_KHR_cooperative_matrix
 773 |     coopmat<float16_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<float16_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
 774 |     coopmat<float16_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<float16_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
 775 | 
 776 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
 777 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(lx));
 778 | 
 779 |     for (int i = 0; i < loop; i++)
 780 |     {)"
 781 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
 782 |     R"(}
 783 | 
 784 |     c0 = c0 + c1;
 785 |     coopMatStore(c0, c_blob_data, gx * (M * N) / 2, N / 2, gl_CooperativeMatrixLayoutRowMajor);
 786 | #elif ncnn_VK_NV_cooperative_matrix
 787 |     fcoopmatNV<16, SCOPE, M, K> a = fcoopmatNV<16, SCOPE, M, K>(float(gx));
 788 |     fcoopmatNV<16, SCOPE, K, N> b = fcoopmatNV<16, SCOPE, K, N>(float(lx));
 789 | 
 790 |     fcoopmatNV<16, SCOPE, M, N> c0 = fcoopmatNV<16, SCOPE, M, N>(float(gx));
 791 |     fcoopmatNV<16, SCOPE, M, N> c1 = fcoopmatNV<16, SCOPE, M, N>(float(lx));
 792 | 
 793 |     for (int i = 0; i < loop; i++)
 794 |     {)"
 795 |         REPEAT_8(c0 = coopMatMulAddNV(a, b, c0); c1 = coopMatMulAddNV(a, b, c1);)
 796 |     R"(}
 797 | 
 798 |     c0 = c0 + c1;
 799 |     coopMatStoreNV(c0, c_blob_data, gx * (M * N) / 2, N / 2, false);
 800 | #endif
 801 | }
 802 | )";
 803 | 
 804 | static const char glsl_fp16_fp32_matrix_data[] = R"(
 805 | #version 450
 806 | 
 807 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 808 | #extension GL_KHR_memory_scope_semantics: require
 809 | #extension GL_EXT_shader_explicit_arithmetic_types: require
 810 | #if ncnn_VK_KHR_cooperative_matrix
 811 | #extension GL_KHR_cooperative_matrix: require
 812 | #elif ncnn_VK_NV_cooperative_matrix
 813 | #extension GL_NV_cooperative_matrix: require
 814 | #endif
 815 | 
 816 | layout (constant_id = 0) const int loop = 1;
 817 | layout (constant_id = 1) const int M = 1;
 818 | layout (constant_id = 2) const int N = 1;
 819 | layout (constant_id = 3) const int K = 1;
 820 | layout (constant_id = 4) const int SCOPE = 3;
 821 | 
 822 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
 823 | 
 824 | void main()
 825 | {
 826 |     const uint gx = gl_GlobalInvocationID.x;
 827 |     const uint lx = gl_LocalInvocationID.x;
 828 | 
 829 | #if ncnn_VK_KHR_cooperative_matrix
 830 |     coopmat<float16_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<float16_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
 831 |     coopmat<float16_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<float16_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
 832 | 
 833 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
 834 | 
 835 |     for (int i = 0; i < loop; i++)
 836 |     {)"
 837 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
 838 |     R"(}
 839 | 
 840 |     coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
 841 | #elif ncnn_VK_NV_cooperative_matrix
 842 |     fcoopmatNV<16, SCOPE, M, K> a = fcoopmatNV<16, SCOPE, M, K>(float(gx));
 843 |     fcoopmatNV<16, SCOPE, K, N> b = fcoopmatNV<16, SCOPE, K, N>(float(lx));
 844 | 
 845 |     fcoopmatNV<32, SCOPE, M, N> c = fcoopmatNV<32, SCOPE, M, N>(float(gx));
 846 | 
 847 |     for (int i = 0; i < loop; i++)
 848 |     {)"
 849 |         REPEAT_16(c = coopMatMulAddNV(a, b, c);)
 850 |     R"(}
 851 | 
 852 |     coopMatStoreNV(c, c_blob_data, gx * (M * N), N, false);
 853 | #endif
 854 | }
 855 | )";
 856 | 
 857 | static const char glsl_fp16_fp32_matrix_dual_data[] = R"(
 858 | #version 450
 859 | 
 860 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 861 | #extension GL_KHR_memory_scope_semantics: require
 862 | #extension GL_EXT_shader_explicit_arithmetic_types: require
 863 | #if ncnn_VK_KHR_cooperative_matrix
 864 | #extension GL_KHR_cooperative_matrix: require
 865 | #elif ncnn_VK_NV_cooperative_matrix
 866 | #extension GL_NV_cooperative_matrix: require
 867 | #endif
 868 | 
 869 | layout (constant_id = 0) const int loop = 1;
 870 | layout (constant_id = 1) const int M = 1;
 871 | layout (constant_id = 2) const int N = 1;
 872 | layout (constant_id = 3) const int K = 1;
 873 | layout (constant_id = 4) const int SCOPE = 3;
 874 | 
 875 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
 876 | 
 877 | void main()
 878 | {
 879 |     const uint gx = gl_GlobalInvocationID.x;
 880 |     const uint lx = gl_LocalInvocationID.x;
 881 | 
 882 | #if ncnn_VK_KHR_cooperative_matrix
 883 |     coopmat<float16_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<float16_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
 884 |     coopmat<float16_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<float16_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
 885 | 
 886 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
 887 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(lx));
 888 | 
 889 |     for (int i = 0; i < loop; i++)
 890 |     {)"
 891 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
 892 |     R"(}
 893 | 
 894 |     c0 = c0 + c1;
 895 |     coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
 896 | #elif ncnn_VK_NV_cooperative_matrix
 897 |     fcoopmatNV<16, SCOPE, M, K> a = fcoopmatNV<16, SCOPE, M, K>(float(gx));
 898 |     fcoopmatNV<16, SCOPE, K, N> b = fcoopmatNV<16, SCOPE, K, N>(float(lx));
 899 | 
 900 |     fcoopmatNV<32, SCOPE, M, N> c0 = fcoopmatNV<32, SCOPE, M, N>(float(gx));
 901 |     fcoopmatNV<32, SCOPE, M, N> c1 = fcoopmatNV<32, SCOPE, M, N>(float(lx));
 902 | 
 903 |     for (int i = 0; i < loop; i++)
 904 |     {)"
 905 |         REPEAT_8(c0 = coopMatMulAddNV(a, b, c0); c1 = coopMatMulAddNV(a, b, c1);)
 906 |     R"(}
 907 | 
 908 |     c0 = c0 + c1;
 909 |     coopMatStoreNV(c0, c_blob_data, gx * (M * N), N, false);
 910 | #endif
 911 | }
 912 | )";
 913 | 
 914 | static const char glsl_int8_matrix_data[] = R"(
 915 | #version 450
 916 | 
 917 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 918 | #extension GL_KHR_memory_scope_semantics: require
 919 | #extension GL_EXT_shader_explicit_arithmetic_types: require
 920 | #if ncnn_VK_KHR_cooperative_matrix
 921 | #extension GL_KHR_cooperative_matrix: require
 922 | #elif ncnn_VK_NV_cooperative_matrix
 923 | #extension GL_NV_integer_cooperative_matrix : require
 924 | #endif
 925 | 
 926 | layout (constant_id = 0) const int loop = 1;
 927 | layout (constant_id = 1) const int M = 1;
 928 | layout (constant_id = 2) const int N = 1;
 929 | layout (constant_id = 3) const int K = 1;
 930 | layout (constant_id = 4) const int SCOPE = 3;
 931 | 
 932 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 933 | 
 934 | void main()
 935 | {
 936 |     const uint gx = gl_GlobalInvocationID.x;
 937 |     const uint lx = gl_LocalInvocationID.x;
 938 | 
 939 | #if ncnn_VK_KHR_cooperative_matrix
 940 |     coopmat<int8_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<int8_t, SCOPE, M, K, gl_MatrixUseA>(int8_t(gx));
 941 |     coopmat<int8_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<int8_t, SCOPE, K, N, gl_MatrixUseB>(int8_t(lx));
 942 | 
 943 |     coopmat<int, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<int, SCOPE, M, N, gl_MatrixUseAccumulator>(int(gx));
 944 | 
 945 |     for (int i = 0; i < loop; i++)
 946 |     {)"
 947 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
 948 |     R"(}
 949 | 
 950 |     coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
 951 | #elif ncnn_VK_NV_cooperative_matrix
 952 |     icoopmatNV<8, SCOPE, M, K> a = icoopmatNV<8, SCOPE, M, K>(int8_t(gx));
 953 |     icoopmatNV<8, SCOPE, K, N> b = icoopmatNV<8, SCOPE, K, N>(int8_t(lx));
 954 | 
 955 |     icoopmatNV<32, SCOPE, M, N> c = icoopmatNV<32, SCOPE, M, N>(int(gx));
 956 | 
 957 |     for (int i = 0; i < loop; i++)
 958 |     {)"
 959 |         REPEAT_16(c = coopMatMulAddNV(a, b, c);)
 960 |     R"(}
 961 | 
 962 |     coopMatStoreNV(c, c_blob_data, gx * (M * N), N, false);
 963 | #endif
 964 | }
 965 | )";
 966 | 
 967 | static const char glsl_int8_matrix_dual_data[] = R"(
 968 | #version 450
 969 | 
 970 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 971 | #extension GL_KHR_memory_scope_semantics: require
 972 | #extension GL_EXT_shader_explicit_arithmetic_types: require
 973 | #if ncnn_VK_KHR_cooperative_matrix
 974 | #extension GL_KHR_cooperative_matrix: require
 975 | #elif ncnn_VK_NV_cooperative_matrix
 976 | #extension GL_NV_integer_cooperative_matrix : require
 977 | #endif
 978 | 
 979 | layout (constant_id = 0) const int loop = 1;
 980 | layout (constant_id = 1) const int M = 1;
 981 | layout (constant_id = 2) const int N = 1;
 982 | layout (constant_id = 3) const int K = 1;
 983 | layout (constant_id = 4) const int SCOPE = 3;
 984 | 
 985 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; };
 986 | 
 987 | void main()
 988 | {
 989 |     const uint gx = gl_GlobalInvocationID.x;
 990 |     const uint lx = gl_LocalInvocationID.x;
 991 | 
 992 | #if ncnn_VK_KHR_cooperative_matrix
 993 |     coopmat<int8_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<int8_t, SCOPE, M, K, gl_MatrixUseA>(int8_t(gx));
 994 |     coopmat<int8_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<int8_t, SCOPE, K, N, gl_MatrixUseB>(int8_t(lx));
 995 | 
 996 |     coopmat<int, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<int, SCOPE, M, N, gl_MatrixUseAccumulator>(int(gx));
 997 |     coopmat<int, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<int, SCOPE, M, N, gl_MatrixUseAccumulator>(int(lx));
 998 | 
 999 |     for (int i = 0; i < loop; i++)
1000 |     {)"
1001 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
1002 |     R"(}
1003 | 
1004 |     c0 = c0 + c1;
1005 |     coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1006 | #elif ncnn_VK_NV_cooperative_matrix
1007 |     icoopmatNV<8, SCOPE, M, K> a = icoopmatNV<8, SCOPE, M, K>(int8_t(gx));
1008 |     icoopmatNV<8, SCOPE, K, N> b = icoopmatNV<8, SCOPE, K, N>(int8_t(lx));
1009 | 
1010 |     icoopmatNV<32, SCOPE, M, N> c0 = icoopmatNV<32, SCOPE, M, N>(int(gx));
1011 |     icoopmatNV<32, SCOPE, M, N> c1 = icoopmatNV<32, SCOPE, M, N>(int(lx));
1012 | 
1013 |     for (int i = 0; i < loop; i++)
1014 |     {)"
1015 |         REPEAT_8(c0 = coopMatMulAddNV(a, b, c0); c1 = coopMatMulAddNV(a, b, c1);)
1016 |     R"(}
1017 | 
1018 |     c0 = c0 + c1;
1019 |     coopMatStoreNV(c0, c_blob_data, gx * (M * N), N, false);
1020 | #endif
1021 | }
1022 | )";
1023 | 
1024 | static const char glsl_bf16_matrix_data[] = R"(
1025 | #version 450
1026 | 
1027 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
1028 | #extension GL_KHR_memory_scope_semantics: require
1029 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1030 | #extension GL_KHR_cooperative_matrix: require
1031 | #extension GL_EXT_bfloat16: require
1032 | 
1033 | layout (constant_id = 0) const int loop = 1;
1034 | layout (constant_id = 1) const int M = 1;
1035 | layout (constant_id = 2) const int N = 1;
1036 | layout (constant_id = 3) const int K = 1;
1037 | layout (constant_id = 4) const int SCOPE = 3;
1038 | 
1039 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1040 | 
1041 | void main()
1042 | {
1043 |     const uint gx = gl_GlobalInvocationID.x;
1044 |     const uint lx = gl_LocalInvocationID.x;
1045 | 
1046 |     coopmat<bfloat16_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<bfloat16_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1047 |     coopmat<bfloat16_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<bfloat16_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1048 | 
1049 |     coopmat<bfloat16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<bfloat16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1050 | 
1051 |     for (int i = 0; i < loop; i++)
1052 |     {)"
1053 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
1054 |     R"(}
1055 | 
1056 |     coopMatStore(c, c_blob_data, gx * (M * N) / 2, N / 2, gl_CooperativeMatrixLayoutRowMajor);
1057 | }
1058 | )";
1059 | 
1060 | static const char glsl_bf16_matrix_dual_data[] = R"(
1061 | #version 450
1062 | 
1063 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
1064 | #extension GL_KHR_memory_scope_semantics: require
1065 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1066 | #extension GL_KHR_cooperative_matrix: require
1067 | #extension GL_EXT_bfloat16: require
1068 | 
1069 | layout (constant_id = 0) const int loop = 1;
1070 | layout (constant_id = 1) const int M = 1;
1071 | layout (constant_id = 2) const int N = 1;
1072 | layout (constant_id = 3) const int K = 1;
1073 | layout (constant_id = 4) const int SCOPE = 3;
1074 | 
1075 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1076 | 
1077 | void main()
1078 | {
1079 |     const uint gx = gl_GlobalInvocationID.x;
1080 |     const uint lx = gl_LocalInvocationID.x;
1081 | 
1082 |     coopmat<bfloat16_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<bfloat16_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1083 |     coopmat<bfloat16_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<bfloat16_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1084 | 
1085 |     coopmat<bfloat16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<bfloat16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1086 |     coopmat<bfloat16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<bfloat16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(lx));
1087 | 
1088 |     for (int i = 0; i < loop; i++)
1089 |     {)"
1090 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
1091 |     R"(}
1092 | 
1093 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c2 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(c0);
1094 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c3 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(c1);
1095 | 
1096 |     c0 = coopmat<bfloat16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(c2 + c3);
1097 |     coopMatStore(c0, c_blob_data, gx * (M * N) / 2, N / 2, gl_CooperativeMatrixLayoutRowMajor);
1098 | }
1099 | )";
1100 | 
1101 | static const char glsl_bf16_fp32_matrix_data[] = R"(
1102 | #version 450
1103 | 
1104 | #extension GL_KHR_memory_scope_semantics: require
1105 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1106 | #extension GL_KHR_cooperative_matrix: require
1107 | #extension GL_EXT_bfloat16: require
1108 | 
1109 | layout (constant_id = 0) const int loop = 1;
1110 | layout (constant_id = 1) const int M = 1;
1111 | layout (constant_id = 2) const int N = 1;
1112 | layout (constant_id = 3) const int K = 1;
1113 | layout (constant_id = 4) const int SCOPE = 3;
1114 | 
1115 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1116 | 
1117 | void main()
1118 | {
1119 |     const uint gx = gl_GlobalInvocationID.x;
1120 |     const uint lx = gl_LocalInvocationID.x;
1121 | 
1122 |     coopmat<bfloat16_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<bfloat16_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1123 |     coopmat<bfloat16_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<bfloat16_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1124 | 
1125 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1126 | 
1127 |     for (int i = 0; i < loop; i++)
1128 |     {)"
1129 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
1130 |     R"(}
1131 | 
1132 |     coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1133 | }
1134 | )";
1135 | 
1136 | static const char glsl_bf16_fp32_matrix_dual_data[] = R"(
1137 | #version 450
1138 | 
1139 | #extension GL_KHR_memory_scope_semantics: require
1140 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1141 | #extension GL_KHR_cooperative_matrix: require
1142 | #extension GL_EXT_bfloat16: require
1143 | 
1144 | layout (constant_id = 0) const int loop = 1;
1145 | layout (constant_id = 1) const int M = 1;
1146 | layout (constant_id = 2) const int N = 1;
1147 | layout (constant_id = 3) const int K = 1;
1148 | layout (constant_id = 4) const int SCOPE = 3;
1149 | 
1150 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1151 | 
1152 | void main()
1153 | {
1154 |     const uint gx = gl_GlobalInvocationID.x;
1155 |     const uint lx = gl_LocalInvocationID.x;
1156 | 
1157 |     coopmat<bfloat16_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<bfloat16_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1158 |     coopmat<bfloat16_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<bfloat16_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1159 | 
1160 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1161 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(lx));
1162 | 
1163 |     for (int i = 0; i < loop; i++)
1164 |     {)"
1165 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
1166 |     R"(}
1167 | 
1168 |     c0 = c0 + c1;
1169 |     coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1170 | }
1171 | )";
1172 | 
1173 | static const char glsl_fp8_fp16_matrix_data[] = R"(
1174 | #version 450
1175 | 
1176 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
1177 | #extension GL_KHR_memory_scope_semantics: require
1178 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1179 | #extension GL_KHR_cooperative_matrix: require
1180 | #extension GL_EXT_float_e4m3: require
1181 | 
1182 | layout (constant_id = 0) const int loop = 1;
1183 | layout (constant_id = 1) const int M = 1;
1184 | layout (constant_id = 2) const int N = 1;
1185 | layout (constant_id = 3) const int K = 1;
1186 | layout (constant_id = 4) const int SCOPE = 3;
1187 | 
1188 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1189 | 
1190 | void main()
1191 | {
1192 |     const uint gx = gl_GlobalInvocationID.x;
1193 |     const uint lx = gl_LocalInvocationID.x;
1194 | 
1195 |     coopmat<floate4m3_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<floate4m3_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1196 |     coopmat<floate4m3_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<floate4m3_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1197 | 
1198 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1199 | 
1200 |     for (int i = 0; i < loop; i++)
1201 |     {)"
1202 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
1203 |     R"(}
1204 | 
1205 |     coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1206 | }
1207 | )";
1208 | 
1209 | static const char glsl_fp8_fp16_matrix_dual_data[] = R"(
1210 | #version 450
1211 | 
1212 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
1213 | #extension GL_KHR_memory_scope_semantics: require
1214 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1215 | #extension GL_KHR_cooperative_matrix: require
1216 | #extension GL_EXT_float_e4m3: require
1217 | 
1218 | layout (constant_id = 0) const int loop = 1;
1219 | layout (constant_id = 1) const int M = 1;
1220 | layout (constant_id = 2) const int N = 1;
1221 | layout (constant_id = 3) const int K = 1;
1222 | layout (constant_id = 4) const int SCOPE = 3;
1223 | 
1224 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1225 | 
1226 | void main()
1227 | {
1228 |     const uint gx = gl_GlobalInvocationID.x;
1229 |     const uint lx = gl_LocalInvocationID.x;
1230 | 
1231 |     coopmat<floate4m3_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<floate4m3_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1232 |     coopmat<floate4m3_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<floate4m3_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1233 | 
1234 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1235 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(lx));
1236 | 
1237 |     for (int i = 0; i < loop; i++)
1238 |     {)"
1239 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
1240 |     R"(}
1241 | 
1242 |     c0 = c0 + c1;
1243 |     coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1244 | }
1245 | )";
1246 | 
1247 | static const char glsl_fp8_fp32_matrix_data[] = R"(
1248 | #version 450
1249 | 
1250 | #extension GL_KHR_memory_scope_semantics: require
1251 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1252 | #extension GL_KHR_cooperative_matrix: require
1253 | #extension GL_EXT_float_e4m3: require
1254 | 
1255 | layout (constant_id = 0) const int loop = 1;
1256 | layout (constant_id = 1) const int M = 1;
1257 | layout (constant_id = 2) const int N = 1;
1258 | layout (constant_id = 3) const int K = 1;
1259 | layout (constant_id = 4) const int SCOPE = 3;
1260 | 
1261 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1262 | 
1263 | void main()
1264 | {
1265 |     const uint gx = gl_GlobalInvocationID.x;
1266 |     const uint lx = gl_LocalInvocationID.x;
1267 | 
1268 |     coopmat<floate4m3_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<floate4m3_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1269 |     coopmat<floate4m3_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<floate4m3_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1270 | 
1271 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1272 | 
1273 |     for (int i = 0; i < loop; i++)
1274 |     {)"
1275 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
1276 |     R"(}
1277 | 
1278 |     coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1279 | }
1280 | )";
1281 | 
1282 | static const char glsl_fp8_fp32_matrix_dual_data[] = R"(
1283 | #version 450
1284 | 
1285 | #extension GL_KHR_memory_scope_semantics: require
1286 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1287 | #extension GL_KHR_cooperative_matrix: require
1288 | #extension GL_EXT_float_e4m3: require
1289 | 
1290 | layout (constant_id = 0) const int loop = 1;
1291 | layout (constant_id = 1) const int M = 1;
1292 | layout (constant_id = 2) const int N = 1;
1293 | layout (constant_id = 3) const int K = 1;
1294 | layout (constant_id = 4) const int SCOPE = 3;
1295 | 
1296 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1297 | 
1298 | void main()
1299 | {
1300 |     const uint gx = gl_GlobalInvocationID.x;
1301 |     const uint lx = gl_LocalInvocationID.x;
1302 | 
1303 |     coopmat<floate4m3_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<floate4m3_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1304 |     coopmat<floate4m3_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<floate4m3_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1305 | 
1306 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1307 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(lx));
1308 | 
1309 |     for (int i = 0; i < loop; i++)
1310 |     {)"
1311 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
1312 |     R"(}
1313 | 
1314 |     c0 = c0 + c1;
1315 |     coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1316 | }
1317 | )";
1318 | 
1319 | static const char glsl_bf8_fp16_matrix_data[] = R"(
1320 | #version 450
1321 | 
1322 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
1323 | #extension GL_KHR_memory_scope_semantics: require
1324 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1325 | #extension GL_KHR_cooperative_matrix: require
1326 | #extension GL_EXT_float_e5m2: require
1327 | 
1328 | layout (constant_id = 0) const int loop = 1;
1329 | layout (constant_id = 1) const int M = 1;
1330 | layout (constant_id = 2) const int N = 1;
1331 | layout (constant_id = 3) const int K = 1;
1332 | layout (constant_id = 4) const int SCOPE = 3;
1333 | 
1334 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1335 | 
1336 | void main()
1337 | {
1338 |     const uint gx = gl_GlobalInvocationID.x;
1339 |     const uint lx = gl_LocalInvocationID.x;
1340 | 
1341 |     coopmat<floate5m2_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<floate5m2_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1342 |     coopmat<floate5m2_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<floate5m2_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1343 | 
1344 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1345 | 
1346 |     for (int i = 0; i < loop; i++)
1347 |     {)"
1348 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
1349 |     R"(}
1350 | 
1351 |     coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1352 | }
1353 | )";
1354 | 
1355 | static const char glsl_bf8_fp16_matrix_dual_data[] = R"(
1356 | #version 450
1357 | 
1358 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
1359 | #extension GL_KHR_memory_scope_semantics: require
1360 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1361 | #extension GL_KHR_cooperative_matrix: require
1362 | #extension GL_EXT_float_e5m2: require
1363 | 
1364 | layout (constant_id = 0) const int loop = 1;
1365 | layout (constant_id = 1) const int M = 1;
1366 | layout (constant_id = 2) const int N = 1;
1367 | layout (constant_id = 3) const int K = 1;
1368 | layout (constant_id = 4) const int SCOPE = 3;
1369 | 
1370 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1371 | 
1372 | void main()
1373 | {
1374 |     const uint gx = gl_GlobalInvocationID.x;
1375 |     const uint lx = gl_LocalInvocationID.x;
1376 | 
1377 |     coopmat<floate5m2_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<floate5m2_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1378 |     coopmat<floate5m2_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<floate5m2_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1379 | 
1380 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1381 |     coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<float16_t, SCOPE, M, N, gl_MatrixUseAccumulator>(float(lx));
1382 | 
1383 |     for (int i = 0; i < loop; i++)
1384 |     {)"
1385 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
1386 |     R"(}
1387 | 
1388 |     c0 = c0 + c1;
1389 |     coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1390 | }
1391 | )";
1392 | 
1393 | static const char glsl_bf8_fp32_matrix_data[] = R"(
1394 | #version 450
1395 | 
1396 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
1397 | #extension GL_KHR_memory_scope_semantics: require
1398 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1399 | #extension GL_KHR_cooperative_matrix: require
1400 | #extension GL_EXT_float_e5m2: require
1401 | 
1402 | layout (constant_id = 0) const int loop = 1;
1403 | layout (constant_id = 1) const int M = 1;
1404 | layout (constant_id = 2) const int N = 1;
1405 | layout (constant_id = 3) const int K = 1;
1406 | layout (constant_id = 4) const int SCOPE = 3;
1407 | 
1408 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1409 | 
1410 | void main()
1411 | {
1412 |     const uint gx = gl_GlobalInvocationID.x;
1413 |     const uint lx = gl_LocalInvocationID.x;
1414 | 
1415 |     coopmat<floate5m2_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<floate5m2_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1416 |     coopmat<floate5m2_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<floate5m2_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1417 | 
1418 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1419 | 
1420 |     for (int i = 0; i < loop; i++)
1421 |     {)"
1422 |         REPEAT_16(c = coopMatMulAdd(a, b, c);)
1423 |     R"(}
1424 | 
1425 |     coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1426 | }
1427 | )";
1428 | 
1429 | static const char glsl_bf8_fp32_matrix_dual_data[] = R"(
1430 | #version 450
1431 | 
1432 | #extension GL_KHR_memory_scope_semantics: require
1433 | #extension GL_EXT_shader_explicit_arithmetic_types: require
1434 | #extension GL_KHR_cooperative_matrix: require
1435 | #extension GL_EXT_float_e5m2: require
1436 | 
1437 | layout (constant_id = 0) const int loop = 1;
1438 | layout (constant_id = 1) const int M = 1;
1439 | layout (constant_id = 2) const int N = 1;
1440 | layout (constant_id = 3) const int K = 1;
1441 | layout (constant_id = 4) const int SCOPE = 3;
1442 | 
1443 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; };
1444 | 
1445 | void main()
1446 | {
1447 |     const uint gx = gl_GlobalInvocationID.x;
1448 |     const uint lx = gl_LocalInvocationID.x;
1449 | 
1450 |     coopmat<floate5m2_t, SCOPE, M, K, gl_MatrixUseA> a = coopmat<floate5m2_t, SCOPE, M, K, gl_MatrixUseA>(float(gx));
1451 |     coopmat<floate5m2_t, SCOPE, K, N, gl_MatrixUseB> b = coopmat<floate5m2_t, SCOPE, K, N, gl_MatrixUseB>(float(lx));
1452 | 
1453 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c0 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(gx));
1454 |     coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator> c1 = coopmat<float, SCOPE, M, N, gl_MatrixUseAccumulator>(float(lx));
1455 | 
1456 |     for (int i = 0; i < loop; i++)
1457 |     {)"
1458 |         REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);)
1459 |     R"(}
1460 | 
1461 |     c0 = c0 + c1;
1462 |     coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor);
1463 | }
1464 | )";
1465 | 
1466 | static double vkpeak(int device_id, int storage_type, int arithmetic_type, int packing_type)
1467 | {
1468 |     ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(device_id);
1469 | 
1470 |     if (!vkdev)
1471 |     {
1472 |         return 0;
1473 |     }
1474 | 
1475 |     if (!vkdev->info.support_fp16_storage() && storage_type == 1)
1476 |     {
1477 |         return 0;
1478 |     }
1479 |     if (!vkdev->info.support_fp16_storage() && storage_type == 4)
1480 |     {
1481 |         return 0;
1482 |     }
1483 |     if (!vkdev->info.support_fp16_arithmetic() && arithmetic_type == 1)
1484 |     {
1485 |         return 0;
1486 |     }
1487 |     if (!vkdev->info.support_fp16_arithmetic() && arithmetic_type == 4)
1488 |     {
1489 |         return 0;
1490 |     }
1491 |     if (!vkdev->info.support_int8_arithmetic() && arithmetic_type == 6)
1492 |     {
1493 |         return 0;
1494 |     }
1495 |     if (!vkdev->info.support_cooperative_matrix() && packing_type == 256)
1496 |     {
1497 |         return 0;
1498 |     }
1499 | 
1500 |     // check shader fp64 feature
1501 |     bool has_shader_fp64 = vkdev->info.physicalDevicefeatures().shaderFloat64;
1502 |     if (!has_shader_fp64 && (storage_type == 2 || arithmetic_type == 2))
1503 |     {
1504 |         return 0;
1505 |     }
1506 | 
1507 |     // check shader int64 feature
1508 |     bool has_shader_int64 = vkdev->info.physicalDevicefeatures().shaderInt64;
1509 |     if (!has_shader_int64 && (storage_type == 5 || arithmetic_type == 5))
1510 |     {
1511 |         return 0;
1512 |     }
1513 | 
1514 |     // check shader int8 dotprod feature
1515 |     bool has_shader_int8_dotprod = vkdev->info.queryShaderIntegerDotProductFeatures().shaderIntegerDotProduct;
1516 |     if (!has_shader_int8_dotprod && (arithmetic_type == 6 && packing_type == 4))
1517 |     {
1518 |         return 0;
1519 |     }
1520 | 
1521 |     // check shader bf16 feature
1522 |     bool has_shader_bf16 = vkdev->info.queryShaderBfloat16Features().shaderBFloat16Type;
1523 |     if (!has_shader_bf16 && (arithmetic_type == 7))
1524 |     {
1525 |         return 0;
1526 |     }
1527 | 
1528 |     // check shader bf16 dotprod feature
1529 |     bool has_shader_bf16_dotprod = vkdev->info.queryShaderBfloat16Features().shaderBFloat16DotProduct;
1530 |     if (!has_shader_bf16_dotprod && (arithmetic_type == 7 && packing_type == 4))
1531 |     {
1532 |         return 0;
1533 |     }
1534 | 
1535 |     // check shader bf16 cooperative matrix feature
1536 |     bool has_shader_bf16_matrix = vkdev->info.queryShaderBfloat16Features().shaderBFloat16CooperativeMatrix;
1537 |     if (!has_shader_bf16_matrix && (arithmetic_type == 7 && packing_type == 256))
1538 |     {
1539 |         return 0;
1540 |     }
1541 | 
1542 |     // check shader fp8 feature
1543 |     bool has_shader_fp8 = vkdev->info.queryShaderFloat8Features().shaderFloat8;
1544 |     if (!has_shader_fp8 && (arithmetic_type == 8 || arithmetic_type == 9))
1545 |     {
1546 |         return 0;
1547 |     }
1548 | 
1549 |     // check shader fp8 cooperative matrix feature
1550 |     bool has_shader_fp8_matrix = vkdev->info.queryShaderFloat8Features().shaderFloat8CooperativeMatrix;
1551 |     if (!has_shader_fp8_matrix && ((arithmetic_type == 8 || arithmetic_type == 9) && packing_type == 256))
1552 |     {
1553 |         return 0;
1554 |     }
1555 | 
1556 |     ncnn::Option opt;
1557 |     opt.use_vulkan_compute = true;
1558 |     opt.use_fp16_packed = storage_type == 1;
1559 |     opt.use_fp16_storage = storage_type == 1 || storage_type == 4;
1560 |     opt.use_fp16_arithmetic = arithmetic_type == 1;
1561 | 
1562 |     ncnn::VkAllocator* allocator = vkdev->acquire_blob_allocator();
1563 | 
1564 |     // reuse c storage, max 512M
1565 |     int buffer_size = std::min((int)(vkdev->get_heap_budget() / 8), 512) * 1024 * 1024;
1566 |     if (vkdev->info.type() == 1)
1567 |     {
1568 |         // max 128M for integrated gpu
1569 |         buffer_size = std::min(buffer_size, 128 * 1024 * 1024);
1570 |     }
1571 |     ncnn::VkMat c(buffer_size, (size_t)1u, 1, allocator);
1572 | 
1573 |     int elemsize;
1574 |     if (storage_type == 0 || storage_type == 3)
1575 |     {
1576 |         // fp32 / int32
1577 |         elemsize = 4;
1578 |     }
1579 |     else if (storage_type == 1 || storage_type == 4)
1580 |     {
1581 |         // fp16 / int16
1582 |         elemsize = 2;
1583 |     }
1584 |     else if (storage_type == 2 || storage_type == 5)
1585 |     {
1586 |         // fp64 / int64
1587 |         elemsize = 8;
1588 |     }
1589 |     else if (storage_type == 6)
1590 |     {
1591 |         // int8
1592 |         elemsize = 1;
1593 |     }
1594 | 
1595 |     int local_size_x = std::min(128, std::max(1, (int)vkdev->info.subgroup_size()));
1596 |     if (packing_type == 256)
1597 |     {
1598 |         // matrix on subgroup
1599 |         local_size_x = (int)vkdev->info.subgroup_size();
1600 |     }
1601 | 
1602 |     int M = 1;
1603 |     int N = 1;
1604 |     int K = 1;
1605 |     // VK_SCOPE_WORKGROUP_KHR = gl_ScopeWorkgroup = 2
1606 |     // VK_SCOPE_SUBGROUP_KHR = gl_ScopeSubgroup = 3
1607 |     int SCOPE = 3;
1608 |     bool use_fp16_fp32_matrix = false;
1609 |     bool use_bf16_fp32_matrix = false;
1610 |     bool use_fp8_fp32_matrix = false;
1611 |     if (packing_type == 256)
1612 |     {
1613 |         bool mnk_found = false;
1614 | 
1615 |         if (arithmetic_type == 1)
1616 |         {
1617 |             if (vkdev->info.support_VK_KHR_cooperative_matrix())
1618 |             {
1619 |                 const std::vector<VkCooperativeMatrixPropertiesKHR>& properties = vkdev->info.queryCooperativeMatrixSubProperties();
1620 | 
1621 |                 {
1622 |                     // find fp16 * fp16 => fp16
1623 |                     for (uint32_t j = 0; j < properties.size(); j++)
1624 |                     {
1625 |                         const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1626 | 
1627 |                         if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
1628 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR
1629 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1630 |                         {
1631 |                             M = cmp.MSize;
1632 |                             N = cmp.NSize;
1633 |                             K = cmp.KSize;
1634 |                             SCOPE = (int)cmp.scope;
1635 |                             mnk_found = true;
1636 |                             break;
1637 |                         }
1638 |                     }
1639 |                 }
1640 | 
1641 |                 if (!mnk_found)
1642 |                 {
1643 |                     // find fp16 * fp16 => fp32
1644 |                     for (uint32_t j = 0; j < properties.size(); j++)
1645 |                     {
1646 |                         const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1647 | 
1648 |                         if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
1649 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
1650 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1651 |                         {
1652 |                             M = cmp.MSize;
1653 |                             N = cmp.NSize;
1654 |                             K = cmp.KSize;
1655 |                             SCOPE = (int)cmp.scope;
1656 |                             mnk_found = true;
1657 |                             use_fp16_fp32_matrix = true;
1658 |                             break;
1659 |                         }
1660 |                     }
1661 |                 }
1662 |             }
1663 |             else // if (vkdev->info.support_VK_NV_cooperative_matrix())
1664 |             {
1665 |                 const std::vector<VkCooperativeMatrixPropertiesNV>& properties = vkdev->info.queryCooperativeMatrixSubPropertiesNV();
1666 | 
1667 |                 {
1668 |                     // find fp16 * fp16 => fp16
1669 |                     for (uint32_t j = 0; j < properties.size(); j++)
1670 |                     {
1671 |                         const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
1672 | 
1673 |                         if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
1674 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT16_NV
1675 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_NV || cmp.scope == VK_SCOPE_WORKGROUP_NV))
1676 |                         {
1677 |                             M = cmp.MSize;
1678 |                             N = cmp.NSize;
1679 |                             K = cmp.KSize;
1680 |                             SCOPE = (int)cmp.scope;
1681 |                             mnk_found = true;
1682 |                             break;
1683 |                         }
1684 |                     }
1685 |                 }
1686 | 
1687 |                 if (!mnk_found)
1688 |                 {
1689 |                     // find fp16 * fp16 => fp32
1690 |                     for (uint32_t j = 0; j < properties.size(); j++)
1691 |                     {
1692 |                         const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
1693 | 
1694 |                         if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
1695 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
1696 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_NV || cmp.scope == VK_SCOPE_WORKGROUP_NV))
1697 |                         {
1698 |                             M = cmp.MSize;
1699 |                             N = cmp.NSize;
1700 |                             K = cmp.KSize;
1701 |                             SCOPE = (int)cmp.scope;
1702 |                             mnk_found = true;
1703 |                             use_fp16_fp32_matrix = true;
1704 |                             break;
1705 |                         }
1706 |                     }
1707 |                 }
1708 |             }
1709 |         }
1710 | 
1711 |         if (arithmetic_type == 6)
1712 |         {
1713 |             if (vkdev->info.support_VK_KHR_cooperative_matrix())
1714 |             {
1715 |                 const std::vector<VkCooperativeMatrixPropertiesKHR>& properties = vkdev->info.queryCooperativeMatrixSubProperties();
1716 | 
1717 |                 // find int8 * int8 => int32
1718 |                 for (uint32_t j = 0; j < properties.size(); j++)
1719 |                 {
1720 |                     const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1721 | 
1722 |                     if (cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR && cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR
1723 |                         && cmp.CType == VK_COMPONENT_TYPE_SINT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_SINT32_KHR
1724 |                         && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1725 |                     {
1726 |                         M = cmp.MSize;
1727 |                         N = cmp.NSize;
1728 |                         K = cmp.KSize;
1729 |                         SCOPE = (int)cmp.scope;
1730 |                         mnk_found = true;
1731 |                         break;
1732 |                     }
1733 |                 }
1734 |             }
1735 |             else // if (vkdev->info.support_VK_NV_cooperative_matrix())
1736 |             {
1737 |                 const std::vector<VkCooperativeMatrixPropertiesNV>& properties = vkdev->info.queryCooperativeMatrixSubPropertiesNV();
1738 | 
1739 |                 // find int8 * int8 => int32
1740 |                 for (uint32_t j = 0; j < properties.size(); j++)
1741 |                 {
1742 |                     const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
1743 | 
1744 |                     if (cmp.AType == VK_COMPONENT_TYPE_SINT8_NV && cmp.BType == VK_COMPONENT_TYPE_SINT8_NV
1745 |                         && cmp.CType == VK_COMPONENT_TYPE_SINT32_NV && cmp.DType == VK_COMPONENT_TYPE_SINT32_NV
1746 |                         && (cmp.scope == VK_SCOPE_SUBGROUP_NV || cmp.scope == VK_SCOPE_WORKGROUP_NV))
1747 |                     {
1748 |                         M = cmp.MSize;
1749 |                         N = cmp.NSize;
1750 |                         K = cmp.KSize;
1751 |                         SCOPE = (int)cmp.scope;
1752 |                         mnk_found = true;
1753 |                         break;
1754 |                     }
1755 |                 }
1756 |             }
1757 |         }
1758 | 
1759 |         if (arithmetic_type == 7)
1760 |         {
1761 |             if (vkdev->info.support_VK_KHR_cooperative_matrix())
1762 |             {
1763 |                 const std::vector<VkCooperativeMatrixPropertiesKHR>& properties = vkdev->info.queryCooperativeMatrixSubProperties();
1764 | 
1765 |                 {
1766 |                     // find bf16 * bf16 => bf16
1767 |                     for (uint32_t j = 0; j < properties.size(); j++)
1768 |                     {
1769 |                         const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1770 | 
1771 |                         if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR
1772 |                             && cmp.CType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.ResultType == VK_COMPONENT_TYPE_BFLOAT16_KHR
1773 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1774 |                         {
1775 |                             M = cmp.MSize;
1776 |                             N = cmp.NSize;
1777 |                             K = cmp.KSize;
1778 |                             SCOPE = (int)cmp.scope;
1779 |                             mnk_found = true;
1780 |                             break;
1781 |                         }
1782 |                     }
1783 |                 }
1784 | 
1785 |                 if (!mnk_found)
1786 |                 {
1787 |                     // find bf16 * bf16 => fp32
1788 |                     for (uint32_t j = 0; j < properties.size(); j++)
1789 |                     {
1790 |                         const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1791 | 
1792 |                         if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR
1793 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
1794 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1795 |                         {
1796 |                             M = cmp.MSize;
1797 |                             N = cmp.NSize;
1798 |                             K = cmp.KSize;
1799 |                             SCOPE = (int)cmp.scope;
1800 |                             mnk_found = true;
1801 |                             use_bf16_fp32_matrix = true;
1802 |                             break;
1803 |                         }
1804 |                     }
1805 |                 }
1806 |             }
1807 |         }
1808 | 
1809 |         if (arithmetic_type == 8)
1810 |         {
1811 |             if (vkdev->info.support_VK_KHR_cooperative_matrix())
1812 |             {
1813 |                 const std::vector<VkCooperativeMatrixPropertiesKHR>& properties = vkdev->info.queryCooperativeMatrixSubProperties();
1814 | 
1815 |                 {
1816 |                     // find fp8 * fp8 => fp16
1817 |                     for (uint32_t j = 0; j < properties.size(); j++)
1818 |                     {
1819 |                         const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1820 | 
1821 |                         if (cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT && cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT
1822 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR
1823 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1824 |                         {
1825 |                             M = cmp.MSize;
1826 |                             N = cmp.NSize;
1827 |                             K = cmp.KSize;
1828 |                             SCOPE = (int)cmp.scope;
1829 |                             mnk_found = true;
1830 |                             break;
1831 |                         }
1832 |                     }
1833 |                 }
1834 | 
1835 |                 if (!mnk_found)
1836 |                 {
1837 |                     // find fp8 * fp8 => fp32
1838 |                     for (uint32_t j = 0; j < properties.size(); j++)
1839 |                     {
1840 |                         const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1841 | 
1842 |                         if (cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT && cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT
1843 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
1844 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1845 |                         {
1846 |                             M = cmp.MSize;
1847 |                             N = cmp.NSize;
1848 |                             K = cmp.KSize;
1849 |                             SCOPE = (int)cmp.scope;
1850 |                             mnk_found = true;
1851 |                             use_fp8_fp32_matrix = true;
1852 |                             break;
1853 |                         }
1854 |                     }
1855 |                 }
1856 |             }
1857 |         }
1858 | 
1859 |         if (arithmetic_type == 9)
1860 |         {
1861 |             if (vkdev->info.support_VK_KHR_cooperative_matrix())
1862 |             {
1863 |                 const std::vector<VkCooperativeMatrixPropertiesKHR>& properties = vkdev->info.queryCooperativeMatrixSubProperties();
1864 | 
1865 |                 {
1866 |                     // find bf8 * bf8 => fp16
1867 |                     for (uint32_t j = 0; j < properties.size(); j++)
1868 |                     {
1869 |                         const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1870 | 
1871 |                         if (cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT && cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
1872 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR
1873 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1874 |                         {
1875 |                             M = cmp.MSize;
1876 |                             N = cmp.NSize;
1877 |                             K = cmp.KSize;
1878 |                             SCOPE = (int)cmp.scope;
1879 |                             mnk_found = true;
1880 |                             break;
1881 |                         }
1882 |                     }
1883 |                 }
1884 | 
1885 |                 if (!mnk_found)
1886 |                 {
1887 |                     // find bf8 * bf8 => fp32
1888 |                     for (uint32_t j = 0; j < properties.size(); j++)
1889 |                     {
1890 |                         const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
1891 | 
1892 |                         if (cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT && cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
1893 |                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
1894 |                             && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR))
1895 |                         {
1896 |                             M = cmp.MSize;
1897 |                             N = cmp.NSize;
1898 |                             K = cmp.KSize;
1899 |                             SCOPE = (int)cmp.scope;
1900 |                             mnk_found = true;
1901 |                             use_fp8_fp32_matrix = true;
1902 |                             break;
1903 |                         }
1904 |                     }
1905 |                 }
1906 |             }
1907 |         }
1908 | 
1909 |         if (!mnk_found)
1910 |         {
1911 |             // no supported component type
1912 |             return 0;
1913 |         }
1914 |     }
1915 | 
1916 |     int max_invocation_count = buffer_size / elemsize;
1917 |     // make max_invocation_count be multiple of local_size_x
1918 |     max_invocation_count = std::max(max_invocation_count / local_size_x, 1) * local_size_x;
1919 |     if (packing_type == 256)
1920 |     {
1921 |         if (use_fp16_fp32_matrix || use_bf16_fp32_matrix || use_fp8_fp32_matrix)
1922 |             max_invocation_count = std::max(max_invocation_count / (M * N) / 2, 1);
1923 |         else
1924 |             max_invocation_count = std::max(max_invocation_count / (M * N), 1);
1925 |     }
1926 | 
1927 |     double max_gflops = 0;
1928 | 
1929 |     // start with little works
1930 |     int invocation_count = std::max(max_invocation_count / 32, 8);
1931 |     int loop = 16;
1932 | 
1933 |     bool rerun = true;
1934 | 
1935 |     // prepare storage
1936 |     while (rerun)
1937 |     {
1938 |         rerun = false;
1939 | 
1940 |         // setup pipeline
1941 |         ncnn::Pipeline pipeline(vkdev);
1942 |         ncnn::Pipeline pipeline_dual(vkdev);
1943 |         {
1944 |             pipeline.set_local_size_xyz(local_size_x, 1, 1);
1945 |             pipeline_dual.set_local_size_xyz(local_size_x, 1, 1);
1946 | 
1947 |             std::vector<ncnn::vk_specialization_type> specializations(1);
1948 |             specializations[0].i = loop;
1949 | 
1950 |             // glsl to spirv
1951 |             // -1 for omit the tail '\0'
1952 |             std::vector<uint32_t> spirv;
1953 |             std::vector<uint32_t> spirv_dual;
1954 |             if (arithmetic_type == 2)
1955 |             {
1956 |                 if (packing_type == 1)
1957 |                 {
1958 |                     ncnn::compile_spirv_module(glsl_fp64_p1_data, sizeof(glsl_fp64_p1_data) - 1, opt, spirv);
1959 |                     ncnn::compile_spirv_module(glsl_fp64_p1_dual_data, sizeof(glsl_fp64_p1_dual_data) - 1, opt, spirv_dual);
1960 |                 }
1961 |                 if (packing_type == 4)
1962 |                 {
1963 |                     ncnn::compile_spirv_module(glsl_fp64_p4_data, sizeof(glsl_fp64_p4_data) - 1, opt, spirv);
1964 |                     ncnn::compile_spirv_module(glsl_fp64_p4_dual_data, sizeof(glsl_fp64_p4_dual_data) - 1, opt, spirv_dual);
1965 |                 }
1966 |             }
1967 |             else if (arithmetic_type == 3)
1968 |             {
1969 |                 if (packing_type == 1)
1970 |                 {
1971 |                     ncnn::compile_spirv_module(glsl_int32_p1_data, sizeof(glsl_int32_p1_data) - 1, opt, spirv);
1972 |                     ncnn::compile_spirv_module(glsl_int32_p1_dual_data, sizeof(glsl_int32_p1_dual_data) - 1, opt, spirv_dual);
1973 |                 }
1974 |                 if (packing_type == 4)
1975 |                 {
1976 |                     ncnn::compile_spirv_module(glsl_int32_p4_data, sizeof(glsl_int32_p4_data) - 1, opt, spirv);
1977 |                     ncnn::compile_spirv_module(glsl_int32_p4_dual_data, sizeof(glsl_int32_p4_dual_data) - 1, opt, spirv_dual);
1978 |                 }
1979 |             }
1980 |             else if (arithmetic_type == 4)
1981 |             {
1982 |                 if (packing_type == 1)
1983 |                 {
1984 |                     ncnn::compile_spirv_module(glsl_int16_p1_data, sizeof(glsl_int16_p1_data) - 1, opt, spirv);
1985 |                     ncnn::compile_spirv_module(glsl_int16_p1_dual_data, sizeof(glsl_int16_p1_dual_data) - 1, opt, spirv_dual);
1986 |                 }
1987 |                 if (packing_type == 4)
1988 |                 {
1989 |                     ncnn::compile_spirv_module(glsl_int16_p4_data, sizeof(glsl_int16_p4_data) - 1, opt, spirv);
1990 |                     ncnn::compile_spirv_module(glsl_int16_p4_dual_data, sizeof(glsl_int16_p4_dual_data) - 1, opt, spirv_dual);
1991 |                 }
1992 |             }
1993 |             else if (arithmetic_type == 5)
1994 |             {
1995 |                 if (packing_type == 1)
1996 |                 {
1997 |                     ncnn::compile_spirv_module(glsl_int64_p1_data, sizeof(glsl_int64_p1_data) - 1, opt, spirv);
1998 |                     ncnn::compile_spirv_module(glsl_int64_p1_dual_data, sizeof(glsl_int64_p1_dual_data) - 1, opt, spirv_dual);
1999 |                 }
2000 |                 if (packing_type == 4)
2001 |                 {
2002 |                     ncnn::compile_spirv_module(glsl_int64_p4_data, sizeof(glsl_int64_p4_data) - 1, opt, spirv);
2003 |                     ncnn::compile_spirv_module(glsl_int64_p4_dual_data, sizeof(glsl_int64_p4_dual_data) - 1, opt, spirv_dual);
2004 |                 }
2005 |             }
2006 |             else if (arithmetic_type == 6)
2007 |             {
2008 |                 if (packing_type == 4)
2009 |                 {
2010 |                     ncnn::compile_spirv_module(glsl_int8_p4_data, sizeof(glsl_int8_p4_data) - 1, opt, spirv);
2011 |                     ncnn::compile_spirv_module(glsl_int8_p4_dual_data, sizeof(glsl_int8_p4_dual_data) - 1, opt, spirv_dual);
2012 |                 }
2013 |                 if (packing_type == 256)
2014 |                 {
2015 |                     // loop M N K SCOPE
2016 |                     specializations.resize(5);
2017 |                     specializations[1].i = M;
2018 |                     specializations[2].i = N;
2019 |                     specializations[3].i = K;
2020 |                     specializations[4].i = SCOPE;
2021 | 
2022 |                     ncnn::compile_spirv_module(glsl_int8_matrix_data, sizeof(glsl_int8_matrix_data) - 1, opt, spirv);
2023 |                     ncnn::compile_spirv_module(glsl_int8_matrix_dual_data, sizeof(glsl_int8_matrix_dual_data) - 1, opt, spirv_dual);
2024 |                 }
2025 |             }
2026 |             else if (arithmetic_type == 7)
2027 |             {
2028 |                 if (packing_type == 4)
2029 |                 {
2030 |                     ncnn::compile_spirv_module(glsl_bf16_p4_data, sizeof(glsl_bf16_p4_data) - 1, opt, spirv);
2031 |                     ncnn::compile_spirv_module(glsl_bf16_p4_dual_data, sizeof(glsl_bf16_p4_dual_data) - 1, opt, spirv_dual);
2032 |                 }
2033 |                 if (packing_type == 256)
2034 |                 {
2035 |                     // loop M N K SCOPE
2036 |                     specializations.resize(5);
2037 |                     specializations[1].i = M;
2038 |                     specializations[2].i = N;
2039 |                     specializations[3].i = K;
2040 |                     specializations[4].i = SCOPE;
2041 | 
2042 |                     if (use_bf16_fp32_matrix)
2043 |                     {
2044 |                         ncnn::compile_spirv_module(glsl_bf16_fp32_matrix_data, sizeof(glsl_bf16_fp32_matrix_data) - 1, opt, spirv);
2045 |                         ncnn::compile_spirv_module(glsl_bf16_fp32_matrix_dual_data, sizeof(glsl_bf16_fp32_matrix_dual_data) - 1, opt, spirv_dual);
2046 |                     }
2047 |                     else
2048 |                     {
2049 |                         ncnn::compile_spirv_module(glsl_bf16_matrix_data, sizeof(glsl_bf16_matrix_data) - 1, opt, spirv);
2050 |                         ncnn::compile_spirv_module(glsl_bf16_matrix_dual_data, sizeof(glsl_bf16_matrix_dual_data) - 1, opt, spirv_dual);
2051 |                     }
2052 |                 }
2053 |             }
2054 |             else if (arithmetic_type == 8)
2055 |             {
2056 |                 if (packing_type == 256)
2057 |                 {
2058 |                     // loop M N K SCOPE
2059 |                     specializations.resize(5);
2060 |                     specializations[1].i = M;
2061 |                     specializations[2].i = N;
2062 |                     specializations[3].i = K;
2063 |                     specializations[4].i = SCOPE;
2064 | 
2065 |                     if (use_fp8_fp32_matrix)
2066 |                     {
2067 |                         ncnn::compile_spirv_module(glsl_fp8_fp32_matrix_data, sizeof(glsl_fp8_fp32_matrix_data) - 1, opt, spirv);
2068 |                         ncnn::compile_spirv_module(glsl_fp8_fp32_matrix_dual_data, sizeof(glsl_fp8_fp32_matrix_dual_data) - 1, opt, spirv_dual);
2069 |                     }
2070 |                     else
2071 |                     {
2072 |                         ncnn::compile_spirv_module(glsl_fp8_fp16_matrix_data, sizeof(glsl_fp8_fp16_matrix_data) - 1, opt, spirv);
2073 |                         ncnn::compile_spirv_module(glsl_fp8_fp16_matrix_dual_data, sizeof(glsl_fp8_fp16_matrix_dual_data) - 1, opt, spirv_dual);
2074 |                     }
2075 |                 }
2076 |             }
2077 |             else if (arithmetic_type == 9)
2078 |             {
2079 |                 if (packing_type == 256)
2080 |                 {
2081 |                     // loop M N K SCOPE
2082 |                     specializations.resize(5);
2083 |                     specializations[1].i = M;
2084 |                     specializations[2].i = N;
2085 |                     specializations[3].i = K;
2086 |                     specializations[4].i = SCOPE;
2087 | 
2088 |                     if (use_fp8_fp32_matrix)
2089 |                     {
2090 |                         ncnn::compile_spirv_module(glsl_bf8_fp32_matrix_data, sizeof(glsl_bf8_fp32_matrix_data) - 1, opt, spirv);
2091 |                         ncnn::compile_spirv_module(glsl_bf8_fp32_matrix_dual_data, sizeof(glsl_bf8_fp32_matrix_dual_data) - 1, opt, spirv_dual);
2092 |                     }
2093 |                     else
2094 |                     {
2095 |                         ncnn::compile_spirv_module(glsl_bf8_fp16_matrix_data, sizeof(glsl_bf8_fp16_matrix_data) - 1, opt, spirv);
2096 |                         ncnn::compile_spirv_module(glsl_bf8_fp16_matrix_dual_data, sizeof(glsl_bf8_fp16_matrix_dual_data) - 1, opt, spirv_dual);
2097 |                     }
2098 |                 }
2099 |             }
2100 |             else // if (arithmetic_type == 0 || arithmetic_type == 1)
2101 |             {
2102 |                 if (packing_type == 1)
2103 |                 {
2104 |                     ncnn::compile_spirv_module(glsl_p1_data, sizeof(glsl_p1_data) - 1, opt, spirv);
2105 |                     ncnn::compile_spirv_module(glsl_p1_dual_data, sizeof(glsl_p1_dual_data) - 1, opt, spirv_dual);
2106 |                 }
2107 |                 if (packing_type == 4)
2108 |                 {
2109 |                     ncnn::compile_spirv_module(glsl_p4_data, sizeof(glsl_p4_data) - 1, opt, spirv);
2110 |                     ncnn::compile_spirv_module(glsl_p4_dual_data, sizeof(glsl_p4_dual_data) - 1, opt, spirv_dual);
2111 |                 }
2112 |                 if (packing_type == 256)
2113 |                 {
2114 |                     // loop M N K SCOPE
2115 |                     specializations.resize(5);
2116 |                     specializations[1].i = M;
2117 |                     specializations[2].i = N;
2118 |                     specializations[3].i = K;
2119 |                     specializations[4].i = SCOPE;
2120 | 
2121 |                     if (use_fp16_fp32_matrix)
2122 |                     {
2123 |                         ncnn::compile_spirv_module(glsl_fp16_fp32_matrix_data, sizeof(glsl_fp16_fp32_matrix_data) - 1, opt, spirv);
2124 |                         ncnn::compile_spirv_module(glsl_fp16_fp32_matrix_dual_data, sizeof(glsl_fp16_fp32_matrix_dual_data) - 1, opt, spirv_dual);
2125 |                     }
2126 |                     else
2127 |                     {
2128 |                         ncnn::compile_spirv_module(glsl_fp16_matrix_data, sizeof(glsl_fp16_matrix_data) - 1, opt, spirv);
2129 |                         ncnn::compile_spirv_module(glsl_fp16_matrix_dual_data, sizeof(glsl_fp16_matrix_dual_data) - 1, opt, spirv_dual);
2130 |                     }
2131 |                 }
2132 |             }
2133 | 
2134 |             int ret0 = pipeline.create(spirv.data(), spirv.size() * 4, specializations);
2135 |             int ret1 = pipeline_dual.create(spirv_dual.data(), spirv_dual.size() * 4, specializations);
2136 |             if (ret0 != 0 || ret1 != 0)
2137 |             {
2138 |                 vkdev->reclaim_blob_allocator(allocator);
2139 |                 return 0;
2140 |             }
2141 |         }
2142 | 
2143 |         const int cmd_loop = 6;
2144 | 
2145 |         for (int i = 0; i < cmd_loop; i++)
2146 |         {
2147 |             // encode command
2148 |             ncnn::VkCompute cmd(vkdev);
2149 |             ncnn::VkCompute cmd_dual(vkdev);
2150 |             {
2151 |                 std::vector<ncnn::VkMat> bindings(1);
2152 |                 bindings[0] = c;
2153 | 
2154 |                 std::vector<ncnn::vk_constant_type> constants(0);
2155 | 
2156 |                 ncnn::VkMat dispatcher;
2157 |                 dispatcher.w = invocation_count;
2158 |                 dispatcher.h = 1;
2159 |                 dispatcher.c = 1;
2160 |                 cmd.record_pipeline(&pipeline, bindings, constants, dispatcher);
2161 |                 cmd_dual.record_pipeline(&pipeline_dual, bindings, constants, dispatcher);
2162 |             }
2163 | 
2164 |             // time this
2165 |             {
2166 |                 double t0 = ncnn::get_current_time();
2167 | 
2168 |                 int ret = cmd.submit_and_wait();
2169 |                 if (ret != 0)
2170 |                 {
2171 |                     vkdev->reclaim_blob_allocator(allocator);
2172 |                     return 0;
2173 |                 }
2174 | 
2175 |                 double t1 = ncnn::get_current_time();
2176 | 
2177 |                 double time = t1 - t0;
2178 | 
2179 |                 if (time < 300)
2180 |                 {
2181 |                     // for fast device
2182 |                     if (invocation_count * 2 <= max_invocation_count)
2183 |                     {
2184 |                         invocation_count = std::min(invocation_count * 2, max_invocation_count);
2185 |                     }
2186 |                     else
2187 |                     {
2188 |                         loop *= 2;
2189 |                     }
2190 |                     rerun = true;
2191 |                     break;
2192 |                 }
2193 | 
2194 |                 t0 = ncnn::get_current_time();
2195 | 
2196 |                 ret = cmd_dual.submit_and_wait();
2197 |                 if (ret != 0)
2198 |                 {
2199 |                     vkdev->reclaim_blob_allocator(allocator);
2200 |                     return 0;
2201 |                 }
2202 | 
2203 |                 t1 = ncnn::get_current_time();
2204 | 
2205 |                 double time_dual = t1 - t0;
2206 | 
2207 |                 if (time_dual < 300)
2208 |                 {
2209 |                     // for fast device
2210 |                     if (invocation_count * 2 <= max_invocation_count)
2211 |                     {
2212 |                         invocation_count = std::min(invocation_count * 2, max_invocation_count);
2213 |                     }
2214 |                     else
2215 |                     {
2216 |                         loop *= 2;
2217 |                     }
2218 |                     rerun = true;
2219 |                     break;
2220 |                 }
2221 | 
2222 |                 double gflops;
2223 |                 {
2224 |                     double mac = (double)invocation_count * ((double)loop * 16 * 2);
2225 | 
2226 |                     if (packing_type == 256)
2227 |                     {
2228 |                         mac *= M * N * K;
2229 |                         mac /= local_size_x;
2230 |                     }
2231 |                     else
2232 |                     {
2233 |                         mac *= packing_type;
2234 |                     }
2235 | 
2236 |                     gflops = mac / time / 1000000;
2237 |                 }
2238 |                 double gflops_dual;
2239 |                 {
2240 |                     // dual issue is faster
2241 |                     double mac = (double)invocation_count * ((double)loop * 16 * 2 + 1); // +1 for the tail c0+c1
2242 | 
2243 |                     if (packing_type == 256)
2244 |                     {
2245 |                         mac *= M * N * K;
2246 |                         mac /= local_size_x;
2247 |                     }
2248 |                     else
2249 |                     {
2250 |                         mac *= packing_type;
2251 |                     }
2252 | 
2253 |                     gflops_dual = mac / time_dual / 1000000;
2254 |                 }
2255 | 
2256 |                 gflops = std::max(gflops, gflops_dual);
2257 | 
2258 |                 // fprintf(stderr, "%f gflops\n", gflops);
2259 | 
2260 |                 if (gflops > max_gflops)
2261 |                     max_gflops = gflops;
2262 |             }
2263 |         }
2264 |     }
2265 | 
2266 |     vkdev->reclaim_blob_allocator(allocator);
2267 | 
2268 |     return max_gflops;
2269 | }
2270 | 
2271 | static double vkpeak_copy(int device_id, int from_type, int to_type)
2272 | {
2273 |     ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(device_id);
2274 | 
2275 |     if (!vkdev)
2276 |     {
2277 |         return 0;
2278 |     }
2279 | 
2280 |     ncnn::Option opt;
2281 |     opt.use_vulkan_compute = true;
2282 |     opt.use_fp16_packed = false;
2283 |     opt.use_fp16_storage = false;
2284 | 
2285 |     ncnn::VkAllocator* staging_allocator = vkdev->acquire_staging_allocator();
2286 |     ncnn::VkAllocator* allocator = vkdev->acquire_blob_allocator();
2287 | 
2288 |     opt.blob_vkallocator = allocator;
2289 |     opt.workspace_vkallocator = allocator;
2290 |     opt.staging_vkallocator = staging_allocator;
2291 | 
2292 |     bool d2d = from_type == 1 && to_type == 1;
2293 | 
2294 |     // devbuf max 512M for host and 2G for d2d
2295 |     size_t buffer_size = std::min((size_t)vkdev->get_heap_budget() / 8, d2d ? (size_t)2048 : (size_t)512) * 1024 * 1024;
2296 |     if (vkdev->info.type() == 1)
2297 |     {
2298 |         // max 128M for integrated gpu
2299 |         buffer_size = std::min(buffer_size, (size_t)128 * 1024 * 1024);
2300 |     }
2301 | 
2302 |     double max_gbps = 0;
2303 | 
2304 |     if (from_type == 0 && to_type == 0)
2305 |     {
2306 |         ncnn::Mat a(1, buffer_size, 1);
2307 |         ncnn::Mat b(1, buffer_size, 1);
2308 | 
2309 |         const int cmd_loop = 10;
2310 | 
2311 |         for (int i = 0; i < cmd_loop; i++)
2312 |         {
2313 |             // reset cache
2314 |             memset(a, 0, buffer_size);
2315 | 
2316 |             ncnn::sleep(100);
2317 | 
2318 |             // time this
2319 |             double t0 = ncnn::get_current_time();
2320 | 
2321 |             memcpy(b, a, buffer_size);
2322 | 
2323 |             double t1 = ncnn::get_current_time();
2324 | 
2325 |             double time = t1 - t0;
2326 | 
2327 |             double gbps = buffer_size / time / 1000000;
2328 | 
2329 |             // fprintf(stderr, "%f gbps\n", gbps);
2330 | 
2331 |             if (gbps > max_gbps)
2332 |                 max_gbps = gbps;
2333 |         }
2334 |     }
2335 |     if (from_type == 0 && to_type == 1)
2336 |     {
2337 |         ncnn::VkMat devbuf(1, buffer_size, 1, staging_allocator);
2338 |         ncnn::Mat hostbuf(1, buffer_size, 1);
2339 | 
2340 |         void* devptr = devbuf.mapped_ptr();
2341 |         void* hostptr = hostbuf.data;
2342 | 
2343 |         const int cmd_loop = 10;
2344 | 
2345 |         for (int i = 0; i < cmd_loop; i++)
2346 |         {
2347 |             // reset cache
2348 |             memset(hostptr, 0, buffer_size);
2349 |             staging_allocator->invalidate(devbuf.data);
2350 | 
2351 |             ncnn::sleep(100);
2352 | 
2353 |             // time this
2354 |             double t0 = ncnn::get_current_time();
2355 | 
2356 |             memcpy(devptr, hostptr, buffer_size);
2357 | 
2358 |             staging_allocator->flush(devbuf.data);
2359 | 
2360 |             double t1 = ncnn::get_current_time();
2361 | 
2362 |             double time = t1 - t0;
2363 | 
2364 |             double gbps = buffer_size / time / 1000000;
2365 | 
2366 |             // fprintf(stderr, "%f gbps\n", gbps);
2367 | 
2368 |             if (gbps > max_gbps)
2369 |                 max_gbps = gbps;
2370 |         }
2371 |     }
2372 |     if (from_type == 1 && to_type == 0)
2373 |     {
2374 |         ncnn::VkMat devbuf(1, buffer_size, 1, staging_allocator);
2375 |         ncnn::Mat hostbuf(1, buffer_size, 1);
2376 | 
2377 |         void* devptr = devbuf.mapped_ptr();
2378 |         void* hostptr = hostbuf.data;
2379 | 
2380 |         const int cmd_loop = 10;
2381 | 
2382 |         for (int i = 0; i < cmd_loop; i++)
2383 |         {
2384 |             // reset cache
2385 |             staging_allocator->flush(devbuf.data);
2386 |             memset(hostptr, 0, buffer_size);
2387 | 
2388 |             ncnn::sleep(100);
2389 | 
2390 |             // time this
2391 |             double t0 = ncnn::get_current_time();
2392 | 
2393 |             staging_allocator->invalidate(devbuf.data);
2394 | 
2395 |             memcpy(hostptr, devptr, buffer_size);
2396 | 
2397 |             double t1 = ncnn::get_current_time();
2398 | 
2399 |             double time = t1 - t0;
2400 | 
2401 |             double gbps = buffer_size / time / 1000000;
2402 | 
2403 |             // fprintf(stderr, "%f gbps\n", gbps);
2404 | 
2405 |             if (gbps > max_gbps)
2406 |                 max_gbps = gbps;
2407 |         }
2408 |     }
2409 |     if (from_type == 1 && to_type == 1)
2410 |     {
2411 |         ncnn::VkMat a(1, buffer_size, 1, allocator);
2412 |         ncnn::VkMat b(1, buffer_size, 1, allocator);
2413 | 
2414 |         const int cmd_loop = 50;
2415 | 
2416 |         for (int i = 0; i < cmd_loop; i++)
2417 |         {
2418 |             // encode command
2419 |             ncnn::VkCompute cmd(vkdev);
2420 | 
2421 |             cmd.record_clone(a, b, opt);
2422 | 
2423 |             // time this
2424 |             double t0 = ncnn::get_current_time();
2425 | 
2426 |             int ret = cmd.submit_and_wait();
2427 |             if (ret != 0)
2428 |             {
2429 |                 vkdev->reclaim_staging_allocator(staging_allocator);
2430 |                 vkdev->reclaim_blob_allocator(allocator);
2431 |                 return 0;
2432 |             }
2433 | 
2434 |             double t1 = ncnn::get_current_time();
2435 | 
2436 |             double time = t1 - t0;
2437 | 
2438 |             double gbps = buffer_size / time / 1000000;
2439 | 
2440 |             // fprintf(stderr, "%f gbps\n", gbps);
2441 | 
2442 |             if (gbps > max_gbps)
2443 |                 max_gbps = gbps;
2444 |         }
2445 |     }
2446 | 
2447 |     vkdev->reclaim_staging_allocator(staging_allocator);
2448 |     vkdev->reclaim_blob_allocator(allocator);
2449 | 
2450 |     return max_gbps;
2451 | }
2452 | 
2453 | int main(int argc, char** argv)
2454 | {
2455 |     if (argc != 2)
2456 |     {
2457 |         fprintf(stderr, "Usage: %s [device_id]\n", argv[0]);
2458 |         return -1;
2459 |     }
2460 | 
2461 |     ncnn::create_gpu_instance();
2462 | 
2463 |     const int gpu_count = ncnn::get_gpu_count();
2464 |     if (gpu_count == 0)
2465 |     {
2466 |         fprintf(stderr, "No vulkan device\n");
2467 |         return -1;
2468 |     }
2469 | 
2470 |     const int device_id = atoi(argv[1]);
2471 |     if (device_id < 0 || device_id >= gpu_count)
2472 |     {
2473 |         fprintf(stderr, "No vulkan device for %d\n", device_id);
2474 |         fprintf(stderr, "Available devices:\n");
2475 | 
2476 |         for (int i = 0; i < gpu_count; i++)
2477 |         {
2478 |             fprintf(stderr, "%d = %s\n", i, ncnn::get_gpu_info(i).device_name());
2479 |         }
2480 | 
2481 |         return -1;
2482 |     }
2483 | 
2484 |     fprintf(stderr, "device       = %s\n", ncnn::get_gpu_info(device_id).device_name());
2485 | 
2486 |     // storage_type / arithmetic_type
2487 |     //      0 = fp32
2488 |     //      1 = fp16
2489 |     //      2 = fp64
2490 |     //      3 = int32
2491 |     //      4 = int16
2492 |     //      5 = int64
2493 |     //      6 = int8
2494 |     //      7 = bf16
2495 |     //      8 = fp8
2496 |     //      9 = bf8
2497 | 
2498 |     // packing_type
2499 |     //      1 = scalar
2500 |     //      4 = vec4 / dotprod
2501 |     //    256 = matrix
2502 | 
2503 |     fprintf(stderr, "\n");
2504 |     fprintf(stderr, "fp32-scalar  = %.2f GFLOPS\n", vkpeak(device_id, 0, 0, 1));
2505 |     fprintf(stderr, "fp32-vec4    = %.2f GFLOPS\n", vkpeak(device_id, 0, 0, 4));
2506 | 
2507 |     fprintf(stderr, "\n");
2508 |     fprintf(stderr, "fp16-scalar  = %.2f GFLOPS\n", vkpeak(device_id, 0, 1, 1));
2509 |     fprintf(stderr, "fp16-vec4    = %.2f GFLOPS\n", vkpeak(device_id, 0, 1, 4));
2510 |     fprintf(stderr, "fp16-matrix  = %.2f GFLOPS\n", vkpeak(device_id, 1, 1, 256));
2511 | 
2512 |     fprintf(stderr, "\n");
2513 |     fprintf(stderr, "fp64-scalar  = %.2f GFLOPS\n", vkpeak(device_id, 2, 2, 1));
2514 |     fprintf(stderr, "fp64-vec4    = %.2f GFLOPS\n", vkpeak(device_id, 2, 2, 4));
2515 | 
2516 |     fprintf(stderr, "\n");
2517 |     fprintf(stderr, "int32-scalar = %.2f GIOPS\n", vkpeak(device_id, 3, 3, 1));
2518 |     fprintf(stderr, "int32-vec4   = %.2f GIOPS\n", vkpeak(device_id, 3, 3, 4));
2519 | 
2520 |     fprintf(stderr, "\n");
2521 |     fprintf(stderr, "int16-scalar = %.2f GIOPS\n", vkpeak(device_id, 3, 4, 1));
2522 |     fprintf(stderr, "int16-vec4   = %.2f GIOPS\n", vkpeak(device_id, 3, 4, 4));
2523 | 
2524 |     fprintf(stderr, "\n");
2525 |     fprintf(stderr, "int64-scalar = %.2f GIOPS\n", vkpeak(device_id, 5, 5, 1));
2526 |     fprintf(stderr, "int64-vec4   = %.2f GIOPS\n", vkpeak(device_id, 5, 5, 4));
2527 | 
2528 |     fprintf(stderr, "\n");
2529 |     fprintf(stderr, "int8-dotprod = %.2f GIOPS\n", vkpeak(device_id, 3, 6, 4));
2530 |     fprintf(stderr, "int8-matrix  = %.2f GIOPS\n", vkpeak(device_id, 3, 6, 256));
2531 | 
2532 |     fprintf(stderr, "\n");
2533 |     fprintf(stderr, "bf16-dotprod = %.2f GFLOPS\n", vkpeak(device_id, 0, 7, 4));
2534 |     fprintf(stderr, "bf16-matrix  = %.2f GFLOPS\n", vkpeak(device_id, 0, 7, 256));
2535 | 
2536 |     fprintf(stderr, "\n");
2537 |     fprintf(stderr, "fp8-matrix   = %.2f GFLOPS\n", vkpeak(device_id, 0, 8, 256));
2538 |     fprintf(stderr, "bf8-matrix   = %.2f GFLOPS\n", vkpeak(device_id, 0, 9, 256));
2539 | 
2540 |     // device_type
2541 |     //      0 = cpu
2542 |     //      1 = gpu
2543 | 
2544 |     fprintf(stderr, "\n");
2545 |     fprintf(stderr, "copy-h2h     = %.2f GBPS\n", vkpeak_copy(device_id, 0, 0));
2546 |     fprintf(stderr, "copy-h2d     = %.2f GBPS\n", vkpeak_copy(device_id, 0, 1));
2547 |     fprintf(stderr, "copy-d2h     = %.2f GBPS\n", vkpeak_copy(device_id, 1, 0));
2548 |     fprintf(stderr, "copy-d2d     = %.2f GBPS\n", vkpeak_copy(device_id, 1, 1));
2549 | 
2550 |     ncnn::destroy_gpu_instance();
2551 | 
2552 |     return 0;
2553 | }
2554 | 


--------------------------------------------------------------------------------