├── .gitmodules ├── LICENSE ├── .github └── workflows │ ├── CI.yml │ └── release.yml ├── README.md ├── CMakeLists.txt └── vkpeak.cpp /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ncnn"] 2 | path = ncnn 3 | url = https://github.com/Tencent/ncnn.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 nihui 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | 4 | env: 5 | VULKANSDK_VERSION: 1.4.309.0 6 | DEVELOPER_DIR: /Applications/Xcode_15.2.app/Contents/Developer 7 | UseMultiToolTask: true 8 | 9 | concurrency: 10 | group: CI-${{ github.ref }} 11 | cancel-in-progress: true 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | windows: 17 | runs-on: windows-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | submodules: 'recursive' 22 | - name: build 23 | run: | 24 | mkdir build; cd build 25 | cmake -A x64 .. 26 | cmake --build . --config Release -j 4 27 | 28 | ubuntu: 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | with: 33 | submodules: 'recursive' 34 | - name: build 35 | run: | 36 | mkdir build && cd build 37 | cmake .. 38 | cmake --build . -j 4 39 | 40 | macos: 41 | runs-on: macos-13 42 | steps: 43 | - uses: actions/checkout@v4 44 | with: 45 | submodules: 'recursive' 46 | - name: vulkansdk 47 | run: | 48 | wget -q https://sdk.lunarg.com/sdk/download/${{ env.VULKANSDK_VERSION }}/mac/vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip?Human=true -O vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip 49 | unzip -q vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip 50 | sudo InstallVulkan-${{ env.VULKANSDK_VERSION }}.app/Contents/MacOS/InstallVulkan-${{ env.VULKANSDK_VERSION }} --root $GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }} --accept-licenses --default-answer --confirm-command install 51 | - name: build-x86_64 52 | run: | 53 | mkdir build-x86_64 && cd build-x86_64 54 | cmake -DCMAKE_OSX_ARCHITECTURES="x86_64" \ 55 | -DVulkan_LIBRARY=$GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }}/macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a \ 56 | .. 57 | cmake --build . -j 4 58 | - name: build-arm64 59 | run: | 60 | mkdir build-arm64 && cd build-arm64 61 | cmake -DCMAKE_OSX_ARCHITECTURES="arm64" \ 62 | -DVulkan_LIBRARY=$GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }}/macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a \ 63 | .. 64 | cmake --build . -j 4 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vkpeak 2 | 3 | ![CI](https://github.com/nihui/vkpeak/workflows/CI/badge.svg) 4 | ![download](https://img.shields.io/github/downloads/nihui/vkpeak/total.svg) 5 | 6 | A synthetic benchmarking tool to measure peak capabilities of vulkan devices. It only measures the peak metrics that can be achieved using vector operations and does not represent a real-world use case. 7 | 8 | ## [Download](https://github.com/nihui/vkpeak/releases) 9 | 10 | Download Windows/Linux/MacOS Executable for Intel/AMD/Nvidia/Apple GPU 11 | 12 | **https://github.com/nihui/vkpeak/releases** 13 | 14 | ## Usages 15 | 16 | ```shell 17 | vkpeak.exe 18 | ``` 19 | 20 | vkpeak will choose the default vulkan device. 21 | 22 | If you need to specify device id, then 23 | 24 | ```shell 25 | vkpeak.exe 0 26 | ``` 27 | 28 | The only parameter 0 is the device id. 29 | 30 | If you encounter a crash or error, try upgrading your GPU driver: 31 | 32 | - Intel: https://downloadcenter.intel.com/product/80939/Graphics-Drivers 33 | - AMD: https://www.amd.com/en/support 34 | - NVIDIA: https://www.nvidia.com/Download/index.aspx 35 | 36 | ## Build from Source 37 | 38 | 1. Clone this project with all submodules 39 | 40 | ```shell 41 | git clone https://github.com/nihui/vkpeak.git 42 | cd vkpeak 43 | git submodule update --init --recursive 44 | ``` 45 | 46 | 2. Build with CMake 47 | - You can pass -DVulkan_LIBRARY= option to link static MoltenVK library on MacOS, MoltenVK is part of Vulkan SDK from https://vulkan.lunarg.com/ 48 | 49 | ```shell 50 | mkdir build 51 | cd build 52 | cmake .. 53 | cmake --build . -j 4 54 | ``` 55 | 56 | ## Sample 57 | 58 | NVIDIA RTX5060Ti 16GB 59 | ``` 60 | device = NVIDIA GeForce RTX 5060 Ti 61 | 62 | fp32-scalar = 17137.46 GFLOPS 63 | fp32-vec4 = 16910.07 GFLOPS 64 | 65 | fp16-scalar = 12730.03 GFLOPS 66 | fp16-vec4 = 12715.02 GFLOPS 67 | fp16-matrix = 101485.35 GFLOPS 68 | 69 | fp64-scalar = 398.59 GFLOPS 70 | fp64-vec4 = 394.08 GFLOPS 71 | 72 | int32-scalar = 12703.68 GIOPS 73 | int32-vec4 = 12181.98 GIOPS 74 | 75 | int16-scalar = 12690.05 GIOPS 76 | int16-vec4 = 12208.29 GIOPS 77 | 78 | int64-scalar = 3104.59 GIOPS 79 | int64-vec4 = 2666.86 GIOPS 80 | 81 | int8-dotprod = 16101.59 GIOPS 82 | int8-matrix = 202947.80 GIOPS 83 | 84 | bf16-dotprod = 0.00 GFLOPS 85 | bf16-matrix = 0.00 GFLOPS 86 | 87 | fp8-matrix = 0.00 GFLOPS 88 | bf8-matrix = 0.00 GFLOPS 89 | 90 | copy-h2h = 18.17 GBPS 91 | copy-h2d = 17.93 GBPS 92 | copy-d2h = 18.09 GBPS 93 | copy-d2d = 190.70 GBPS 94 | ``` 95 | 96 | AMD RX9060XT 16GB 97 | ``` 98 | device = AMD Radeon Graphics (RADV GFX1200) 99 | 100 | fp32-scalar = 17606.54 GFLOPS 101 | fp32-vec4 = 12155.22 GFLOPS 102 | 103 | fp16-scalar = 16921.16 GFLOPS 104 | fp16-vec4 = 27833.48 GFLOPS 105 | fp16-matrix = 105337.66 GFLOPS 106 | 107 | fp64-scalar = 442.80 GFLOPS 108 | fp64-vec4 = 437.55 GFLOPS 109 | 110 | int32-scalar = 2804.59 GIOPS 111 | int32-vec4 = 2796.74 GIOPS 112 | 113 | int16-scalar = 15034.62 GIOPS 114 | int16-vec4 = 26356.38 GIOPS 115 | 116 | int64-scalar = 932.14 GIOPS 117 | int64-vec4 = 768.53 GIOPS 118 | 119 | int8-dotprod = 53893.32 GIOPS 120 | int8-matrix = 194476.41 GIOPS 121 | 122 | bf16-dotprod = 24427.68 GFLOPS 123 | bf16-matrix = 105099.82 GFLOPS 124 | 125 | fp8-matrix = 205061.72 GFLOPS 126 | bf8-matrix = 208234.02 GFLOPS 127 | 128 | copy-h2h = 21.05 GBPS 129 | copy-h2d = 21.17 GBPS 130 | copy-d2h = 23.70 GBPS 131 | copy-d2d = 145.23 GBPS 132 | ``` 133 | 134 | ## Other Open-Source Code Used 135 | 136 | - https://github.com/Tencent/ncnn for fast neural network inference on ALL PLATFORMS 137 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | on: workflow_dispatch 3 | 4 | env: 5 | VULKANSDK_VERSION: 1.4.309.0 6 | DEVELOPER_DIR: /Applications/Xcode_15.2.app/Contents/Developer 7 | UseMultiToolTask: true 8 | APPLICATION_NAME: vkpeak 9 | 10 | jobs: 11 | 12 | setup: 13 | runs-on: ubuntu-latest 14 | outputs: 15 | APPNAME: ${{ steps.get_appname.outputs.APPNAME }} 16 | VERSION: ${{ steps.get_version.outputs.VERSION }} 17 | steps: 18 | - name: get-appname 19 | id: get_appname 20 | run: echo "APPNAME=${APPLICATION_NAME}" >> $GITHUB_OUTPUT 21 | - name: get-version 22 | id: get_version 23 | run: | 24 | DATE=`date +'%Y%m%d'` 25 | echo "VERSION=${DATE}" >> $GITHUB_OUTPUT 26 | 27 | ubuntu: 28 | needs: [setup] 29 | runs-on: ubuntu-22.04 30 | env: 31 | PACKAGENAME: ${{ needs.setup.outputs.APPNAME }}-${{ needs.setup.outputs.VERSION }}-ubuntu 32 | steps: 33 | - uses: actions/checkout@v4 34 | with: 35 | submodules: 'recursive' 36 | - name: build 37 | run: | 38 | mkdir build && cd build 39 | cmake .. 40 | cmake --build . -j 4 41 | - name: package 42 | run: | 43 | mkdir -p ${{ env.PACKAGENAME }} 44 | cp README.md LICENSE ${{ env.PACKAGENAME }} 45 | cp build/${{ needs.setup.outputs.APPNAME }} ${{ env.PACKAGENAME }} 46 | strip -g ${{ env.PACKAGENAME }}/${{ needs.setup.outputs.APPNAME }} 47 | zip -9 -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} 48 | - name: upload 49 | uses: actions/upload-artifact@v4 50 | with: 51 | name: ${{ env.PACKAGENAME }} 52 | path: ${{ env.PACKAGENAME }}.zip 53 | 54 | macos: 55 | needs: [setup] 56 | runs-on: macos-13 57 | env: 58 | PACKAGENAME: ${{ needs.setup.outputs.APPNAME }}-${{ needs.setup.outputs.VERSION }}-macos 59 | steps: 60 | - uses: actions/checkout@v4 61 | with: 62 | submodules: 'recursive' 63 | - name: vulkansdk 64 | run: | 65 | wget -q https://sdk.lunarg.com/sdk/download/${{ env.VULKANSDK_VERSION }}/mac/vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip?Human=true -O vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip 66 | unzip -q vulkansdk-macos-${{ env.VULKANSDK_VERSION }}.zip 67 | sudo InstallVulkan-${{ env.VULKANSDK_VERSION }}.app/Contents/MacOS/InstallVulkan-${{ env.VULKANSDK_VERSION }} --root $GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }} --accept-licenses --default-answer --confirm-command install 68 | - name: build-x86_64 69 | run: | 70 | mkdir build-x86_64 && cd build-x86_64 71 | cmake -DCMAKE_OSX_ARCHITECTURES="x86_64" \ 72 | -DVulkan_LIBRARY=$GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }}/macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a \ 73 | .. 74 | cmake --build . -j 4 75 | - name: build-arm64 76 | run: | 77 | mkdir build-arm64 && cd build-arm64 78 | cmake -DCMAKE_OSX_ARCHITECTURES="arm64" \ 79 | -DVulkan_LIBRARY=$GITHUB_WORKSPACE/${{ env.VULKANSDK_VERSION }}/macOS/lib/MoltenVK.xcframework/macos-arm64_x86_64/libMoltenVK.a \ 80 | .. 81 | cmake --build . -j 4 82 | - name: package 83 | run: | 84 | mkdir -p ${{ env.PACKAGENAME }} 85 | cp README.md LICENSE ${{ env.PACKAGENAME }} 86 | lipo -create build-x86_64/${{ needs.setup.outputs.APPNAME }} build-arm64/${{ needs.setup.outputs.APPNAME }} -o ${{ env.PACKAGENAME }}/${{ needs.setup.outputs.APPNAME }} 87 | strip ${{ env.PACKAGENAME }}/${{ needs.setup.outputs.APPNAME }} 88 | zip -9 -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} 89 | - name: upload 90 | uses: actions/upload-artifact@v4 91 | with: 92 | name: ${{ env.PACKAGENAME }} 93 | path: ${{ env.PACKAGENAME }}.zip 94 | 95 | windows: 96 | needs: [setup] 97 | runs-on: windows-latest 98 | env: 99 | UseMultiToolTask: true 100 | PACKAGENAME: ${{ needs.setup.outputs.APPNAME }}-${{ needs.setup.outputs.VERSION }}-windows 101 | steps: 102 | - uses: actions/checkout@v4 103 | with: 104 | submodules: 'recursive' 105 | - name: build 106 | run: | 107 | mkdir build; cd build 108 | cmake -A x64 .. 109 | cmake --build . --config Release -j 4 110 | - name: package 111 | run: | 112 | mkdir ${{ env.PACKAGENAME }} 113 | Copy-Item -Verbose -Path "README.md" -Destination "${{ env.PACKAGENAME }}" 114 | Copy-Item -Verbose -Path "LICENSE" -Destination "${{ env.PACKAGENAME }}" 115 | Copy-Item -Verbose -Path "build\Release\${{ needs.setup.outputs.APPNAME }}.exe" -Destination "${{ env.PACKAGENAME }}" 116 | 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} 117 | - name: upload 118 | uses: actions/upload-artifact@v4 119 | with: 120 | name: ${{ env.PACKAGENAME }} 121 | path: ${{ env.PACKAGENAME }}.zip 122 | 123 | release: 124 | needs: [setup, ubuntu, macos, windows] 125 | runs-on: ubuntu-latest 126 | steps: 127 | - name: download 128 | uses: actions/download-artifact@v4 129 | with: 130 | path: artifacts 131 | 132 | - name: create-release 133 | uses: softprops/action-gh-release@v2 134 | with: 135 | token: ${{ secrets.GITHUB_TOKEN }} 136 | tag_name: ${{ needs.setup.outputs.VERSION }} 137 | name: Release ${{ needs.setup.outputs.VERSION }} 138 | files: artifacts/*/*.zip 139 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_policy(SET CMP0091 NEW) 2 | set(CMAKE_POLICY_DEFAULT_CMP0091 NEW) 3 | set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") 4 | 5 | project(vkpeak) 6 | cmake_minimum_required(VERSION 3.10) 7 | 8 | if(NOT CMAKE_BUILD_TYPE) 9 | set(CMAKE_BUILD_TYPE release CACHE STRING "Choose the type of build" FORCE) 10 | endif() 11 | 12 | # build ncnn library 13 | if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ncnn/CMakeLists.txt") 14 | message(FATAL_ERROR "The submodules were not downloaded! Please update submodules with \"git submodule update --init --recursive\" and try again.") 15 | endif() 16 | 17 | option(NCNN_INSTALL_SDK "" OFF) 18 | option(NCNN_STRING "" OFF) 19 | option(NCNN_STDIO "" OFF) 20 | option(NCNN_C_API "" OFF) 21 | option(NCNN_PIXEL "" OFF) 22 | option(NCNN_PIXEL_ROTATE "" OFF) 23 | option(NCNN_PIXEL_AFFINE "" OFF) 24 | option(NCNN_PIXEL_DRAWING "" OFF) 25 | option(NCNN_PLATFORM_API "" OFF) 26 | option(NCNN_VULKAN "" ON) 27 | option(NCNN_BUILD_BENCHMARK "" OFF) 28 | option(NCNN_BUILD_TESTS "" OFF) 29 | option(NCNN_BUILD_TOOLS "" OFF) 30 | option(NCNN_BUILD_EXAMPLES "" OFF) 31 | option(NCNN_INT8 "" OFF) 32 | option(NCNN_BF16 "" OFF) 33 | option(NCNN_OPENMP "" OFF) 34 | option(NCNN_THREADS "" ON) 35 | option(NCNN_DISABLE_RTTI "" ON) 36 | option(NCNN_DISABLE_EXCEPTION "" ON) 37 | 38 | option(WITH_LAYER_absval "" OFF) 39 | option(WITH_LAYER_argmax "" OFF) 40 | option(WITH_LAYER_batchnorm "" OFF) 41 | option(WITH_LAYER_bias "" OFF) 42 | option(WITH_LAYER_bnll "" OFF) 43 | option(WITH_LAYER_concat "" OFF) 44 | option(WITH_LAYER_convolution "" OFF) 45 | option(WITH_LAYER_crop "" OFF) 46 | option(WITH_LAYER_deconvolution "" OFF) 47 | option(WITH_LAYER_dropout "" OFF) 48 | option(WITH_LAYER_eltwise "" OFF) 49 | option(WITH_LAYER_elu "" OFF) 50 | option(WITH_LAYER_embed "" OFF) 51 | option(WITH_LAYER_exp "" OFF) 52 | option(WITH_LAYER_flatten "" OFF) 53 | option(WITH_LAYER_innerproduct "" OFF) 54 | option(WITH_LAYER_input "" OFF) 55 | option(WITH_LAYER_log "" OFF) 56 | option(WITH_LAYER_lrn "" OFF) 57 | option(WITH_LAYER_memorydata "" OFF) 58 | option(WITH_LAYER_mvn "" OFF) 59 | option(WITH_LAYER_pooling "" OFF) 60 | option(WITH_LAYER_power "" OFF) 61 | option(WITH_LAYER_prelu "" OFF) 62 | option(WITH_LAYER_proposal "" OFF) 63 | option(WITH_LAYER_reduction "" OFF) 64 | option(WITH_LAYER_relu "" OFF) 65 | option(WITH_LAYER_reshape "" OFF) 66 | option(WITH_LAYER_roipooling "" OFF) 67 | option(WITH_LAYER_scale "" OFF) 68 | option(WITH_LAYER_sigmoid "" OFF) 69 | option(WITH_LAYER_slice "" OFF) 70 | option(WITH_LAYER_softmax "" OFF) 71 | option(WITH_LAYER_split "" OFF) 72 | option(WITH_LAYER_spp "" OFF) 73 | option(WITH_LAYER_tanh "" OFF) 74 | option(WITH_LAYER_threshold "" OFF) 75 | option(WITH_LAYER_tile "" OFF) 76 | option(WITH_LAYER_rnn "" OFF) 77 | option(WITH_LAYER_lstm "" OFF) 78 | option(WITH_LAYER_binaryop "" OFF) 79 | option(WITH_LAYER_unaryop "" OFF) 80 | option(WITH_LAYER_convolutiondepthwise "" OFF) 81 | option(WITH_LAYER_padding "" OFF) 82 | option(WITH_LAYER_squeeze "" OFF) 83 | option(WITH_LAYER_expanddims "" OFF) 84 | option(WITH_LAYER_normalize "" OFF) 85 | option(WITH_LAYER_permute "" OFF) 86 | option(WITH_LAYER_priorbox "" OFF) 87 | option(WITH_LAYER_detectionoutput "" OFF) 88 | option(WITH_LAYER_interp "" OFF) 89 | option(WITH_LAYER_deconvolutiondepthwise "" OFF) 90 | option(WITH_LAYER_shufflechannel "" OFF) 91 | option(WITH_LAYER_instancenorm "" OFF) 92 | option(WITH_LAYER_clip "" OFF) 93 | option(WITH_LAYER_reorg "" OFF) 94 | option(WITH_LAYER_yolodetectionoutput "" OFF) 95 | option(WITH_LAYER_quantize "" OFF) 96 | option(WITH_LAYER_dequantize "" OFF) 97 | option(WITH_LAYER_yolov3detectionoutput "" OFF) 98 | option(WITH_LAYER_psroipooling "" OFF) 99 | option(WITH_LAYER_roialign "" OFF) 100 | option(WITH_LAYER_packing "" ON) 101 | option(WITH_LAYER_requantize "" OFF) 102 | option(WITH_LAYER_cast "" OFF) 103 | option(WITH_LAYER_hardsigmoid "" OFF) 104 | option(WITH_LAYER_selu "" OFF) 105 | option(WITH_LAYER_hardswish "" OFF) 106 | option(WITH_LAYER_noop "" OFF) 107 | option(WITH_LAYER_pixelshuffle "" OFF) 108 | option(WITH_LAYER_deepcopy "" OFF) 109 | option(WITH_LAYER_mish "" OFF) 110 | option(WITH_LAYER_statisticspooling "" OFF) 111 | option(WITH_LAYER_swish "" OFF) 112 | option(WITH_LAYER_gemm "" OFF) 113 | option(WITH_LAYER_groupnorm "" OFF) 114 | option(WITH_LAYER_layernorm "" OFF) 115 | option(WITH_LAYER_softplus "" OFF) 116 | option(WITH_LAYER_gru "" OFF) 117 | option(WITH_LAYER_multiheadattention "" OFF) 118 | option(WITH_LAYER_gelu "" OFF) 119 | option(WITH_LAYER_convolution1d "" OFF) 120 | option(WITH_LAYER_pooling1d "" OFF) 121 | option(WITH_LAYER_convolutiondepthwise1d "" OFF) 122 | option(WITH_LAYER_convolution3d "" OFF) 123 | option(WITH_LAYER_convolutiondepthwise3d "" OFF) 124 | option(WITH_LAYER_pooling3d "" OFF) 125 | option(WITH_LAYER_matmul "" OFF) 126 | option(WITH_LAYER_deconvolution1d "" OFF) 127 | option(WITH_LAYER_deconvolutiondepthwise1d "" OFF) 128 | option(WITH_LAYER_deconvolution3d "" OFF) 129 | option(WITH_LAYER_deconvolutiondepthwise3d "" OFF) 130 | option(WITH_LAYER_einsum "" OFF) 131 | option(WITH_LAYER_deformableconv2d "" OFF) 132 | option(WITH_LAYER_glu "" OFF) 133 | option(WITH_LAYER_fold "" OFF) 134 | option(WITH_LAYER_unfold "" OFF) 135 | option(WITH_LAYER_gridsample "" OFF) 136 | option(WITH_LAYER_cumulativesum "" OFF) 137 | option(WITH_LAYER_copyto "" OFF) 138 | option(WITH_LAYER_erf "" OFF) 139 | option(WITH_LAYER_diag "" OFF) 140 | option(WITH_LAYER_celu "" OFF) 141 | option(WITH_LAYER_shrink "" OFF) 142 | option(WITH_LAYER_rmsnorm "" OFF) 143 | option(WITH_LAYER_spectrogram "" OFF) 144 | option(WITH_LAYER_inversespectrogram "" OFF) 145 | option(WITH_LAYER_flip "" OFF) 146 | 147 | add_subdirectory(ncnn) 148 | 149 | add_executable(vkpeak vkpeak.cpp) 150 | 151 | set_target_properties(vkpeak PROPERTIES CXX_STANDARD 11) 152 | 153 | target_link_libraries(vkpeak ncnn) 154 | -------------------------------------------------------------------------------- /vkpeak.cpp: -------------------------------------------------------------------------------- 1 | // vkpeak implemented with ncnn library 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define REPEAT_1(...) #__VA_ARGS__ 9 | #define REPEAT_2(...) REPEAT_1(__VA_ARGS__) REPEAT_1(__VA_ARGS__) 10 | #define REPEAT_4(...) REPEAT_2(__VA_ARGS__) REPEAT_2(__VA_ARGS__) 11 | #define REPEAT_8(...) REPEAT_4(__VA_ARGS__) REPEAT_4(__VA_ARGS__) 12 | #define REPEAT_16(...) REPEAT_8(__VA_ARGS__) REPEAT_8(__VA_ARGS__) 13 | 14 | static const char glsl_p1_data[] = R"( 15 | #version 450 16 | 17 | layout (constant_id = 0) const int loop = 1; 18 | 19 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 20 | 21 | void main() 22 | { 23 | const uint gx = gl_GlobalInvocationID.x; 24 | const uint lx = gl_LocalInvocationID.x; 25 | 26 | afp c = afp(gx); 27 | 28 | afp a = c; 29 | afp b = afp(lx); 30 | 31 | for (int i = 0; i < loop; i++) 32 | {)" 33 | REPEAT_16(c = a * c + b;) 34 | R"(} 35 | 36 | c_blob_data[gx] = float(c); 37 | } 38 | )"; 39 | 40 | static const char glsl_p1_dual_data[] = R"( 41 | #version 450 42 | 43 | layout (constant_id = 0) const int loop = 1; 44 | 45 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 46 | 47 | void main() 48 | { 49 | const uint gx = gl_GlobalInvocationID.x; 50 | const uint lx = gl_LocalInvocationID.x; 51 | 52 | afp c0 = afp(gx); 53 | afp c1 = afp(lx); 54 | 55 | afp a = c0; 56 | afp b = c1; 57 | 58 | for (int i = 0; i < loop; i++) 59 | {)" 60 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 61 | R"(} 62 | 63 | c0 = c0 + c1; 64 | c_blob_data[gx] = float(c0); 65 | } 66 | )"; 67 | 68 | static const char glsl_p4_data[] = R"( 69 | #version 450 70 | 71 | layout (constant_id = 0) const int loop = 1; 72 | 73 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 74 | 75 | void main() 76 | { 77 | const uint gx = gl_GlobalInvocationID.x; 78 | const uint lx = gl_LocalInvocationID.x; 79 | 80 | afpvec4 c = afpvec4(gx); 81 | 82 | afpvec4 a = c + afpvec4(0,1,2,-3); 83 | afpvec4 b = afpvec4(lx) + afpvec4(2,3,5,-7); 84 | 85 | for (int i = 0; i < loop; i++) 86 | {)" 87 | REPEAT_16(c = a * c + b;) 88 | R"(} 89 | 90 | c_blob_data[gx] = float((c[0] + c[1]) + (c[2] + c[3])); 91 | } 92 | )"; 93 | 94 | static const char glsl_p4_dual_data[] = R"( 95 | #version 450 96 | 97 | layout (constant_id = 0) const int loop = 1; 98 | 99 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 100 | 101 | void main() 102 | { 103 | const uint gx = gl_GlobalInvocationID.x; 104 | const uint lx = gl_LocalInvocationID.x; 105 | 106 | afpvec4 c0 = afpvec4(gx); 107 | afpvec4 c1 = afpvec4(lx); 108 | 109 | afpvec4 a = c0 + afpvec4(0,1,2,-3); 110 | afpvec4 b = c1 + afpvec4(2,3,5,-7); 111 | 112 | for (int i = 0; i < loop; i++) 113 | {)" 114 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 115 | R"(} 116 | 117 | c0 = c0 + c1; 118 | c_blob_data[gx] = float((c0[0] + c0[1]) + (c0[2] + c0[3])); 119 | } 120 | )"; 121 | 122 | static const char glsl_fp64_p1_data[] = R"( 123 | #version 450 124 | 125 | layout (constant_id = 0) const int loop = 1; 126 | 127 | layout (binding = 0) writeonly buffer c_blob { double c_blob_data[]; }; 128 | 129 | void main() 130 | { 131 | const uint gx = gl_GlobalInvocationID.x; 132 | const uint lx = gl_LocalInvocationID.x; 133 | 134 | double c = double(gx); 135 | 136 | double a = c; 137 | double b = double(lx); 138 | 139 | for (int i = 0; i < loop; i++) 140 | {)" 141 | REPEAT_16(c = a * c + b;) 142 | R"(} 143 | 144 | c_blob_data[gx] = c; 145 | } 146 | )"; 147 | 148 | static const char glsl_fp64_p1_dual_data[] = R"( 149 | #version 450 150 | 151 | layout (constant_id = 0) const int loop = 1; 152 | 153 | layout (binding = 0) writeonly buffer c_blob { double c_blob_data[]; }; 154 | 155 | void main() 156 | { 157 | const uint gx = gl_GlobalInvocationID.x; 158 | const uint lx = gl_LocalInvocationID.x; 159 | 160 | double c0 = double(gx); 161 | double c1 = double(lx); 162 | 163 | double a = c0; 164 | double b = c1; 165 | 166 | for (int i = 0; i < loop; i++) 167 | {)" 168 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 169 | R"(} 170 | 171 | c0 = c0 + c1; 172 | c_blob_data[gx] = c0; 173 | } 174 | )"; 175 | 176 | static const char glsl_fp64_p4_data[] = R"( 177 | #version 450 178 | 179 | layout (constant_id = 0) const int loop = 1; 180 | 181 | layout (binding = 0) writeonly buffer c_blob { double c_blob_data[]; }; 182 | 183 | void main() 184 | { 185 | const uint gx = gl_GlobalInvocationID.x; 186 | const uint lx = gl_LocalInvocationID.x; 187 | 188 | dvec4 c = dvec4(gx); 189 | 190 | dvec4 a = c + dvec4(0,1,2,-3); 191 | dvec4 b = dvec4(lx) + dvec4(2,3,5,-7); 192 | 193 | for (int i = 0; i < loop; i++) 194 | {)" 195 | REPEAT_16(c = a * c + b;) 196 | R"(} 197 | 198 | c_blob_data[gx] = (c[0] + c[1]) + (c[2] + c[3]); 199 | } 200 | )"; 201 | 202 | static const char glsl_fp64_p4_dual_data[] = R"( 203 | #version 450 204 | 205 | layout (constant_id = 0) const int loop = 1; 206 | 207 | layout (binding = 0) writeonly buffer c_blob { double c_blob_data[]; }; 208 | 209 | void main() 210 | { 211 | const uint gx = gl_GlobalInvocationID.x; 212 | const uint lx = gl_LocalInvocationID.x; 213 | 214 | dvec4 c0 = dvec4(gx); 215 | dvec4 c1 = dvec4(lx); 216 | 217 | dvec4 a = c0 + dvec4(0,1,2,-3); 218 | dvec4 b = c1 + dvec4(2,3,5,-7); 219 | 220 | for (int i = 0; i < loop; i++) 221 | {)" 222 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 223 | R"(} 224 | 225 | c0 = c0 + c1; 226 | c_blob_data[gx] = (c0[0] + c0[1]) + (c0[2] + c0[3]); 227 | } 228 | )"; 229 | 230 | static const char glsl_int32_p1_data[] = R"( 231 | #version 450 232 | 233 | layout (constant_id = 0) const int loop = 1; 234 | 235 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 236 | 237 | void main() 238 | { 239 | const uint gx = gl_GlobalInvocationID.x; 240 | const uint lx = gl_LocalInvocationID.x; 241 | 242 | int c = int(gx); 243 | 244 | int a = c; 245 | int b = int(lx); 246 | 247 | for (int i = 0; i < loop; i++) 248 | {)" 249 | REPEAT_16(c = a * c + b;) 250 | R"(} 251 | 252 | c_blob_data[gx] = c; 253 | } 254 | )"; 255 | 256 | static const char glsl_int32_p1_dual_data[] = R"( 257 | #version 450 258 | 259 | layout (constant_id = 0) const int loop = 1; 260 | 261 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 262 | 263 | void main() 264 | { 265 | const uint gx = gl_GlobalInvocationID.x; 266 | const uint lx = gl_LocalInvocationID.x; 267 | 268 | int c0 = int(gx); 269 | int c1 = int(lx); 270 | 271 | int a = c0; 272 | int b = c1; 273 | 274 | for (int i = 0; i < loop; i++) 275 | {)" 276 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 277 | R"(} 278 | 279 | c0 = c0 + c1; 280 | c_blob_data[gx] = c0; 281 | } 282 | )"; 283 | 284 | static const char glsl_int32_p4_data[] = R"( 285 | #version 450 286 | 287 | layout (constant_id = 0) const int loop = 1; 288 | 289 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 290 | 291 | void main() 292 | { 293 | const uint gx = gl_GlobalInvocationID.x; 294 | const uint lx = gl_LocalInvocationID.x; 295 | 296 | ivec4 c = ivec4(gx); 297 | 298 | ivec4 a = c + ivec4(0,1,2,-3); 299 | ivec4 b = ivec4(lx) + ivec4(2,3,5,-7); 300 | 301 | for (int i = 0; i < loop; i++) 302 | {)" 303 | REPEAT_16(c = a * c + b;) 304 | R"(} 305 | 306 | c_blob_data[gx] = (c[0] + c[1]) + (c[2] + c[3]); 307 | } 308 | )"; 309 | 310 | static const char glsl_int32_p4_dual_data[] = R"( 311 | #version 450 312 | 313 | layout (constant_id = 0) const int loop = 1; 314 | 315 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 316 | 317 | void main() 318 | { 319 | const uint gx = gl_GlobalInvocationID.x; 320 | const uint lx = gl_LocalInvocationID.x; 321 | 322 | ivec4 c0 = ivec4(gx); 323 | ivec4 c1 = ivec4(lx); 324 | 325 | ivec4 a = c0 + ivec4(0,1,2,-3); 326 | ivec4 b = c1 + ivec4(2,3,5,-7); 327 | 328 | for (int i = 0; i < loop; i++) 329 | {)" 330 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 331 | R"(} 332 | 333 | c0 = c0 + c1; 334 | c_blob_data[gx] = (c0[0] + c0[1]) + (c0[2] + c0[3]); 335 | } 336 | )"; 337 | 338 | static const char glsl_int16_p1_data[] = R"( 339 | #version 450 340 | 341 | #extension GL_EXT_shader_explicit_arithmetic_types_int16: require 342 | 343 | layout (constant_id = 0) const int loop = 1; 344 | 345 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 346 | 347 | void main() 348 | { 349 | const uint gx = gl_GlobalInvocationID.x; 350 | const uint lx = gl_LocalInvocationID.x; 351 | 352 | int16_t c = int16_t(gx); 353 | 354 | int16_t a = c; 355 | int16_t b = int16_t(lx); 356 | 357 | for (int i = 0; i < loop; i++) 358 | {)" 359 | REPEAT_16(c = a * c + b;) 360 | R"(} 361 | 362 | c_blob_data[gx] = int(c); 363 | } 364 | )"; 365 | 366 | static const char glsl_int16_p1_dual_data[] = R"( 367 | #version 450 368 | 369 | #extension GL_EXT_shader_explicit_arithmetic_types_int16: require 370 | 371 | layout (constant_id = 0) const int loop = 1; 372 | 373 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 374 | 375 | void main() 376 | { 377 | const uint gx = gl_GlobalInvocationID.x; 378 | const uint lx = gl_LocalInvocationID.x; 379 | 380 | int16_t c0 = int16_t(gx); 381 | int16_t c1 = int16_t(lx); 382 | 383 | int16_t a = c0; 384 | int16_t b = c1; 385 | 386 | for (int i = 0; i < loop; i++) 387 | {)" 388 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 389 | R"(} 390 | 391 | c0 = c0 + c1; 392 | c_blob_data[gx] = int(c0); 393 | } 394 | )"; 395 | 396 | static const char glsl_int16_p4_data[] = R"( 397 | #version 450 398 | 399 | #extension GL_EXT_shader_explicit_arithmetic_types_int16: require 400 | 401 | layout (constant_id = 0) const int loop = 1; 402 | 403 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 404 | 405 | void main() 406 | { 407 | const uint gx = gl_GlobalInvocationID.x; 408 | const uint lx = gl_LocalInvocationID.x; 409 | 410 | i16vec4 c = i16vec4(gx); 411 | 412 | i16vec4 a = c + i16vec4(0,1,2,-3); 413 | i16vec4 b = i16vec4(lx) + i16vec4(2,3,5,-7); 414 | 415 | for (int i = 0; i < loop; i++) 416 | {)" 417 | REPEAT_16(c = a * c + b;) 418 | R"(} 419 | 420 | c_blob_data[gx] = int((c[0] + c[1]) + (c[2] + c[3])); 421 | } 422 | )"; 423 | 424 | static const char glsl_int16_p4_dual_data[] = R"( 425 | #version 450 426 | 427 | #extension GL_EXT_shader_explicit_arithmetic_types_int16: require 428 | 429 | layout (constant_id = 0) const int loop = 1; 430 | 431 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 432 | 433 | void main() 434 | { 435 | const uint gx = gl_GlobalInvocationID.x; 436 | const uint lx = gl_LocalInvocationID.x; 437 | 438 | i16vec4 c0 = i16vec4(gx); 439 | i16vec4 c1 = i16vec4(lx); 440 | 441 | i16vec4 a = c0 + i16vec4(0,1,2,-3); 442 | i16vec4 b = c1 + i16vec4(2,3,5,-7); 443 | 444 | for (int i = 0; i < loop; i++) 445 | {)" 446 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 447 | R"(} 448 | 449 | c0 = c0 + c1; 450 | c_blob_data[gx] = int((c0[0] + c0[1]) + (c0[2] + c0[3])); 451 | } 452 | )"; 453 | 454 | static const char glsl_int64_p1_data[] = R"( 455 | #version 450 456 | 457 | #extension GL_EXT_shader_explicit_arithmetic_types_int64: require 458 | 459 | layout (constant_id = 0) const int loop = 1; 460 | 461 | layout (binding = 0) writeonly buffer c_blob { int64_t c_blob_data[]; }; 462 | 463 | void main() 464 | { 465 | const uint gx = gl_GlobalInvocationID.x; 466 | const uint lx = gl_LocalInvocationID.x; 467 | 468 | int64_t c = int64_t(gx); 469 | 470 | int64_t a = c; 471 | int64_t b = int64_t(lx); 472 | 473 | for (int i = 0; i < loop; i++) 474 | {)" 475 | REPEAT_16(c = a * c + b;) 476 | R"(} 477 | 478 | c_blob_data[gx] = c; 479 | } 480 | )"; 481 | 482 | static const char glsl_int64_p1_dual_data[] = R"( 483 | #version 450 484 | 485 | #extension GL_EXT_shader_explicit_arithmetic_types_int64: require 486 | 487 | layout (constant_id = 0) const int loop = 1; 488 | 489 | layout (binding = 0) writeonly buffer c_blob { int64_t c_blob_data[]; }; 490 | 491 | void main() 492 | { 493 | const uint gx = gl_GlobalInvocationID.x; 494 | const uint lx = gl_LocalInvocationID.x; 495 | 496 | int64_t c0 = int64_t(gx); 497 | int64_t c1 = int64_t(lx); 498 | 499 | int64_t a = c0; 500 | int64_t b = c1; 501 | 502 | for (int i = 0; i < loop; i++) 503 | {)" 504 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 505 | R"(} 506 | 507 | c0 = c0 + c1; 508 | c_blob_data[gx] = c0; 509 | } 510 | )"; 511 | 512 | static const char glsl_int64_p4_data[] = R"( 513 | #version 450 514 | 515 | #extension GL_EXT_shader_explicit_arithmetic_types_int64: require 516 | 517 | layout (constant_id = 0) const int loop = 1; 518 | 519 | layout (binding = 0) writeonly buffer c_blob { int64_t c_blob_data[]; }; 520 | 521 | void main() 522 | { 523 | const uint gx = gl_GlobalInvocationID.x; 524 | const uint lx = gl_LocalInvocationID.x; 525 | 526 | i64vec4 c = i64vec4(gx); 527 | 528 | i64vec4 a = c + i64vec4(0,1,2,-3); 529 | i64vec4 b = i64vec4(lx) + i64vec4(2,3,5,-7); 530 | 531 | for (int i = 0; i < loop; i++) 532 | {)" 533 | REPEAT_16(c = a * c + b;) 534 | R"(} 535 | 536 | c_blob_data[gx] = (c[0] + c[1]) + (c[2] + c[3]); 537 | } 538 | )"; 539 | 540 | static const char glsl_int64_p4_dual_data[] = R"( 541 | #version 450 542 | 543 | #extension GL_EXT_shader_explicit_arithmetic_types_int64: require 544 | 545 | layout (constant_id = 0) const int loop = 1; 546 | 547 | layout (binding = 0) writeonly buffer c_blob { int64_t c_blob_data[]; }; 548 | 549 | void main() 550 | { 551 | const uint gx = gl_GlobalInvocationID.x; 552 | const uint lx = gl_LocalInvocationID.x; 553 | 554 | i64vec4 c0 = i64vec4(gx); 555 | i64vec4 c1 = i64vec4(lx); 556 | 557 | i64vec4 a = c0 + i64vec4(0,1,2,-3); 558 | i64vec4 b = c1 + i64vec4(2,3,5,-7); 559 | 560 | for (int i = 0; i < loop; i++) 561 | {)" 562 | REPEAT_8(c0 = a * c0 + b; c1 = a * c1 + b;) 563 | R"(} 564 | 565 | c0 = c0 + c1; 566 | c_blob_data[gx] = (c0[0] + c0[1]) + (c0[2] + c0[3]); 567 | } 568 | )"; 569 | 570 | static const char glsl_int8_p4_data[] = R"( 571 | #version 450 572 | 573 | #extension GL_EXT_integer_dot_product: require 574 | 575 | layout (constant_id = 0) const int loop = 1; 576 | 577 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 578 | 579 | void main() 580 | { 581 | const uint gx = gl_GlobalInvocationID.x; 582 | const uint lx = gl_LocalInvocationID.x; 583 | 584 | int c = int(gx); 585 | 586 | int a = int(gx); 587 | int b = int(lx); 588 | 589 | for (int i = 0; i < loop; i++) 590 | {)" 591 | REPEAT_16(c = dotPacked4x8AccSatEXT(a, b, c);) 592 | R"(} 593 | 594 | c_blob_data[gx] = c; 595 | } 596 | )"; 597 | 598 | static const char glsl_int8_p4_dual_data[] = R"( 599 | #version 450 600 | 601 | #extension GL_EXT_integer_dot_product: require 602 | 603 | layout (constant_id = 0) const int loop = 1; 604 | 605 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 606 | 607 | void main() 608 | { 609 | const uint gx = gl_GlobalInvocationID.x; 610 | const uint lx = gl_LocalInvocationID.x; 611 | 612 | int c0 = int(gx); 613 | int c1 = int(lx); 614 | 615 | int a = int(gx); 616 | int b = int(lx); 617 | 618 | for (int i = 0; i < loop; i++) 619 | {)" 620 | REPEAT_8(c0 = dotPacked4x8AccSatEXT(a, b, c0); c1 = dotPacked4x8AccSatEXT(a, b, c1);) 621 | R"(} 622 | 623 | c0 = c0 + c1; 624 | c_blob_data[gx] = c0; 625 | } 626 | )"; 627 | 628 | static const char glsl_bf16_p4_data[] = R"( 629 | #version 450 630 | 631 | #extension GL_EXT_shader_explicit_arithmetic_types : require 632 | #extension GL_EXT_bfloat16: require 633 | 634 | layout (constant_id = 0) const int loop = 1; 635 | 636 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 637 | 638 | void main() 639 | { 640 | const uint gx = gl_GlobalInvocationID.x; 641 | const uint lx = gl_LocalInvocationID.x; 642 | 643 | bfloat16_t c = bfloat16_t(gx); 644 | 645 | u16vec4 a = uint16_t(gx) + u16vec4(0,1,2,3); 646 | bf16vec4 b = uintBitsToBFloat16EXT(uint16_t(lx) + u16vec4(2,3,5,7)); 647 | 648 | for (int i = 0; i < loop; i++) 649 | {)" 650 | REPEAT_4(c = dot(uintBitsToBFloat16EXT(a), b); a.x = bfloat16BitsToUintEXT(c); 651 | c = dot(uintBitsToBFloat16EXT(a), b); a.y = bfloat16BitsToUintEXT(c); 652 | c = dot(uintBitsToBFloat16EXT(a), b); a.z = bfloat16BitsToUintEXT(c); 653 | c = dot(uintBitsToBFloat16EXT(a), b); a.w = bfloat16BitsToUintEXT(c);) 654 | R"(} 655 | 656 | c_blob_data[gx] = float(c); 657 | } 658 | )"; 659 | 660 | static const char glsl_bf16_p4_dual_data[] = R"( 661 | #version 450 662 | 663 | #extension GL_EXT_shader_explicit_arithmetic_types : require 664 | #extension GL_EXT_bfloat16: require 665 | 666 | layout (constant_id = 0) const int loop = 1; 667 | 668 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 669 | 670 | void main() 671 | { 672 | const uint gx = gl_GlobalInvocationID.x; 673 | const uint lx = gl_LocalInvocationID.x; 674 | 675 | bfloat16_t c0 = bfloat16_t(gx); 676 | bfloat16_t c1 = bfloat16_t(lx); 677 | 678 | u16vec4 a0 = uint16_t(gx) + u16vec4(0,1,2,3); 679 | u16vec4 a1 = uint16_t(gx) + u16vec4(10,21,32,43); 680 | bf16vec4 b = uintBitsToBFloat16EXT(uint16_t(lx) + u16vec4(2,3,5,7)); 681 | 682 | for (int i = 0; i < loop; i++) 683 | {)" 684 | REPEAT_4(c0 = dot(uintBitsToBFloat16EXT(a0), b); a0.x = bfloat16BitsToUintEXT(c0); 685 | c1 = dot(uintBitsToBFloat16EXT(a1), b); a1.y = bfloat16BitsToUintEXT(c1); 686 | c0 = dot(uintBitsToBFloat16EXT(a0), b); a0.z = bfloat16BitsToUintEXT(c0); 687 | c1 = dot(uintBitsToBFloat16EXT(a1), b); a1.w = bfloat16BitsToUintEXT(c1);) 688 | R"(} 689 | 690 | c_blob_data[gx] = float(c0) + float(c1); 691 | } 692 | )"; 693 | 694 | static const char glsl_fp16_matrix_data[] = R"( 695 | #version 450 696 | 697 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 698 | #extension GL_KHR_memory_scope_semantics: require 699 | #extension GL_EXT_shader_explicit_arithmetic_types: require 700 | #if ncnn_VK_KHR_cooperative_matrix 701 | #extension GL_KHR_cooperative_matrix: require 702 | #elif ncnn_VK_NV_cooperative_matrix 703 | #extension GL_NV_cooperative_matrix: require 704 | #endif 705 | 706 | layout (constant_id = 0) const int loop = 1; 707 | layout (constant_id = 1) const int M = 1; 708 | layout (constant_id = 2) const int N = 1; 709 | layout (constant_id = 3) const int K = 1; 710 | layout (constant_id = 4) const int SCOPE = 3; 711 | 712 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 713 | 714 | void main() 715 | { 716 | const uint gx = gl_GlobalInvocationID.x; 717 | const uint lx = gl_LocalInvocationID.x; 718 | 719 | #if ncnn_VK_KHR_cooperative_matrix 720 | coopmat a = coopmat(float(gx)); 721 | coopmat b = coopmat(float(lx)); 722 | 723 | coopmat c = coopmat(float(gx)); 724 | 725 | for (int i = 0; i < loop; i++) 726 | {)" 727 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 728 | R"(} 729 | 730 | coopMatStore(c, c_blob_data, gx * (M * N) / 2, N / 2, gl_CooperativeMatrixLayoutRowMajor); 731 | #elif ncnn_VK_NV_cooperative_matrix 732 | fcoopmatNV<16, SCOPE, M, K> a = fcoopmatNV<16, SCOPE, M, K>(float(gx)); 733 | fcoopmatNV<16, SCOPE, K, N> b = fcoopmatNV<16, SCOPE, K, N>(float(lx)); 734 | 735 | fcoopmatNV<16, SCOPE, M, N> c = fcoopmatNV<16, SCOPE, M, N>(float(gx)); 736 | 737 | for (int i = 0; i < loop; i++) 738 | {)" 739 | REPEAT_16(c = coopMatMulAddNV(a, b, c);) 740 | R"(} 741 | 742 | coopMatStoreNV(c, c_blob_data, gx * (M * N) / 2, N / 2, false); 743 | #endif 744 | } 745 | )"; 746 | 747 | static const char glsl_fp16_matrix_dual_data[] = R"( 748 | #version 450 749 | 750 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 751 | #extension GL_KHR_memory_scope_semantics: require 752 | #extension GL_EXT_shader_explicit_arithmetic_types: require 753 | #if ncnn_VK_KHR_cooperative_matrix 754 | #extension GL_KHR_cooperative_matrix: require 755 | #elif ncnn_VK_NV_cooperative_matrix 756 | #extension GL_NV_cooperative_matrix: require 757 | #endif 758 | 759 | layout (constant_id = 0) const int loop = 1; 760 | layout (constant_id = 1) const int M = 1; 761 | layout (constant_id = 2) const int N = 1; 762 | layout (constant_id = 3) const int K = 1; 763 | layout (constant_id = 4) const int SCOPE = 3; 764 | 765 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 766 | 767 | void main() 768 | { 769 | const uint gx = gl_GlobalInvocationID.x; 770 | const uint lx = gl_LocalInvocationID.x; 771 | 772 | #if ncnn_VK_KHR_cooperative_matrix 773 | coopmat a = coopmat(float(gx)); 774 | coopmat b = coopmat(float(lx)); 775 | 776 | coopmat c0 = coopmat(float(gx)); 777 | coopmat c1 = coopmat(float(lx)); 778 | 779 | for (int i = 0; i < loop; i++) 780 | {)" 781 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 782 | R"(} 783 | 784 | c0 = c0 + c1; 785 | coopMatStore(c0, c_blob_data, gx * (M * N) / 2, N / 2, gl_CooperativeMatrixLayoutRowMajor); 786 | #elif ncnn_VK_NV_cooperative_matrix 787 | fcoopmatNV<16, SCOPE, M, K> a = fcoopmatNV<16, SCOPE, M, K>(float(gx)); 788 | fcoopmatNV<16, SCOPE, K, N> b = fcoopmatNV<16, SCOPE, K, N>(float(lx)); 789 | 790 | fcoopmatNV<16, SCOPE, M, N> c0 = fcoopmatNV<16, SCOPE, M, N>(float(gx)); 791 | fcoopmatNV<16, SCOPE, M, N> c1 = fcoopmatNV<16, SCOPE, M, N>(float(lx)); 792 | 793 | for (int i = 0; i < loop; i++) 794 | {)" 795 | REPEAT_8(c0 = coopMatMulAddNV(a, b, c0); c1 = coopMatMulAddNV(a, b, c1);) 796 | R"(} 797 | 798 | c0 = c0 + c1; 799 | coopMatStoreNV(c0, c_blob_data, gx * (M * N) / 2, N / 2, false); 800 | #endif 801 | } 802 | )"; 803 | 804 | static const char glsl_fp16_fp32_matrix_data[] = R"( 805 | #version 450 806 | 807 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 808 | #extension GL_KHR_memory_scope_semantics: require 809 | #extension GL_EXT_shader_explicit_arithmetic_types: require 810 | #if ncnn_VK_KHR_cooperative_matrix 811 | #extension GL_KHR_cooperative_matrix: require 812 | #elif ncnn_VK_NV_cooperative_matrix 813 | #extension GL_NV_cooperative_matrix: require 814 | #endif 815 | 816 | layout (constant_id = 0) const int loop = 1; 817 | layout (constant_id = 1) const int M = 1; 818 | layout (constant_id = 2) const int N = 1; 819 | layout (constant_id = 3) const int K = 1; 820 | layout (constant_id = 4) const int SCOPE = 3; 821 | 822 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 823 | 824 | void main() 825 | { 826 | const uint gx = gl_GlobalInvocationID.x; 827 | const uint lx = gl_LocalInvocationID.x; 828 | 829 | #if ncnn_VK_KHR_cooperative_matrix 830 | coopmat a = coopmat(float(gx)); 831 | coopmat b = coopmat(float(lx)); 832 | 833 | coopmat c = coopmat(float(gx)); 834 | 835 | for (int i = 0; i < loop; i++) 836 | {)" 837 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 838 | R"(} 839 | 840 | coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 841 | #elif ncnn_VK_NV_cooperative_matrix 842 | fcoopmatNV<16, SCOPE, M, K> a = fcoopmatNV<16, SCOPE, M, K>(float(gx)); 843 | fcoopmatNV<16, SCOPE, K, N> b = fcoopmatNV<16, SCOPE, K, N>(float(lx)); 844 | 845 | fcoopmatNV<32, SCOPE, M, N> c = fcoopmatNV<32, SCOPE, M, N>(float(gx)); 846 | 847 | for (int i = 0; i < loop; i++) 848 | {)" 849 | REPEAT_16(c = coopMatMulAddNV(a, b, c);) 850 | R"(} 851 | 852 | coopMatStoreNV(c, c_blob_data, gx * (M * N), N, false); 853 | #endif 854 | } 855 | )"; 856 | 857 | static const char glsl_fp16_fp32_matrix_dual_data[] = R"( 858 | #version 450 859 | 860 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 861 | #extension GL_KHR_memory_scope_semantics: require 862 | #extension GL_EXT_shader_explicit_arithmetic_types: require 863 | #if ncnn_VK_KHR_cooperative_matrix 864 | #extension GL_KHR_cooperative_matrix: require 865 | #elif ncnn_VK_NV_cooperative_matrix 866 | #extension GL_NV_cooperative_matrix: require 867 | #endif 868 | 869 | layout (constant_id = 0) const int loop = 1; 870 | layout (constant_id = 1) const int M = 1; 871 | layout (constant_id = 2) const int N = 1; 872 | layout (constant_id = 3) const int K = 1; 873 | layout (constant_id = 4) const int SCOPE = 3; 874 | 875 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 876 | 877 | void main() 878 | { 879 | const uint gx = gl_GlobalInvocationID.x; 880 | const uint lx = gl_LocalInvocationID.x; 881 | 882 | #if ncnn_VK_KHR_cooperative_matrix 883 | coopmat a = coopmat(float(gx)); 884 | coopmat b = coopmat(float(lx)); 885 | 886 | coopmat c0 = coopmat(float(gx)); 887 | coopmat c1 = coopmat(float(lx)); 888 | 889 | for (int i = 0; i < loop; i++) 890 | {)" 891 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 892 | R"(} 893 | 894 | c0 = c0 + c1; 895 | coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 896 | #elif ncnn_VK_NV_cooperative_matrix 897 | fcoopmatNV<16, SCOPE, M, K> a = fcoopmatNV<16, SCOPE, M, K>(float(gx)); 898 | fcoopmatNV<16, SCOPE, K, N> b = fcoopmatNV<16, SCOPE, K, N>(float(lx)); 899 | 900 | fcoopmatNV<32, SCOPE, M, N> c0 = fcoopmatNV<32, SCOPE, M, N>(float(gx)); 901 | fcoopmatNV<32, SCOPE, M, N> c1 = fcoopmatNV<32, SCOPE, M, N>(float(lx)); 902 | 903 | for (int i = 0; i < loop; i++) 904 | {)" 905 | REPEAT_8(c0 = coopMatMulAddNV(a, b, c0); c1 = coopMatMulAddNV(a, b, c1);) 906 | R"(} 907 | 908 | c0 = c0 + c1; 909 | coopMatStoreNV(c0, c_blob_data, gx * (M * N), N, false); 910 | #endif 911 | } 912 | )"; 913 | 914 | static const char glsl_int8_matrix_data[] = R"( 915 | #version 450 916 | 917 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 918 | #extension GL_KHR_memory_scope_semantics: require 919 | #extension GL_EXT_shader_explicit_arithmetic_types: require 920 | #if ncnn_VK_KHR_cooperative_matrix 921 | #extension GL_KHR_cooperative_matrix: require 922 | #elif ncnn_VK_NV_cooperative_matrix 923 | #extension GL_NV_integer_cooperative_matrix : require 924 | #endif 925 | 926 | layout (constant_id = 0) const int loop = 1; 927 | layout (constant_id = 1) const int M = 1; 928 | layout (constant_id = 2) const int N = 1; 929 | layout (constant_id = 3) const int K = 1; 930 | layout (constant_id = 4) const int SCOPE = 3; 931 | 932 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 933 | 934 | void main() 935 | { 936 | const uint gx = gl_GlobalInvocationID.x; 937 | const uint lx = gl_LocalInvocationID.x; 938 | 939 | #if ncnn_VK_KHR_cooperative_matrix 940 | coopmat a = coopmat(int8_t(gx)); 941 | coopmat b = coopmat(int8_t(lx)); 942 | 943 | coopmat c = coopmat(int(gx)); 944 | 945 | for (int i = 0; i < loop; i++) 946 | {)" 947 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 948 | R"(} 949 | 950 | coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 951 | #elif ncnn_VK_NV_cooperative_matrix 952 | icoopmatNV<8, SCOPE, M, K> a = icoopmatNV<8, SCOPE, M, K>(int8_t(gx)); 953 | icoopmatNV<8, SCOPE, K, N> b = icoopmatNV<8, SCOPE, K, N>(int8_t(lx)); 954 | 955 | icoopmatNV<32, SCOPE, M, N> c = icoopmatNV<32, SCOPE, M, N>(int(gx)); 956 | 957 | for (int i = 0; i < loop; i++) 958 | {)" 959 | REPEAT_16(c = coopMatMulAddNV(a, b, c);) 960 | R"(} 961 | 962 | coopMatStoreNV(c, c_blob_data, gx * (M * N), N, false); 963 | #endif 964 | } 965 | )"; 966 | 967 | static const char glsl_int8_matrix_dual_data[] = R"( 968 | #version 450 969 | 970 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 971 | #extension GL_KHR_memory_scope_semantics: require 972 | #extension GL_EXT_shader_explicit_arithmetic_types: require 973 | #if ncnn_VK_KHR_cooperative_matrix 974 | #extension GL_KHR_cooperative_matrix: require 975 | #elif ncnn_VK_NV_cooperative_matrix 976 | #extension GL_NV_integer_cooperative_matrix : require 977 | #endif 978 | 979 | layout (constant_id = 0) const int loop = 1; 980 | layout (constant_id = 1) const int M = 1; 981 | layout (constant_id = 2) const int N = 1; 982 | layout (constant_id = 3) const int K = 1; 983 | layout (constant_id = 4) const int SCOPE = 3; 984 | 985 | layout (binding = 0) writeonly buffer c_blob { int c_blob_data[]; }; 986 | 987 | void main() 988 | { 989 | const uint gx = gl_GlobalInvocationID.x; 990 | const uint lx = gl_LocalInvocationID.x; 991 | 992 | #if ncnn_VK_KHR_cooperative_matrix 993 | coopmat a = coopmat(int8_t(gx)); 994 | coopmat b = coopmat(int8_t(lx)); 995 | 996 | coopmat c0 = coopmat(int(gx)); 997 | coopmat c1 = coopmat(int(lx)); 998 | 999 | for (int i = 0; i < loop; i++) 1000 | {)" 1001 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 1002 | R"(} 1003 | 1004 | c0 = c0 + c1; 1005 | coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1006 | #elif ncnn_VK_NV_cooperative_matrix 1007 | icoopmatNV<8, SCOPE, M, K> a = icoopmatNV<8, SCOPE, M, K>(int8_t(gx)); 1008 | icoopmatNV<8, SCOPE, K, N> b = icoopmatNV<8, SCOPE, K, N>(int8_t(lx)); 1009 | 1010 | icoopmatNV<32, SCOPE, M, N> c0 = icoopmatNV<32, SCOPE, M, N>(int(gx)); 1011 | icoopmatNV<32, SCOPE, M, N> c1 = icoopmatNV<32, SCOPE, M, N>(int(lx)); 1012 | 1013 | for (int i = 0; i < loop; i++) 1014 | {)" 1015 | REPEAT_8(c0 = coopMatMulAddNV(a, b, c0); c1 = coopMatMulAddNV(a, b, c1);) 1016 | R"(} 1017 | 1018 | c0 = c0 + c1; 1019 | coopMatStoreNV(c0, c_blob_data, gx * (M * N), N, false); 1020 | #endif 1021 | } 1022 | )"; 1023 | 1024 | static const char glsl_bf16_matrix_data[] = R"( 1025 | #version 450 1026 | 1027 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 1028 | #extension GL_KHR_memory_scope_semantics: require 1029 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1030 | #extension GL_KHR_cooperative_matrix: require 1031 | #extension GL_EXT_bfloat16: require 1032 | 1033 | layout (constant_id = 0) const int loop = 1; 1034 | layout (constant_id = 1) const int M = 1; 1035 | layout (constant_id = 2) const int N = 1; 1036 | layout (constant_id = 3) const int K = 1; 1037 | layout (constant_id = 4) const int SCOPE = 3; 1038 | 1039 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1040 | 1041 | void main() 1042 | { 1043 | const uint gx = gl_GlobalInvocationID.x; 1044 | const uint lx = gl_LocalInvocationID.x; 1045 | 1046 | coopmat a = coopmat(float(gx)); 1047 | coopmat b = coopmat(float(lx)); 1048 | 1049 | coopmat c = coopmat(float(gx)); 1050 | 1051 | for (int i = 0; i < loop; i++) 1052 | {)" 1053 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 1054 | R"(} 1055 | 1056 | coopMatStore(c, c_blob_data, gx * (M * N) / 2, N / 2, gl_CooperativeMatrixLayoutRowMajor); 1057 | } 1058 | )"; 1059 | 1060 | static const char glsl_bf16_matrix_dual_data[] = R"( 1061 | #version 450 1062 | 1063 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 1064 | #extension GL_KHR_memory_scope_semantics: require 1065 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1066 | #extension GL_KHR_cooperative_matrix: require 1067 | #extension GL_EXT_bfloat16: require 1068 | 1069 | layout (constant_id = 0) const int loop = 1; 1070 | layout (constant_id = 1) const int M = 1; 1071 | layout (constant_id = 2) const int N = 1; 1072 | layout (constant_id = 3) const int K = 1; 1073 | layout (constant_id = 4) const int SCOPE = 3; 1074 | 1075 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1076 | 1077 | void main() 1078 | { 1079 | const uint gx = gl_GlobalInvocationID.x; 1080 | const uint lx = gl_LocalInvocationID.x; 1081 | 1082 | coopmat a = coopmat(float(gx)); 1083 | coopmat b = coopmat(float(lx)); 1084 | 1085 | coopmat c0 = coopmat(float(gx)); 1086 | coopmat c1 = coopmat(float(lx)); 1087 | 1088 | for (int i = 0; i < loop; i++) 1089 | {)" 1090 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 1091 | R"(} 1092 | 1093 | coopmat c2 = coopmat(c0); 1094 | coopmat c3 = coopmat(c1); 1095 | 1096 | c0 = coopmat(c2 + c3); 1097 | coopMatStore(c0, c_blob_data, gx * (M * N) / 2, N / 2, gl_CooperativeMatrixLayoutRowMajor); 1098 | } 1099 | )"; 1100 | 1101 | static const char glsl_bf16_fp32_matrix_data[] = R"( 1102 | #version 450 1103 | 1104 | #extension GL_KHR_memory_scope_semantics: require 1105 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1106 | #extension GL_KHR_cooperative_matrix: require 1107 | #extension GL_EXT_bfloat16: require 1108 | 1109 | layout (constant_id = 0) const int loop = 1; 1110 | layout (constant_id = 1) const int M = 1; 1111 | layout (constant_id = 2) const int N = 1; 1112 | layout (constant_id = 3) const int K = 1; 1113 | layout (constant_id = 4) const int SCOPE = 3; 1114 | 1115 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1116 | 1117 | void main() 1118 | { 1119 | const uint gx = gl_GlobalInvocationID.x; 1120 | const uint lx = gl_LocalInvocationID.x; 1121 | 1122 | coopmat a = coopmat(float(gx)); 1123 | coopmat b = coopmat(float(lx)); 1124 | 1125 | coopmat c = coopmat(float(gx)); 1126 | 1127 | for (int i = 0; i < loop; i++) 1128 | {)" 1129 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 1130 | R"(} 1131 | 1132 | coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1133 | } 1134 | )"; 1135 | 1136 | static const char glsl_bf16_fp32_matrix_dual_data[] = R"( 1137 | #version 450 1138 | 1139 | #extension GL_KHR_memory_scope_semantics: require 1140 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1141 | #extension GL_KHR_cooperative_matrix: require 1142 | #extension GL_EXT_bfloat16: require 1143 | 1144 | layout (constant_id = 0) const int loop = 1; 1145 | layout (constant_id = 1) const int M = 1; 1146 | layout (constant_id = 2) const int N = 1; 1147 | layout (constant_id = 3) const int K = 1; 1148 | layout (constant_id = 4) const int SCOPE = 3; 1149 | 1150 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1151 | 1152 | void main() 1153 | { 1154 | const uint gx = gl_GlobalInvocationID.x; 1155 | const uint lx = gl_LocalInvocationID.x; 1156 | 1157 | coopmat a = coopmat(float(gx)); 1158 | coopmat b = coopmat(float(lx)); 1159 | 1160 | coopmat c0 = coopmat(float(gx)); 1161 | coopmat c1 = coopmat(float(lx)); 1162 | 1163 | for (int i = 0; i < loop; i++) 1164 | {)" 1165 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 1166 | R"(} 1167 | 1168 | c0 = c0 + c1; 1169 | coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1170 | } 1171 | )"; 1172 | 1173 | static const char glsl_fp8_fp16_matrix_data[] = R"( 1174 | #version 450 1175 | 1176 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 1177 | #extension GL_KHR_memory_scope_semantics: require 1178 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1179 | #extension GL_KHR_cooperative_matrix: require 1180 | #extension GL_EXT_float_e4m3: require 1181 | 1182 | layout (constant_id = 0) const int loop = 1; 1183 | layout (constant_id = 1) const int M = 1; 1184 | layout (constant_id = 2) const int N = 1; 1185 | layout (constant_id = 3) const int K = 1; 1186 | layout (constant_id = 4) const int SCOPE = 3; 1187 | 1188 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1189 | 1190 | void main() 1191 | { 1192 | const uint gx = gl_GlobalInvocationID.x; 1193 | const uint lx = gl_LocalInvocationID.x; 1194 | 1195 | coopmat a = coopmat(float(gx)); 1196 | coopmat b = coopmat(float(lx)); 1197 | 1198 | coopmat c = coopmat(float(gx)); 1199 | 1200 | for (int i = 0; i < loop; i++) 1201 | {)" 1202 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 1203 | R"(} 1204 | 1205 | coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1206 | } 1207 | )"; 1208 | 1209 | static const char glsl_fp8_fp16_matrix_dual_data[] = R"( 1210 | #version 450 1211 | 1212 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 1213 | #extension GL_KHR_memory_scope_semantics: require 1214 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1215 | #extension GL_KHR_cooperative_matrix: require 1216 | #extension GL_EXT_float_e4m3: require 1217 | 1218 | layout (constant_id = 0) const int loop = 1; 1219 | layout (constant_id = 1) const int M = 1; 1220 | layout (constant_id = 2) const int N = 1; 1221 | layout (constant_id = 3) const int K = 1; 1222 | layout (constant_id = 4) const int SCOPE = 3; 1223 | 1224 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1225 | 1226 | void main() 1227 | { 1228 | const uint gx = gl_GlobalInvocationID.x; 1229 | const uint lx = gl_LocalInvocationID.x; 1230 | 1231 | coopmat a = coopmat(float(gx)); 1232 | coopmat b = coopmat(float(lx)); 1233 | 1234 | coopmat c0 = coopmat(float(gx)); 1235 | coopmat c1 = coopmat(float(lx)); 1236 | 1237 | for (int i = 0; i < loop; i++) 1238 | {)" 1239 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 1240 | R"(} 1241 | 1242 | c0 = c0 + c1; 1243 | coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1244 | } 1245 | )"; 1246 | 1247 | static const char glsl_fp8_fp32_matrix_data[] = R"( 1248 | #version 450 1249 | 1250 | #extension GL_KHR_memory_scope_semantics: require 1251 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1252 | #extension GL_KHR_cooperative_matrix: require 1253 | #extension GL_EXT_float_e4m3: require 1254 | 1255 | layout (constant_id = 0) const int loop = 1; 1256 | layout (constant_id = 1) const int M = 1; 1257 | layout (constant_id = 2) const int N = 1; 1258 | layout (constant_id = 3) const int K = 1; 1259 | layout (constant_id = 4) const int SCOPE = 3; 1260 | 1261 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1262 | 1263 | void main() 1264 | { 1265 | const uint gx = gl_GlobalInvocationID.x; 1266 | const uint lx = gl_LocalInvocationID.x; 1267 | 1268 | coopmat a = coopmat(float(gx)); 1269 | coopmat b = coopmat(float(lx)); 1270 | 1271 | coopmat c = coopmat(float(gx)); 1272 | 1273 | for (int i = 0; i < loop; i++) 1274 | {)" 1275 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 1276 | R"(} 1277 | 1278 | coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1279 | } 1280 | )"; 1281 | 1282 | static const char glsl_fp8_fp32_matrix_dual_data[] = R"( 1283 | #version 450 1284 | 1285 | #extension GL_KHR_memory_scope_semantics: require 1286 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1287 | #extension GL_KHR_cooperative_matrix: require 1288 | #extension GL_EXT_float_e4m3: require 1289 | 1290 | layout (constant_id = 0) const int loop = 1; 1291 | layout (constant_id = 1) const int M = 1; 1292 | layout (constant_id = 2) const int N = 1; 1293 | layout (constant_id = 3) const int K = 1; 1294 | layout (constant_id = 4) const int SCOPE = 3; 1295 | 1296 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1297 | 1298 | void main() 1299 | { 1300 | const uint gx = gl_GlobalInvocationID.x; 1301 | const uint lx = gl_LocalInvocationID.x; 1302 | 1303 | coopmat a = coopmat(float(gx)); 1304 | coopmat b = coopmat(float(lx)); 1305 | 1306 | coopmat c0 = coopmat(float(gx)); 1307 | coopmat c1 = coopmat(float(lx)); 1308 | 1309 | for (int i = 0; i < loop; i++) 1310 | {)" 1311 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 1312 | R"(} 1313 | 1314 | c0 = c0 + c1; 1315 | coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1316 | } 1317 | )"; 1318 | 1319 | static const char glsl_bf8_fp16_matrix_data[] = R"( 1320 | #version 450 1321 | 1322 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 1323 | #extension GL_KHR_memory_scope_semantics: require 1324 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1325 | #extension GL_KHR_cooperative_matrix: require 1326 | #extension GL_EXT_float_e5m2: require 1327 | 1328 | layout (constant_id = 0) const int loop = 1; 1329 | layout (constant_id = 1) const int M = 1; 1330 | layout (constant_id = 2) const int N = 1; 1331 | layout (constant_id = 3) const int K = 1; 1332 | layout (constant_id = 4) const int SCOPE = 3; 1333 | 1334 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1335 | 1336 | void main() 1337 | { 1338 | const uint gx = gl_GlobalInvocationID.x; 1339 | const uint lx = gl_LocalInvocationID.x; 1340 | 1341 | coopmat a = coopmat(float(gx)); 1342 | coopmat b = coopmat(float(lx)); 1343 | 1344 | coopmat c = coopmat(float(gx)); 1345 | 1346 | for (int i = 0; i < loop; i++) 1347 | {)" 1348 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 1349 | R"(} 1350 | 1351 | coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1352 | } 1353 | )"; 1354 | 1355 | static const char glsl_bf8_fp16_matrix_dual_data[] = R"( 1356 | #version 450 1357 | 1358 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 1359 | #extension GL_KHR_memory_scope_semantics: require 1360 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1361 | #extension GL_KHR_cooperative_matrix: require 1362 | #extension GL_EXT_float_e5m2: require 1363 | 1364 | layout (constant_id = 0) const int loop = 1; 1365 | layout (constant_id = 1) const int M = 1; 1366 | layout (constant_id = 2) const int N = 1; 1367 | layout (constant_id = 3) const int K = 1; 1368 | layout (constant_id = 4) const int SCOPE = 3; 1369 | 1370 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1371 | 1372 | void main() 1373 | { 1374 | const uint gx = gl_GlobalInvocationID.x; 1375 | const uint lx = gl_LocalInvocationID.x; 1376 | 1377 | coopmat a = coopmat(float(gx)); 1378 | coopmat b = coopmat(float(lx)); 1379 | 1380 | coopmat c0 = coopmat(float(gx)); 1381 | coopmat c1 = coopmat(float(lx)); 1382 | 1383 | for (int i = 0; i < loop; i++) 1384 | {)" 1385 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 1386 | R"(} 1387 | 1388 | c0 = c0 + c1; 1389 | coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1390 | } 1391 | )"; 1392 | 1393 | static const char glsl_bf8_fp32_matrix_data[] = R"( 1394 | #version 450 1395 | 1396 | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require 1397 | #extension GL_KHR_memory_scope_semantics: require 1398 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1399 | #extension GL_KHR_cooperative_matrix: require 1400 | #extension GL_EXT_float_e5m2: require 1401 | 1402 | layout (constant_id = 0) const int loop = 1; 1403 | layout (constant_id = 1) const int M = 1; 1404 | layout (constant_id = 2) const int N = 1; 1405 | layout (constant_id = 3) const int K = 1; 1406 | layout (constant_id = 4) const int SCOPE = 3; 1407 | 1408 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1409 | 1410 | void main() 1411 | { 1412 | const uint gx = gl_GlobalInvocationID.x; 1413 | const uint lx = gl_LocalInvocationID.x; 1414 | 1415 | coopmat a = coopmat(float(gx)); 1416 | coopmat b = coopmat(float(lx)); 1417 | 1418 | coopmat c = coopmat(float(gx)); 1419 | 1420 | for (int i = 0; i < loop; i++) 1421 | {)" 1422 | REPEAT_16(c = coopMatMulAdd(a, b, c);) 1423 | R"(} 1424 | 1425 | coopMatStore(c, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1426 | } 1427 | )"; 1428 | 1429 | static const char glsl_bf8_fp32_matrix_dual_data[] = R"( 1430 | #version 450 1431 | 1432 | #extension GL_KHR_memory_scope_semantics: require 1433 | #extension GL_EXT_shader_explicit_arithmetic_types: require 1434 | #extension GL_KHR_cooperative_matrix: require 1435 | #extension GL_EXT_float_e5m2: require 1436 | 1437 | layout (constant_id = 0) const int loop = 1; 1438 | layout (constant_id = 1) const int M = 1; 1439 | layout (constant_id = 2) const int N = 1; 1440 | layout (constant_id = 3) const int K = 1; 1441 | layout (constant_id = 4) const int SCOPE = 3; 1442 | 1443 | layout (binding = 0) writeonly buffer c_blob { float c_blob_data[]; }; 1444 | 1445 | void main() 1446 | { 1447 | const uint gx = gl_GlobalInvocationID.x; 1448 | const uint lx = gl_LocalInvocationID.x; 1449 | 1450 | coopmat a = coopmat(float(gx)); 1451 | coopmat b = coopmat(float(lx)); 1452 | 1453 | coopmat c0 = coopmat(float(gx)); 1454 | coopmat c1 = coopmat(float(lx)); 1455 | 1456 | for (int i = 0; i < loop; i++) 1457 | {)" 1458 | REPEAT_8(c0 = coopMatMulAdd(a, b, c0); c1 = coopMatMulAdd(a, b, c1);) 1459 | R"(} 1460 | 1461 | c0 = c0 + c1; 1462 | coopMatStore(c0, c_blob_data, gx * (M * N), N, gl_CooperativeMatrixLayoutRowMajor); 1463 | } 1464 | )"; 1465 | 1466 | static double vkpeak(int device_id, int storage_type, int arithmetic_type, int packing_type) 1467 | { 1468 | ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(device_id); 1469 | 1470 | if (!vkdev) 1471 | { 1472 | return 0; 1473 | } 1474 | 1475 | if (!vkdev->info.support_fp16_storage() && storage_type == 1) 1476 | { 1477 | return 0; 1478 | } 1479 | if (!vkdev->info.support_fp16_storage() && storage_type == 4) 1480 | { 1481 | return 0; 1482 | } 1483 | if (!vkdev->info.support_fp16_arithmetic() && arithmetic_type == 1) 1484 | { 1485 | return 0; 1486 | } 1487 | if (!vkdev->info.support_fp16_arithmetic() && arithmetic_type == 4) 1488 | { 1489 | return 0; 1490 | } 1491 | if (!vkdev->info.support_int8_arithmetic() && arithmetic_type == 6) 1492 | { 1493 | return 0; 1494 | } 1495 | if (!vkdev->info.support_cooperative_matrix() && packing_type == 256) 1496 | { 1497 | return 0; 1498 | } 1499 | 1500 | // check shader fp64 feature 1501 | bool has_shader_fp64 = vkdev->info.physicalDevicefeatures().shaderFloat64; 1502 | if (!has_shader_fp64 && (storage_type == 2 || arithmetic_type == 2)) 1503 | { 1504 | return 0; 1505 | } 1506 | 1507 | // check shader int64 feature 1508 | bool has_shader_int64 = vkdev->info.physicalDevicefeatures().shaderInt64; 1509 | if (!has_shader_int64 && (storage_type == 5 || arithmetic_type == 5)) 1510 | { 1511 | return 0; 1512 | } 1513 | 1514 | // check shader int8 dotprod feature 1515 | bool has_shader_int8_dotprod = vkdev->info.queryShaderIntegerDotProductFeatures().shaderIntegerDotProduct; 1516 | if (!has_shader_int8_dotprod && (arithmetic_type == 6 && packing_type == 4)) 1517 | { 1518 | return 0; 1519 | } 1520 | 1521 | // check shader bf16 feature 1522 | bool has_shader_bf16 = vkdev->info.queryShaderBfloat16Features().shaderBFloat16Type; 1523 | if (!has_shader_bf16 && (arithmetic_type == 7)) 1524 | { 1525 | return 0; 1526 | } 1527 | 1528 | // check shader bf16 dotprod feature 1529 | bool has_shader_bf16_dotprod = vkdev->info.queryShaderBfloat16Features().shaderBFloat16DotProduct; 1530 | if (!has_shader_bf16_dotprod && (arithmetic_type == 7 && packing_type == 4)) 1531 | { 1532 | return 0; 1533 | } 1534 | 1535 | // check shader bf16 cooperative matrix feature 1536 | bool has_shader_bf16_matrix = vkdev->info.queryShaderBfloat16Features().shaderBFloat16CooperativeMatrix; 1537 | if (!has_shader_bf16_matrix && (arithmetic_type == 7 && packing_type == 256)) 1538 | { 1539 | return 0; 1540 | } 1541 | 1542 | // check shader fp8 feature 1543 | bool has_shader_fp8 = vkdev->info.queryShaderFloat8Features().shaderFloat8; 1544 | if (!has_shader_fp8 && (arithmetic_type == 8 || arithmetic_type == 9)) 1545 | { 1546 | return 0; 1547 | } 1548 | 1549 | // check shader fp8 cooperative matrix feature 1550 | bool has_shader_fp8_matrix = vkdev->info.queryShaderFloat8Features().shaderFloat8CooperativeMatrix; 1551 | if (!has_shader_fp8_matrix && ((arithmetic_type == 8 || arithmetic_type == 9) && packing_type == 256)) 1552 | { 1553 | return 0; 1554 | } 1555 | 1556 | ncnn::Option opt; 1557 | opt.use_vulkan_compute = true; 1558 | opt.use_fp16_packed = storage_type == 1; 1559 | opt.use_fp16_storage = storage_type == 1 || storage_type == 4; 1560 | opt.use_fp16_arithmetic = arithmetic_type == 1; 1561 | 1562 | ncnn::VkAllocator* allocator = vkdev->acquire_blob_allocator(); 1563 | 1564 | // reuse c storage, max 512M 1565 | int buffer_size = std::min((int)(vkdev->get_heap_budget() / 8), 512) * 1024 * 1024; 1566 | if (vkdev->info.type() == 1) 1567 | { 1568 | // max 128M for integrated gpu 1569 | buffer_size = std::min(buffer_size, 128 * 1024 * 1024); 1570 | } 1571 | ncnn::VkMat c(buffer_size, (size_t)1u, 1, allocator); 1572 | 1573 | int elemsize; 1574 | if (storage_type == 0 || storage_type == 3) 1575 | { 1576 | // fp32 / int32 1577 | elemsize = 4; 1578 | } 1579 | else if (storage_type == 1 || storage_type == 4) 1580 | { 1581 | // fp16 / int16 1582 | elemsize = 2; 1583 | } 1584 | else if (storage_type == 2 || storage_type == 5) 1585 | { 1586 | // fp64 / int64 1587 | elemsize = 8; 1588 | } 1589 | else if (storage_type == 6) 1590 | { 1591 | // int8 1592 | elemsize = 1; 1593 | } 1594 | 1595 | int local_size_x = std::min(128, std::max(1, (int)vkdev->info.subgroup_size())); 1596 | if (packing_type == 256) 1597 | { 1598 | // matrix on subgroup 1599 | local_size_x = (int)vkdev->info.subgroup_size(); 1600 | } 1601 | 1602 | int M = 1; 1603 | int N = 1; 1604 | int K = 1; 1605 | // VK_SCOPE_WORKGROUP_KHR = gl_ScopeWorkgroup = 2 1606 | // VK_SCOPE_SUBGROUP_KHR = gl_ScopeSubgroup = 3 1607 | int SCOPE = 3; 1608 | bool use_fp16_fp32_matrix = false; 1609 | bool use_bf16_fp32_matrix = false; 1610 | bool use_fp8_fp32_matrix = false; 1611 | if (packing_type == 256) 1612 | { 1613 | bool mnk_found = false; 1614 | 1615 | if (arithmetic_type == 1) 1616 | { 1617 | if (vkdev->info.support_VK_KHR_cooperative_matrix()) 1618 | { 1619 | const std::vector& properties = vkdev->info.queryCooperativeMatrixSubProperties(); 1620 | 1621 | { 1622 | // find fp16 * fp16 => fp16 1623 | for (uint32_t j = 0; j < properties.size(); j++) 1624 | { 1625 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1626 | 1627 | if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR 1628 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR 1629 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1630 | { 1631 | M = cmp.MSize; 1632 | N = cmp.NSize; 1633 | K = cmp.KSize; 1634 | SCOPE = (int)cmp.scope; 1635 | mnk_found = true; 1636 | break; 1637 | } 1638 | } 1639 | } 1640 | 1641 | if (!mnk_found) 1642 | { 1643 | // find fp16 * fp16 => fp32 1644 | for (uint32_t j = 0; j < properties.size(); j++) 1645 | { 1646 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1647 | 1648 | if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR 1649 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR 1650 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1651 | { 1652 | M = cmp.MSize; 1653 | N = cmp.NSize; 1654 | K = cmp.KSize; 1655 | SCOPE = (int)cmp.scope; 1656 | mnk_found = true; 1657 | use_fp16_fp32_matrix = true; 1658 | break; 1659 | } 1660 | } 1661 | } 1662 | } 1663 | else // if (vkdev->info.support_VK_NV_cooperative_matrix()) 1664 | { 1665 | const std::vector& properties = vkdev->info.queryCooperativeMatrixSubPropertiesNV(); 1666 | 1667 | { 1668 | // find fp16 * fp16 => fp16 1669 | for (uint32_t j = 0; j < properties.size(); j++) 1670 | { 1671 | const VkCooperativeMatrixPropertiesNV& cmp = properties[j]; 1672 | 1673 | if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV 1674 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT16_NV 1675 | && (cmp.scope == VK_SCOPE_SUBGROUP_NV || cmp.scope == VK_SCOPE_WORKGROUP_NV)) 1676 | { 1677 | M = cmp.MSize; 1678 | N = cmp.NSize; 1679 | K = cmp.KSize; 1680 | SCOPE = (int)cmp.scope; 1681 | mnk_found = true; 1682 | break; 1683 | } 1684 | } 1685 | } 1686 | 1687 | if (!mnk_found) 1688 | { 1689 | // find fp16 * fp16 => fp32 1690 | for (uint32_t j = 0; j < properties.size(); j++) 1691 | { 1692 | const VkCooperativeMatrixPropertiesNV& cmp = properties[j]; 1693 | 1694 | if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV 1695 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV 1696 | && (cmp.scope == VK_SCOPE_SUBGROUP_NV || cmp.scope == VK_SCOPE_WORKGROUP_NV)) 1697 | { 1698 | M = cmp.MSize; 1699 | N = cmp.NSize; 1700 | K = cmp.KSize; 1701 | SCOPE = (int)cmp.scope; 1702 | mnk_found = true; 1703 | use_fp16_fp32_matrix = true; 1704 | break; 1705 | } 1706 | } 1707 | } 1708 | } 1709 | } 1710 | 1711 | if (arithmetic_type == 6) 1712 | { 1713 | if (vkdev->info.support_VK_KHR_cooperative_matrix()) 1714 | { 1715 | const std::vector& properties = vkdev->info.queryCooperativeMatrixSubProperties(); 1716 | 1717 | // find int8 * int8 => int32 1718 | for (uint32_t j = 0; j < properties.size(); j++) 1719 | { 1720 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1721 | 1722 | if (cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR && cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR 1723 | && cmp.CType == VK_COMPONENT_TYPE_SINT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_SINT32_KHR 1724 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1725 | { 1726 | M = cmp.MSize; 1727 | N = cmp.NSize; 1728 | K = cmp.KSize; 1729 | SCOPE = (int)cmp.scope; 1730 | mnk_found = true; 1731 | break; 1732 | } 1733 | } 1734 | } 1735 | else // if (vkdev->info.support_VK_NV_cooperative_matrix()) 1736 | { 1737 | const std::vector& properties = vkdev->info.queryCooperativeMatrixSubPropertiesNV(); 1738 | 1739 | // find int8 * int8 => int32 1740 | for (uint32_t j = 0; j < properties.size(); j++) 1741 | { 1742 | const VkCooperativeMatrixPropertiesNV& cmp = properties[j]; 1743 | 1744 | if (cmp.AType == VK_COMPONENT_TYPE_SINT8_NV && cmp.BType == VK_COMPONENT_TYPE_SINT8_NV 1745 | && cmp.CType == VK_COMPONENT_TYPE_SINT32_NV && cmp.DType == VK_COMPONENT_TYPE_SINT32_NV 1746 | && (cmp.scope == VK_SCOPE_SUBGROUP_NV || cmp.scope == VK_SCOPE_WORKGROUP_NV)) 1747 | { 1748 | M = cmp.MSize; 1749 | N = cmp.NSize; 1750 | K = cmp.KSize; 1751 | SCOPE = (int)cmp.scope; 1752 | mnk_found = true; 1753 | break; 1754 | } 1755 | } 1756 | } 1757 | } 1758 | 1759 | if (arithmetic_type == 7) 1760 | { 1761 | if (vkdev->info.support_VK_KHR_cooperative_matrix()) 1762 | { 1763 | const std::vector& properties = vkdev->info.queryCooperativeMatrixSubProperties(); 1764 | 1765 | { 1766 | // find bf16 * bf16 => bf16 1767 | for (uint32_t j = 0; j < properties.size(); j++) 1768 | { 1769 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1770 | 1771 | if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR 1772 | && cmp.CType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.ResultType == VK_COMPONENT_TYPE_BFLOAT16_KHR 1773 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1774 | { 1775 | M = cmp.MSize; 1776 | N = cmp.NSize; 1777 | K = cmp.KSize; 1778 | SCOPE = (int)cmp.scope; 1779 | mnk_found = true; 1780 | break; 1781 | } 1782 | } 1783 | } 1784 | 1785 | if (!mnk_found) 1786 | { 1787 | // find bf16 * bf16 => fp32 1788 | for (uint32_t j = 0; j < properties.size(); j++) 1789 | { 1790 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1791 | 1792 | if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR 1793 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR 1794 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1795 | { 1796 | M = cmp.MSize; 1797 | N = cmp.NSize; 1798 | K = cmp.KSize; 1799 | SCOPE = (int)cmp.scope; 1800 | mnk_found = true; 1801 | use_bf16_fp32_matrix = true; 1802 | break; 1803 | } 1804 | } 1805 | } 1806 | } 1807 | } 1808 | 1809 | if (arithmetic_type == 8) 1810 | { 1811 | if (vkdev->info.support_VK_KHR_cooperative_matrix()) 1812 | { 1813 | const std::vector& properties = vkdev->info.queryCooperativeMatrixSubProperties(); 1814 | 1815 | { 1816 | // find fp8 * fp8 => fp16 1817 | for (uint32_t j = 0; j < properties.size(); j++) 1818 | { 1819 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1820 | 1821 | if (cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT && cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT 1822 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR 1823 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1824 | { 1825 | M = cmp.MSize; 1826 | N = cmp.NSize; 1827 | K = cmp.KSize; 1828 | SCOPE = (int)cmp.scope; 1829 | mnk_found = true; 1830 | break; 1831 | } 1832 | } 1833 | } 1834 | 1835 | if (!mnk_found) 1836 | { 1837 | // find fp8 * fp8 => fp32 1838 | for (uint32_t j = 0; j < properties.size(); j++) 1839 | { 1840 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1841 | 1842 | if (cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT && cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT 1843 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR 1844 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1845 | { 1846 | M = cmp.MSize; 1847 | N = cmp.NSize; 1848 | K = cmp.KSize; 1849 | SCOPE = (int)cmp.scope; 1850 | mnk_found = true; 1851 | use_fp8_fp32_matrix = true; 1852 | break; 1853 | } 1854 | } 1855 | } 1856 | } 1857 | } 1858 | 1859 | if (arithmetic_type == 9) 1860 | { 1861 | if (vkdev->info.support_VK_KHR_cooperative_matrix()) 1862 | { 1863 | const std::vector& properties = vkdev->info.queryCooperativeMatrixSubProperties(); 1864 | 1865 | { 1866 | // find bf8 * bf8 => fp16 1867 | for (uint32_t j = 0; j < properties.size(); j++) 1868 | { 1869 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1870 | 1871 | if (cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT && cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT 1872 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR 1873 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1874 | { 1875 | M = cmp.MSize; 1876 | N = cmp.NSize; 1877 | K = cmp.KSize; 1878 | SCOPE = (int)cmp.scope; 1879 | mnk_found = true; 1880 | break; 1881 | } 1882 | } 1883 | } 1884 | 1885 | if (!mnk_found) 1886 | { 1887 | // find bf8 * bf8 => fp32 1888 | for (uint32_t j = 0; j < properties.size(); j++) 1889 | { 1890 | const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; 1891 | 1892 | if (cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT && cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT 1893 | && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR 1894 | && (cmp.scope == VK_SCOPE_SUBGROUP_KHR || cmp.scope == VK_SCOPE_WORKGROUP_KHR)) 1895 | { 1896 | M = cmp.MSize; 1897 | N = cmp.NSize; 1898 | K = cmp.KSize; 1899 | SCOPE = (int)cmp.scope; 1900 | mnk_found = true; 1901 | use_fp8_fp32_matrix = true; 1902 | break; 1903 | } 1904 | } 1905 | } 1906 | } 1907 | } 1908 | 1909 | if (!mnk_found) 1910 | { 1911 | // no supported component type 1912 | return 0; 1913 | } 1914 | } 1915 | 1916 | int max_invocation_count = buffer_size / elemsize; 1917 | // make max_invocation_count be multiple of local_size_x 1918 | max_invocation_count = std::max(max_invocation_count / local_size_x, 1) * local_size_x; 1919 | if (packing_type == 256) 1920 | { 1921 | if (use_fp16_fp32_matrix || use_bf16_fp32_matrix || use_fp8_fp32_matrix) 1922 | max_invocation_count = std::max(max_invocation_count / (M * N) / 2, 1); 1923 | else 1924 | max_invocation_count = std::max(max_invocation_count / (M * N), 1); 1925 | } 1926 | 1927 | double max_gflops = 0; 1928 | 1929 | // start with little works 1930 | int invocation_count = std::max(max_invocation_count / 32, 8); 1931 | int loop = 16; 1932 | 1933 | bool rerun = true; 1934 | 1935 | // prepare storage 1936 | while (rerun) 1937 | { 1938 | rerun = false; 1939 | 1940 | // setup pipeline 1941 | ncnn::Pipeline pipeline(vkdev); 1942 | ncnn::Pipeline pipeline_dual(vkdev); 1943 | { 1944 | pipeline.set_local_size_xyz(local_size_x, 1, 1); 1945 | pipeline_dual.set_local_size_xyz(local_size_x, 1, 1); 1946 | 1947 | std::vector specializations(1); 1948 | specializations[0].i = loop; 1949 | 1950 | // glsl to spirv 1951 | // -1 for omit the tail '\0' 1952 | std::vector spirv; 1953 | std::vector spirv_dual; 1954 | if (arithmetic_type == 2) 1955 | { 1956 | if (packing_type == 1) 1957 | { 1958 | ncnn::compile_spirv_module(glsl_fp64_p1_data, sizeof(glsl_fp64_p1_data) - 1, opt, spirv); 1959 | ncnn::compile_spirv_module(glsl_fp64_p1_dual_data, sizeof(glsl_fp64_p1_dual_data) - 1, opt, spirv_dual); 1960 | } 1961 | if (packing_type == 4) 1962 | { 1963 | ncnn::compile_spirv_module(glsl_fp64_p4_data, sizeof(glsl_fp64_p4_data) - 1, opt, spirv); 1964 | ncnn::compile_spirv_module(glsl_fp64_p4_dual_data, sizeof(glsl_fp64_p4_dual_data) - 1, opt, spirv_dual); 1965 | } 1966 | } 1967 | else if (arithmetic_type == 3) 1968 | { 1969 | if (packing_type == 1) 1970 | { 1971 | ncnn::compile_spirv_module(glsl_int32_p1_data, sizeof(glsl_int32_p1_data) - 1, opt, spirv); 1972 | ncnn::compile_spirv_module(glsl_int32_p1_dual_data, sizeof(glsl_int32_p1_dual_data) - 1, opt, spirv_dual); 1973 | } 1974 | if (packing_type == 4) 1975 | { 1976 | ncnn::compile_spirv_module(glsl_int32_p4_data, sizeof(glsl_int32_p4_data) - 1, opt, spirv); 1977 | ncnn::compile_spirv_module(glsl_int32_p4_dual_data, sizeof(glsl_int32_p4_dual_data) - 1, opt, spirv_dual); 1978 | } 1979 | } 1980 | else if (arithmetic_type == 4) 1981 | { 1982 | if (packing_type == 1) 1983 | { 1984 | ncnn::compile_spirv_module(glsl_int16_p1_data, sizeof(glsl_int16_p1_data) - 1, opt, spirv); 1985 | ncnn::compile_spirv_module(glsl_int16_p1_dual_data, sizeof(glsl_int16_p1_dual_data) - 1, opt, spirv_dual); 1986 | } 1987 | if (packing_type == 4) 1988 | { 1989 | ncnn::compile_spirv_module(glsl_int16_p4_data, sizeof(glsl_int16_p4_data) - 1, opt, spirv); 1990 | ncnn::compile_spirv_module(glsl_int16_p4_dual_data, sizeof(glsl_int16_p4_dual_data) - 1, opt, spirv_dual); 1991 | } 1992 | } 1993 | else if (arithmetic_type == 5) 1994 | { 1995 | if (packing_type == 1) 1996 | { 1997 | ncnn::compile_spirv_module(glsl_int64_p1_data, sizeof(glsl_int64_p1_data) - 1, opt, spirv); 1998 | ncnn::compile_spirv_module(glsl_int64_p1_dual_data, sizeof(glsl_int64_p1_dual_data) - 1, opt, spirv_dual); 1999 | } 2000 | if (packing_type == 4) 2001 | { 2002 | ncnn::compile_spirv_module(glsl_int64_p4_data, sizeof(glsl_int64_p4_data) - 1, opt, spirv); 2003 | ncnn::compile_spirv_module(glsl_int64_p4_dual_data, sizeof(glsl_int64_p4_dual_data) - 1, opt, spirv_dual); 2004 | } 2005 | } 2006 | else if (arithmetic_type == 6) 2007 | { 2008 | if (packing_type == 4) 2009 | { 2010 | ncnn::compile_spirv_module(glsl_int8_p4_data, sizeof(glsl_int8_p4_data) - 1, opt, spirv); 2011 | ncnn::compile_spirv_module(glsl_int8_p4_dual_data, sizeof(glsl_int8_p4_dual_data) - 1, opt, spirv_dual); 2012 | } 2013 | if (packing_type == 256) 2014 | { 2015 | // loop M N K SCOPE 2016 | specializations.resize(5); 2017 | specializations[1].i = M; 2018 | specializations[2].i = N; 2019 | specializations[3].i = K; 2020 | specializations[4].i = SCOPE; 2021 | 2022 | ncnn::compile_spirv_module(glsl_int8_matrix_data, sizeof(glsl_int8_matrix_data) - 1, opt, spirv); 2023 | ncnn::compile_spirv_module(glsl_int8_matrix_dual_data, sizeof(glsl_int8_matrix_dual_data) - 1, opt, spirv_dual); 2024 | } 2025 | } 2026 | else if (arithmetic_type == 7) 2027 | { 2028 | if (packing_type == 4) 2029 | { 2030 | ncnn::compile_spirv_module(glsl_bf16_p4_data, sizeof(glsl_bf16_p4_data) - 1, opt, spirv); 2031 | ncnn::compile_spirv_module(glsl_bf16_p4_dual_data, sizeof(glsl_bf16_p4_dual_data) - 1, opt, spirv_dual); 2032 | } 2033 | if (packing_type == 256) 2034 | { 2035 | // loop M N K SCOPE 2036 | specializations.resize(5); 2037 | specializations[1].i = M; 2038 | specializations[2].i = N; 2039 | specializations[3].i = K; 2040 | specializations[4].i = SCOPE; 2041 | 2042 | if (use_bf16_fp32_matrix) 2043 | { 2044 | ncnn::compile_spirv_module(glsl_bf16_fp32_matrix_data, sizeof(glsl_bf16_fp32_matrix_data) - 1, opt, spirv); 2045 | ncnn::compile_spirv_module(glsl_bf16_fp32_matrix_dual_data, sizeof(glsl_bf16_fp32_matrix_dual_data) - 1, opt, spirv_dual); 2046 | } 2047 | else 2048 | { 2049 | ncnn::compile_spirv_module(glsl_bf16_matrix_data, sizeof(glsl_bf16_matrix_data) - 1, opt, spirv); 2050 | ncnn::compile_spirv_module(glsl_bf16_matrix_dual_data, sizeof(glsl_bf16_matrix_dual_data) - 1, opt, spirv_dual); 2051 | } 2052 | } 2053 | } 2054 | else if (arithmetic_type == 8) 2055 | { 2056 | if (packing_type == 256) 2057 | { 2058 | // loop M N K SCOPE 2059 | specializations.resize(5); 2060 | specializations[1].i = M; 2061 | specializations[2].i = N; 2062 | specializations[3].i = K; 2063 | specializations[4].i = SCOPE; 2064 | 2065 | if (use_fp8_fp32_matrix) 2066 | { 2067 | ncnn::compile_spirv_module(glsl_fp8_fp32_matrix_data, sizeof(glsl_fp8_fp32_matrix_data) - 1, opt, spirv); 2068 | ncnn::compile_spirv_module(glsl_fp8_fp32_matrix_dual_data, sizeof(glsl_fp8_fp32_matrix_dual_data) - 1, opt, spirv_dual); 2069 | } 2070 | else 2071 | { 2072 | ncnn::compile_spirv_module(glsl_fp8_fp16_matrix_data, sizeof(glsl_fp8_fp16_matrix_data) - 1, opt, spirv); 2073 | ncnn::compile_spirv_module(glsl_fp8_fp16_matrix_dual_data, sizeof(glsl_fp8_fp16_matrix_dual_data) - 1, opt, spirv_dual); 2074 | } 2075 | } 2076 | } 2077 | else if (arithmetic_type == 9) 2078 | { 2079 | if (packing_type == 256) 2080 | { 2081 | // loop M N K SCOPE 2082 | specializations.resize(5); 2083 | specializations[1].i = M; 2084 | specializations[2].i = N; 2085 | specializations[3].i = K; 2086 | specializations[4].i = SCOPE; 2087 | 2088 | if (use_fp8_fp32_matrix) 2089 | { 2090 | ncnn::compile_spirv_module(glsl_bf8_fp32_matrix_data, sizeof(glsl_bf8_fp32_matrix_data) - 1, opt, spirv); 2091 | ncnn::compile_spirv_module(glsl_bf8_fp32_matrix_dual_data, sizeof(glsl_bf8_fp32_matrix_dual_data) - 1, opt, spirv_dual); 2092 | } 2093 | else 2094 | { 2095 | ncnn::compile_spirv_module(glsl_bf8_fp16_matrix_data, sizeof(glsl_bf8_fp16_matrix_data) - 1, opt, spirv); 2096 | ncnn::compile_spirv_module(glsl_bf8_fp16_matrix_dual_data, sizeof(glsl_bf8_fp16_matrix_dual_data) - 1, opt, spirv_dual); 2097 | } 2098 | } 2099 | } 2100 | else // if (arithmetic_type == 0 || arithmetic_type == 1) 2101 | { 2102 | if (packing_type == 1) 2103 | { 2104 | ncnn::compile_spirv_module(glsl_p1_data, sizeof(glsl_p1_data) - 1, opt, spirv); 2105 | ncnn::compile_spirv_module(glsl_p1_dual_data, sizeof(glsl_p1_dual_data) - 1, opt, spirv_dual); 2106 | } 2107 | if (packing_type == 4) 2108 | { 2109 | ncnn::compile_spirv_module(glsl_p4_data, sizeof(glsl_p4_data) - 1, opt, spirv); 2110 | ncnn::compile_spirv_module(glsl_p4_dual_data, sizeof(glsl_p4_dual_data) - 1, opt, spirv_dual); 2111 | } 2112 | if (packing_type == 256) 2113 | { 2114 | // loop M N K SCOPE 2115 | specializations.resize(5); 2116 | specializations[1].i = M; 2117 | specializations[2].i = N; 2118 | specializations[3].i = K; 2119 | specializations[4].i = SCOPE; 2120 | 2121 | if (use_fp16_fp32_matrix) 2122 | { 2123 | ncnn::compile_spirv_module(glsl_fp16_fp32_matrix_data, sizeof(glsl_fp16_fp32_matrix_data) - 1, opt, spirv); 2124 | ncnn::compile_spirv_module(glsl_fp16_fp32_matrix_dual_data, sizeof(glsl_fp16_fp32_matrix_dual_data) - 1, opt, spirv_dual); 2125 | } 2126 | else 2127 | { 2128 | ncnn::compile_spirv_module(glsl_fp16_matrix_data, sizeof(glsl_fp16_matrix_data) - 1, opt, spirv); 2129 | ncnn::compile_spirv_module(glsl_fp16_matrix_dual_data, sizeof(glsl_fp16_matrix_dual_data) - 1, opt, spirv_dual); 2130 | } 2131 | } 2132 | } 2133 | 2134 | int ret0 = pipeline.create(spirv.data(), spirv.size() * 4, specializations); 2135 | int ret1 = pipeline_dual.create(spirv_dual.data(), spirv_dual.size() * 4, specializations); 2136 | if (ret0 != 0 || ret1 != 0) 2137 | { 2138 | vkdev->reclaim_blob_allocator(allocator); 2139 | return 0; 2140 | } 2141 | } 2142 | 2143 | const int cmd_loop = 6; 2144 | 2145 | for (int i = 0; i < cmd_loop; i++) 2146 | { 2147 | // encode command 2148 | ncnn::VkCompute cmd(vkdev); 2149 | ncnn::VkCompute cmd_dual(vkdev); 2150 | { 2151 | std::vector bindings(1); 2152 | bindings[0] = c; 2153 | 2154 | std::vector constants(0); 2155 | 2156 | ncnn::VkMat dispatcher; 2157 | dispatcher.w = invocation_count; 2158 | dispatcher.h = 1; 2159 | dispatcher.c = 1; 2160 | cmd.record_pipeline(&pipeline, bindings, constants, dispatcher); 2161 | cmd_dual.record_pipeline(&pipeline_dual, bindings, constants, dispatcher); 2162 | } 2163 | 2164 | // time this 2165 | { 2166 | double t0 = ncnn::get_current_time(); 2167 | 2168 | int ret = cmd.submit_and_wait(); 2169 | if (ret != 0) 2170 | { 2171 | vkdev->reclaim_blob_allocator(allocator); 2172 | return 0; 2173 | } 2174 | 2175 | double t1 = ncnn::get_current_time(); 2176 | 2177 | double time = t1 - t0; 2178 | 2179 | if (time < 300) 2180 | { 2181 | // for fast device 2182 | if (invocation_count * 2 <= max_invocation_count) 2183 | { 2184 | invocation_count = std::min(invocation_count * 2, max_invocation_count); 2185 | } 2186 | else 2187 | { 2188 | loop *= 2; 2189 | } 2190 | rerun = true; 2191 | break; 2192 | } 2193 | 2194 | t0 = ncnn::get_current_time(); 2195 | 2196 | ret = cmd_dual.submit_and_wait(); 2197 | if (ret != 0) 2198 | { 2199 | vkdev->reclaim_blob_allocator(allocator); 2200 | return 0; 2201 | } 2202 | 2203 | t1 = ncnn::get_current_time(); 2204 | 2205 | double time_dual = t1 - t0; 2206 | 2207 | if (time_dual < 300) 2208 | { 2209 | // for fast device 2210 | if (invocation_count * 2 <= max_invocation_count) 2211 | { 2212 | invocation_count = std::min(invocation_count * 2, max_invocation_count); 2213 | } 2214 | else 2215 | { 2216 | loop *= 2; 2217 | } 2218 | rerun = true; 2219 | break; 2220 | } 2221 | 2222 | double gflops; 2223 | { 2224 | double mac = (double)invocation_count * ((double)loop * 16 * 2); 2225 | 2226 | if (packing_type == 256) 2227 | { 2228 | mac *= M * N * K; 2229 | mac /= local_size_x; 2230 | } 2231 | else 2232 | { 2233 | mac *= packing_type; 2234 | } 2235 | 2236 | gflops = mac / time / 1000000; 2237 | } 2238 | double gflops_dual; 2239 | { 2240 | // dual issue is faster 2241 | double mac = (double)invocation_count * ((double)loop * 16 * 2 + 1); // +1 for the tail c0+c1 2242 | 2243 | if (packing_type == 256) 2244 | { 2245 | mac *= M * N * K; 2246 | mac /= local_size_x; 2247 | } 2248 | else 2249 | { 2250 | mac *= packing_type; 2251 | } 2252 | 2253 | gflops_dual = mac / time_dual / 1000000; 2254 | } 2255 | 2256 | gflops = std::max(gflops, gflops_dual); 2257 | 2258 | // fprintf(stderr, "%f gflops\n", gflops); 2259 | 2260 | if (gflops > max_gflops) 2261 | max_gflops = gflops; 2262 | } 2263 | } 2264 | } 2265 | 2266 | vkdev->reclaim_blob_allocator(allocator); 2267 | 2268 | return max_gflops; 2269 | } 2270 | 2271 | static double vkpeak_copy(int device_id, int from_type, int to_type) 2272 | { 2273 | ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(device_id); 2274 | 2275 | if (!vkdev) 2276 | { 2277 | return 0; 2278 | } 2279 | 2280 | ncnn::Option opt; 2281 | opt.use_vulkan_compute = true; 2282 | opt.use_fp16_packed = false; 2283 | opt.use_fp16_storage = false; 2284 | 2285 | ncnn::VkAllocator* staging_allocator = vkdev->acquire_staging_allocator(); 2286 | ncnn::VkAllocator* allocator = vkdev->acquire_blob_allocator(); 2287 | 2288 | opt.blob_vkallocator = allocator; 2289 | opt.workspace_vkallocator = allocator; 2290 | opt.staging_vkallocator = staging_allocator; 2291 | 2292 | bool d2d = from_type == 1 && to_type == 1; 2293 | 2294 | // devbuf max 512M for host and 2G for d2d 2295 | size_t buffer_size = std::min((size_t)vkdev->get_heap_budget() / 8, d2d ? (size_t)2048 : (size_t)512) * 1024 * 1024; 2296 | if (vkdev->info.type() == 1) 2297 | { 2298 | // max 128M for integrated gpu 2299 | buffer_size = std::min(buffer_size, (size_t)128 * 1024 * 1024); 2300 | } 2301 | 2302 | double max_gbps = 0; 2303 | 2304 | if (from_type == 0 && to_type == 0) 2305 | { 2306 | ncnn::Mat a(1, buffer_size, 1); 2307 | ncnn::Mat b(1, buffer_size, 1); 2308 | 2309 | const int cmd_loop = 10; 2310 | 2311 | for (int i = 0; i < cmd_loop; i++) 2312 | { 2313 | // reset cache 2314 | memset(a, 0, buffer_size); 2315 | 2316 | ncnn::sleep(100); 2317 | 2318 | // time this 2319 | double t0 = ncnn::get_current_time(); 2320 | 2321 | memcpy(b, a, buffer_size); 2322 | 2323 | double t1 = ncnn::get_current_time(); 2324 | 2325 | double time = t1 - t0; 2326 | 2327 | double gbps = buffer_size / time / 1000000; 2328 | 2329 | // fprintf(stderr, "%f gbps\n", gbps); 2330 | 2331 | if (gbps > max_gbps) 2332 | max_gbps = gbps; 2333 | } 2334 | } 2335 | if (from_type == 0 && to_type == 1) 2336 | { 2337 | ncnn::VkMat devbuf(1, buffer_size, 1, staging_allocator); 2338 | ncnn::Mat hostbuf(1, buffer_size, 1); 2339 | 2340 | void* devptr = devbuf.mapped_ptr(); 2341 | void* hostptr = hostbuf.data; 2342 | 2343 | const int cmd_loop = 10; 2344 | 2345 | for (int i = 0; i < cmd_loop; i++) 2346 | { 2347 | // reset cache 2348 | memset(hostptr, 0, buffer_size); 2349 | staging_allocator->invalidate(devbuf.data); 2350 | 2351 | ncnn::sleep(100); 2352 | 2353 | // time this 2354 | double t0 = ncnn::get_current_time(); 2355 | 2356 | memcpy(devptr, hostptr, buffer_size); 2357 | 2358 | staging_allocator->flush(devbuf.data); 2359 | 2360 | double t1 = ncnn::get_current_time(); 2361 | 2362 | double time = t1 - t0; 2363 | 2364 | double gbps = buffer_size / time / 1000000; 2365 | 2366 | // fprintf(stderr, "%f gbps\n", gbps); 2367 | 2368 | if (gbps > max_gbps) 2369 | max_gbps = gbps; 2370 | } 2371 | } 2372 | if (from_type == 1 && to_type == 0) 2373 | { 2374 | ncnn::VkMat devbuf(1, buffer_size, 1, staging_allocator); 2375 | ncnn::Mat hostbuf(1, buffer_size, 1); 2376 | 2377 | void* devptr = devbuf.mapped_ptr(); 2378 | void* hostptr = hostbuf.data; 2379 | 2380 | const int cmd_loop = 10; 2381 | 2382 | for (int i = 0; i < cmd_loop; i++) 2383 | { 2384 | // reset cache 2385 | staging_allocator->flush(devbuf.data); 2386 | memset(hostptr, 0, buffer_size); 2387 | 2388 | ncnn::sleep(100); 2389 | 2390 | // time this 2391 | double t0 = ncnn::get_current_time(); 2392 | 2393 | staging_allocator->invalidate(devbuf.data); 2394 | 2395 | memcpy(hostptr, devptr, buffer_size); 2396 | 2397 | double t1 = ncnn::get_current_time(); 2398 | 2399 | double time = t1 - t0; 2400 | 2401 | double gbps = buffer_size / time / 1000000; 2402 | 2403 | // fprintf(stderr, "%f gbps\n", gbps); 2404 | 2405 | if (gbps > max_gbps) 2406 | max_gbps = gbps; 2407 | } 2408 | } 2409 | if (from_type == 1 && to_type == 1) 2410 | { 2411 | ncnn::VkMat a(1, buffer_size, 1, allocator); 2412 | ncnn::VkMat b(1, buffer_size, 1, allocator); 2413 | 2414 | const int cmd_loop = 50; 2415 | 2416 | for (int i = 0; i < cmd_loop; i++) 2417 | { 2418 | // encode command 2419 | ncnn::VkCompute cmd(vkdev); 2420 | 2421 | cmd.record_clone(a, b, opt); 2422 | 2423 | // time this 2424 | double t0 = ncnn::get_current_time(); 2425 | 2426 | int ret = cmd.submit_and_wait(); 2427 | if (ret != 0) 2428 | { 2429 | vkdev->reclaim_staging_allocator(staging_allocator); 2430 | vkdev->reclaim_blob_allocator(allocator); 2431 | return 0; 2432 | } 2433 | 2434 | double t1 = ncnn::get_current_time(); 2435 | 2436 | double time = t1 - t0; 2437 | 2438 | double gbps = buffer_size / time / 1000000; 2439 | 2440 | // fprintf(stderr, "%f gbps\n", gbps); 2441 | 2442 | if (gbps > max_gbps) 2443 | max_gbps = gbps; 2444 | } 2445 | } 2446 | 2447 | vkdev->reclaim_staging_allocator(staging_allocator); 2448 | vkdev->reclaim_blob_allocator(allocator); 2449 | 2450 | return max_gbps; 2451 | } 2452 | 2453 | int main(int argc, char** argv) 2454 | { 2455 | if (argc != 2) 2456 | { 2457 | fprintf(stderr, "Usage: %s [device_id]\n", argv[0]); 2458 | return -1; 2459 | } 2460 | 2461 | ncnn::create_gpu_instance(); 2462 | 2463 | const int gpu_count = ncnn::get_gpu_count(); 2464 | if (gpu_count == 0) 2465 | { 2466 | fprintf(stderr, "No vulkan device\n"); 2467 | return -1; 2468 | } 2469 | 2470 | const int device_id = atoi(argv[1]); 2471 | if (device_id < 0 || device_id >= gpu_count) 2472 | { 2473 | fprintf(stderr, "No vulkan device for %d\n", device_id); 2474 | fprintf(stderr, "Available devices:\n"); 2475 | 2476 | for (int i = 0; i < gpu_count; i++) 2477 | { 2478 | fprintf(stderr, "%d = %s\n", i, ncnn::get_gpu_info(i).device_name()); 2479 | } 2480 | 2481 | return -1; 2482 | } 2483 | 2484 | fprintf(stderr, "device = %s\n", ncnn::get_gpu_info(device_id).device_name()); 2485 | 2486 | // storage_type / arithmetic_type 2487 | // 0 = fp32 2488 | // 1 = fp16 2489 | // 2 = fp64 2490 | // 3 = int32 2491 | // 4 = int16 2492 | // 5 = int64 2493 | // 6 = int8 2494 | // 7 = bf16 2495 | // 8 = fp8 2496 | // 9 = bf8 2497 | 2498 | // packing_type 2499 | // 1 = scalar 2500 | // 4 = vec4 / dotprod 2501 | // 256 = matrix 2502 | 2503 | fprintf(stderr, "\n"); 2504 | fprintf(stderr, "fp32-scalar = %.2f GFLOPS\n", vkpeak(device_id, 0, 0, 1)); 2505 | fprintf(stderr, "fp32-vec4 = %.2f GFLOPS\n", vkpeak(device_id, 0, 0, 4)); 2506 | 2507 | fprintf(stderr, "\n"); 2508 | fprintf(stderr, "fp16-scalar = %.2f GFLOPS\n", vkpeak(device_id, 0, 1, 1)); 2509 | fprintf(stderr, "fp16-vec4 = %.2f GFLOPS\n", vkpeak(device_id, 0, 1, 4)); 2510 | fprintf(stderr, "fp16-matrix = %.2f GFLOPS\n", vkpeak(device_id, 1, 1, 256)); 2511 | 2512 | fprintf(stderr, "\n"); 2513 | fprintf(stderr, "fp64-scalar = %.2f GFLOPS\n", vkpeak(device_id, 2, 2, 1)); 2514 | fprintf(stderr, "fp64-vec4 = %.2f GFLOPS\n", vkpeak(device_id, 2, 2, 4)); 2515 | 2516 | fprintf(stderr, "\n"); 2517 | fprintf(stderr, "int32-scalar = %.2f GIOPS\n", vkpeak(device_id, 3, 3, 1)); 2518 | fprintf(stderr, "int32-vec4 = %.2f GIOPS\n", vkpeak(device_id, 3, 3, 4)); 2519 | 2520 | fprintf(stderr, "\n"); 2521 | fprintf(stderr, "int16-scalar = %.2f GIOPS\n", vkpeak(device_id, 3, 4, 1)); 2522 | fprintf(stderr, "int16-vec4 = %.2f GIOPS\n", vkpeak(device_id, 3, 4, 4)); 2523 | 2524 | fprintf(stderr, "\n"); 2525 | fprintf(stderr, "int64-scalar = %.2f GIOPS\n", vkpeak(device_id, 5, 5, 1)); 2526 | fprintf(stderr, "int64-vec4 = %.2f GIOPS\n", vkpeak(device_id, 5, 5, 4)); 2527 | 2528 | fprintf(stderr, "\n"); 2529 | fprintf(stderr, "int8-dotprod = %.2f GIOPS\n", vkpeak(device_id, 3, 6, 4)); 2530 | fprintf(stderr, "int8-matrix = %.2f GIOPS\n", vkpeak(device_id, 3, 6, 256)); 2531 | 2532 | fprintf(stderr, "\n"); 2533 | fprintf(stderr, "bf16-dotprod = %.2f GFLOPS\n", vkpeak(device_id, 0, 7, 4)); 2534 | fprintf(stderr, "bf16-matrix = %.2f GFLOPS\n", vkpeak(device_id, 0, 7, 256)); 2535 | 2536 | fprintf(stderr, "\n"); 2537 | fprintf(stderr, "fp8-matrix = %.2f GFLOPS\n", vkpeak(device_id, 0, 8, 256)); 2538 | fprintf(stderr, "bf8-matrix = %.2f GFLOPS\n", vkpeak(device_id, 0, 9, 256)); 2539 | 2540 | // device_type 2541 | // 0 = cpu 2542 | // 1 = gpu 2543 | 2544 | fprintf(stderr, "\n"); 2545 | fprintf(stderr, "copy-h2h = %.2f GBPS\n", vkpeak_copy(device_id, 0, 0)); 2546 | fprintf(stderr, "copy-h2d = %.2f GBPS\n", vkpeak_copy(device_id, 0, 1)); 2547 | fprintf(stderr, "copy-d2h = %.2f GBPS\n", vkpeak_copy(device_id, 1, 0)); 2548 | fprintf(stderr, "copy-d2d = %.2f GBPS\n", vkpeak_copy(device_id, 1, 1)); 2549 | 2550 | ncnn::destroy_gpu_instance(); 2551 | 2552 | return 0; 2553 | } 2554 | --------------------------------------------------------------------------------