├── .gitignore
├── LICENSE
├── README.md
├── example_1
    ├── CMakeLists.txt
    └── main.cu
├── example_10
    ├── CMakeLists.txt
    └── main.cu
├── example_11
    ├── CMakeLists.txt
    └── main.cu
├── example_12
    ├── CMakeLists.txt
    └── main.cu
├── example_2
    ├── CMakeLists.txt
    └── main.cu
├── example_3
    ├── CMakeLists.txt
    └── main.cu
├── example_4
    ├── CMakeLists.txt
    └── main.cu
├── example_5
    ├── CMakeLists.txt
    └── main.cu
├── example_6
    ├── CMakeLists.txt
    ├── options.h
    ├── register_layout.cu
    ├── register_layout.h
    ├── visualize_layout.cpp
    └── visualize_layout.h
├── example_7
    ├── CMakeLists.txt
    └── main.cu
├── example_8
    ├── CMakeLists.txt
    └── main.cu
├── example_9
    ├── CMakeLists.txt
    └── main.cu
└── layout_0
    ├── CMakeLists.txt
    ├── README.aux
    ├── README.log
    ├── README.pdf
    ├── README.tex
    └── main.cu


/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | build
 3 | 
 4 | 
 5 | # Prerequisites
 6 | *.d
 7 | 
 8 | # Compiled Object files
 9 | *.slo
10 | *.lo
11 | *.o
12 | *.obj
13 | 
14 | # Precompiled Headers
15 | *.gch
16 | *.pch
17 | 
18 | # Compiled Dynamic libraries
19 | *.so
20 | *.dylib
21 | *.dll
22 | 
23 | # Fortran module files
24 | *.mod
25 | *.smod
26 | 
27 | # Compiled Static libraries
28 | *.lai
29 | *.la
30 | *.a
31 | *.lib
32 | 
33 | # Executables
34 | *.exe
35 | *.out
36 | *.app
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 yhpark
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cutlass_EX
 2 | 
 3 | ## 0. Introduction
 4 | - Goal : Development of a 4-bit primitives kernels by using Cutlass
 5 | 
 6 | ## 1. Example List
 7 | 
 8 | ### example_1) custom code with CUTLASS
 9 | 
10 | ### example_2) cutlass::uint4b_t 
11 | 
12 | ### example_3) single-precision gemm template
13 | - [00_basic_gemm](https://github.com/NVIDIA/cutlass/blob/main/examples/00_basic_gemm/basic_gemm.cu)
14 | - This is kernel computes the general matrix product (GEMM) using single-precision floating-point arithmetic and assumes all matrices have column-major layout.
15 | 
16 | ### example_4) mixed-precision gemm template with cutlass utilities
17 | - [01_cutlass_utilities](https://github.com/NVIDIA/cutlass/blob/main/examples/01_cutlass_utilities/cutlass_utilities.cu)
18 | - These utilities are intended to be useful supporting components for managing tensor and matrix memory allocations, initializing and comparing results, and computing reference output.
19 | 
20 | ### example_5) CUTLASS debugging tool
21 | - [02_dump_reg_shmem](https://github.com/NVIDIA/cutlass/blob/main/examples/02_dump_reg_shmem/dump_reg_shmem.cu)
22 | - Demonstrate CUTLASS debugging tool for dumping fragments and shared memory
23 | - dumping : Record the state of memory at a specific point in time
24 | 
25 | ### example_6) CUTLASS layout visualization example
26 | - [03_visualize_layout](https://github.com/NVIDIA/cutlass/blob/main/examples/03_visualize_layout/visualize_layout.cpp)
27 | 
28 | ### example_7) CUTLASS example to compute a batched strided gemm in two different ways
29 | - [05_batched_gemm](https://github.com/NVIDIA/cutlass/blob/main/examples/05_batched_gemm/batched_gemm.cu)
30 | - strided batched gemm : By specifying pointers to the first matrices of the batch and the stride between the consecutive matrices of the batch.
31 | - array gemm : By copying pointers to all matrices of the batch to the device memory.
32 | 
33 | 
34 | ### example_8) CUTLASS turing gemm using tensor cores
35 | - [08_turing_tensorop_gemm](https://github.com/NVIDIA/cutlass/blob/main/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu)
36 | 
37 | 
38 | ### example_9) CUTLASS turing convolution using tensor cores
39 | - [09_turing_tensorop_conv2dfprop](https://github.com/NVIDIA/cutlass/blob/main/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu)
40 | 
41 | 
42 | ### example_10) CUTLASS ampere convolution using tensor cores
43 | - [16_ampere_tensorop_conv2dfprop](https://github.com/NVIDIA/cutlass/blob/main/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu)
44 | 
45 | ### example_11) Handling Cutlass Tensors
46 | 
47 | ### example_12) Simple CUTLASS convolution using Tensor core
48 | 
49 | 
50 | ## 2. Guide
51 | ```
52 |     cd example_{number}
53 |     mkdir build
54 |     cd build
55 |     cmake ..
56 |     make
57 |     ./main
58 | ```
59 | 
60 | 
61 | 
62 | ## 3 Reference 
63 | * Cutlass : <https://github.com/NVIDIA/cutlass>


--------------------------------------------------------------------------------
/example_1/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | 
22 | add_executable(main main.cu)
23 | 
24 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
25 | 
26 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
27 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
28 | 


--------------------------------------------------------------------------------
/example_1/main.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cutlass/cutlass.h>
 3 | #include <cutlass/numeric_types.h>
 4 | #include <cutlass/core_io.h>
 5 | 
 6 | int main()
 7 | {
 8 | 
 9 |     cutlass::half_t x = 2.25_hf;
10 | 
11 |     std::cout << x << std::endl;
12 | 
13 |     return 0;
14 | }


--------------------------------------------------------------------------------
/example_10/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_11/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_11/main.cu:
--------------------------------------------------------------------------------
  1 | #include "cutlass/gemm/device/gemm.h"
  2 | #include "cutlass/util/host_tensor.h"
  3 | #include "cutlass/util/reference/device/gemm.h"
  4 | #include "cutlass/util/reference/host/tensor_fill.h"
  5 | #include "cutlass/util/tensor_view_io.h"
  6 | 
  7 | #include "helper.h"
  8 | 
  9 | void check_cuda_version(bool &notSupported)
 10 | {
 11 |     // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
 12 |     // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
 13 |     if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)))
 14 |     {
 15 |         std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
 16 |         notSupported = true;
 17 |     }
 18 | }
 19 | 
 20 | void check_compute_capability(bool &notSupported)
 21 | {
 22 |     cudaDeviceProp props;
 23 |     CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 24 |     if (!(props.major >= 8))
 25 |     {
 26 |         std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
 27 |                   << std::endl;
 28 |         notSupported = true;
 29 |     }
 30 | }
 31 | 
 32 | // Computes the output tensor size (NKPQ)
 33 | cutlass::Tensor4DCoord calc_output_size(cutlass::Tensor4DCoord &input_size, cutlass::Tensor4DCoord &padding, cutlass::Tensor4DCoord &filter_size, cutlass::MatrixCoord &conv_stride)
 34 | {
 35 |     return cutlass::Tensor4DCoord(
 36 |         input_size.n(),
 37 |         (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
 38 |         (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
 39 |         filter_size.n());
 40 | }
 41 | 
 42 | template <
 43 |     typename Element_, // Data type of element stored within tensor (concept: NumericType)
 44 |     typename Layout_   // Defines a mapping from logical coordinate to linear memory (concept: Layout)
 45 |     >
 46 | void show_tensor_view(cutlass::HostTensor<Element_, Layout_> &tensor)
 47 | {
 48 |     std::cout << "tensor size : " << tensor.size() << std::endl;
 49 |     std::cout << "tensor shape : " << tensor.extent() << std::endl;
 50 |     std::cout << "tensor data type : " << typeid(*tensor.host_data()).name() << std::endl;
 51 |     std::cout << "tensor view : " << std::endl;
 52 |     std::cout << tensor.host_view() << std::endl;
 53 | }
 54 | 
 55 | int main(int argc, char const **args)
 56 | {
 57 |     // 0. cuda & device arch version check
 58 |     bool notSupported = false;
 59 |     check_cuda_version(notSupported);
 60 |     check_compute_capability(notSupported);
 61 |     if (notSupported)
 62 |         return 0;
 63 | 
 64 |     // 1. Allocate host-device tensors using the CUTLASS Utilities.
 65 |     cutlass::Tensor4DCoord input_size{1, 4, 4, 3}; // N, H, W, C
 66 |     using ElementInputA = cutlass::uint4b_t;       // Data type of elements in input tensor
 67 |     using LayoutInputA = cutlass::layout::TensorNHWC;
 68 |     cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(input_size);
 69 | 
 70 |     cutlass::Tensor4DCoord filter_size{2, 3, 3, 3}; // K, KH, KW, C
 71 |     using ElementInputB = cutlass::half_t;          // Data type of elements in input tensor
 72 |     using LayoutInputB = cutlass::layout::TensorNHWC;
 73 |     cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(filter_size);
 74 | 
 75 |     cutlass::Tensor4DCoord padding{0, 0, 0, 0}; // T, B, L, R
 76 |     cutlass::MatrixCoord conv_stride{1, 1};
 77 |     cutlass::Tensor4DCoord output_size = calc_output_size(input_size, padding, filter_size, conv_stride);
 78 |     using ElementOutput = float; // Data type of elements in output tensor
 79 |     using LayoutOutput = cutlass::layout::TensorNHWC;
 80 |     cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(output_size);
 81 |     cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(output_size);
 82 |     cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(output_size);
 83 | 
 84 |     // Initialize tensors
 85 | 
 86 |     // Fill tensor A on host with Sequential data
 87 |     cutlass::reference::host::BlockFillSequential(tensor_a.host_data(), tensor_a.capacity());
 88 |     show_tensor_view(tensor_a);
 89 | 
 90 |     // Fill tensor B on host with uniformly distributed random data
 91 |     cutlass::reference::host::TensorFillRandomUniform(tensor_b.host_view(), 1, ElementInputB(7), ElementInputB(-8), 0);
 92 |     show_tensor_view(tensor_b);
 93 | 
 94 |     // Fill tensor C on host with Sequential data
 95 |     cutlass::reference::host::BlockFillSequential(tensor_c.host_data(), tensor_c.capacity());
 96 |     show_tensor_view(tensor_c);
 97 | 
 98 |     // Fill tensor D on host with zeros
 99 |     cutlass::reference::host::TensorFill(tensor_d.host_view());
100 |     show_tensor_view(tensor_d);
101 | 
102 |     // Fill tensor D for reference on host with zeros
103 |     cutlass::reference::host::TensorFill(tensor_ref_d.host_view());
104 |     show_tensor_view(tensor_ref_d);
105 | 
106 |     // Copy data from host to GPU
107 |     tensor_a.sync_device();
108 |     tensor_b.sync_device();
109 |     tensor_c.sync_device();
110 |     tensor_d.sync_device();
111 |     tensor_ref_d.sync_device();
112 | 
113 |     return 0;
114 | }


--------------------------------------------------------------------------------
/example_12/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_12/main.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <sstream>
  4 | 
  5 | #include "cutlass/cutlass.h"
  6 | #include "cutlass/gemm/device/gemm.h"
  7 | #include "cutlass/conv/kernel/default_conv2d_fprop.h"
  8 | #include "cutlass/conv/device/implicit_gemm_convolution.h"
  9 | 
 10 | #include "cutlass/util/command_line.h"
 11 | #include "cutlass/util/host_tensor.h"
 12 | #include "cutlass/util/tensor_view_io.h"
 13 | #include "cutlass/util/reference/device/gemm.h"
 14 | #include "cutlass/util/reference/host/tensor_compare.h"
 15 | #include "cutlass/util/reference/host/tensor_copy.h"
 16 | #include "cutlass/util/reference/host/tensor_fill.h"
 17 | #include "cutlass/util/reference/host/convolution.h"
 18 | #include "cutlass/util/tensor_view_io.h"
 19 | 
 20 | #include "helper.h"
 21 | 
 22 | void check_cuda_version(bool &notSupported)
 23 | {
 24 |     // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
 25 |     // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
 26 |     if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)))
 27 |     {
 28 |         std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
 29 |         notSupported = true;
 30 |     }
 31 | }
 32 | 
 33 | void check_compute_capability(bool &notSupported)
 34 | {
 35 |     cudaDeviceProp props;
 36 |     CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 37 |     if (!(props.major >= 8))
 38 |     {
 39 |         std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
 40 |                   << std::endl;
 41 |         notSupported = true;
 42 |     }
 43 | }
 44 | 
 45 | // Computes the output tensor size (NKPQ)
 46 | cutlass::Tensor4DCoord calc_output_size(cutlass::Tensor4DCoord &input_size, cutlass::Tensor4DCoord &padding, cutlass::Tensor4DCoord &filter_size, cutlass::MatrixCoord &conv_stride)
 47 | {
 48 |     return cutlass::Tensor4DCoord(
 49 |         input_size.n(),
 50 |         (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
 51 |         (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
 52 |         filter_size.n());
 53 | }
 54 | 
 55 | template <
 56 |     typename Element_, // Data type of element stored within tensor (concept: NumericType)
 57 |     typename Layout_   // Defines a mapping from logical coordinate to linear memory (concept: Layout)
 58 |     >
 59 | void show_tensor_view(cutlass::HostTensor<Element_, Layout_> &tensor)
 60 | {
 61 |     std::cout << "tensor size : " << tensor.size() << std::endl;
 62 |     std::cout << "tensor shape : " << tensor.extent() << std::endl;
 63 |     std::cout << "tensor data type : " << typeid(*tensor.host_data()).name() << std::endl;
 64 |     std::cout << "tensor view : " << std::endl;
 65 |     std::cout << tensor.host_view() << std::endl;
 66 | }
 67 | 
 68 | // Compute performance in Gflop/s
 69 | // Gflop/s stands for billions (10^9) of
 70 | // floating-point operations per second (Gflop/s).
 71 | double gflops(double runtime_s, cutlass::Tensor4DCoord &output_size, cutlass::Tensor4DCoord &filter_size)
 72 | {
 73 | 
 74 |     // Number of multiply-adds = NPQK * CRS
 75 |     int64_t fmas = output_size.product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
 76 | 
 77 |     // Two flops per multiply-add
 78 |     return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
 79 | }
 80 | 
 81 | int main(int argc, char const **args)
 82 | {
 83 |     //
 84 |     // 0. cuda & device arch version check
 85 |     //
 86 |     bool notSupported = false;
 87 |     check_cuda_version(notSupported);
 88 |     check_compute_capability(notSupported);
 89 |     if (notSupported)
 90 |         return 0;
 91 | 
 92 |     //
 93 |     // 1. Allocate host-device tensors using the CUTLASS Utilities.
 94 |     //
 95 |     using DataType = cutlass::half_t; // Data type of elements in output tensor
 96 |     using TensorLayout = cutlass::layout::TensorNHWC;
 97 |     using ElementAccumulator = float;     // Data type of accumulator
 98 |     using ElementComputeEpilogue = float; // Data type of epilogue computation (alpha, beta)
 99 | 
100 |     cutlass::Tensor4DCoord input_size{1, 64, 64, 32}; // N, H, W, C
101 |     cutlass::HostTensor<DataType, TensorLayout> input_tensor(input_size);
102 | 
103 |     cutlass::Tensor4DCoord filter_size{64, 3, 3, 32}; // K, KH, KW, C
104 |     cutlass::HostTensor<DataType, TensorLayout> filter_tensor(filter_size);
105 | 
106 |     cutlass::Tensor4DCoord padding{0, 0, 0, 0}; // T, B, L, R
107 |     cutlass::MatrixCoord conv_stride{1, 1};
108 |     cutlass::MatrixCoord dilation{1, 1};
109 |     cutlass::Tensor4DCoord output_size = calc_output_size(input_size, padding, filter_size, conv_stride);
110 |     std::cout << "output_size : " << output_size << std::endl;
111 |     cutlass::HostTensor<DataType, TensorLayout> output_tensor(output_size);
112 |     cutlass::HostTensor<DataType, TensorLayout> tensor_d(output_size);
113 |     cutlass::HostTensor<DataType, TensorLayout> tensor_ref_d(output_size);
114 | 
115 |     //
116 |     // 2. Initialize tensors
117 |     //
118 |     // Fill tensor input_tensor with Sequential numbers
119 |     // cutlass::reference::host::BlockFillSequential(input_tensor.host_data(), input_tensor.capacity());
120 |     cutlass::reference::host::TensorFill(input_tensor.host_view(), cutlass::half_t(2));
121 |     // show_tensor_view(input_tensor);
122 | 
123 |     // Fill tensor output_tensor with ones
124 |     cutlass::reference::host::TensorFill(filter_tensor.host_view(), cutlass::half_t(1));
125 |     // show_tensor_view(filter_tensor);
126 | 
127 |     // Fill tensor output_tensor with zeros
128 |     cutlass::reference::host::TensorFill(output_tensor.host_view());
129 |     // show_tensor_view(output_tensor);
130 |     cutlass::reference::host::TensorFill(tensor_d.host_view());
131 |     // show_tensor_view(tensor_d);
132 |     cutlass::reference::host::TensorFill(tensor_ref_d.host_view());
133 |     // show_tensor_view(tensor_ref_d);
134 | 
135 |     //
136 |     // 3. Compute reference implementation on host side
137 |     //
138 |     // Construct Conv2dProblemSize with user defined output size
139 |     cutlass::conv::Conv2dProblemSize problem_size(
140 |         input_size,
141 |         filter_size,
142 |         padding,
143 |         conv_stride,
144 |         dilation,
145 |         output_size,
146 |         cutlass::conv::Mode::kCrossCorrelation,
147 |         1 // Split K dimension into 1 partitions
148 |     );
149 | 
150 |     ElementComputeEpilogue alpha{1};
151 |     ElementComputeEpilogue beta{0};
152 | 
153 |     std::cout << "Conv2d on host...\n";
154 |     cutlass::reference::host::Conv2dFprop<
155 |         DataType, TensorLayout,
156 |         DataType, TensorLayout,
157 |         DataType, TensorLayout,
158 |         ElementAccumulator,
159 |         ElementComputeEpilogue>(
160 |         problem_size,
161 |         input_tensor.host_ref(),
162 |         filter_tensor.host_ref(),
163 |         output_tensor.host_ref(),
164 |         tensor_ref_d.host_ref(),
165 |         alpha,
166 |         beta);
167 | 
168 |     // show_tensor_view(tensor_ref_d);
169 | 
170 |     //
171 |     // 4. Copy data from host to GPU
172 |     //
173 |     input_tensor.sync_device();
174 |     filter_tensor.sync_device();
175 |     output_tensor.sync_device();
176 |     tensor_d.sync_device();
177 | 
178 |     //
179 |     // 5. define kernel properties type
180 |     //
181 |     // Whether to use tensor cores or regular SIMT cores on GPU SM
182 |     using MMAOp = cutlass::arch::OpClassTensorOp;
183 |     // SM architecture number
184 |     using SmArch = cutlass::arch::Sm86;
185 |     // Threadblock tile shape
186 |     using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;
187 |     // Warp tile shape
188 |     using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
189 |     // MMA (Tensor Core instruction, in this case) tile shape
190 |     using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
191 |     // How the kernel schedules threadblocks
192 |     using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
193 |     // Number of pipeline stages to use
194 |     constexpr int NumStages = 3;
195 |     // Which iterator algorithm to use: Analytic or Optimized
196 |     static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
197 |     // The epilogue part of the kernel
198 |     using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
199 |         DataType,                                    // Data type of output matrix.
200 |         128 / cutlass::sizeof_bits<DataType>::value, // The number of elements per vectorized
201 |                                                      // memory access. This becomes the vector width of
202 |                                                      // math instructions in the epilogue too.
203 |         ElementAccumulator,                          // Data type of accumulator
204 |         ElementComputeEpilogue>;                     // Data type for alpha/beta in linear combination
205 |     // Kernel properties type
206 |     using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
207 |         DataType, TensorLayout,
208 |         DataType, TensorLayout,
209 |         DataType, TensorLayout,
210 |         ElementAccumulator,
211 |         MMAOp,
212 |         SmArch,
213 |         ThreadblockShape,
214 |         WarpShape,
215 |         InstructionShape,
216 |         EpilogueOp,
217 |         SwizzleThreadBlock,
218 |         NumStages,
219 |         cutlass::arch::OpMultiplyAdd,
220 |         IteratorAlgorithm>::Kernel;
221 | 
222 |     // Type of the actual kernel
223 |     using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
224 | 
225 |     // define arguments for CUTLASS Convolution
226 |     // Construct ImplicitGemm::Argument structure with conv2d problem size, data pointers, and epilogue values
227 |     typename ImplicitGemm::Arguments arguments{
228 |         problem_size,
229 |         input_tensor.device_ref(),
230 |         filter_tensor.device_ref(),
231 |         output_tensor.device_ref(),
232 |         tensor_d.device_ref(),
233 |         {alpha, beta},
234 |     };
235 | 
236 |     //
237 |     // 6. Initialize CUTLASS Convolution
238 |     //
239 |     cutlass::Status status;
240 |     ImplicitGemm implicit_gemm_op;
241 |     size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
242 |     // Allocate workspace memory
243 |     cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
244 |     status = implicit_gemm_op.can_implement(arguments);
245 |     CUTLASS_CHECK(status);
246 |     status = implicit_gemm_op.initialize(arguments, workspace.get());
247 |     CUTLASS_CHECK(status);
248 | 
249 |     //
250 |     // 7. Launch CUTLASS kernel
251 |     //
252 |     std::cout << "Conv2d on device using cutlass kernel...\n";
253 |     status = implicit_gemm_op();
254 | 
255 |     CUTLASS_CHECK(status);
256 | 
257 |     //
258 |     // 8. Check if CUTLASS kernel and reference kernel produced the same output
259 |     //
260 |     tensor_d.sync_host(); // cutlass output
261 | 
262 |     bool passed = cutlass::reference::host::TensorEquals(tensor_d.host_view(), tensor_ref_d.host_view());
263 | 
264 |     if (!passed)
265 |     {
266 |         status = cutlass::Status::kErrorInternal;
267 |         std::cout << "ERROR - results miscompared.\n";
268 |     }
269 |     else
270 |     {
271 |         status = cutlass::Status::kSuccess;
272 |         std::cout << "Passed.\n";
273 |     }
274 |     CUTLASS_CHECK(status);
275 |     // show_tensor_view(tensor_d);
276 | 
277 |     //
278 |     // 9. Performance measurement
279 |     //
280 |     cudaEvent_t events[2];
281 |     cudaError_t error;
282 | 
283 |     for (auto &event : events)
284 |     {
285 |         error = cudaEventCreate(&event);
286 |         if (error != cudaSuccess)
287 |         {
288 |             std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(error) << std::endl;
289 |         }
290 |     }
291 | 
292 |     // Record an event at the start of a series of convolution operations.
293 |     error = cudaEventRecord(events[0]);
294 |     if (error != cudaSuccess)
295 |     {
296 |         std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(error) << std::endl;
297 |     }
298 | 
299 |     // Launch a sequence of implicit GEMM operations on the device.
300 |     int iterations = 20;
301 |     for (int iteration = 0; iteration < iterations; ++iteration)
302 |     {
303 |         status = implicit_gemm_op();
304 |         CUTLASS_CHECK(status);
305 |     }
306 | 
307 |     // Record an event when the convolutions have been launched.
308 |     error = cudaEventRecord(events[1]);
309 |     if (error != cudaSuccess)
310 |     {
311 |         std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(error) << std::endl;
312 |     }
313 | 
314 |     // Wait for work on the device to complete.
315 |     error = cudaEventSynchronize(events[1]);
316 |     if (error != cudaSuccess)
317 |     {
318 |         std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(error) << std::endl;
319 |     }
320 | 
321 |     // Measure elapsed runtime.
322 |     float runtime_ms = 0;
323 |     error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
324 |     if (error != cudaSuccess)
325 |     {
326 |         std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(error) << std::endl;
327 |     }
328 | 
329 |     // Print average run time and floating-point throughput (Gflop/s).
330 |     runtime_ms = double(runtime_ms) / double(iterations);
331 |     double gflops_v = gflops(runtime_ms / 1000.0, output_size, filter_size);
332 |     std::cout << "runtime : " << runtime_ms << "[ms], gflops : " << gflops_v << std::endl;
333 | 
334 |     // Cleanup
335 |     for (auto event : events)
336 |     {
337 |         (void)cudaEventDestroy(event);
338 |     }
339 | 
340 |     return 0;
341 | }


--------------------------------------------------------------------------------
/example_2/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | 
22 | add_executable(main main.cu)
23 | 
24 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
25 | 
26 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
27 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
28 | 


--------------------------------------------------------------------------------
/example_2/main.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cutlass/cutlass.h>
 3 | #include <cutlass/numeric_types.h>
 4 | #include <cutlass/core_io.h>
 5 | #include <string>
 6 | 
 7 | int main()
 8 | {
 9 |     // 4bit data type
10 |     using ui4bit = cutlass::uint4b_t;
11 | 
12 |     ui4bit ubit4_a{4};
13 |     ui4bit ubit4_b{2};
14 | 
15 |     // adress
16 |     std::cout << "adress of ubit4_a : " << &ubit4_a << std::endl;
17 |     std::cout << "adress of ubit4_b : " << &ubit4_b << std::endl;
18 | 
19 |     // array
20 |     int arr0[]{1, 2, 3, 4, 5};
21 |     std::cout << "adress of int arr0[0] : " << &arr0[0] << std::endl;
22 |     std::cout << "adress of int arr0[1] : " << &arr0[1] << std::endl;
23 |     std::cout << "adress of int arr0[2] : " << &arr0[2] << std::endl;
24 |     std::cout << "adress of int arr0[3] : " << &arr0[3] << std::endl;
25 |     std::cout << "adress of int arr0[4] : " << &arr0[4] << std::endl;
26 | 
27 |     cutlass::half_t arr1[]{1_hf, 2_hf, 3_hf, 4_hf, 5_hf};
28 |     std::cout << "adress of half_t arr1[0] : " << &arr1[0] << std::endl;
29 |     std::cout << "adress of half_t arr1[1] : " << &arr1[1] << std::endl;
30 |     std::cout << "adress of half_t arr1[2] : " << &arr1[2] << std::endl;
31 |     std::cout << "adress of half_t arr1[3] : " << &arr1[3] << std::endl;
32 |     std::cout << "adress of half_t arr1[4] : " << &arr1[4] << std::endl;
33 | 
34 |     // std::uint8_t arr2[]{1, 2, 3, 4, 5};
35 |     // std::cout << "adress of uint8_t arr2[0] : " << &arr2[0] << std::endl;
36 |     // std::cout << "adress of uint8_t arr2[1] : " << &arr2[1] << std::endl;
37 |     // std::cout << "adress of uint8_t arr2[2] : " << &arr2[2] << std::endl;
38 |     // std::cout << "adress of uint8_t arr2[3] : " << &arr2[3] << std::endl;
39 |     // std::cout << "adress of uint8_t arr2[4] : " << &arr2[4] << std::endl;
40 | 
41 |     ui4bit arr3[]{1, 2, 3, 4, 5};
42 |     std::cout << "adress of uint4b_t arr3[0] : " << &arr3[0] << std::endl;
43 |     std::cout << "adress of uint4b_t arr3[1] : " << &arr3[1] << std::endl;
44 |     std::cout << "adress of uint4b_t arr3[2] : " << &arr3[2] << std::endl;
45 |     std::cout << "adress of uint4b_t arr3[3] : " << &arr3[3] << std::endl;
46 |     std::cout << "adress of uint4b_t arr3[4] : " << &arr3[4] << std::endl;
47 | 
48 |     // data size check
49 |     cutlass::sizeof_bits<ui4bit> sbit4;
50 |     std::cout << "uint4b_t bit size : " << sbit4.value << std::endl;
51 |     std::cout << "real storage size : " << sizeof(sbit4) << std::endl;
52 | 
53 |     // numeric range
54 |     cutlass::platform::numeric_limits<ui4bit> nlb4;
55 |     std::cout << "uint4b_t min : " << nlb4.lowest() << std::endl;
56 |     std::cout << "uint4b_t max : " << nlb4.max() << std::endl;
57 |     std::cout << "uint4b_t is_integer : " << nlb4.is_integer << std::endl;
58 | 
59 |     // simple numeric calculate
60 |     std::cout << "ubit4_a(" + std::to_string(ubit4_a) + ") + ubit4_b(" + std::to_string(ubit4_b) + ") : " << ubit4_a + ubit4_b << std::endl;
61 |     std::cout << "ubit4_a(" + std::to_string(ubit4_a) + ") - ubit4_b(" + std::to_string(ubit4_b) + ") : " << ubit4_a - ubit4_b << std::endl;
62 |     std::cout << "ubit4_a(" + std::to_string(ubit4_a) + ") * ubit4_b(" + std::to_string(ubit4_b) + ") : " << ubit4_a * ubit4_b << std::endl;
63 |     std::cout << "ubit4_a(" + std::to_string(ubit4_a) + ") / ubit4_b(" + std::to_string(ubit4_b) + ") : " << ubit4_a / ubit4_b << std::endl;
64 | 
65 |     return 0;
66 | }


--------------------------------------------------------------------------------
/example_3/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_3/main.cu:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | /*
 33 |   This example demonstrates how to call a CUTLASS GEMM kernel and provides a naive reference
 34 |   matrix multiply kernel to verify its correctness.
 35 | 
 36 |   The CUTLASS Gemm template is instantiated in the function CutlassSgemmNN. This is kernel computes
 37 |   the general matrix product (GEMM) using single-precision floating-point arithmetic and assumes
 38 |   all matrices have column-major layout.
 39 | 
 40 |   The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
 41 |   See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
 42 |   in CUTLASS.
 43 | 
 44 |   https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/
 45 | 
 46 |   Aside from defining and launching the SGEMM kernel, this example does not use any other components
 47 |   or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
 48 |   prevalent in the CUTLASS unit tests.
 49 | 
 50 |   This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to
 51 |   highlight the minimum amount of differences needed to transition to cutlass-2.0.
 52 | 
 53 |   Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu
 54 | */
 55 | 
 56 | // Standard Library includes
 57 | #include <iostream>
 58 | #include <sstream>
 59 | #include <vector>
 60 | 
 61 | // Helper methods to check for errors
 62 | #include "helper.h"
 63 | 
 64 | //
 65 | // CUTLASS includes needed for single-precision GEMM kernel
 66 | //
 67 | 
 68 | // Defines cutlass::gemm::device::Gemm, the generic Gemm computation template class.
 69 | #include "cutlass/gemm/device/gemm.h"
 70 | 
 71 | ///////////////////////////////////////////////////////////////////////////////////////////////////
 72 | //
 73 | // This function defines a CUTLASS GEMM kernel instantiation, constructs its parameters object,
 74 | // and launches it on the CUDA device.
 75 | //
 76 | ///////////////////////////////////////////////////////////////////////////////////////////////////
 77 | 
 78 | /// Define a CUTLASS GEMM template and launch a GEMM kernel.
 79 | cudaError_t CutlassSgemmNN(
 80 |     int M,
 81 |     int N,
 82 |     int K,
 83 |     float alpha,
 84 |     float const *A,
 85 |     int lda,
 86 |     float const *B,
 87 |     int ldb,
 88 |     float beta,
 89 |     float *C,
 90 |     int ldc)
 91 | {
 92 | 
 93 |     // Define type definition for single-precision CUTLASS GEMM with column-major
 94 |     // input matrices and 128x128x8 threadblock tile size (chosen by default).
 95 |     //
 96 |     // To keep the interface manageable, several helpers are defined for plausible compositions
 97 |     // including the following example for single-precision GEMM. Typical values are used as
 98 |     // default template arguments. See `cutlass/gemm/device/default_gemm_configuration.h` for more details.
 99 |     //
100 |     // To view the full gemm device API interface, see `cutlass/gemm/device/gemm.h`
101 | 
102 |     using ColumnMajor = cutlass::layout::ColumnMajor;
103 | 
104 |     using CutlassGemm = cutlass::gemm::device::Gemm<float,        // Data-type of A matrix
105 |                                                     ColumnMajor,  // Layout of A matrix
106 |                                                     float,        // Data-type of B matrix
107 |                                                     ColumnMajor,  // Layout of B matrix
108 |                                                     float,        // Data-type of C matrix
109 |                                                     ColumnMajor>; // Layout of C matrix
110 | 
111 |     // Define a CUTLASS GEMM type
112 |     CutlassGemm gemm_operator;
113 | 
114 |     // Construct the CUTLASS GEMM arguments object.
115 |     //
116 |     // One of CUTLASS's design patterns is to define gemm argument objects that are constructible
117 |     // in host code and passed to kernels by value. These may include pointers, strides, scalars,
118 |     // and other arguments needed by Gemm and its components.
119 |     //
120 |     // The benefits of this pattern are (1.) a structured, composable strategy for passing host-constructible
121 |     // arguments to kernels and (2.) minimized initialization overhead on kernel entry.
122 |     //
123 |     CutlassGemm::Arguments args({M, N, K},      // Gemm Problem dimensions
124 |                                 {A, lda},       // Tensor-ref for source matrix A
125 |                                 {B, ldb},       // Tensor-ref for source matrix B
126 |                                 {C, ldc},       // Tensor-ref for source matrix C
127 |                                 {C, ldc},       // Tensor-ref for destination matrix D (may be different memory than source C matrix)
128 |                                 {alpha, beta}); // Scalars used in the Epilogue
129 | 
130 |     //
131 |     // Launch the CUTLASS GEMM kernel.
132 |     //
133 | 
134 |     cutlass::Status status = gemm_operator(args);
135 | 
136 |     //
137 |     // Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
138 |     //
139 | 
140 |     if (status != cutlass::Status::kSuccess)
141 |     {
142 |         return cudaErrorUnknown;
143 |     }
144 | 
145 |     // Return success, if no errors were encountered.
146 |     return cudaSuccess;
147 | }
148 | 
149 | ///////////////////////////////////////////////////////////////////////////////////////////////////
150 | //
151 | // The source code after this point in the file is generic CUDA using the CUDA Runtime API
152 | // and simple CUDA kernels to initialize matrices and compute the general matrix product.
153 | //
154 | ///////////////////////////////////////////////////////////////////////////////////////////////////
155 | 
156 | /// Kernel to initialize a matrix with small integers.
157 | __global__ void InitializeMatrix_kernel(
158 |     float *matrix,
159 |     int rows,
160 |     int columns,
161 |     int seed = 0)
162 | {
163 | 
164 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
165 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
166 | 
167 |     if (i < rows && j < columns)
168 |     {
169 |         int offset = i + j * rows;
170 | 
171 |         // Generate arbitrary elements.
172 |         int const k = 16807;
173 |         int const m = 16;
174 |         float value = float(((offset + seed) * k % m) - m / 2);
175 | 
176 |         matrix[offset] = value;
177 |     }
178 | }
179 | 
180 | /// Simple function to initialize a matrix to arbitrary small integers.
181 | cudaError_t InitializeMatrix(float *matrix, int rows, int columns, int seed = 0)
182 | {
183 | 
184 |     dim3 block(16, 16);
185 |     dim3 grid(
186 |         (rows + block.x - 1) / block.x,
187 |         (columns + block.y - 1) / block.y);
188 | 
189 |     InitializeMatrix_kernel<<<grid, block>>>(matrix, rows, columns, seed);
190 | 
191 |     return cudaGetLastError();
192 | }
193 | 
194 | ///////////////////////////////////////////////////////////////////////////////////////////////////
195 | 
196 | /// Allocates device memory for a matrix then fills with arbitrary small integers.
197 | cudaError_t AllocateMatrix(float **matrix, int rows, int columns, int seed = 0)
198 | {
199 |     cudaError_t result;
200 | 
201 |     size_t sizeof_matrix = sizeof(float) * rows * columns;
202 | 
203 |     // Allocate device memory.
204 |     result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
205 | 
206 |     if (result != cudaSuccess)
207 |     {
208 |         std::cerr << "Failed to allocate matrix: "
209 |                   << cudaGetErrorString(result) << std::endl;
210 |         return result;
211 |     }
212 | 
213 |     // Clear the allocation.
214 |     result = cudaMemset(*matrix, 0, sizeof_matrix);
215 | 
216 |     if (result != cudaSuccess)
217 |     {
218 |         std::cerr << "Failed to clear matrix device memory: "
219 |                   << cudaGetErrorString(result) << std::endl;
220 |         return result;
221 |     }
222 | 
223 |     // Initialize matrix elements to arbitrary small integers.
224 |     result = InitializeMatrix(*matrix, rows, columns, seed);
225 | 
226 |     if (result != cudaSuccess)
227 |     {
228 |         std::cerr << "Failed to initialize matrix: "
229 |                   << cudaGetErrorString(result) << std::endl;
230 |         return result;
231 |     }
232 | 
233 |     return result;
234 | }
235 | 
236 | ///////////////////////////////////////////////////////////////////////////////////////////////////
237 | 
238 | /// Naive reference GEMM computation.
239 | __global__ void ReferenceGemm_kernel(
240 |     int M,
241 |     int N,
242 |     int K,
243 |     float alpha,
244 |     float const *A,
245 |     int lda,
246 |     float const *B,
247 |     int ldb,
248 |     float beta,
249 |     float *C,
250 |     int ldc)
251 | {
252 | 
253 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
254 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
255 | 
256 |     if (i < M && j < N)
257 |     {
258 |         float accumulator = 0;
259 | 
260 |         for (int k = 0; k < K; ++k)
261 |         {
262 |             accumulator += A[i + k * lda] * B[k + j * ldb];
263 |         }
264 | 
265 |         C[i + j * ldc] = alpha * accumulator + beta * C[i + j * ldc];
266 |     }
267 | }
268 | 
269 | /// Reference GEMM computation.
270 | cudaError_t ReferenceGemm(
271 |     int M,
272 |     int N,
273 |     int K,
274 |     float alpha,
275 |     float const *A,
276 |     int lda,
277 |     float const *B,
278 |     int ldb,
279 |     float beta,
280 |     float *C,
281 |     int ldc)
282 | {
283 | 
284 |     dim3 block(16, 16);
285 |     dim3 grid(
286 |         (M + block.x - 1) / block.x,
287 |         (N + block.y - 1) / block.y);
288 | 
289 |     ReferenceGemm_kernel<<<grid, block>>>(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
290 | 
291 |     return cudaGetLastError();
292 | }
293 | 
294 | ///////////////////////////////////////////////////////////////////////////////////////////////////
295 | 
296 | /// Allocate several matrices in GPU device memory and call a single-precision
297 | /// CUTLASS GEMM kernel.
298 | cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta)
299 | {
300 |     cudaError_t result;
301 | 
302 |     //
303 |     // Define several matrices to be used as operands to GEMM kernels.
304 |     //
305 | 
306 |     // Compute leading dimensions for each matrix.
307 |     int lda = M;
308 |     int ldb = K;
309 |     int ldc = M;
310 | 
311 |     // Compute size in bytes of the C matrix.
312 |     size_t sizeof_C = sizeof(float) * ldc * N;
313 | 
314 |     // Define pointers to matrices in GPU device memory.
315 |     float *A;
316 |     float *B;
317 |     float *C_cutlass;
318 |     float *C_reference;
319 | 
320 |     //
321 |     // Allocate matrices in GPU device memory with arbitrary seeds.
322 |     //
323 | 
324 |     result = AllocateMatrix(&A, M, K, 0);
325 | 
326 |     if (result != cudaSuccess)
327 |     {
328 |         return result;
329 |     }
330 | 
331 |     result = AllocateMatrix(&B, K, N, 17);
332 | 
333 |     if (result != cudaSuccess)
334 |     {
335 |         cudaFree(A);
336 |         return result;
337 |     }
338 | 
339 |     result = AllocateMatrix(&C_cutlass, M, N, 101);
340 | 
341 |     if (result != cudaSuccess)
342 |     {
343 |         cudaFree(A);
344 |         cudaFree(B);
345 |         return result;
346 |     }
347 | 
348 |     result = AllocateMatrix(&C_reference, M, N, 101);
349 | 
350 |     if (result != cudaSuccess)
351 |     {
352 |         cudaFree(A);
353 |         cudaFree(B);
354 |         cudaFree(C_cutlass);
355 |         return result;
356 |     }
357 | 
358 |     result = cudaMemcpy(C_reference, C_cutlass, sizeof_C, cudaMemcpyDeviceToDevice);
359 | 
360 |     if (result != cudaSuccess)
361 |     {
362 |         std::cerr << "Failed to copy C_cutlass matrix to C_reference: "
363 |                   << cudaGetErrorString(result) << std::endl;
364 | 
365 |         cudaFree(C_reference);
366 |         cudaFree(C_cutlass);
367 |         cudaFree(B);
368 |         cudaFree(A);
369 | 
370 |         return result;
371 |     }
372 | 
373 |     //
374 |     // Launch CUTLASS GEMM.
375 |     //
376 | 
377 |     result = CutlassSgemmNN(M, N, K, alpha, A, lda, B, ldb, beta, C_cutlass, ldc);
378 | 
379 |     if (result != cudaSuccess)
380 |     {
381 |         std::cerr << "CUTLASS GEMM kernel failed: "
382 |                   << cudaGetErrorString(result) << std::endl;
383 | 
384 |         cudaFree(C_reference);
385 |         cudaFree(C_cutlass);
386 |         cudaFree(B);
387 |         cudaFree(A);
388 | 
389 |         return result;
390 |     }
391 | 
392 |     //
393 |     // Verify.
394 |     //
395 | 
396 |     // Launch reference GEMM
397 |     result = ReferenceGemm(M, N, K, alpha, A, lda, B, ldb, beta, C_reference, ldc);
398 | 
399 |     if (result != cudaSuccess)
400 |     {
401 |         std::cerr << "Reference GEMM kernel failed: "
402 |                   << cudaGetErrorString(result) << std::endl;
403 | 
404 |         cudaFree(C_reference);
405 |         cudaFree(C_cutlass);
406 |         cudaFree(B);
407 |         cudaFree(A);
408 | 
409 |         return result;
410 |     }
411 | 
412 |     // Copy to host and verify equivalence.
413 |     std::vector<float> host_cutlass(ldc * N, 0);
414 |     std::vector<float> host_reference(ldc * N, 0);
415 | 
416 |     result = cudaMemcpy(host_cutlass.data(), C_cutlass, sizeof_C, cudaMemcpyDeviceToHost);
417 | 
418 |     if (result != cudaSuccess)
419 |     {
420 |         std::cerr << "Failed to copy CUTLASS GEMM results: "
421 |                   << cudaGetErrorString(result) << std::endl;
422 | 
423 |         cudaFree(C_reference);
424 |         cudaFree(C_cutlass);
425 |         cudaFree(B);
426 |         cudaFree(A);
427 | 
428 |         return result;
429 |     }
430 | 
431 |     result = cudaMemcpy(host_reference.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
432 | 
433 |     if (result != cudaSuccess)
434 |     {
435 |         std::cerr << "Failed to copy Reference GEMM results: "
436 |                   << cudaGetErrorString(result) << std::endl;
437 | 
438 |         cudaFree(C_reference);
439 |         cudaFree(C_cutlass);
440 |         cudaFree(B);
441 |         cudaFree(A);
442 | 
443 |         return result;
444 |     }
445 | 
446 |     //
447 |     // Free device memory allocations.
448 |     //
449 | 
450 |     cudaFree(C_reference);
451 |     cudaFree(C_cutlass);
452 |     cudaFree(B);
453 |     cudaFree(A);
454 | 
455 |     //
456 |     // Test for bit equivalence of results.
457 |     //
458 | 
459 |     if (host_cutlass != host_reference)
460 |     {
461 |         std::cerr << "CUTLASS results incorrect." << std::endl;
462 | 
463 |         return cudaErrorUnknown;
464 |     }
465 | 
466 |     return cudaSuccess;
467 | }
468 | 
469 | ///////////////////////////////////////////////////////////////////////////////////////////////////
470 | 
471 | /// Entry point to basic_gemm example.
472 | //
473 | // usage:
474 | //
475 | //   00_basic_gemm <M> <N> <K> <alpha> <beta>
476 | //
477 | int main(int argc, const char *arg[])
478 | {
479 | 
480 |     //
481 |     // Parse the command line to obtain GEMM dimensions and scalar values.
482 |     //
483 | 
484 |     // GEMM problem dimensions.
485 |     int problem[3] = {32, 128, 64};
486 | 
487 |     for (int i = 1; i < argc && i < 4; ++i)
488 |     {
489 |         std::stringstream ss(arg[i]);
490 |         ss >> problem[i - 1];
491 |     }
492 | 
493 |     // Scalars used for linear scaling the result of the matrix product.
494 |     float scalars[2] = {1, 0};
495 | 
496 |     for (int i = 4; i < argc && i < 6; ++i)
497 |     {
498 |         std::stringstream ss(arg[i]);
499 |         ss >> scalars[i - 4];
500 |     }
501 | 
502 |     //
503 |     // Run the CUTLASS GEMM test.
504 |     //
505 | 
506 |     cudaError_t result = TestCutlassGemm(
507 |         problem[0], // GEMM M dimension
508 |         problem[1], // GEMM N dimension
509 |         problem[2], // GEMM K dimension
510 |         scalars[0], // alpha
511 |         scalars[1]  // beta
512 |     );
513 | 
514 |     if (result == cudaSuccess)
515 |     {
516 |         std::cout << "Passed." << std::endl;
517 |     }
518 | 
519 |     // Exit.
520 |     return result == cudaSuccess ? 0 : -1;
521 | }
522 | 
523 | ///////////////////////////////////////////////////////////////////////////////////////////////////


--------------------------------------------------------------------------------
/example_4/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_4/main.cu:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | /*
 33 |   This example demonstrates several CUTLASS utilities in the context of a mixed-precision
 34 |   floating-point matrix product computation.
 35 | 
 36 |   These utilities are intended to be useful supporting components for managing tensor and matrix
 37 |   memory allocations, initializing and comparing results, and computing reference output.
 38 | 
 39 |   CUTLASS utilities are defined in the directory `tools/util`, and definitions appear
 40 |   namespace `cutlass::` or an inner namespace therein. Operations in `cutlass::reference::` have
 41 |   both host-side and device-side implementations, and the choice to use device-side initialization
 42 |   and host-side verification in this example was arbitrary.
 43 | 
 44 | 
 45 |   cutlass::half_t
 46 | 
 47 |     This is a numeric type implementing IEEE half-precision quantities. It is functional in host
 48 |     and device code. In host-side code, CUTLASS_ENABLE_F16C optionally enables harware-accelerated
 49 |     numeric conversion on x86-64 CPUs support F16C extensions. In device code, all available
 50 |     hardware is used to implement conversion and numeric operations.
 51 | 
 52 | 
 53 |   cutlass::HostTensor<>
 54 | 
 55 |     This template class simplifies the creation of tensors for all supported layouts. It simplifies
 56 |     allocation and management of host- and device- memory allocations.
 57 | 
 58 |     This class offers methods device_view() and host_view() to provide TensorView objects for
 59 |     device- and host-side memory allocations.
 60 | 
 61 | 
 62 |   cutlass::reference::device::TensorFillRandomGaussian()
 63 | 
 64 |     This template function initializes elementsof a tensor to a random Gaussian distribution. It
 65 |     uses cuRAND in device code to compute random numbers.
 66 | 
 67 | 
 68 |   cutlass::reference::host::Gemm<>
 69 | 
 70 |     This template function computes the general matrix product. This template supports unique
 71 |     data types for each matrix operand, the internal accumulation type, and the scalar parameters
 72 |     alpha and beta.
 73 | 
 74 | 
 75 |   cutlass::reference::host::TensorEquals()
 76 | 
 77 |     Compares two tensors of identical rank and returns true if values are bit equivalent.
 78 | 
 79 | */
 80 | 
 81 | // Standard Library includes
 82 | #include <iostream>
 83 | #include <sstream>
 84 | #include <vector>
 85 | #include <fstream>
 86 | 
 87 | // CUTLASS includes needed for half-precision GEMM kernel
 88 | #include "cutlass/cutlass.h"
 89 | #include "cutlass/core_io.h"
 90 | #include "cutlass/layout/matrix.h"
 91 | #include "cutlass/gemm/device/gemm.h"
 92 | 
 93 | //
 94 | // CUTLASS utility includes
 95 | //
 96 | 
 97 | // Defines operator<<() to write TensorView objects to std::ostream
 98 | #include "cutlass/util/tensor_view_io.h"
 99 | 
100 | // Defines cutlass::HostTensor<>
101 | #include "cutlass/util/host_tensor.h"
102 | 
103 | // Defines cutlass::half_t
104 | #include "cutlass/numeric_types.h"
105 | 
106 | // Defines device_memory::copy_device_to_device()
107 | #include "cutlass/util/device_memory.h"
108 | 
109 | // Defines cutlass::reference::device::TensorFillRandomGaussian()
110 | #include "cutlass/util/reference/device/tensor_fill.h"
111 | 
112 | // Defines cutlass::reference::host::TensorEquals()
113 | #include "cutlass/util/reference/host/tensor_compare.h"
114 | 
115 | // Defines cutlass::reference::host::Gemm()
116 | #include "cutlass/util/reference/host/gemm.h"
117 | 
118 | #pragma warning(disable : 4503)
119 | ///////////////////////////////////////////////////////////////////////////////////////////////////
120 | 
121 | /// Define a CUTLASS GEMM template and launch a GEMM kernel.
122 | cudaError_t cutlass_hgemm_nn(
123 |     int M,
124 |     int N,
125 |     int K,
126 |     cutlass::half_t alpha,
127 |     cutlass::half_t const *A,
128 |     cutlass::layout::ColumnMajor::Stride::Index lda,
129 |     cutlass::half_t const *B,
130 |     cutlass::layout::ColumnMajor::Stride::Index ldb,
131 |     cutlass::half_t beta,
132 |     cutlass::half_t *C,
133 |     cutlass::layout::ColumnMajor::Stride::Index ldc)
134 | {
135 | 
136 |     // Define the GEMM operation
137 |     using Gemm = cutlass::gemm::device::Gemm<
138 |         cutlass::half_t,              // ElementA
139 |         cutlass::layout::ColumnMajor, // LayoutA
140 |         cutlass::half_t,              // ElementB
141 |         cutlass::layout::ColumnMajor, // LayoutB
142 |         cutlass::half_t,              // ElementOutput
143 |         cutlass::layout::ColumnMajor  // LayoutOutput
144 |         >;
145 | 
146 |     Gemm gemm_op;
147 | 
148 |     cutlass::Status status = gemm_op({{M, N, K},
149 |                                       {A, lda},
150 |                                       {B, ldb},
151 |                                       {C, ldc},
152 |                                       {C, ldc},
153 |                                       {alpha, beta}});
154 | 
155 |     if (status != cutlass::Status::kSuccess)
156 |     {
157 |         return cudaErrorUnknown;
158 |     }
159 | 
160 |     return cudaSuccess;
161 | }
162 | 
163 | ///////////////////////////////////////////////////////////////////////////////////////////////////
164 | 
165 | /// Allocate several matrices in GPU device memory and call a single-precision
166 | /// CUTLASS GEMM kernel.
167 | cudaError_t TestCutlassGemm(int M, int N, int K, cutlass::half_t alpha, cutlass::half_t beta)
168 | {
169 |     cudaError_t result;
170 | 
171 |     //
172 |     // Construct cutlass::HostTensor<> using the half-precision host-side type.
173 |     //
174 |     // cutlass::HostTensor<> allocates memory on both the host and device corresponding to rank=2
175 |     // tensors in column-major layout. Explicit synchronization methods are offered to copy the
176 |     // tensor to the device or to the host.
177 |     //
178 | 
179 |     // M-by-K matrix of cutlass::half_t
180 |     cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A(cutlass::MatrixCoord(M, K));
181 | 
182 |     // K-by-N matrix of cutlass::half_t
183 |     cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B(cutlass::MatrixCoord(K, N));
184 | 
185 |     // M-by-N matrix of cutlass::half_t
186 |     cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C_cutlass(cutlass::MatrixCoord(M, N));
187 | 
188 |     // M-by-N matrix of cutlass::half_t
189 |     cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C_reference(cutlass::MatrixCoord(M, N));
190 | 
191 |     //
192 |     // Initialize matrices with small, random integers.
193 |     //
194 | 
195 |     // Arbitrary RNG seed value. Hard-coded for deterministic results.
196 |     uint64_t seed = 2080;
197 | 
198 |     // Gaussian random distribution
199 |     cutlass::half_t mean = 0.0_hf;
200 |     cutlass::half_t stddev = 5.0_hf;
201 | 
202 |     // Specify the number of bits right of the binary decimal that are permitted
203 |     // to be non-zero. A value of "0" here truncates random values to integers
204 |     int bits_less_than_one = 0;
205 | 
206 |     cutlass::reference::device::TensorFillRandomGaussian(
207 |         A.device_view(),
208 |         seed,
209 |         mean,
210 |         stddev,
211 |         bits_less_than_one);
212 | 
213 |     cutlass::reference::device::TensorFillRandomGaussian(
214 |         B.device_view(),
215 |         seed * 2019,
216 |         mean,
217 |         stddev,
218 |         bits_less_than_one);
219 | 
220 |     cutlass::reference::device::TensorFillRandomGaussian(
221 |         C_cutlass.device_view(),
222 |         seed * 1993,
223 |         mean,
224 |         stddev,
225 |         bits_less_than_one);
226 | 
227 |     // Copy C_cutlass into C_reference so the GEMM is correct when beta != 0.
228 |     cutlass::device_memory::copy_device_to_device(
229 |         C_reference.device_data(),
230 |         C_cutlass.device_data(),
231 |         C_cutlass.capacity());
232 | 
233 |     // Copy the device-side view into host memory
234 |     C_reference.sync_host();
235 | 
236 |     //
237 |     // Launch the CUTLASS GEMM kernel
238 |     //
239 | 
240 |     result = cutlass_hgemm_nn(
241 |         M,
242 |         N,
243 |         K,
244 |         alpha,
245 |         A.device_data(),
246 |         A.stride(0),
247 |         B.device_data(),
248 |         B.stride(0),
249 |         beta,
250 |         C_cutlass.device_data(),
251 |         C_cutlass.stride(0));
252 | 
253 |     if (result != cudaSuccess)
254 |     {
255 |         return result;
256 |     }
257 | 
258 |     //
259 |     // Verify the result using a host-side reference
260 |     //
261 | 
262 |     // A and B were initialized using device-side procedures. The intent of this example is to
263 |     // use the host-side reference GEMM, so we must perform a device-to-host copy.
264 |     A.sync_host();
265 |     B.sync_host();
266 | 
267 |     // Copy CUTLASS's GEMM results into host memory.
268 |     C_cutlass.sync_host();
269 | 
270 |     // Compute the reference result using the host-side GEMM reference implementation.
271 |     cutlass::reference::host::Gemm<
272 |         cutlass::half_t,              // ElementA
273 |         cutlass::layout::ColumnMajor, // LayoutA
274 |         cutlass::half_t,              // ElementB
275 |         cutlass::layout::ColumnMajor, // LayoutB
276 |         cutlass::half_t,              // ElementOutput
277 |         cutlass::layout::ColumnMajor, // LayoutOutput
278 |         cutlass::half_t,
279 |         cutlass::half_t>
280 |         gemm_ref;
281 | 
282 |     gemm_ref(
283 |         {M, N, K},             // problem size (type: cutlass::gemm::GemmCoord)
284 |         alpha,                 // alpha        (type: cutlass::half_t)
285 |         A.host_ref(),          // A            (type: TensorRef<half_t, ColumnMajor>)
286 |         B.host_ref(),          // B            (type: TensorRef<half_t, ColumnMajor>)
287 |         beta,                  // beta         (type: cutlass::half_t)
288 |         C_reference.host_ref() // C            (type: TensorRef<half_t, ColumnMajor>)
289 |     );
290 | 
291 |     // Compare reference to computed results.
292 |     if (!cutlass::reference::host::TensorEquals(
293 |             C_reference.host_view(),
294 |             C_cutlass.host_view()))
295 |     {
296 | 
297 |         char const *filename = "errors_01_cutlass_utilities.csv";
298 | 
299 |         std::cerr << "Error - CUTLASS GEMM kernel differs from reference. Wrote computed and reference results to '" << filename << "'" << std::endl;
300 | 
301 |         //
302 |         // On error, print C_cutlass and C_reference to std::cerr.
303 |         //
304 |         // Note, these are matrices of half-precision elements stored in host memory as
305 |         // arrays of type cutlass::half_t.
306 |         //
307 | 
308 |         std::ofstream file(filename);
309 | 
310 |         // Result of CUTLASS GEMM kernel
311 |         file << "\n\nCUTLASS =\n"
312 |              << C_cutlass.host_view() << std::endl;
313 | 
314 |         // Result of reference computation
315 |         file << "\n\nReference =\n"
316 |              << C_reference.host_view() << std::endl;
317 | 
318 |         // Return error code.
319 |         return cudaErrorUnknown;
320 |     }
321 | 
322 |     // Passed error check
323 |     return cudaSuccess;
324 | }
325 | 
326 | ///////////////////////////////////////////////////////////////////////////////////////////////////
327 | 
328 | /// Entry point to cutlass_utilities example.
329 | //
330 | // usage:
331 | //
332 | //   01_cutlass_utilities <M> <N> <K> <alpha> <beta>
333 | //
334 | int main(int argc, const char *arg[])
335 | {
336 | 
337 |     //
338 |     // This example uses half-precision and is only suitable for devices with compute capabitliy 5.3 or greater.
339 |     //
340 | 
341 |     cudaDeviceProp prop;
342 |     cudaError_t result = cudaGetDeviceProperties(&prop, 0);
343 | 
344 |     if (result != cudaSuccess)
345 |     {
346 |         std::cerr << "Failed to query device properties with error " << cudaGetErrorString(result) << std::endl;
347 |         return -1;
348 |     }
349 | 
350 |     if (!(prop.major > 5 || (prop.major == 5 && prop.minor >= 3)))
351 |     {
352 |         std::cerr << "This example uses half precision and is only suitable for devices with compute capability 5.3 or greater.\n";
353 |         std::cerr << "You are using a CUDA device with compute capability " << prop.major << "." << prop.minor << std::endl;
354 |         return -1;
355 |     }
356 | 
357 |     //
358 |     // Parse the command line to obtain GEMM dimensions and scalar values.
359 |     //
360 | 
361 |     // GEMM problem dimensions: <M> <N> <K>
362 |     int problem[3] = {128, 128, 128};
363 | 
364 |     for (int i = 1; i < argc && i < 4; ++i)
365 |     {
366 |         std::stringstream ss(arg[i]);
367 |         ss >> problem[i - 1];
368 |     }
369 | 
370 |     // Linear scale factors in GEMM. Note, these are half-precision values stored as
371 |     // cutlass::half_t.
372 |     //
373 |     // Values outside the range of IEEE FP16 will overflow to infinity or underflow to zero.
374 |     //
375 |     cutlass::half_t scalars[2] = {1.0_hf, 0.0_hf};
376 | 
377 |     for (int i = 4; i < argc && i < 6; ++i)
378 |     {
379 |         std::stringstream ss(arg[i]);
380 | 
381 |         ss >> scalars[i - 4]; // lexical cast to cutlass::half_t
382 |     }
383 | 
384 |     //
385 |     // Run the CUTLASS GEMM test.
386 |     //
387 | 
388 |     result = TestCutlassGemm(
389 |         problem[0], // GEMM M dimension
390 |         problem[1], // GEMM N dimension
391 |         problem[2], // GEMM K dimension
392 |         scalars[0], // alpha
393 |         scalars[1]  // beta
394 |     );
395 | 
396 |     if (result == cudaSuccess)
397 |     {
398 |         std::cout << "Passed." << std::endl;
399 |     }
400 | 
401 |     // Exit.
402 |     return result == cudaSuccess ? 0 : -1;
403 | }
404 | 
405 | ///////////////////////////////////////////////////////////////////////////////////////////////////
406 | 


--------------------------------------------------------------------------------
/example_5/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_5/main.cu:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | /*! \file
 33 |   \brief Demonstrate CUTLASS debugging tool for dumping fragments and shared
 34 |   memory
 35 |  */
 36 | 
 37 | ///////////////////////////////////////////////////////////////////////////////////////////////////
 38 | 
 39 | // Standard Library includes
 40 | 
 41 | #include <iostream>
 42 | 
 43 | //
 44 | // CUTLASS includes
 45 | //
 46 | 
 47 | #include "cutlass/aligned_buffer.h"
 48 | #include "cutlass/gemm/gemm.h"
 49 | #include "cutlass/layout/matrix.h"
 50 | #include "cutlass/matrix_shape.h"
 51 | #include "cutlass/numeric_types.h"
 52 | 
 53 | #include "cutlass/core_io.h"
 54 | #include "cutlass/util/host_tensor.h"
 55 | #include "cutlass/util/tensor_view_io.h"
 56 | 
 57 | #include "cutlass/util/reference/host/gemm.h"
 58 | #include "cutlass/util/reference/host/tensor_compare.h"
 59 | #include "cutlass/util/reference/host/tensor_fill.h"
 60 | 
 61 | #include "cutlass/transform/pitch_linear_thread_map.h"
 62 | #include "cutlass/transform/threadblock/predicated_tile_iterator.h"
 63 | #include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h"
 64 | 
 65 | #include "cutlass/util/debug.h"
 66 | #include "cutlass/util/device_dump.h"
 67 | 
 68 | #define EXAMPLE_MATRIX_ROW 64
 69 | #define EXAMPLE_MATRIX_COL 32
 70 | 
 71 | ///////////////////////////////////////////////////////////////////////////////////////////////////
 72 | 
 73 | template <typename Element, typename GmemIterator, typename SmemIterator>
 74 | __global__ void kernel_dump(typename GmemIterator::Params params,
 75 |                             typename GmemIterator::TensorRef ref)
 76 | {
 77 |     extern __shared__ Element shared_storage[];
 78 | 
 79 |     // Construct the global iterator and load the data to the fragments.
 80 |     int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
 81 | 
 82 |     GmemIterator gmem_iterator(params, ref.data(),
 83 |                                {EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL},
 84 |                                tb_thread_id);
 85 | 
 86 |     typename GmemIterator::Fragment frag;
 87 | 
 88 |     frag.clear();
 89 |     gmem_iterator.load(frag);
 90 | 
 91 |     // Call dump_fragment() with different parameters.
 92 |     if (threadIdx.x == 0 && blockIdx.x == 0)
 93 |         printf("\nAll threads dump all the elements:\n");
 94 |     cutlass::debug::dump_fragment(frag);
 95 | 
 96 |     if (threadIdx.x == 0 && blockIdx.x == 0)
 97 |         printf("\nFirst thread dumps all the elements:\n");
 98 |     cutlass::debug::dump_fragment(frag, /*N = */ 1);
 99 | 
100 |     if (threadIdx.x == 0 && blockIdx.x == 0)
101 |         printf("\nFirst thread dumps first 16 elements:\n");
102 |     cutlass::debug::dump_fragment(frag, /*N = */ 1, /*M = */ 16);
103 | 
104 |     if (threadIdx.x == 0 && blockIdx.x == 0)
105 |         printf("\nFirst thread dumps first 16 elements with a stride of 8:\n");
106 |     cutlass::debug::dump_fragment(frag, /*N = */ 1, /*M = */ 16, /*S = */ 8);
107 | 
108 |     // Construct the shared iterator and store the data to the shared memory.
109 |     SmemIterator smem_iterator(
110 |         typename SmemIterator::TensorRef(
111 |             {shared_storage, SmemIterator::Layout::packed(
112 |                                  {EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL})}),
113 |         tb_thread_id);
114 | 
115 |     smem_iterator.store(frag);
116 | 
117 |     // Call dump_shmem() with different parameters.
118 |     if (threadIdx.x == 0 && blockIdx.x == 0)
119 |         printf("\nDump all the elements:\n");
120 |     cutlass::debug::dump_shmem(shared_storage,
121 |                                EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL);
122 | 
123 |     if (threadIdx.x == 0 && blockIdx.x == 0)
124 |         printf("\nDump all the elements with a stride of 8:\n");
125 |     cutlass::debug::dump_shmem(
126 |         shared_storage, EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL, /*S = */ 8);
127 | }
128 | 
129 | ///////////////////////////////////////////////////////////////////////////////////////////////////
130 | 
131 | /// Entry point for dump_reg_shmem example.
132 | //
133 | // usage:
134 | //
135 | //   02_dump_reg_shmem
136 | //
137 | int main()
138 | {
139 |     // Initialize a 64x32 column major matrix with sequential data (1,2,3...).
140 |     using Element = cutlass::half_t;
141 |     using Layout = cutlass::layout::ColumnMajor;
142 | 
143 |     cutlass::HostTensor<Element, Layout> matrix({EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL});
144 |     cutlass::reference::host::BlockFillSequential(matrix.host_data(), matrix.capacity());
145 | 
146 |     // Dump the matrix.
147 |     std::cout << "Matrix:\n"
148 |               << matrix.host_view() << "\n";
149 | 
150 |     // Copy the matrix to the device.
151 |     matrix.sync_device();
152 | 
153 |     // Define a global iterator, a shared iterator and their thread map.
154 |     using ThreadMap = cutlass::transform::PitchLinearWarpRakedThreadMap<
155 |         cutlass::layout::PitchLinearShape<EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL>,
156 |         32, cutlass::layout::PitchLinearShape<8, 4>, 8>;
157 | 
158 |     using GmemIterator = cutlass::transform::threadblock::PredicatedTileIterator<
159 |         cutlass::MatrixShape<EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL>, Element, Layout, 1, ThreadMap>;
160 | 
161 |     typename GmemIterator::Params params(matrix.layout());
162 | 
163 |     using SmemIterator = cutlass::transform::threadblock::RegularTileIterator<
164 |         cutlass::MatrixShape<EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL>, Element,
165 |         cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<16, 64>, 1,
166 |         ThreadMap>;
167 | 
168 |     dim3 grid(1, 1);
169 |     dim3 block(32, 1, 1);
170 | 
171 |     int smem_size =
172 |         int(sizeof(Element) * EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL);
173 | 
174 |     kernel_dump<Element, GmemIterator, SmemIterator>
175 |         <<<grid, block, smem_size, 0>>>(params, matrix.device_ref());
176 | 
177 |     cudaError_t result = cudaDeviceSynchronize();
178 | 
179 |     if (result != cudaSuccess)
180 |     {
181 |         std::cout << "Failed" << std::endl;
182 |     }
183 | 
184 |     return (result == cudaSuccess ? 0 : -1);
185 | }
186 | 
187 | ///////////////////////////////////////////////////////////////////////////////////////////////////


--------------------------------------------------------------------------------
/example_6/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_library(plugin SHARED register_layout.cu)
25 | target_link_libraries(plugin cudart)
26 | 
27 | add_executable(main visualize_layout.cpp)
28 | 
29 | target_link_libraries(main plugin)
30 | 
31 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
32 | 
33 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
34 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
35 | 


--------------------------------------------------------------------------------
/example_6/options.h:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | #pragma once
 33 | 
 34 | #include <vector>
 35 | #include <iostream>
 36 | 
 37 | // Cutlass command line parser
 38 | #include "cutlass/util/command_line.h"
 39 | 
 40 | class Options {
 41 | public:
 42 | 
 43 |   bool help;
 44 |   bool good;
 45 |   std::vector<int> extent;          ///< extent of tile to fill
 46 |   std::vector<int> stride;          ///< stride vector for layout function
 47 |   std::vector<int> output_shape;    ///< output shape
 48 |   int vectorize;                    ///< sequences of consecutive output elements are concatenated into a vector
 49 |                                     ///  if, and only if, they were consecutive in source memory
 50 | 
 51 | public:
 52 | 
 53 |   /// Options
 54 |   Options(): 
 55 |     help(false),
 56 |     good(true),
 57 |     extent({32, 8}),
 58 |     stride({32}),
 59 |     output_shape({16, 8}), 
 60 |     vectorize(1) { 
 61 | 
 62 |   }
 63 | 
 64 |   /// Constructs from command line parser
 65 |   Options(cutlass::CommandLine const & cmd_line): help(false), good(true) {
 66 | 
 67 |     if (cmd_line.check_cmd_line_flag("help") ||
 68 |         cmd_line.check_cmd_line_flag("h")) {
 69 | 
 70 |       help = true;
 71 |     }
 72 | 
 73 |     if (cmd_line.check_cmd_line_flag("extent")) {
 74 |       cmd_line.get_cmd_line_arguments("extent", extent);
 75 |     }
 76 |     else {
 77 |       extent = {32, 8};
 78 |     }
 79 | 
 80 |     if (cmd_line.check_cmd_line_flag("stride")) {
 81 |       cmd_line.get_cmd_line_arguments("stride", stride);
 82 |     }
 83 |     
 84 |     int default_output_shape[] = {16, 8}; 
 85 | 
 86 |     if (cmd_line.check_cmd_line_flag("output-shape")) {
 87 |       cmd_line.get_cmd_line_arguments("output-shape", output_shape);
 88 |     }
 89 | 
 90 |     for (int i = int(output_shape.size()); i < 2; ++i) {
 91 |       output_shape.push_back(default_output_shape[i]);
 92 |     }
 93 | 
 94 |     if (cmd_line.check_cmd_line_flag("vectorize")) {
 95 |       cmd_line.get_cmd_line_argument("vectorize", vectorize);
 96 |     }
 97 |     else {
 98 |       vectorize = 1;
 99 |     }
100 | 
101 |     if (output_shape.front() % vectorize) {
102 | 
103 |       std::cerr << "Error: --vectorize=" << vectorize 
104 |         << " must divide contiguous elements in --output-shape="
105 |         << output_shape.at(0) << "," << output_shape.at(1) << std::endl;
106 | 
107 |       good = false;
108 |     }
109 |   }
110 | 
111 |   /// Prints usage statement
112 |   static void print_usage(std::ostream &out) {
113 |     out
114 |       << "  Options:\n"
115 |       << "    --help                              Displays this help message.\n"
116 |       << "    --extent=<extent>                   Specifies the layout-specific extent (as comma-delimited array).\n"
117 |       << "    --stride=<stride>                   Specifies the layout-specific stride vector (comma-delimited array)\n"
118 |       << "    --output-shape=<extent>             Specifies the dimensions of a row-major output matrix. \n"
119 |       << "    --vectorize=<vector length>         If possible, vectorizes the output into vectors of consecutive elements\n";
120 |   }
121 | };
122 | 


--------------------------------------------------------------------------------
/example_6/register_layout.cu:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | /*! \file
 33 |   \brief CUTLASS layout visualization example
 34 | */
 35 | 
 36 | #include <map>
 37 | #include <memory>
 38 | 
 39 | #include "cutlass/layout/matrix.h"
 40 | #include "cutlass/layout/pitch_linear.h"
 41 | #include "cutlass/layout/tensor_op_multiplicand_sm70.h"
 42 | #include "cutlass/layout/tensor_op_multiplicand_sm75.h"
 43 | #include "cutlass/layout/tensor_op_multiplicand_sm80.h"
 44 | 
 45 | #include "visualize_layout.h"
 46 | #include "register_layout.h"
 47 | 
 48 | /////////////////////////////////////////////////////////////////////////////////////////////////
 49 | 
 50 | void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase> > &layouts) {
 51 | 
 52 |   struct {
 53 |     char const *name;
 54 |     VisualizeLayoutBase *ptr;
 55 |   } layout_pairs[] = {
 56 | 
 57 |       {"PitchLinear", new VisualizeLayout<cutlass::layout::PitchLinear>},
 58 |       {"ColumnMajor", new VisualizeLayout<cutlass::layout::ColumnMajor>},
 59 |       {"RowMajor", new VisualizeLayout<cutlass::layout::RowMajor>},
 60 |       {"ColumnMajorInterleaved<4>",
 61 |        new VisualizeLayout<cutlass::layout::ColumnMajorInterleaved<4>>},
 62 |       {"RowMajorInterleaved<4>",
 63 |        new VisualizeLayout<cutlass::layout::RowMajorInterleaved<4>>},
 64 |       // All Ampere/Turing H/Integer matrix multiply tensor core kernels uses the same swizzling
 65 |       // layout implementation with different templates.
 66 |       //
 67 |       // mma.sync.aligned.m8n8k128.s32.b1.b1.s32 Interleaved-256
 68 |       // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 Interleaved-256
 69 |       {"TensorOpMultiplicand<1,256>",
 70 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 256>>},
 71 |       // mma.sync.aligned.m8n8k128.s32.b1.b1.s32 TN kblock512
 72 |       // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 TN kblock512
 73 |       {"TensorOpMultiplicand<1,512>",
 74 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 512>>},
 75 |       // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 TN kblock1024
 76 |       {"TensorOpMultiplicand<1,1024>",
 77 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 1024>>},
 78 |       // Integer matrix multiply.int4 8832  Interleaved-64
 79 |       // Integer matrix multiply.int4 16864 Interleaved-64
 80 |       {"TensorOpMultiplicand<4,64>",
 81 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 64>>},
 82 |       // Integer matrix multiply.int4 8832  TN kblock128
 83 |       // Integer matrix multiply.int4 16864 TN kblock128
 84 |       {"TensorOpMultiplicand<4,128>",
 85 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 128>>},
 86 |       // Integer matrix multiply.int4 16864 TN kblock256
 87 |       {"TensorOpMultiplicand<4,256>",
 88 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 256>>},
 89 |       // Integer matrix multiply 8816  Interleaved-32
 90 |       // Integer matrix multiply 16832 Interleaved-32
 91 |       {"TensorOpMultiplicand<8,32>",
 92 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 32>>},
 93 |       // Integer matrix multiply 8816  TN kblock64
 94 |       // Integer matrix multiply 16832 TN kblock64
 95 |       {"TensorOpMultiplicand<8,64>",
 96 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 64>>},
 97 |       // Integer matrix multiply 16832 TN kblock128
 98 |       {"TensorOpMultiplicand<8,128>",
 99 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 128>>},
100 |       // Matrix Multiply 1688  TN kblock32
101 |       // Matrix multiply 16816 TN kblock32
102 |       {"TensorOpMultiplicand<16,32>",
103 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 32>>},
104 |       // Matrix multiply 1688  NT
105 |       // Matrix multiply 16816 NT
106 |       // Matrix multiply 16816 TN kblock64
107 |       {"TensorOpMultiplicand<16,64>",
108 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 64>>},
109 |       // Matrix multiply 1688.TF32 TN kblock16
110 |       {"TensorOpMultiplicand<32,16>",
111 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<32, 16>>},
112 |       // Matrix multiply 1688.TF32 TN kblock32
113 |       {"TensorOpMultiplicand<32,32>",
114 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<32, 32>>},
115 |       // Matrix multiply 1688 NT
116 |       {"TensorOpMultiplicandCongruous<32,32>",
117 |        new VisualizeLayout<
118 |            cutlass::layout::TensorOpMultiplicandCongruous<32, 32>>},
119 |       // Matrix multiply 884 NT
120 |       {"TensorOpMultiplicandCongruous<64,16>",
121 |        new VisualizeLayout<
122 |            cutlass::layout::TensorOpMultiplicandCongruous<64, 16>>},
123 |       // Matrix multiply 884 TN
124 |       {"TensorOpMultiplicand64bCrosswise",
125 |        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand64bCrosswise>},
126 |       {"TensorOpMultiplicandCongruous<128,4>",
127 |        new VisualizeLayout<
128 |            cutlass::layout::TensorOpMultiplicandCongruous<128, 4>>},
129 |       {"TensorOpMultiplicandCrosswise<128,4>",
130 |        new VisualizeLayout<
131 |            cutlass::layout::TensorOpMultiplicandCrosswise<128, 4>>},
132 |       {"VoltaTensorOpMultiplicandCongruous<16>",
133 |        new VisualizeLayout<
134 |            cutlass::layout::VoltaTensorOpMultiplicandCongruous<16>>},
135 |       {"VoltaTensorOpMultiplicandCrosswise<16,32>",
136 |        new VisualizeLayout<
137 |            cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>}
138 |   };
139 | 
140 |   for (auto layout : layout_pairs) {
141 |     layouts.emplace(std::string(layout.name), std::unique_ptr<VisualizeLayoutBase>(layout.ptr));
142 |   }
143 | }
144 | 
145 | /////////////////////////////////////////////////////////////////////////////////////////////////
146 | 


--------------------------------------------------------------------------------
/example_6/register_layout.h:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: BSD-3-Clause
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *
 8 |  * 1. Redistributions of source code must retain the above copyright notice, this
 9 |  * list of conditions and the following disclaimer.
10 |  *
11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
12 |  * this list of conditions and the following disclaimer in the documentation
13 |  * and/or other materials provided with the distribution.
14 |  *
15 |  * 3. Neither the name of the copyright holder nor the names of its
16 |  * contributors may be used to endorse or promote products derived from
17 |  * this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |  *
30 |  **************************************************************************************************/
31 | 
32 | /*! \file
33 |   \brief CUTLASS layout visualization example
34 | */
35 | 
36 | #pragma once
37 | 
38 | #include <map>
39 | #include <memory>
40 | 
41 | #include "options.h"
42 | 
43 | /////////////////////////////////////////////////////////////////////////////////////////////////
44 | 
45 | struct VisualizeLayoutBase {
46 |   virtual bool visualize(Options const &) = 0;
47 |   virtual bool verify(bool verbose, std::ostream &out) = 0;
48 |   virtual void print_csv(std::ostream &out, char delim = '|', char new_line = '\n') = 0;
49 |   virtual std::ostream &print_help(std::ostream &out) {
50 |     return out;
51 |   }
52 |   virtual ~VisualizeLayoutBase() { }
53 | };
54 | 
55 | /////////////////////////////////////////////////////////////////////////////////////////////////
56 | 
57 | void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase> > &layouts);
58 | 
59 | /////////////////////////////////////////////////////////////////////////////////////////////////
60 | 


--------------------------------------------------------------------------------
/example_6/visualize_layout.cpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | /*! \file
 33 |   \brief CUTLASS layout visualization tool
 34 | */
 35 | 
 36 | #include <map>
 37 | #include <iostream>
 38 | #include <iomanip>
 39 | #include <memory>
 40 | 
 41 | #include <cutlass/cutlass.h>
 42 | 
 43 | #include "options.h"
 44 | #include "register_layout.h"
 45 | 
 46 | /////////////////////////////////////////////////////////////////////////////////////////////////
 47 | 
 48 | std::map<std::string, std::unique_ptr<VisualizeLayoutBase> > layouts;
 49 | 
 50 | /////////////////////////////////////////////////////////////////////////////////////////////////
 51 | 
 52 | void print_usage(std::ostream &out) {
 53 | 
 54 |   out << "03_visualize_layout <layout> [options]"
 55 |     << "\n\n"
 56 |     << "  Layouts:\n";
 57 | 
 58 |   for (auto const & layout : layouts) {
 59 |     out << "    " << layout.first << std::string(46 - layout.first.size(), ' ');
 60 |     layout.second->print_help(out);
 61 |     out << "\n";
 62 |   }
 63 | 
 64 |   out << "\n";
 65 |     
 66 |   Options::print_usage(out);
 67 | 
 68 |   out << "\nExamples:\n\n"
 69 |       << "$ 03_visualize_layout RowMajor --extent=16,16\n"
 70 |       << "$ 03_visualize_layout \"ColumnMajorInterleaved<4>\" --extent=32,8 "
 71 |          "--output-shape=16 --vectorize=4\n"
 72 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<4,64>\" "
 73 |          "--extent=64,64 --vectorize=32 --output-shape=256,4\n"
 74 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<4,128>\" "
 75 |          "--extent=128,32 --vectorize=32 --output-shape=256,4\n"
 76 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<4,256>\" "
 77 |          "--extent=256,16 --vectorize=32 --output-shape=256,4\n"
 78 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<8,32>\" "
 79 |          "--extent=32,64 --vectorize=16 --output-shape=128,4\n"
 80 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<8,64>\" "
 81 |          "--extent=64,32 --vectorize=16 --output-shape=128,4\n"
 82 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<8,128>\" "
 83 |          "--extent=128,16 --vectorize=16 --output-shape=128,4\n"
 84 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<16,32>\" "
 85 |          "--extent=32,32 --vectorize=8 --output-shape=64,4\n"
 86 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<16,64>\" "
 87 |          "--extent=64,16 --vectorize=8 --output-shape=64,4\n"
 88 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<32,16>\" "
 89 |          "--extent=16,32 --vectorize=4 --output-shape=32,4\n"
 90 |       << "$ 03_visualize_layout \"TensorOpMultiplicand<32,32>\" "
 91 |          "--extent=32,16 --vectorize=4 --output-shape=32,4\n"
 92 |       << "$ 03_visualize_layout \"TensorOpMultiplicandCongruous<32,32>\" "
 93 |          "--extent=32,16 --vectorize=4 --output-shape=32,4\n"
 94 |       << "$ 03_visualize_layout \"TensorOpMultiplicandCongruous<64, 16>\" "
 95 |          "--extent=16,16 --vectorize=2 --output-shape=16,4\n"
 96 |       << "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCrosswise<16,32>\" "
 97 |          "--extent=32,64 --vectorize=4 --output-shape=64,4\n"
 98 |       << "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCongruous<16>\" "
 99 |          "--extent=64,32 --vectorize=8 --output-shape=64,4\n";
100 | 
101 |   out << std::endl;
102 | }
103 | 
104 | /////////////////////////////////////////////////////////////////////////////////////////////////
105 | 
106 | /// Entry point
107 | int main(int argc, char const *arg[]) {
108 | 
109 |   RegisterLayouts(layouts);
110 | 
111 |   if (argc == 1 || (std::string(arg[0]) == "-h" || std::string(arg[1]) == "--help")) {
112 |     print_usage(std::cout);
113 |     return 0;
114 |   }
115 | 
116 |   // parse command line, skipping layout name
117 |   cutlass::CommandLine cmd_line(argc - 1, arg + 1);
118 |   Options options(cmd_line);
119 | 
120 |   if (options.help) {
121 |     print_usage(std::cout);
122 |     return 0;
123 |   }
124 | 
125 |   if (!options.good) {
126 |     return -1;
127 |   }
128 | 
129 |   std::string layout_name = arg[1];
130 | 
131 |   auto layout_it = layouts.find(layout_name);
132 |   if (layout_it == layouts.end()) {
133 |     std::cerr << "Layout '" << layout_name << "' not supported." << std::endl;
134 |     return -1;
135 |   }
136 | 
137 |   bool passed  = layout_it->second->visualize(options);
138 |   if (!passed) {
139 |     return -1;
140 |   }
141 | 
142 |   layout_it->second->print_csv(std::cout);
143 | 
144 |   cudaFree(0); // Ensure CUDA is available.
145 | 
146 |   return 0;
147 | }
148 | 
149 | /////////////////////////////////////////////////////////////////////////////////////////////////
150 | 


--------------------------------------------------------------------------------
/example_6/visualize_layout.h:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | /*! \file
 33 |   \brief CUTLASS layout visualization example
 34 | */
 35 | 
 36 | #pragma once
 37 | 
 38 | #include <algorithm>
 39 | #include <stdexcept>
 40 | #include <vector>
 41 | 
 42 | #include "cutlass/coord.h"
 43 | #include "cutlass/util/reference/host/tensor_foreach.h"
 44 | 
 45 | #include "register_layout.h"
 46 | 
 47 | /////////////////////////////////////////////////////////////////////////////////////////////////
 48 | 
 49 | /// Permits copying dynamic vectors into static-length vectors 
 50 | template <typename TensorCoord, int Rank>
 51 | struct vector_to_coord {
 52 |   
 53 |   vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
 54 | 
 55 |     coord[Rank - 1] = vec.at(Rank - 1);
 56 |     
 57 |     if (Rank > 1) {
 58 |       vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
 59 |     }
 60 |   }
 61 | };
 62 | 
 63 | /// Permits copying dynamic vectors into static-length vectors 
 64 | template <typename TensorCoord>
 65 | struct vector_to_coord<TensorCoord, 1> {
 66 |   
 67 |   vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
 68 | 
 69 |     coord[0] = vec.at(0);
 70 |   }
 71 | };
 72 | 
 73 | /// Permits copying dynamic vectors into static-length vectors 
 74 | template <typename TensorCoord>
 75 | struct vector_to_coord<TensorCoord, 0> {
 76 |   
 77 |   vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
 78 | 
 79 |   }
 80 | };
 81 | 
 82 | /////////////////////////////////////////////////////////////////////////////////////////////////
 83 | 
 84 | template <typename T>
 85 | std::ostream &operator<<(std::ostream &out, std::vector<T> const &vec) {
 86 |   auto it = vec.begin();
 87 |   if (it != vec.end()) {
 88 |     out << *it;
 89 |     for (++it; it != vec.end(); ++it) {
 90 |       out << ", " << *it;
 91 |     }
 92 |   }
 93 |   return out;
 94 | }
 95 | 
 96 | /////////////////////////////////////////////////////////////////////////////////////////////////
 97 | 
 98 | /// Permits copying static-length vectors into dynamic vectors
 99 | template <typename TensorCoord, int Rank>
100 | struct coord_to_vector {
101 |   
102 |   coord_to_vector(std::vector<int> &vec, TensorCoord const &coord) {
103 | 
104 |     vec.at(Rank - 1) = coord[Rank - 1];
105 |     coord_to_vector<TensorCoord, Rank - 1>(vec, coord);
106 |   }
107 | };
108 | 
109 | /// Permits copying static-length vectors into dynamic vectors
110 | template <typename TensorCoord>
111 | struct coord_to_vector<TensorCoord, 1> {
112 |   
113 |   coord_to_vector(std::vector<int> &vec, TensorCoord const &coord) {
114 | 
115 |     vec.at(0) = coord[0];
116 |   }
117 | };
118 | 
119 | /// Permits copying static-length vectors into dynamic vectors
120 | template <typename TensorCoord>
121 | struct coord_to_vector<TensorCoord, 0> {
122 |   
123 |   coord_to_vector(std::vector<int> &vec, TensorCoord const &coord) {
124 |   }
125 | };
126 | 
127 | /////////////////////////////////////////////////////////////////////////////////////////////////
128 | 
129 | /// Structure representing an element in source memory
130 | struct Element {
131 | 
132 |   std::vector<int> coord;     ///< logical coordinate of element (as vector)
133 |   int offset;                 ///< linear offset from source memory
134 |   int color;                  ///< enables coloring each element to indicate
135 | 
136 |   /// Default ctor
137 |   inline Element(): offset(-1), color(0) { }
138 | 
139 |   /// Construct from logical coordinate and initial offset
140 |   inline Element(
141 |     std::vector<int> const &coord_, 
142 |     int offset_,
143 |     int color_ = 0
144 |   ): 
145 |     coord(coord_), offset(offset_), color(color_) { }
146 | 
147 |   /// Returns true if element is in a defined state
148 |   inline bool valid() const {
149 |     return offset >= 0;
150 |   }
151 | };
152 | 
153 | /////////////////////////////////////////////////////////////////////////////////////////////////
154 | 
155 | /// Visualizes memory layouts by constructing a 'shape' 
156 | template <typename Layout_>
157 | class VisualizeLayout : public VisualizeLayoutBase {
158 | public:
159 | 
160 |   using Layout = Layout_;
161 |   using TensorCoord = typename Layout::TensorCoord;
162 |   using Stride = typename Layout::Stride;
163 | 
164 | public:
165 | 
166 |   Options options;
167 |   Layout layout;
168 |   TensorCoord extent;
169 |   std::vector<Element> elements;
170 |   
171 | public:
172 | 
173 |   /// Initializes the problem space
174 |   VisualizeLayout() {
175 | 
176 |   }
177 | 
178 |   /// visualization method
179 |   bool visualize(Options const &options_) {
180 | 
181 |     options = options_;
182 |     
183 |     if (options.extent.size() != TensorCoord::kRank) {
184 |       
185 |       std::cerr
186 |         << "--extent must have rank " << TensorCoord::kRank
187 |         << " (given: " << options.extent.size() << ")" << std::endl;
188 | 
189 |       return false;
190 |     }
191 |     
192 |     vector_to_coord<TensorCoord, TensorCoord::kRank>(extent, options.extent);
193 | 
194 |     // Construct the layout for a packed tensor
195 |     if (options.stride.empty()) {
196 | 
197 |       layout = Layout::packed(extent);
198 |     }
199 |     else if (options.stride.size() != Stride::kRank) {
200 | 
201 |       std::cerr 
202 |         << "--stride must have rank " << Stride::kRank 
203 |         << " (given: " << options.stride.size() << ")" << std::endl;
204 | 
205 |       return false;
206 |     }
207 |     else {
208 |       // Stride from 
209 |       Stride stride;
210 |       vector_to_coord<Stride, Stride::kRank>(stride, options.stride);
211 | 
212 |       layout = Layout(stride);
213 |     }
214 | 
215 |     // Resize elements, setting elements to 'undefined' state
216 |     elements.resize(layout.capacity(extent));
217 | 
218 |     // enumerate points in tensor space and assign 
219 |     cutlass::reference::host::TensorForEachLambda(
220 |       extent, 
221 |       [&](TensorCoord coord) { 
222 |         
223 |         std::vector<int> coord_vec(TensorCoord::kRank, 0);
224 |         coord_to_vector<TensorCoord, TensorCoord::kRank>(coord_vec, coord);
225 | 
226 |         int offset = int(layout(coord));
227 | 
228 |         if (offset >= int(elements.size())) {
229 |           std::cerr
230 |             << "Layout error - " << coord_vec 
231 |             << " is out of range (computed offset: " << offset 
232 |             << ", capacity: " << elements.size() << std::endl;
233 | 
234 |           throw std::out_of_range("(TensorForEach) layout error - coordinate out of range");
235 |         }
236 | 
237 |         elements.at(offset) = Element(coord_vec, offset);
238 |       });
239 | 
240 |     return true;
241 |   }
242 | 
243 |   /// Verifies the layout satisfies vectorization requirements
244 |   bool verify(bool verbose, std::ostream &out) {
245 |     return true;
246 |   }
247 | 
248 | private:
249 | 
250 |   /// returns a pair (is_vectorizable, one_changing_rank) to determine if a
251 |   /// vector exists (consecutive logical coordinates or uniformly invalid)
252 |   /// at the given location. 
253 |   std::pair< bool, int > _is_vectorizable(int i) const {
254 |     // (all elements are invalid) or 
255 |     // (all elements are valid AND 
256 |     //  exactly one rank is changing AND 
257 |     //  elements are consecutive)
258 | 
259 |     // Don't need vectorization.
260 |     if (options.vectorize <= 2) return std::make_pair(false, -1);
261 | 
262 |     // Boundary check.
263 |     if (i > elements.size() || (i + options.vectorize - 1) > elements.size())
264 |       return std::make_pair(false, -1);
265 | 
266 |     // Check if either all elements are valid or invalid.
267 |     bool all_elements_invalid = std::all_of(
268 |         elements.begin() + i, elements.begin() + i + options.vectorize,
269 |         [](Element const &e) { return !e.valid(); });
270 | 
271 |     bool all_elements_valid = std::all_of(
272 |         elements.begin() + i, elements.begin() + i + options.vectorize,
273 |         [](Element const &e) { return e.valid(); });
274 | 
275 |     if (!all_elements_invalid && !all_elements_valid)
276 |       return std::make_pair(false, -1);
277 | 
278 |     // From here, it is vectorizable.
279 |     if (all_elements_invalid) return std::make_pair(true, -1);
280 | 
281 |     // Check if only exactly one rank is changing.
282 |     int one_changing_rank = -1;
283 |     for (int j = 0; j < options.vectorize; ++j) {
284 |       for (int r = 0; r < TensorCoord::kRank; ++r) {
285 |         if (elements.at(i + j).coord.at(r) != elements.at(i).coord.at(r)) {
286 |           if (one_changing_rank == -1) {
287 |             one_changing_rank = r;
288 |           } else if (one_changing_rank != r) {
289 |             return std::make_pair(false, -1);
290 |           }
291 |         }
292 |       }
293 |     }
294 | 
295 |     return std::make_pair(true, one_changing_rank);
296 |   }
297 | 
298 |   /// Prints a vector of elements
299 |   void _print_vector(std::ostream &out, int i, int one_changing_rank) {
300 |     Element const &base_element = elements.at(i);
301 |     if (base_element.valid()) {
302 |       out << "(";
303 |       for (int r = 0; r < TensorCoord::kRank; ++r) {
304 |         if (r) {
305 |           out << ", ";
306 |         }
307 | 
308 |         if (r == one_changing_rank) {
309 |           out 
310 |             << base_element.coord.at(r) 
311 |             << ".." 
312 |             << (base_element.coord.at(r) + options.vectorize - 1);
313 |         }
314 |         else {
315 |           out << base_element.coord.at(r);
316 |         }
317 |       }
318 |       out << ")";
319 |     }
320 |     else {
321 |       out << " ";
322 |     }
323 |   }
324 | 
325 |   /// Prints a single element
326 |   void _print_element(std::ostream &out, int k) {
327 |     Element const &element = elements.at(k);
328 |     if (element.valid()) {
329 |       out << "(";
330 |       for (int v = 0; v < TensorCoord::kRank; ++v) {
331 |         out << (v ? ", " : "") << element.coord.at(v);
332 |       }
333 |       out << ")"; 
334 |     }
335 |     else {
336 |       out << " ";
337 |     }
338 |   }
339 | 
340 | public:
341 | 
342 |   /// Pretty-prints the layout to the console
343 |   void print_csv(std::ostream &out, char delim = '|', char new_line = '\n') {
344 |     int row = -1;
345 | 
346 |     for (int i = 0; i < int(elements.size()); i += options.vectorize) {
347 |       if (i % options.output_shape.at(0)) {
348 |         out << delim;
349 |       }
350 |       else {
351 |         if (row >= 0) {
352 |           out << new_line;
353 |         }
354 |         ++row;
355 |         if (row == options.output_shape.at(1)) {
356 |           out << new_line;
357 |           row = 0;
358 |         }
359 |       }
360 | 
361 |       auto is_vector = _is_vectorizable(i);
362 | 
363 |       if (is_vector.first) {
364 |         _print_vector(out, i, is_vector.second);        // print a vector starting at element i
365 |       }
366 |       else {
367 |         for (int j = 0; j < options.vectorize; ++j) {   // print individual elements [i..i+j)
368 |           _print_element(out, i + j);
369 |         }
370 |       } 
371 |     }
372 |     
373 |     out << new_line << std::flush;
374 |   }
375 | 
376 |   /// Help message
377 |   virtual std::ostream &print_help(std::ostream &out) {
378 |     out << "TensorCoord rank " << TensorCoord::kRank << ", Stride rank: " << Stride::kRank;
379 |     return out;
380 |   }
381 | };
382 | 
383 | /////////////////////////////////////////////////////////////////////////////////////////////////
384 | 


--------------------------------------------------------------------------------
/example_7/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_7/main.cu:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | #include <iostream>
 33 | #include <vector>
 34 | 
 35 | #include "cutlass/cutlass.h"
 36 | #include "cutlass/layout/matrix.h"
 37 | #include "cutlass/gemm/device/gemm_array.h"
 38 | #include "cutlass/gemm/device/gemm_batched.h"
 39 | 
 40 | #pragma warning(disable : 4503)
 41 | 
 42 | /*
 43 | This example demonstrates how to use cutlass to compute a batched strided gemm in two different ways:
 44 |   1. By specifying pointers to the first matrices of the batch and the stride between the consecutive
 45 |      matrices of the batch (this is called a strided batched gemm).
 46 |   2. By copying pointers to all matrices of the batch to the device memory (this is called an array gemm).
 47 | In this example, both A and B matrix are non-transpose and column major matrix
 48 | batched_C = batched_A x batched_B
 49 | As an example, matrix C can be seen as
 50 | -----------------------------------------------------------
 51 | (0,0,0) | (0,0,1) | (0,0,2) | (1,0,0) | (1,0,1) | (1,0,2) |
 52 | -----------------------------------------------------------
 53 | (0,1,0) | (0,1,1) | (0,1,2) | (1,1,0) | (1,1,1) | (1,1,2) |
 54 | -----------------------------------------------------------
 55 | (0,2,0) | (0,2,1) | (0,2,2) | (1,2,0) | (1,2,1) | (1,2,2) |
 56 | -----------------------------------------------------------
 57 | (0,3,0) | (0,3,1) | (0,3,2) | (1,3,0) | (1,3,1) | (1,3,2) |
 58 | -----------------------------------------------------------
 59 | (0,4,0) | (0,4,1) | (0,4,2) | (1,4,0) | (1,4,1) | (1,4,2) |
 60 | -----------------------------------------------------------
 61 | (0,5,0) | (0,5,1) | (0,5,2) | (1,5,0) | (1,5,1) | (1,5,2) |
 62 | -----------------------------------------------------------
 63 |            batch 0          |           batch 1
 64 | where we denote each element with (batch_idx, row_idx, column_idx)
 65 | In this example, batch size is 2, M is 6 and N is 3
 66 | The stride (batch_stride_C) between the first element of two batches is ldc * n
 67 | 
 68 | matrix A can be seen as
 69 | ---------------------------------------
 70 | (0,0,0) | (0,0,1) | (1,0,0) | (1,0,1) |
 71 | ---------------------------------------
 72 | (0,1,0) | (0,1,1) | (1,1,0) | (1,1,1) |
 73 | ---------------------------------------
 74 | (0,2,0) | (0,2,1) | (1,2,0) | (1,2,1) |
 75 | ---------------------------------------
 76 | (0,3,0) | (0,3,1) | (1,3,0) | (1,3,1) |
 77 | ---------------------------------------
 78 | (0,4,0) | (0,4,1) | (1,4,0) | (1,4,1) |
 79 | ---------------------------------------
 80 | (0,5,0) | (0,5,1) | (1,5,0) | (1,5,1) |
 81 | ---------------------------------------
 82 |      batch 0      |      batch 1
 83 | , where batch size is 2, M is 6 and K is 2
 84 | The stride (batch_stride_A) between the first element of two batches is lda * k
 85 | 
 86 | matrix B can be seen as
 87 | -----------------------------
 88 | (0,0,0) | (0,0,1) | (0,0,2) |
 89 | ----------------------------- batch 0
 90 | (0,1,0) | (0,1,1) | (0,1,2) |
 91 | -------------------------------------
 92 | (1,0,0) | (1,0,1) | (1,0,2) |
 93 | ----------------------------- batch 1
 94 | (1,1,0) | (1,1,1) | (1,1,2) |
 95 | -----------------------------
 96 | , where the batch size is 2, N is 3 and K is 2
 97 | The stride (batch_stride_B) between the first element of two batches is k
 98 | 
 99 | 
100 | */
101 | 
102 | cudaError_t cutlass_array_sgemm(
103 |     int m,
104 |     int n,
105 |     int k,
106 |     float alpha,
107 |     float const *const *A,
108 |     int lda,
109 |     float const *const *B,
110 |     int ldb,
111 |     float *const *C,
112 |     int ldc,
113 |     float beta,
114 |     int batch_count)
115 | {
116 | 
117 |     using Gemm = cutlass::gemm::device::GemmArray<
118 |         float, cutlass::layout::ColumnMajor,
119 |         float, cutlass::layout::ColumnMajor,
120 |         float, cutlass::layout::ColumnMajor>;
121 | 
122 |     Gemm gemm_op;
123 | 
124 |     cutlass::Status status = gemm_op({{m, n, k},
125 |                                       A,
126 |                                       lda,
127 |                                       B,
128 |                                       ldb,
129 |                                       C,
130 |                                       ldc,
131 |                                       C,
132 |                                       ldc,
133 |                                       {alpha, beta},
134 |                                       batch_count});
135 | 
136 |     if (status != cutlass::Status::kSuccess)
137 |     {
138 |         return cudaErrorUnknown;
139 |     }
140 | 
141 |     return cudaSuccess;
142 | }
143 | 
144 | cudaError_t cutlass_strided_batched_sgemm(
145 |     int m,
146 |     int n,
147 |     int k,
148 |     float alpha,
149 |     float const *A,
150 |     int lda,
151 |     long long int batch_stride_A,
152 |     float const *B,
153 |     int ldb,
154 |     long long int batch_stride_B,
155 |     float *C,
156 |     int ldc,
157 |     long long int batch_stride_C,
158 |     float beta,
159 |     int batch_count)
160 | {
161 | 
162 |     using Gemm = cutlass::gemm::device::GemmBatched<
163 |         float, cutlass::layout::ColumnMajor,
164 |         float, cutlass::layout::ColumnMajor,
165 |         float, cutlass::layout::ColumnMajor>;
166 | 
167 |     Gemm gemm_op;
168 | 
169 |     cutlass::Status status = gemm_op({{m, n, k},
170 |                                       {A, lda},
171 |                                       batch_stride_A,
172 |                                       {B, ldb},
173 |                                       batch_stride_B,
174 |                                       {C, ldc},
175 |                                       batch_stride_C,
176 |                                       {C, ldc},
177 |                                       batch_stride_C,
178 |                                       {alpha, beta},
179 |                                       batch_count});
180 | 
181 |     if (status != cutlass::Status::kSuccess)
182 |     {
183 |         return cudaErrorUnknown;
184 |     }
185 | 
186 |     return cudaSuccess;
187 | }
188 | 
189 | template <typename T>
190 | cudaError_t strided_batched_gemm_nn_reference(
191 |     int m,
192 |     int n,
193 |     int k,
194 |     T alpha,
195 |     std::vector<T> const &A,
196 |     int lda,
197 |     long long int batch_stride_A,
198 |     std::vector<T> const &B,
199 |     int ldb,
200 |     long long int batch_stride_B,
201 |     std::vector<T> &C,
202 |     int ldc,
203 |     long long int batch_stride_C,
204 |     T beta,
205 |     int batch_count)
206 | {
207 |     /*
208 |     strided batched gemm NN
209 |     */
210 | 
211 |     cudaError_t result = cudaSuccess;
212 | 
213 |     if (A.size() < lda * k * batch_count)
214 |     {
215 |         std::cout << "the size of A is too small" << std::endl;
216 |         return cudaErrorInvalidValue;
217 |     }
218 |     if (B.size() < ldb * n)
219 |     {
220 |         std::cout << "the size of B is too small" << std::endl;
221 |         return cudaErrorInvalidValue;
222 |     }
223 |     if (C.size() < ldc * n * batch_count)
224 |     {
225 |         std::cout << "the size of C is too small" << std::endl;
226 |         return cudaErrorInvalidValue;
227 |     }
228 | 
229 |     for (int batch_idx = 0; batch_idx < batch_count; batch_idx++)
230 |     {
231 |         for (int n_idx = 0; n_idx < n; n_idx++)
232 |         {
233 |             for (int m_idx = 0; m_idx < m; m_idx++)
234 |             {
235 |                 T accum = beta * C[batch_idx * batch_stride_C + n_idx * ldc + m_idx];
236 |                 for (int k_idx = 0; k_idx < k; k_idx++)
237 |                 {
238 |                     accum += alpha * A[batch_idx * batch_stride_A + k_idx * lda + m_idx] * B[batch_idx * batch_stride_B + n_idx * ldb + k_idx];
239 |                 }
240 |                 C[batch_idx * batch_stride_C + n_idx * ldc + m_idx] = accum;
241 |             }
242 |         }
243 |     }
244 | 
245 |     return result;
246 | }
247 | 
248 | cudaError_t run_batched_gemm(bool use_array)
249 | {
250 | 
251 |     const char *gemm_desc = use_array ? "array" : "strided batched";
252 |     std::cout << "Running " << gemm_desc << " gemm" << std::endl;
253 | 
254 |     // Arbitrary problem size
255 |     int const m = 520;
256 |     int const n = 219;
257 |     int const k = 129;
258 |     int const batch_count = 17;
259 | 
260 |     // A, B are non-transpose, column major
261 |     int const lda = m;
262 |     int const ldb = k * batch_count;
263 |     int const ldc = m;
264 | 
265 |     int const count_A = batch_count * lda * k;
266 |     int const count_B = ldb * n;
267 |     int const count_C = batch_count * ldc * n;
268 | 
269 |     // the memory is batched along K dimension
270 |     long long int batch_stride_A = static_cast<long long int>(lda) * static_cast<long long int>(k);
271 |     long long int batch_stride_B = static_cast<long long int>(k);
272 |     long long int batch_stride_C = static_cast<long long int>(ldc) * static_cast<long long int>(n);
273 | 
274 |     // alpha and beta
275 |     float alpha = 1.0f;
276 |     float beta = 2.0f;
277 | 
278 |     cudaError_t result = cudaSuccess;
279 | 
280 |     // allocate the host memory
281 |     std::vector<float> host_A(count_A);
282 |     std::vector<float> host_B(count_B);
283 |     std::vector<float> host_C(count_C);
284 |     std::vector<float> result_C(count_C);
285 | 
286 |     // allocate the device memory
287 |     float *A;
288 |     float *B;
289 |     float *C;
290 | 
291 |     result = cudaMalloc(&A, count_A * sizeof(float));
292 |     if (result != cudaSuccess)
293 |     {
294 |         std::cerr << "cudaMalloc result = " << result << std::endl;
295 |         return result;
296 |     }
297 |     result = cudaMalloc(&B, count_B * sizeof(float));
298 |     if (result != cudaSuccess)
299 |     {
300 |         std::cerr << "cudaMalloc result = " << result << std::endl;
301 |         return result;
302 |     }
303 |     result = cudaMalloc(&C, count_C * sizeof(float));
304 |     if (result != cudaSuccess)
305 |     {
306 |         std::cerr << "cudaMalloc result = " << result << std::endl;
307 |         return result;
308 |     }
309 | 
310 |     // Limit range to avoid floating-point errors
311 |     int const kRange = 8;
312 | 
313 |     // fill A
314 |     for (int b_idx = 0; b_idx < batch_count; b_idx++)
315 |     {
316 |         for (int col_idx = 0; col_idx < k; col_idx++)
317 |         {
318 |             for (int row_idx = 0; row_idx < m; row_idx++)
319 |             {
320 |                 host_A[row_idx + col_idx * lda + b_idx * lda * k] = static_cast<float>((row_idx + col_idx * lda + b_idx * lda * k) % kRange);
321 |             }
322 |         }
323 |     }
324 |     // fill B
325 |     for (int b_idx = 0; b_idx < batch_count; b_idx++)
326 |     {
327 |         for (int col_idx = 0; col_idx < n; col_idx++)
328 |         {
329 |             for (int row_idx = 0; row_idx < k; row_idx++)
330 |             {
331 |                 host_B[row_idx + col_idx * ldb + b_idx * k] = static_cast<float>(((n + k * ldb + batch_count * k) - (row_idx + col_idx * ldb + b_idx * k)) % kRange);
332 |             }
333 |         }
334 |     }
335 |     // fill C
336 |     for (int b_idx = 0; b_idx < batch_count; b_idx++)
337 |     {
338 |         for (int col_idx = 0; col_idx < n; col_idx++)
339 |         {
340 |             for (int row_idx = 0; row_idx < m; row_idx++)
341 |             {
342 |                 host_C[row_idx + col_idx * ldc + b_idx * ldc * n] = 1.f;
343 |             }
344 |         }
345 |     }
346 | 
347 |     // ref memory
348 |     std::vector<float> ref_A(host_A);
349 |     std::vector<float> ref_B(host_B);
350 |     std::vector<float> ref_C(host_C);
351 |     // copy host memory to device
352 |     result = cudaMemcpy(A, host_A.data(), count_A * sizeof(float), cudaMemcpyHostToDevice);
353 |     if (result != cudaSuccess)
354 |     {
355 |         std::cerr << "cudaMemcpy result = " << result << std::endl;
356 |         return result;
357 |     }
358 |     result = cudaMemcpy(B, host_B.data(), count_B * sizeof(float), cudaMemcpyHostToDevice);
359 |     if (result != cudaSuccess)
360 |     {
361 |         std::cerr << "cudaMemcpy result = " << result << std::endl;
362 |         return result;
363 |     }
364 |     result = cudaMemcpy(C, host_C.data(), count_C * sizeof(float), cudaMemcpyHostToDevice);
365 |     if (result != cudaSuccess)
366 |     {
367 |         std::cerr << "cudaMemcpy result = " << result << std::endl;
368 |         return result;
369 |     }
370 | 
371 |     // run cutlass
372 |     if (use_array)
373 |     {
374 |         // allocate the host memory for the pointers to the matrices of the batch
375 |         std::vector<float *> host_ptr_A(batch_count);
376 |         std::vector<float *> host_ptr_B(batch_count);
377 |         std::vector<float *> host_ptr_C(batch_count);
378 | 
379 |         // permute the batch elements to emphasize that GemmArray does not depend on matrices being separated by a fixed stride
380 |         std::vector<size_t> permutation = {14, 11, 3, 10, 1, 13, 9, 4, 6, 16, 8, 15, 7, 12, 0, 2, 5};
381 |         for (size_t b_idx = 0; b_idx < batch_count; b_idx++)
382 |         {
383 |             host_ptr_A[b_idx] = A + permutation[b_idx] * batch_stride_A;
384 |             host_ptr_B[b_idx] = B + permutation[b_idx] * batch_stride_B;
385 |             host_ptr_C[b_idx] = C + permutation[b_idx] * batch_stride_C;
386 |         }
387 | 
388 |         // allocate the corresponding device memory
389 |         float const **ptr_A;
390 |         float const **ptr_B;
391 |         float **ptr_C;
392 | 
393 |         result = cudaMalloc(&ptr_A, batch_count * sizeof(float *));
394 |         if (result != cudaSuccess)
395 |         {
396 |             std::cerr << "cudaMalloc result = " << result << std::endl;
397 |             return result;
398 |         }
399 |         result = cudaMalloc(&ptr_B, batch_count * sizeof(float *));
400 |         if (result != cudaSuccess)
401 |         {
402 |             std::cerr << "cudaMalloc result = " << result << std::endl;
403 |             return result;
404 |         }
405 |         result = cudaMalloc(&ptr_C, batch_count * sizeof(float *));
406 |         if (result != cudaSuccess)
407 |         {
408 |             std::cerr << "cudaMalloc result = " << result << std::endl;
409 |             return result;
410 |         }
411 | 
412 |         // copy the matrix pointers to the device
413 |         result = cudaMemcpy(ptr_A, host_ptr_A.data(), batch_count * sizeof(float *), cudaMemcpyHostToDevice);
414 |         if (result != cudaSuccess)
415 |         {
416 |             std::cerr << "cudaMemcpy result = " << result << std::endl;
417 |             return result;
418 |         }
419 |         result = cudaMemcpy(ptr_B, host_ptr_B.data(), batch_count * sizeof(float *), cudaMemcpyHostToDevice);
420 |         if (result != cudaSuccess)
421 |         {
422 |             std::cerr << "cudaMemcpy result = " << result << std::endl;
423 |             return result;
424 |         }
425 |         result = cudaMemcpy(ptr_C, host_ptr_C.data(), batch_count * sizeof(float *), cudaMemcpyHostToDevice);
426 |         if (result != cudaSuccess)
427 |         {
428 |             std::cerr << "cudaMemcpy result = " << result << std::endl;
429 |             return result;
430 |         }
431 | 
432 |         result = cutlass_array_sgemm(m, n, k, alpha, ptr_A, lda, ptr_B, ldb, ptr_C, ldc, beta, batch_count);
433 | 
434 |         if (result != cudaSuccess)
435 |             return result;
436 |     }
437 |     else
438 |     {
439 |         result = cutlass_strided_batched_sgemm(
440 |             m, n, k, alpha, A, lda, batch_stride_A, B, ldb, batch_stride_B, C, ldc, batch_stride_C,
441 |             beta, batch_count);
442 |         if (result != cudaSuccess)
443 |             return result;
444 |     }
445 | 
446 |     // copy device memory to host
447 |     result = cudaMemcpy(result_C.data(), C, count_C * sizeof(float), cudaMemcpyDeviceToHost);
448 |     if (result != cudaSuccess)
449 |     {
450 |         std::cerr << "cudaMemcpy result = " << result << std::endl;
451 |         return result;
452 |     }
453 | 
454 |     // compare with reference code
455 |     result = strided_batched_gemm_nn_reference(m, n, k, alpha, ref_A, lda, batch_stride_A, ref_B, ldb, batch_stride_B, ref_C, ldc, batch_stride_C,
456 |                                                beta, batch_count);
457 |     if (result != 0)
458 |         return result;
459 | 
460 |     // Expect bit-level accuracy for this simple example
461 |     if (ref_C != result_C)
462 |     {
463 |         std::cout << "CUTLASS " << gemm_desc << " gemm does not run correctly" << std::endl;
464 |         return cudaErrorUnknown;
465 |     }
466 | 
467 |     // free memory
468 |     result = cudaFree(A);
469 |     if (result != cudaSuccess)
470 |     {
471 |         std::cerr << "cudaFree result = " << result << std::endl;
472 |         return result;
473 |     }
474 |     result = cudaFree(B);
475 |     if (result != cudaSuccess)
476 |     {
477 |         std::cerr << "cudaFree result = " << result << std::endl;
478 |         return result;
479 |     }
480 |     result = cudaFree(C);
481 |     if (result != cudaSuccess)
482 |     {
483 |         std::cerr << "cudaFree result = " << result << std::endl;
484 |         return result;
485 |     }
486 | 
487 |     return result;
488 | }
489 | 
490 | int main()
491 | {
492 | 
493 |     cudaError_t result = cudaSuccess;
494 |     for (bool use_array : {false, true})
495 |     {
496 |         result = run_batched_gemm(use_array);
497 |         if (result == cudaSuccess)
498 |         {
499 |             std::cout << "Passed." << std::endl;
500 |         }
501 |         else
502 |         {
503 |             break;
504 |         }
505 |     }
506 | 
507 |     // Exit.
508 |     return result == cudaSuccess ? 0 : -1;
509 | }


--------------------------------------------------------------------------------
/example_8/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_8/main.cu:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | /**
 33 | This example shows how to run matrix multiplication kernels using functions and data structures
 34 | provided by CUTLASS using tensor cores; which we run on a NVIDIA Turing GPU.
 35 | 
 36 | Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
 37 | high performance kernels at scale which works for multiple problem sizes with good abstractions is
 38 | really hard. CUTLASS solves this problem by providing simplified abstractions to compose
 39 | multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
 40 | easily.
 41 | 
 42 | CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
 43 | and thread-block level, they compute on their own tile-size with higher level of tile sizes being
 44 | composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
 45 | to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
 46 | threadblock-tile (tile size computed by a threadblock).
 47 | 
 48 | In thie example, we split variable initialization into
 49 | 1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
 50 | can view them (logical to physical mapping)
 51 | 2. Setting up computation properties : describes how the above set matrices will be used to compute
 52 | output of matrix multiplication.
 53 | 
 54 | First, we setup the data types of matrices A, B, C and D along with alpha, beta as the equation for
 55 | GEMM is D = alpha * A * B + beta * C. In CUTLASS, the kernels first compute A * B and leaves the
 56 | rest of the computation to end of the kernel as alpha * X + beta * C is a simple element-wise
 57 | operation on X (A * B) and C. We call this as epilogue of kernel. Hence, we setup data types for
 58 | alpha and beta to be equal to ElementComputeEpilogue = int32_t. As we want to use MMA instructions
 59 | on Turing and they support 8-bit signed integer (int8_t), we use data type for elements in input
 60 | matrix A and B as int8_t. Volta also supports accumulation of partial dot product to int32_t, which
 61 | can store wider range of numbers, we use it as data type of output matrix elements and accumulation.
 62 | We convey this to CUTLASS kernel by initializing template variables ElementAccumulator (int32_t),
 63 | ElementComputeEpilogue (int32_t), ElementInputA (int8_t), ElementInputB (int8_t), ElementOutput
 64 | (int32_t). Communicating just the data type is not enough. As the data is laid out linearly in
 65 | memory, we have to convey the layout of matrices. We do that by initializing template variable
 66 | LayoutInputA to column major cutlass variable, LayoutInputB to row major and LayoutOutput to row
 67 | major. Next, we setup rules to comptue alpha * X + beta * C which is called epilogue of the kernel.
 68 | We initialize template variable EpilogueOp, which takes the data type of output ElementOutput
 69 | (int32_t), the number of elements per vector memory access (16), data type of accumulator (int32_t)
 70 | and data type of computation of linear combination (alpha * X + beta * C).
 71 | 
 72 | Now that we setup the properties of data, we have to setup properties of computation.
 73 | 
 74 | Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
 75 | 64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
 76 | deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
 77 | bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
 78 | high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
 79 | understanding and coding complicated hardware optimizations which can easily go wrong.
 80 | 
 81 | CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
 82 | constitute the whole process of loading input data from global memory to shared memory, loading data
 83 | from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
 84 | sequence shows a typical mma pipeline.
 85 | 
 86 | matrix in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
 87 | output to global memory
 88 | 
 89 | The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
 90 | until the previous finished executing. There are stages in the pipeline which do not have fixed
 91 | latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
 92 | pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
 93 | Finally, the pipeline in a kernel looks like
 94 | 
 95 | (1) matrix in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
 96 | mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) matrix in global
 97 | memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
 98 | (9) output to global memory
 99 | 
100 | This way, you can hide the second global memoroy load latency by doing computation on already loaded
101 | input data.
102 | 
103 | There are few more template variables initialized such as, which threadblock tile of output matrix
104 | is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on.
105 | 
106 | These are all put together to create a template variable which describes CUTLASS GEMM kernel using
107 | cutlass::gemm::device::Gemm template.
108 | 
109 | The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
110 | We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
111 | in the way of learning CUTLASS.
112 | 
113 | Once all the matrices are initialized and filled with data, create arguments tuple to launch CUTLASS
114 | kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
115 | important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
116 | memory required by the kernel we instantiated. If yes, we create it and pass it along with other
117 | arguments created to initialize CUTLASS kernel then, the kernel is launched.
118 | 
119 | In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
120 | the output from CUTLASS kernel is same as reference GEMM kernel.
121 | */
122 | 
123 | #include <iostream>
124 | 
125 | #include "cutlass/cutlass.h"
126 | #include "cutlass/gemm/device/gemm.h"
127 | #include "cutlass/util/host_tensor.h"
128 | #include "cutlass/util/reference/device/gemm.h"
129 | #include "cutlass/util/reference/host/tensor_compare.h"
130 | #include "cutlass/util/reference/host/tensor_copy.h"
131 | #include "cutlass/util/reference/host/tensor_fill.h"
132 | #include "cutlass/util/tensor_view_io.h"
133 | #include "helper.h"
134 | 
135 | // The code section below describes datatype for input, output matrices and computation between
136 | // elements in input matrices.
137 | using ElementAccumulator = int32_t;                // <- data type of accumulator
138 | using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
139 | using ElementInputA = int8_t;                      // <- data type of elements in input matrix A
140 | using ElementInputB = int8_t;                      // <- data type of elements in input matrix B
141 | using ElementOutput = int32_t;                     // <- data type of elements in output matrix D
142 | 
143 | // The code section below describes matrix layout of input and output matrices. Column Major for
144 | // Matrix A, Row Major for Matrix B and Row Major for Matrix C
145 | using LayoutInputA = cutlass::layout::RowMajor;
146 | using LayoutInputB = cutlass::layout::ColumnMajor;
147 | using LayoutOutput = cutlass::layout::RowMajor;
148 | 
149 | // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
150 | using MMAOp = cutlass::arch::OpClassTensorOp;
151 | 
152 | // This code section describes CUDA SM architecture number
153 | using SmArch = cutlass::arch::Sm75;
154 | 
155 | // This code section describes the tile size a thread block will compute
156 | using ShapeMMAThreadBlock =
157 |     cutlass::gemm::GemmShape<128, 256, 64>; // <- threadblock tile M = 128, N = 256, K = 64
158 | // This code section describes tile size a warp will compute
159 | using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 64>; // <- warp tile M = 64, N = 64, K = 64
160 | // This code section describes the size of MMA op
161 | using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 16>; // <- MMA Op tile M = 8, N = 8, K = 16
162 | 
163 | // This code section describes how threadblocks are scheduled on GPU
164 | using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ??
165 | 
166 | // This code section describes the epilogue part of the kernel
167 | using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
168 |     ElementOutput,                                    // <- data type of output matrix
169 |     128 / cutlass::sizeof_bits<ElementOutput>::value, // <- the number of elements per vectorized
170 |                                                       // memory access. For a byte, it's 16
171 |                                                       // elements. This becomes the vector width of
172 |                                                       // math instructions in the epilogue too
173 |     ElementAccumulator,                               // <- data type of accumulator
174 |     ElementComputeEpilogue>;                          // <- data type for alpha/beta in linear combination function
175 | 
176 | // Number of pipelines you want to use
177 | constexpr int NumStages = 2;
178 | 
179 | using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
180 |                                          LayoutInputA,
181 |                                          ElementInputB,
182 |                                          LayoutInputB,
183 |                                          ElementOutput,
184 |                                          LayoutOutput,
185 |                                          ElementAccumulator,
186 |                                          MMAOp,
187 |                                          SmArch,
188 |                                          ShapeMMAThreadBlock,
189 |                                          ShapeMMAWarp,
190 |                                          ShapeMMAOp,
191 |                                          EpilogueOp,
192 |                                          SwizzleThreadBlock,
193 |                                          NumStages>;
194 | 
195 | int run()
196 | {
197 | 
198 |     const int length_m = 5120;
199 |     const int length_n = 4096;
200 |     const int length_k = 4096;
201 | 
202 |     // Create a tuple of problem size for matrix multiplication
203 |     cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
204 | 
205 |     // Initialize tensors using CUTLASS helper functions
206 |     cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
207 |         problem_size.mk()); // <- Create matrix A with dimensions M x K
208 |     cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
209 |         problem_size.kn()); // <- Create matrix B with dimensions K x N
210 |     cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
211 |         problem_size.mn()); // <- Create matrix C with dimensions M x N
212 |     cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
213 |         problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
214 |                             // CUTLASS kernel
215 |     cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
216 |         problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
217 |                             // reference kernel
218 | 
219 |     // Fill input and output matrices on host using CUTLASS helper functions
220 |     cutlass::reference::host::TensorFillRandomUniform(
221 |         tensor_a.host_view(),
222 |         1,
223 |         ElementInputA(4),
224 |         ElementInputA(-4),
225 |         0); // <- Fill matrix A on host with uniform-distribution random data
226 |     cutlass::reference::host::TensorFillRandomUniform(
227 |         tensor_b.host_view(),
228 |         1,
229 |         ElementInputB(4),
230 |         ElementInputB(-4),
231 |         0); // <- Fill matrix B on host with uniform-distribution random data
232 |     cutlass::reference::host::TensorFillRandomUniform(
233 |         tensor_c.host_view(),
234 |         1,
235 |         ElementOutput(4),
236 |         ElementOutput(-4),
237 |         0); // <- Fill matrix C on host with uniform-distribution random data
238 |     cutlass::reference::host::TensorFill(
239 |         tensor_d.host_view()); // <- fill matrix D on host with zeros
240 |     cutlass::reference::host::TensorFill(
241 |         tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
242 | 
243 |     // Copy data from host to GPU
244 |     tensor_a.sync_device();
245 |     tensor_b.sync_device();
246 |     tensor_c.sync_device();
247 |     tensor_d.sync_device();
248 |     tensor_ref_d.sync_device();
249 | 
250 |     // Initialize alpha and beta for dot product computation
251 |     ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
252 |     ElementComputeEpilogue beta = ElementComputeEpilogue(0);
253 | 
254 |     // Split K dimension into 1 partitions
255 |     int split_k_slices = 1;
256 | 
257 |     // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
258 |     // instantiated CUTLASS kernel
259 |     typename Gemm::Arguments arguments{problem_size,          // <- problem size of matrix multiplication
260 |                                        tensor_a.device_ref(), // <- reference to matrix A on device
261 |                                        tensor_b.device_ref(), // <- reference to matrix B on device
262 |                                        tensor_c.device_ref(), // <- reference to matrix C on device
263 |                                        tensor_d.device_ref(), // <- reference to matrix D on device
264 |                                        {alpha, beta},         // <- tuple of alpha and beta
265 |                                        split_k_slices};       // <- k-dimension split factor
266 | 
267 |     // Using the arguments, query for extra workspace required for matrix multiplication computation
268 |     size_t workspace_size = Gemm::get_workspace_size(arguments);
269 | 
270 |     // Allocate workspace memory
271 |     cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
272 | 
273 |     // Instantiate CUTLASS kernel depending on templates
274 |     Gemm gemm_op;
275 | 
276 |     // Check the problem size is supported or not
277 |     cutlass::Status status = gemm_op.can_implement(arguments);
278 |     CUTLASS_CHECK(status);
279 | 
280 |     // Initialize CUTLASS kernel with arguments and workspace pointer
281 |     status = gemm_op.initialize(arguments, workspace.get());
282 |     CUTLASS_CHECK(status);
283 | 
284 |     // Launch initialized CUTLASS kernel
285 |     status = gemm_op();
286 |     CUTLASS_CHECK(status);
287 | 
288 |     // Create instantiation for device reference gemm kernel
289 |     cutlass::reference::device::Gemm<ElementInputA,
290 |                                      LayoutInputA,
291 |                                      ElementInputB,
292 |                                      LayoutInputB,
293 |                                      ElementOutput,
294 |                                      LayoutOutput,
295 |                                      ElementComputeEpilogue,
296 |                                      ElementComputeEpilogue>
297 |         gemm_device;
298 | 
299 |     // Launch device reference gemm kernel
300 |     gemm_device(problem_size,
301 |                 alpha,
302 |                 tensor_a.device_ref(),
303 |                 tensor_b.device_ref(),
304 |                 beta,
305 |                 tensor_c.device_ref(),
306 |                 tensor_ref_d.device_ref());
307 | 
308 |     // Wait for kernels to finish
309 |     cudaDeviceSynchronize();
310 | 
311 |     // Copy output data from CUTLASS and reference kernel to host for comparison
312 |     tensor_d.sync_host();
313 |     tensor_ref_d.sync_host();
314 | 
315 |     // Check if output from CUTLASS kernel and reference kernel are equal or not
316 |     bool passed = cutlass::reference::host::TensorEquals(
317 |         tensor_d.host_view(),
318 |         tensor_ref_d.host_view());
319 | 
320 |     std::cout << (passed ? "Passed" : "Failed") << std::endl;
321 | 
322 |     return (passed ? 0 : -1);
323 | }
324 | 
325 | int main()
326 | {
327 |     bool notSupported = false;
328 | 
329 |     // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
330 |     // in CUDA 10.2.
331 |     //
332 |     // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
333 |     if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)))
334 |     {
335 |         std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
336 |         notSupported = true;
337 |     }
338 | 
339 |     cudaDeviceProp props;
340 | 
341 |     cudaError_t error = cudaGetDeviceProperties(&props, 0);
342 |     if (error != cudaSuccess)
343 |     {
344 |         std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
345 |         return -1;
346 |     }
347 | 
348 |     if (!((props.major * 10 + props.minor) >= 75))
349 |     {
350 |         std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75."
351 |                   << std::endl;
352 | 
353 |         notSupported = true;
354 |     }
355 | 
356 |     if (notSupported)
357 |     {
358 |         // Returning zero so this test passes on older Toolkits. Its actions are no-op.
359 |         return 0;
360 |     }
361 | 
362 |     return run();
363 | }


--------------------------------------------------------------------------------
/example_9/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/yhpark/workspace/cutlass/include)
21 | include_directories(/home/yhpark/workspace/cutlass/tools/util/include)
22 | include_directories(/home/yhpark/workspace/cutlass/examples/common)
23 | 
24 | add_executable(main main.cu)
25 | 
26 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
27 | 
28 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
29 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
30 | 


--------------------------------------------------------------------------------
/example_9/main.cu:
--------------------------------------------------------------------------------
  1 | /***************************************************************************************************
  2 |  * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  * list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  * this list of conditions and the following disclaimer in the documentation
 13 |  * and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the copyright holder nor the names of its
 16 |  * contributors may be used to endorse or promote products derived from
 17 |  * this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 |  *
 30 |  **************************************************************************************************/
 31 | 
 32 | /**
 33 | 
 34 | 
 35 | This example shows how to run convolution kernels using functions and data structures
 36 | provided by CUTLASS using tensor cores; which we run on a NVIDIA Turing GPU.
 37 | 
 38 | Writing a single high performance convolution kernel is hard but do-able. Whereas writing
 39 | high performance kernels at scale which works for multiple problem sizes with good abstractions is
 40 | really hard. CUTLASS solves this problem by providing simplified abstractions to compose
 41 | multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
 42 | of GPU easily.
 43 | 
 44 | CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
 45 | and thread-block level, they compute on their own tile-size with higher level of tile sizes being
 46 | composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
 47 | to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
 48 | threadblock-tile (tile size computed by a threadblock).
 49 | 
 50 | In thie example, we split variable initialization into
 51 | 1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
 52 | can view them (logical to physical mapping)
 53 | 2. Setting up computation properties : describes how the above set tensors will be used to compute
 54 | output of convolution.
 55 | 
 56 | First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
 57 | with alpha, beta as the equation for convolution is C = alpha * Conv(A, B) + beta * C. In CUTLASS,
 58 | the kernels first compute Conv(A, B) and leave the rest of the computation to end of the kernel as
 59 | alpha * X + beta * C is a simple element-wise operation on X (Conv(A, B)) and C. We call this as
 60 | epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to
 61 | ElementComputeEpilogue = float. We want to use MMA instructions on Turing and they support 4-bit
 62 | signed integer. But int4b_t is not fully supported by Nvidia software stack, so CUTLASS introduces
 63 | cutlass::int4b_t. We use the data type for elements in input tensor A and B as cutlass::int4b_t. We
 64 | convey this to CUTLASS kernel by initializing template variables ElementAccumulator (int32_t),
 65 | ElementComputeEpilogue (float), ElementInputA (cutlass::int4b_t), ElementInputB (cutlass::int4b_t),
 66 | ElementOutput (int32_t). Communicating just the data type is not enough. As the data is laid out
 67 | linearly in memory, we have to convey the layout of tensors. We do that by initializing template
 68 | variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup
 69 | rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
 70 | variable EpilogueOp, which takes the data type of output ElementOutput (int32_t), the number of
 71 | elements per vector memory access (32), data type of accumulator (int32_t) and data type of
 72 | computation of linear combination (alpha * X + beta * C).
 73 | 
 74 | Now that we setup the properties of data, we have to setup properties of computation.
 75 | 
 76 | Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
 77 | 64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
 78 | internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
 79 | data in bank-conflict free manner, and ton of other variables required to compose, initialize and
 80 | launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
 81 | from understanding and coding complicated hardware optimizations which can easily go wrong.
 82 | 
 83 | CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
 84 | constitute the whole process of loading input data from global memory to shared memory, loading data
 85 | from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
 86 | sequence shows a typical mma pipeline.
 87 | 
 88 | tensor in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
 89 | output to global memory
 90 | 
 91 | The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
 92 | until the previous finished executing. There are stages in the pipeline which do not have fixed
 93 | latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
 94 | pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
 95 | Finally, the pipeline in a kernel looks like
 96 | 
 97 | (1) tensor in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
 98 | mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) tensor in global
 99 | memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
100 | (9) output to global memory
101 | 
102 | This way, you can hide the second global memory load latency by doing computation on already loaded
103 | input data.
104 | 
105 | There are few more template variables initialized such as, which threadblock tile of output matrix
106 | is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on.
107 | 
108 | These are all put together to create a template variable which describes CUTLASS Implicit GEMM
109 | kernel using cutlass::conv::device::ImplicitGemm template.
110 | 
111 | The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
112 | We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
113 | in the way of learning CUTLASS.
114 | 
115 | Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS
116 | kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
117 | R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
118 | important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
119 | memory required by the kernel we instantiated. If yes, we create it and pass it along with other
120 | arguments created to initialize CUTLASS kernel then, the kernel is launched.
121 | 
122 | In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
123 | compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
124 | */
125 | 
126 | #include <iostream>
127 | #include <fstream>
128 | #include <sstream>
129 | 
130 | #include "cutlass/cutlass.h"
131 | #include "cutlass/gemm/device/gemm.h"
132 | #include "cutlass/conv/kernel/default_conv2d_fprop.h"
133 | #include "cutlass/conv/device/implicit_gemm_convolution.h"
134 | 
135 | #include "cutlass/util/command_line.h"
136 | #include "cutlass/util/host_tensor.h"
137 | #include "cutlass/util/tensor_view_io.h"
138 | #include "cutlass/util/reference/device/gemm.h"
139 | #include "cutlass/util/reference/host/tensor_compare.h"
140 | #include "cutlass/util/reference/host/tensor_copy.h"
141 | #include "cutlass/util/reference/host/tensor_fill.h"
142 | #include "cutlass/util/reference/host/convolution.h"
143 | #include "cutlass/util/tensor_view_io.h"
144 | 
145 | #include "helper.h"
146 | 
147 | // The code section below describes datatype for input, output tensors and computation between
148 | // elements
149 | using ElementAccumulator = int32_t;     // Data type of accumulator
150 | using ElementComputeEpilogue = float;   // Data type of epilogue computation (alpha, beta)
151 | using ElementInputA = cutlass::int4b_t; // Data type of elements in input tensor
152 | using ElementInputB = cutlass::int4b_t; // Data type of elements in input tensor
153 | using ElementOutput = cutlass::int4b_t; // Data type of elements in output tensor
154 | 
155 | using LayoutInputA = cutlass::layout::TensorNHWC;
156 | using LayoutInputB = cutlass::layout::TensorNHWC;
157 | using LayoutOutput = cutlass::layout::TensorNHWC;
158 | 
159 | // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
160 | using MMAOp = cutlass::arch::OpClassTensorOp;
161 | 
162 | // This code section describes CUDA SM architecture number
163 | using SmArch = cutlass::arch::Sm75;
164 | 
165 | // This code section describes the tile size a thread block will compute
166 | using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 128>; // Threadblock tile shape
167 | 
168 | // This code section describes tile size a warp will compute
169 | using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; // Warp tile shape
170 | 
171 | // This code section describes the size of MMA op
172 | using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; // TensorCore instruction shape
173 | 
174 | // This code section describes how threadblocks are scheduled on GPU
175 | using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
176 | 
177 | // Number of pipelines you want to use
178 | constexpr int NumStages = 2;
179 | 
180 | // This code section describes the epilogue part of the kernel, we use default value
181 | using EpilogueOp = cutlass::epilogue::thread::LinearCombinationClamp<
182 |     ElementOutput,           // Data type of output matrix.
183 |     8,                       // The number of elements per vectorized.
184 |                              // memory access. This becomes the vector width of
185 |                              // math instructions in the epilogue too.
186 |     ElementAccumulator,      // Data type of accumulator
187 |     ElementComputeEpilogue>; // Data type for alpha/beta in linear combination
188 | 
189 | using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
190 |     ElementInputA, LayoutInputA,
191 |     ElementInputB, LayoutInputB,
192 |     ElementOutput, LayoutOutput,
193 |     ElementAccumulator,
194 |     MMAOp,
195 |     SmArch,
196 |     ThreadblockShape,
197 |     WarpShape,
198 |     InstructionShape,
199 |     EpilogueOp,
200 |     SwizzleThreadBlock,
201 |     NumStages,
202 |     cutlass::arch::OpMultiplyAddSaturate,
203 |     cutlass::conv::IteratorAlgorithm::kAnalytic>::Kernel;
204 | 
205 | using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
206 | 
207 | /////////////////////////////////////////////////////////////////////////////////////////////////
208 | 
209 | // Command line options parsing
210 | struct Options
211 | {
212 | 
213 |     bool help;
214 |     cutlass::Tensor4DCoord input_size;
215 |     cutlass::Tensor4DCoord filter_size;
216 |     cutlass::Tensor4DCoord padding;
217 |     cutlass::MatrixCoord conv_stride;
218 |     cutlass::MatrixCoord dilation;
219 |     bool reference_check;
220 |     bool measure_performance;
221 |     int iterations;
222 |     bool save_workspace;
223 |     ElementComputeEpilogue alpha;
224 |     ElementComputeEpilogue beta;
225 |     bool benchmark;
226 |     std::string tag;
227 | 
228 |     Options() : help(false),
229 |                 input_size(1, 32, 32, 32),
230 |                 filter_size(32, 3, 3, 32),
231 |                 padding(1, 1, 1, 1),
232 |                 conv_stride(1, 1),
233 |                 dilation(1, 1),
234 |                 reference_check(false),
235 |                 measure_performance(true),
236 |                 iterations(20),
237 |                 save_workspace(false),
238 |                 alpha(1),
239 |                 beta(0),
240 |                 benchmark(false) {}
241 | 
242 |     // Verify the problem size is compatible with the CUTLASS Convolution implementation.
243 |     bool valid()
244 |     {
245 | 
246 |         //
247 |         // CUTLASS attempts to load 128b vectors of int4b_t elements. Consequently,
248 |         // all pointers, strides, and tensor extents must be divisible by 32 elements.
249 |         //
250 |         int const kAlignment = 32;
251 | 
252 |         if ((input_size.c() % kAlignment) ||
253 |             (filter_size.n() % kAlignment))
254 |         {
255 | 
256 |             // misaligned tensors
257 |             return false;
258 |         }
259 | 
260 |         // Invalid padding
261 |         if ((padding.h() != filter_size.h() / 2) ||
262 |             (padding.w() != filter_size.w() / 2))
263 |         {
264 | 
265 |             return false;
266 |         }
267 | 
268 |         return true;
269 |     }
270 | 
271 |     /// Updates input and filter sizes
272 |     void update(
273 |         cutlass::Tensor4DCoord input_size,
274 |         cutlass::Tensor4DCoord filter_size)
275 |     {
276 | 
277 |         this->input_size = input_size;
278 |         this->filter_size = filter_size;
279 | 
280 |         padding.n() = filter_size.h() / 2;
281 |         padding.h() = filter_size.h() / 2;
282 |         padding.w() = filter_size.w() / 2;
283 |         padding.c() = filter_size.w() / 2;
284 |     }
285 | 
286 |     // Parses the command line
287 |     void parse(int argc, char const **args)
288 |     {
289 |         cutlass::CommandLine cmd(argc, args);
290 | 
291 |         if (cmd.check_cmd_line_flag("help"))
292 |         {
293 |             help = true;
294 |         }
295 | 
296 |         if (cmd.check_cmd_line_flag("ref-check"))
297 |         {
298 |             reference_check = true;
299 |         }
300 | 
301 |         if (cmd.check_cmd_line_flag("perf-check"))
302 |         {
303 |             measure_performance = true;
304 |         }
305 | 
306 |         if (cmd.check_cmd_line_flag("save-workspace"))
307 |         {
308 |             save_workspace = true;
309 |         }
310 | 
311 |         if (cmd.check_cmd_line_flag("benchmark"))
312 |         {
313 |             benchmark = true;
314 |         }
315 | 
316 |         cmd.get_cmd_line_argument("n", input_size.n());
317 |         cmd.get_cmd_line_argument("h", input_size.h());
318 |         cmd.get_cmd_line_argument("w", input_size.w());
319 |         cmd.get_cmd_line_argument("c", input_size.c());
320 | 
321 |         cmd.get_cmd_line_argument("k", filter_size.n());
322 |         cmd.get_cmd_line_argument("r", filter_size.h());
323 |         cmd.get_cmd_line_argument("s", filter_size.w());
324 |         filter_size.c() = input_size.c();
325 | 
326 |         cmd.get_cmd_line_argument("alpha", alpha);
327 |         cmd.get_cmd_line_argument("beta", beta);
328 | 
329 |         cmd.get_cmd_line_argument("iterations", iterations);
330 |         cmd.get_cmd_line_argument("tag", tag);
331 | 
332 |         if (filter_size.h() == 3 && filter_size.w() == 3)
333 |         {
334 |             padding = {1, 1, 1, 1};
335 |         }
336 |         else
337 |         {
338 |             filter_size.h() = 1;
339 |             filter_size.w() = 1;
340 |             padding = {0, 0, 0, 0};
341 |         }
342 |     }
343 | 
344 |     /// Prints the usage statement.
345 |     std::ostream &print_usage(std::ostream &out) const
346 |     {
347 | 
348 |         out << "09_turing_tensorop_conv2dfprop example\n\n"
349 |             << "  This example uses Turing's Tensor Core operators on int4 data types to compute\n"
350 |             << "  forward convolution on tensors of layout NHWC.\n\n"
351 |             << "Options:\n\n"
352 |             << "  --help               If specified, displays this usage statement.\n\n"
353 |             << "  --n=<int>            Input tensor extent N\n"
354 |             << "  --h=<int>            Input tensor extent H\n"
355 |             << "  --w=<int>            Input tensor extent W\n"
356 |             << "  --c=<int>            Input tensor extent C\n"
357 |             << "  --k=<int>            Filter extent K\n"
358 |             << "  --r=<int>            Filter extent R\n"
359 |             << "  --s=<int>            Filter extent S\n\n"
360 |             << "  --alpha=<float>      Epilogue scalar alpha\n"
361 |             << "  --beta=<float>       Epilogue scalar beta\n\n"
362 |             << "  --ref-check          If set (true), reference check on the host is computed\n"
363 |             << "  --perf-check         If set (true), performance is measured.\n"
364 |             << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
365 |             << "  --iterations=<int>   Number of profiling iterations to perform.\n"
366 |             << "  --save-workspace     If set, workspace is written to a text file.\n"
367 |             << "  --tag=<string>       String to replicate across the first column in the results table\n";
368 | 
369 |         out << "\n\nExamples:\n\n"
370 |             << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
371 |             << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
372 | 
373 |         return out;
374 |     }
375 | 
376 |     /// Computes the output tensor size (NPQK)
377 |     cutlass::Tensor4DCoord output_size() const
378 |     {
379 |         return cutlass::Tensor4DCoord(
380 |             input_size.n(),
381 |             (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
382 |             (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
383 |             filter_size.n());
384 |     }
385 | 
386 |     /// Compute performance in GFLOP/s
387 |     double gflops(double runtime_s) const
388 |     {
389 | 
390 |         // Number of multiply-adds = NPQK * CRS
391 |         int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
392 | 
393 |         // Two flops per multiply-add
394 |         return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
395 |     }
396 | };
397 | 
398 | /////////////////////////////////////////////////////////////////////////////////////////////////
399 | 
400 | struct Result
401 | {
402 |     double runtime_ms;
403 |     double gflops;
404 |     cutlass::Status status;
405 |     cutlass::Status reference_check;
406 |     cudaError_t error;
407 | 
408 |     Result() : runtime_ms(0),
409 |                gflops(0),
410 |                status(cutlass::Status::kSuccess),
411 |                reference_check(cutlass::Status::kInvalid),
412 |                error(cudaSuccess) {}
413 | 
414 |     static std::ostream &print_header(std::ostream &out, Options const &options)
415 |     {
416 | 
417 |         if (!options.tag.empty())
418 |         {
419 |             out << "Name,";
420 |         }
421 | 
422 |         out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
423 | 
424 |         return out;
425 |     }
426 | 
427 |     std::ostream &print(std::ostream &out, int idx, Options const &options)
428 |     {
429 | 
430 |         if (!options.tag.empty())
431 |         {
432 |             out << options.tag << ",";
433 |         }
434 | 
435 |         out
436 |             << "conv_" << idx << ","
437 |             << options.input_size.n() << ","
438 |             << options.input_size.h() << ","
439 |             << options.input_size.w() << ","
440 |             << options.input_size.c() << ","
441 |             << options.filter_size.n() << ","
442 |             << options.filter_size.h() << ","
443 |             << options.filter_size.w() << ","
444 |             << runtime_ms << ","
445 |             << gflops;
446 | 
447 |         return out;
448 |     }
449 | };
450 | 
451 | /////////////////////////////////////////////////////////////////////////////////////////////////
452 | 
453 | /// Runs one benchmark
454 | Result profile_convolution(Options const &options)
455 | {
456 | 
457 |     Result result;
458 | 
459 |     //
460 |     // Allocate host-device tensors using the CUTLASS Utilities.
461 |     //
462 | 
463 |     cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
464 |     cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
465 |     cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
466 |     cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
467 | 
468 |     //
469 |     // Initialize tensors
470 |     //
471 | 
472 |     // Fill tensor A on host with uniform-distribution random data
473 |     cutlass::reference::host::TensorFillRandomUniform(
474 |         tensor_a.host_view(),
475 |         1,
476 |         ElementInputA(7),
477 |         ElementInputA(-8),
478 |         0);
479 | 
480 |     // Fill tensor B on host with uniform-distribution random data
481 |     cutlass::reference::host::TensorFillRandomUniform(
482 |         tensor_b.host_view(),
483 |         1,
484 |         ElementInputB(7),
485 |         ElementInputB(-8),
486 |         0);
487 | 
488 |     // Fill tensor C on host with zeros
489 |     cutlass::reference::host::TensorFill(
490 |         tensor_c.host_view());
491 | 
492 |     // Fill tensor C for reference on host with zeros
493 |     cutlass::reference::host::TensorFill(
494 |         tensor_ref_c.host_view());
495 | 
496 |     // Copy data from host to GPU
497 |     tensor_a.sync_device();
498 |     tensor_b.sync_device();
499 |     tensor_c.sync_device();
500 |     tensor_ref_c.sync_device();
501 | 
502 |     //
503 |     // Define arguments for CUTLASS Convolution
504 |     //
505 | 
506 |     // mode (kCrossCorrelation or kConvolution)
507 |     cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
508 | 
509 |     // Split K dimension into 1 partitions
510 |     int split_k_slices = 1;
511 | 
512 |     // Construct Conv2dProblemSize with user defined output size
513 |     cutlass::conv::Conv2dProblemSize problem_size(
514 |         options.input_size,
515 |         options.filter_size,
516 |         options.padding,
517 |         options.conv_stride,
518 |         options.dilation,
519 |         options.output_size(),
520 |         mode,
521 |         split_k_slices);
522 | 
523 |     // Construct ImplicitGemm::Argument structure with conv2d
524 |     // problem size, data pointers, and epilogue values
525 |     typename ImplicitGemm::Arguments arguments{
526 |         problem_size,
527 |         tensor_a.device_ref(),
528 |         tensor_b.device_ref(),
529 |         tensor_c.device_ref(),
530 |         tensor_c.device_ref(),
531 |         {options.alpha, options.beta},
532 |     };
533 | 
534 |     //
535 |     // Initialize CUTLASS Convolution
536 |     //
537 | 
538 |     ImplicitGemm implicit_gemm_op;
539 | 
540 |     size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
541 | 
542 |     // Allocate workspace memory
543 |     cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
544 | 
545 |     result.status = implicit_gemm_op.can_implement(arguments);
546 |     CUTLASS_CHECK(result.status);
547 | 
548 |     result.status = implicit_gemm_op.initialize(arguments, workspace.get());
549 |     CUTLASS_CHECK(result.status);
550 | 
551 |     //
552 |     // Launch initialized CUTLASS kernel
553 |     //
554 |     result.status = implicit_gemm_op();
555 | 
556 |     CUTLASS_CHECK(result.status);
557 | 
558 |     //
559 |     // Optional reference check
560 |     //
561 | 
562 |     if (options.reference_check)
563 |     {
564 |         std::cout << "Verification on host...\n";
565 | 
566 |         // Compute with reference implementation
567 |         cutlass::reference::host::Conv2dFprop<
568 |             ElementInputA,
569 |             LayoutInputA,
570 |             ElementInputB,
571 |             LayoutInputB,
572 |             ElementOutput,
573 |             LayoutOutput,
574 |             ElementComputeEpilogue,
575 |             ElementAccumulator,
576 |             ElementOutput,
577 |             cutlass::NumericConverterClamp<ElementOutput, ElementComputeEpilogue>>(
578 |             problem_size,
579 |             tensor_a.host_ref(),
580 |             tensor_b.host_ref(),
581 |             tensor_c.host_ref(),
582 |             tensor_ref_c.host_ref(),
583 |             options.alpha,
584 |             options.beta);
585 | 
586 |         // Check if output from CUTLASS kernel and reference kernel are equal or not
587 |         tensor_c.sync_host();
588 | 
589 |         bool passed = cutlass::reference::host::TensorEquals(
590 |             tensor_c.host_view(),
591 |             tensor_ref_c.host_view());
592 | 
593 |         if (!passed)
594 |         {
595 |             result.reference_check = cutlass::Status::kErrorInternal;
596 |             std::cout << "ERROR - results miscompared.\n";
597 |         }
598 |         else
599 |         {
600 |             result.reference_check = cutlass::Status::kSuccess;
601 |             std::cout << "Passed.\n";
602 |         }
603 |     }
604 |     else
605 |     {
606 |         result.reference_check = cutlass::Status::kInvalid;
607 |     }
608 | 
609 |     if (options.save_workspace)
610 |     {
611 | 
612 |         std::stringstream ss;
613 | 
614 |         ss << "09_tensor_conv_workspace_conv2dfprop_"
615 |            << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c()
616 |            << "_"
617 |            << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c()
618 |            << ".dat";
619 | 
620 |         std::ofstream output_workspace(ss.str());
621 | 
622 |         output_workspace
623 |             << "Input = \n"
624 |             << tensor_a.host_view() << "\n\n"
625 |             << "Filters = \n"
626 |             << tensor_b.host_view() << "\n\n";
627 | 
628 |         if (options.reference_check)
629 |         {
630 |             output_workspace << "Reference = \n"
631 |                              << tensor_ref_c.host_view() << "\n\n";
632 |         }
633 | 
634 |         output_workspace << "Computed = \n"
635 |                          << tensor_c.host_view() << std::endl;
636 | 
637 |         std::cout << "Results written to '" << ss.str() << "'." << std::endl;
638 |     }
639 | 
640 |     //
641 |     // Performance measurement
642 |     //
643 | 
644 |     if (options.measure_performance)
645 |     {
646 | 
647 |         cudaEvent_t events[2];
648 | 
649 |         for (auto &event : events)
650 |         {
651 |             result.error = cudaEventCreate(&event);
652 |             if (result.error != cudaSuccess)
653 |             {
654 |                 std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
655 |                 return result;
656 |             }
657 |         }
658 | 
659 |         // Record an event at the start of a series of convolution operations.
660 |         result.error = cudaEventRecord(events[0]);
661 |         if (result.error != cudaSuccess)
662 |         {
663 |             std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
664 |             return result;
665 |         }
666 | 
667 |         // Launch a sequence of implicit GEMM operations on the device
668 |         for (int iteration = 0; iteration < options.iterations; ++iteration)
669 |         {
670 |             result.status = implicit_gemm_op();
671 |             CUTLASS_CHECK(result.status);
672 |         }
673 | 
674 |         // Record an event when the convolutions have been launched.
675 |         result.error = cudaEventRecord(events[1]);
676 |         if (result.error != cudaSuccess)
677 |         {
678 |             std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
679 |             return result;
680 |         }
681 | 
682 |         // Wait for work on the device to complete.
683 |         result.error = cudaEventSynchronize(events[1]);
684 |         if (result.error != cudaSuccess)
685 |         {
686 |             std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
687 |             return result;
688 |         }
689 | 
690 |         // Measure elapsed runtime
691 |         float runtime_ms = 0;
692 |         result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
693 |         if (result.error != cudaSuccess)
694 |         {
695 |             std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
696 |             return result;
697 |         }
698 | 
699 |         // Print average runtime and GFLOPs.
700 |         result.runtime_ms = double(runtime_ms) / double(options.iterations);
701 |         result.gflops = options.gflops(result.runtime_ms / 1000.0);
702 | 
703 |         // Cleanup
704 |         for (auto event : events)
705 |         {
706 |             (void)cudaEventDestroy(event);
707 |         }
708 |     }
709 | 
710 |     return result;
711 | }
712 | 
713 | /////////////////////////////////////////////////////////////////////////////////////////////////
714 | 
715 | int main(int argc, char const **args)
716 | {
717 | 
718 |     // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
719 |     //
720 |     // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
721 |     if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)))
722 |     {
723 |         std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
724 |         return 0;
725 |     }
726 | 
727 |     cudaDeviceProp props;
728 |     CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
729 | 
730 |     if (!(props.major > 7 || (props.major == 7 && props.minor >= 5)))
731 |     {
732 |         std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
733 |                   << std::endl;
734 |         return 0;
735 |     }
736 | 
737 |     Options options;
738 | 
739 |     options.parse(argc, args);
740 | 
741 |     if (options.help)
742 |     {
743 |         options.print_usage(std::cout) << std::endl;
744 |         return 0;
745 |     }
746 | 
747 |     // options.benchmark = true;
748 | 
749 |     if (options.benchmark)
750 |     {
751 |         // Benchmark several layers
752 | 
753 |         int batch_sizes[] = {1, 32, 64, 128, 256, 512};
754 | 
755 |         struct Benchmark
756 |         {
757 |             int h, w, c, k, r, s;
758 |         } layers[] = {
759 |             {56, 56, 64, 256, 1, 1},
760 |             {56, 56, 64, 64, 1, 1},
761 |             {56, 56, 64, 64, 3, 3},
762 |             {56, 56, 256, 64, 1, 1},
763 |             {56, 56, 256, 512, 1, 1},
764 |             {56, 56, 256, 128, 1, 1},
765 |             {28, 28, 128, 128, 3, 3},
766 |             {28, 28, 128, 512, 1, 1},
767 |             {28, 28, 512, 128, 1, 1},
768 |             {28, 28, 512, 1024, 1, 1},
769 |             {28, 28, 512, 256, 1, 1},
770 |             {14, 14, 256, 256, 3, 3},
771 |             {14, 14, 256, 1024, 1, 1},
772 |             {14, 14, 1024, 256, 1, 1},
773 |             {14, 14, 1024, 2048, 1, 1},
774 |             {14, 14, 1024, 512, 1, 1},
775 |             {7, 7, 512, 512, 3, 3},
776 |         };
777 | 
778 |         Result::print_header(std::cout, options) << std::endl;
779 | 
780 |         int idx = 1;
781 | 
782 |         for (auto const &layer : layers)
783 |         {
784 |             for (auto N : batch_sizes)
785 |             {
786 | 
787 |                 options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
788 | 
789 |                 Result result = profile_convolution(options);
790 |                 result.print(std::cout, idx, options) << std::endl;
791 |             }
792 | 
793 |             ++idx;
794 |         }
795 |     }
796 |     else
797 |     {
798 | 
799 |         // Execute one problem size
800 |         if (!options.valid())
801 |         {
802 |             std::cerr << "Invalid problem." << std::endl;
803 |             return -1;
804 |         }
805 | 
806 |         Result result = profile_convolution(options);
807 | 
808 |         Result::print_header(std::cout, options) << std::endl;
809 |         result.print(std::cout, 1, options) << std::endl;
810 |     }
811 | 
812 |     return 0;
813 | }
814 | 
815 | /////////////////////////////////////////////////////////////////////////////////////////////////


--------------------------------------------------------------------------------
/layout_0/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 2 | project(main LANGUAGES CXX CUDA)
 3 | 
 4 | #
 5 | # CUTLASS 3.x requires C++17
 6 | #
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
13 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | 
19 | # cutlass
20 | include_directories(/home/pyh/playground/cutlass/include)
21 | 
22 | add_executable(main main.cu)
23 | 
24 | set_property(TARGET main PROPERTY CUDA_ARCHITECTURES "86")
25 | 
26 | message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") 
27 | message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") 
28 | 


--------------------------------------------------------------------------------
/layout_0/README.aux:
--------------------------------------------------------------------------------
1 | \relax 
2 | \gdef \@abspage@last{1}
3 | 


--------------------------------------------------------------------------------
/layout_0/README.log:
--------------------------------------------------------------------------------
  1 | This is pdfTeX, Version 3.141592653-2.6-1.40.22 (TeX Live 2022/dev/Debian) (preloaded format=pdflatex 2024.11.10)  10 NOV 2024 20:55
  2 | entering extended mode
  3 |  restricted \write18 enabled.
  4 |  %&-line parsing enabled.
  5 | **README.tex
  6 | (./README.tex
  7 | LaTeX2e <2021-11-15> patch level 1
  8 | L3 programming layer <2022-01-21>
  9 | (/usr/share/texlive/texmf-dist/tex/latex/standalone/standalone.cls
 10 | Document Class: standalone 2018/03/26 v1.3a Class to compile TeX sub-files stan
 11 | dalone
 12 | (/usr/share/texlive/texmf-dist/tex/latex/tools/shellesc.sty
 13 | Package: shellesc 2019/11/08 v1.0c unified shell escape interface for LaTeX
 14 | Package shellesc Info: Restricted shell escape enabled on input line 77.
 15 | )
 16 | (/usr/share/texlive/texmf-dist/tex/generic/iftex/ifluatex.sty
 17 | Package: ifluatex 2019/10/25 v1.5 ifluatex legacy package. Use iftex instead.
 18 | 
 19 | (/usr/share/texlive/texmf-dist/tex/generic/iftex/iftex.sty
 20 | Package: iftex 2020/03/06 v1.0d TeX engine tests
 21 | ))
 22 | (/usr/share/texlive/texmf-dist/tex/latex/xkeyval/xkeyval.sty
 23 | Package: xkeyval 2020/11/20 v2.8 package option processing (HA)
 24 | 
 25 | (/usr/share/texlive/texmf-dist/tex/generic/xkeyval/xkeyval.tex
 26 | (/usr/share/texlive/texmf-dist/tex/generic/xkeyval/xkvutils.tex
 27 | \XKV@toks=\toks16
 28 | \XKV@tempa@toks=\toks17
 29 | 
 30 | (/usr/share/texlive/texmf-dist/tex/generic/xkeyval/keyval.tex))
 31 | \XKV@depth=\count185
 32 | File: xkeyval.tex 2014/12/03 v2.7a key=value parser (HA)
 33 | ))
 34 | \sa@internal=\count186
 35 | \c@sapage=\count187
 36 | 
 37 | (/usr/share/texlive/texmf-dist/tex/latex/standalone/standalone.cfg
 38 | File: standalone.cfg 2018/03/26 v1.3a Default configuration file for 'standalon
 39 | e' class
 40 | )
 41 | (/usr/share/texlive/texmf-dist/tex/latex/base/article.cls
 42 | Document Class: article 2021/10/04 v1.4n Standard LaTeX document class
 43 | (/usr/share/texlive/texmf-dist/tex/latex/base/size10.clo
 44 | File: size10.clo 2021/10/04 v1.4n Standard LaTeX file (size option)
 45 | )
 46 | \c@part=\count188
 47 | \c@section=\count189
 48 | \c@subsection=\count190
 49 | \c@subsubsection=\count191
 50 | \c@paragraph=\count192
 51 | \c@subparagraph=\count193
 52 | \c@figure=\count194
 53 | \c@table=\count195
 54 | \abovecaptionskip=\skip47
 55 | \belowcaptionskip=\skip48
 56 | \bibindent=\dimen138
 57 | )
 58 | \sa@box=\box50
 59 | runsystem(pdflatex  -shell-escape  -jobname 'README' '\expandafter\def\csname s
 60 | a@internal@run\endcsname{1}\input{README}')...disabled (restricted).
 61 | 
 62 | runsystem(convert -density 300 -units PixelsPerInch README.pdf  -quality 90 REA
 63 | DME.png)...disabled (restricted).
 64 | 
 65 | 
 66 | 
 67 | Class standalone Warning: Conversion failed! Please ensure that shell escape
 68 | (standalone)              is enabled (e.g. use '-shell-escape').
 69 | 
 70 | ) (/usr/share/texlive/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty
 71 | (/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty
 72 | (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty
 73 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex
 74 | \pgfutil@everybye=\toks18
 75 | \pgfutil@tempdima=\dimen139
 76 | \pgfutil@tempdimb=\dimen140
 77 | 
 78 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-common-lists.t
 79 | ex)) (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def
 80 | \pgfutil@abb=\box51
 81 | ) (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex
 82 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/pgf.revision.tex)
 83 | Package: pgfrcs 2021/05/15 v3.1.9a (3.1.9a)
 84 | ))
 85 | Package: pgf 2021/05/15 v3.1.9a (3.1.9a)
 86 | 
 87 | (/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty
 88 | (/usr/share/texlive/texmf-dist/tex/latex/graphics/graphicx.sty
 89 | Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
 90 | 
 91 | (/usr/share/texlive/texmf-dist/tex/latex/graphics/graphics.sty
 92 | Package: graphics 2021/03/04 v1.4d Standard LaTeX Graphics (DPC,SPQR)
 93 | 
 94 | (/usr/share/texlive/texmf-dist/tex/latex/graphics/trig.sty
 95 | Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
 96 | )
 97 | (/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
 98 | File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
 99 | )
100 | Package graphics Info: Driver file: pdftex.def on input line 107.
101 | 
102 | (/usr/share/texlive/texmf-dist/tex/latex/graphics-def/pdftex.def
103 | File: pdftex.def 2020/10/05 v1.2a Graphics/color driver for pdftex
104 | ))
105 | \Gin@req@height=\dimen141
106 | \Gin@req@width=\dimen142
107 | )
108 | (/usr/share/texlive/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty
109 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex
110 | Package: pgfsys 2021/05/15 v3.1.9a (3.1.9a)
111 | 
112 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex
113 | \pgfkeys@pathtoks=\toks19
114 | \pgfkeys@temptoks=\toks20
115 | 
116 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeysfiltered.code.t
117 | ex
118 | \pgfkeys@tmptoks=\toks21
119 | ))
120 | \pgf@x=\dimen143
121 | \pgf@y=\dimen144
122 | \pgf@xa=\dimen145
123 | \pgf@ya=\dimen146
124 | \pgf@xb=\dimen147
125 | \pgf@yb=\dimen148
126 | \pgf@xc=\dimen149
127 | \pgf@yc=\dimen150
128 | \pgf@xd=\dimen151
129 | \pgf@yd=\dimen152
130 | \w@pgf@writea=\write3
131 | \r@pgf@reada=\read2
132 | \c@pgf@counta=\count196
133 | \c@pgf@countb=\count197
134 | \c@pgf@countc=\count198
135 | \c@pgf@countd=\count199
136 | \t@pgf@toka=\toks22
137 | \t@pgf@tokb=\toks23
138 | \t@pgf@tokc=\toks24
139 | \pgf@sys@id@count=\count266
140 |  (/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg
141 | File: pgf.cfg 2021/05/15 v3.1.9a (3.1.9a)
142 | )
143 | Driver file for pgf: pgfsys-pdftex.def
144 | 
145 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def
146 | File: pgfsys-pdftex.def 2021/05/15 v3.1.9a (3.1.9a)
147 | 
148 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.de
149 | f
150 | File: pgfsys-common-pdf.def 2021/05/15 v3.1.9a (3.1.9a)
151 | )))
152 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.
153 | tex
154 | File: pgfsyssoftpath.code.tex 2021/05/15 v3.1.9a (3.1.9a)
155 | \pgfsyssoftpath@smallbuffer@items=\count267
156 | \pgfsyssoftpath@bigbuffer@items=\count268
157 | )
158 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.
159 | tex
160 | File: pgfsysprotocol.code.tex 2021/05/15 v3.1.9a (3.1.9a)
161 | )) (/usr/share/texlive/texmf-dist/tex/latex/xcolor/xcolor.sty
162 | Package: xcolor 2021/10/31 v2.13 LaTeX color extensions (UK)
163 | 
164 | (/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/color.cfg
165 | File: color.cfg 2016/01/02 v1.6 sample color configuration
166 | )
167 | Package xcolor Info: Driver file: pdftex.def on input line 227.
168 | Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1352.
169 | Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1356.
170 | Package xcolor Info: Model `RGB' extended on input line 1368.
171 | Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1370.
172 | Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1371.
173 | Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1372.
174 | Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1373.
175 | Package xcolor Info: Model `Gray' substituted by `gray' on input line 1374.
176 | Package xcolor Info: Model `wave' substituted by `hsb' on input line 1375.
177 | )
178 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex
179 | Package: pgfcore 2021/05/15 v3.1.9a (3.1.9a)
180 | 
181 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex
182 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex
183 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex)
184 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex
185 | \pgfmath@dimen=\dimen153
186 | \pgfmath@count=\count269
187 | \pgfmath@box=\box52
188 | \pgfmath@toks=\toks25
189 | \pgfmath@stack@operand=\toks26
190 | \pgfmath@stack@operation=\toks27
191 | )
192 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex
193 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code
194 | .tex)
195 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonomet
196 | ric.code.tex)
197 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.cod
198 | e.tex)
199 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison
200 | .code.tex)
201 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.
202 | tex)
203 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code
204 | .tex)
205 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.
206 | tex)
207 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerari
208 | thmetics.code.tex)))
209 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex
210 | \c@pgfmathroundto@lastzeros=\count270
211 | ))
212 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfint.code.tex)
213 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.te
214 | x
215 | File: pgfcorepoints.code.tex 2021/05/15 v3.1.9a (3.1.9a)
216 | \pgf@picminx=\dimen154
217 | \pgf@picmaxx=\dimen155
218 | \pgf@picminy=\dimen156
219 | \pgf@picmaxy=\dimen157
220 | \pgf@pathminx=\dimen158
221 | \pgf@pathmaxx=\dimen159
222 | \pgf@pathminy=\dimen160
223 | \pgf@pathmaxy=\dimen161
224 | \pgf@xx=\dimen162
225 | \pgf@xy=\dimen163
226 | \pgf@yx=\dimen164
227 | \pgf@yy=\dimen165
228 | \pgf@zx=\dimen166
229 | \pgf@zy=\dimen167
230 | )
231 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.
232 | code.tex
233 | File: pgfcorepathconstruct.code.tex 2021/05/15 v3.1.9a (3.1.9a)
234 | \pgf@path@lastx=\dimen168
235 | \pgf@path@lasty=\dimen169
236 | )
237 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code
238 | .tex
239 | File: pgfcorepathusage.code.tex 2021/05/15 v3.1.9a (3.1.9a)
240 | \pgf@shorten@end@additional=\dimen170
241 | \pgf@shorten@start@additional=\dimen171
242 | )
243 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.te
244 | x
245 | File: pgfcorescopes.code.tex 2021/05/15 v3.1.9a (3.1.9a)
246 | \pgfpic=\box53
247 | \pgf@hbox=\box54
248 | \pgf@layerbox@main=\box55
249 | \pgf@picture@serial@count=\count271
250 | )
251 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.c
252 | ode.tex
253 | File: pgfcoregraphicstate.code.tex 2021/05/15 v3.1.9a (3.1.9a)
254 | \pgflinewidth=\dimen172
255 | )
256 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformation
257 | s.code.tex
258 | File: pgfcoretransformations.code.tex 2021/05/15 v3.1.9a (3.1.9a)
259 | \pgf@pt@x=\dimen173
260 | \pgf@pt@y=\dimen174
261 | \pgf@pt@temp=\dimen175
262 | )
263 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex
264 | File: pgfcorequick.code.tex 2021/05/15 v3.1.9a (3.1.9a)
265 | )
266 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.t
267 | ex
268 | File: pgfcoreobjects.code.tex 2021/05/15 v3.1.9a (3.1.9a)
269 | )
270 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing
271 | .code.tex
272 | File: pgfcorepathprocessing.code.tex 2021/05/15 v3.1.9a (3.1.9a)
273 | )
274 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.te
275 | x
276 | File: pgfcorearrows.code.tex 2021/05/15 v3.1.9a (3.1.9a)
277 | \pgfarrowsep=\dimen176
278 | )
279 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex
280 | File: pgfcoreshade.code.tex 2021/05/15 v3.1.9a (3.1.9a)
281 | \pgf@max=\dimen177
282 | \pgf@sys@shading@range@num=\count272
283 | \pgf@shadingcount=\count273
284 | )
285 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex
286 | File: pgfcoreimage.code.tex 2021/05/15 v3.1.9a (3.1.9a)
287 | 
288 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.
289 | tex
290 | File: pgfcoreexternal.code.tex 2021/05/15 v3.1.9a (3.1.9a)
291 | \pgfexternal@startupbox=\box56
292 | ))
293 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.te
294 | x
295 | File: pgfcorelayers.code.tex 2021/05/15 v3.1.9a (3.1.9a)
296 | )
297 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.c
298 | ode.tex
299 | File: pgfcoretransparency.code.tex 2021/05/15 v3.1.9a (3.1.9a)
300 | )
301 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.
302 | tex
303 | File: pgfcorepatterns.code.tex 2021/05/15 v3.1.9a (3.1.9a)
304 | )
305 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex
306 | File: pgfcorerdf.code.tex 2021/05/15 v3.1.9a (3.1.9a)
307 | )))
308 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex
309 | File: pgfmoduleshapes.code.tex 2021/05/15 v3.1.9a (3.1.9a)
310 | \pgfnodeparttextbox=\box57
311 | ) (/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex
312 | File: pgfmoduleplot.code.tex 2021/05/15 v3.1.9a (3.1.9a)
313 | )
314 | (/usr/share/texlive/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65
315 | .sty
316 | Package: pgfcomp-version-0-65 2021/05/15 v3.1.9a (3.1.9a)
317 | \pgf@nodesepstart=\dimen178
318 | \pgf@nodesepend=\dimen179
319 | )
320 | (/usr/share/texlive/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18
321 | .sty
322 | Package: pgfcomp-version-1-18 2021/05/15 v3.1.9a (3.1.9a)
323 | )) (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgffor.sty
324 | (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty
325 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex))
326 | (/usr/share/texlive/texmf-dist/tex/latex/pgf/math/pgfmath.sty
327 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex))
328 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex
329 | Package: pgffor 2021/05/15 v3.1.9a (3.1.9a)
330 | 
331 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)
332 | \pgffor@iter=\dimen180
333 | \pgffor@skip=\dimen181
334 | \pgffor@stack=\toks28
335 | \pgffor@toks=\toks29
336 | ))
337 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex
338 | Package: tikz 2021/05/15 v3.1.9a (3.1.9a)
339 | 
340 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers
341 | .code.tex
342 | File: pgflibraryplothandlers.code.tex 2021/05/15 v3.1.9a (3.1.9a)
343 | \pgf@plot@mark@count=\count274
344 | \pgfplotmarksize=\dimen182
345 | )
346 | \tikz@lastx=\dimen183
347 | \tikz@lasty=\dimen184
348 | \tikz@lastxsaved=\dimen185
349 | \tikz@lastysaved=\dimen186
350 | \tikz@lastmovetox=\dimen187
351 | \tikz@lastmovetoy=\dimen188
352 | \tikzleveldistance=\dimen189
353 | \tikzsiblingdistance=\dimen190
354 | \tikz@figbox=\box58
355 | \tikz@figbox@bg=\box59
356 | \tikz@tempbox=\box60
357 | \tikz@tempbox@bg=\box61
358 | \tikztreelevel=\count275
359 | \tikznumberofchildren=\count276
360 | \tikznumberofcurrentchild=\count277
361 | \tikz@fig@count=\count278
362 | 
363 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex
364 | File: pgfmodulematrix.code.tex 2021/05/15 v3.1.9a (3.1.9a)
365 | \pgfmatrixcurrentrow=\count279
366 | \pgfmatrixcurrentcolumn=\count280
367 | \pgf@matrix@numberofcolumns=\count281
368 | )
369 | \tikz@expandcount=\count282
370 | 
371 | (/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik
372 | zlibrarytopaths.code.tex
373 | File: tikzlibrarytopaths.code.tex 2021/05/15 v3.1.9a (3.1.9a)
374 | )))
375 | (/usr/share/texlive/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def
376 | File: l3backend-pdftex.def 2022-01-12 L3 backend support: PDF output (pdfTeX)
377 | \l__color_backend_stack_int=\count283
378 | \l__pdf_internal_box=\box62
379 | )
380 | No file README.aux.
381 | \openout1 = `README.aux'.
382 | 
383 | LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 5.
384 | LaTeX Font Info:    ... okay on input line 5.
385 | LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 5.
386 | LaTeX Font Info:    ... okay on input line 5.
387 | LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 5.
388 | LaTeX Font Info:    ... okay on input line 5.
389 | LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 5.
390 | LaTeX Font Info:    ... okay on input line 5.
391 | LaTeX Font Info:    Checking defaults for TS1/cmr/m/n on input line 5.
392 | LaTeX Font Info:    ... okay on input line 5.
393 | LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 5.
394 | LaTeX Font Info:    ... okay on input line 5.
395 | LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 5.
396 | LaTeX Font Info:    ... okay on input line 5.
397 | (/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii
398 | [Loading MPS to PDF converter (version 2006.09.02).]
399 | \scratchcounter=\count284
400 | \scratchdimen=\dimen191
401 | \scratchbox=\box63
402 | \nofMPsegments=\count285
403 | \nofMParguments=\count286
404 | \everyMPshowfont=\toks30
405 | \MPscratchCnt=\count287
406 | \MPscratchDim=\dimen192
407 | \MPnumerator=\count288
408 | \makeMPintoPDFobject=\count289
409 | \everyMPtoPDFconversion=\toks31
410 | ) (/usr/share/texlive/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty
411 | Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf
412 | Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4
413 | 85.
414 | 
415 | (/usr/share/texlive/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg
416 | File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv
417 | e
418 | ))
419 | [1
420 | 
421 | {/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] (./README.aux) ) 
422 | Here is how much of TeX's memory you used:
423 |  12352 strings out of 480247
424 |  257875 string characters out of 5896150
425 |  521240 words of memory out of 5000000
426 |  30100 multiletter control sequences out of 15000+600000
427 |  469726 words of font info for 30 fonts, out of 8000000 for 9000
428 |  14 hyphenation exceptions out of 8191
429 |  100i,7n,104p,387b,613s stack positions out of 5000i,500n,10000p,200000b,80000s
430 | </usr/shar
431 | e/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10.pfb></usr/share/texli
432 | ve/texmf-dist/fonts/type1/public/amsfonts/cm/cmtt12.pfb>
433 | Output written on README.pdf (1 page, 16110 bytes).
434 | PDF statistics:
435 |  21 PDF objects out of 1000 (max. 8388607)
436 |  13 compressed objects within 1 object stream
437 |  0 named destinations out of 1000 (max. 500000)
438 |  13 words of extra memory for PDF output out of 10000 (max. 10000000)
439 | 
440 | 


--------------------------------------------------------------------------------
/layout_0/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yester31/Cutlass_EX/ae173cd69f984b441508ae4d3024adea7d1bdb79/layout_0/README.pdf


--------------------------------------------------------------------------------
/layout_0/README.tex:
--------------------------------------------------------------------------------
 1 | % Layout: (4,(2,2)):(2,(1,8))
 2 | \documentclass[convert]{standalone}
 3 | \usepackage{tikz}
 4 | 
 5 | \begin{document}
 6 | \begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]
 7 | 
 8 | \node[fill=black!00] at (0,0) {0};
 9 | \node[fill=black!40] at (0,1) {1};
10 | \node[fill=black!00] at (0,2) {8};
11 | \node[fill=black!40] at (0,3) {9};
12 | \node[fill=black!20] at (1,0) {2};
13 | \node[fill=black!60] at (1,1) {3};
14 | \node[fill=black!20] at (1,2) {10};
15 | \node[fill=black!60] at (1,3) {11};
16 | \node[fill=black!10] at (2,0) {4};
17 | \node[fill=black!50] at (2,1) {5};
18 | \node[fill=black!10] at (2,2) {12};
19 | \node[fill=black!50] at (2,3) {13};
20 | \node[fill=black!30] at (3,0) {6};
21 | \node[fill=black!70] at (3,1) {7};
22 | \node[fill=black!30] at (3,2) {14};
23 | \node[fill=black!70] at (3,3) {15};
24 | \draw[color=black,thick,shift={(-0.5,-0.5)}] (0,0) grid (4,4);
25 | 
26 | \node at (0,-1) {\Large{\texttt{0}}};
27 | \node at (1,-1) {\Large{\texttt{1}}};
28 | \node at (2,-1) {\Large{\texttt{2}}};
29 | \node at (3,-1) {\Large{\texttt{3}}};
30 | \node at (-1,0) {\Large{\texttt{0}}};
31 | \node at (-1,1) {\Large{\texttt{1}}};
32 | \node at (-1,2) {\Large{\texttt{2}}};
33 | \node at (-1,3) {\Large{\texttt{3}}};
34 | \end{tikzpicture}
35 | \end{document}
36 | 


--------------------------------------------------------------------------------
/layout_0/main.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cutlass/cutlass.h>
  3 | #include <cute/layout.hpp>
  4 | 
  5 | using namespace cute;
  6 | 
  7 | template <class Shape, class Stride>
  8 | void print2D(Layout<Shape, Stride> const &layout)
  9 | {
 10 |     for (int m = 0; m < size<0>(layout); ++m)
 11 |     {
 12 |         for (int n = 0; n < size<1>(layout); ++n)
 13 |         {
 14 |             printf("%3d  ", layout(m, n));
 15 |         }
 16 |         printf("\n");
 17 |     }
 18 |     printf("======================\n");
 19 | };
 20 | 
 21 | template <class Shape, class Stride>
 22 | void print1D(Layout<Shape, Stride> const &layout)
 23 | {
 24 |     for (int i = 0; i < size(layout); ++i)
 25 |     {
 26 |         printf("%3d  ", layout(i));
 27 |     }
 28 |     printf("\n======================\n");
 29 | };
 30 | 
 31 | int main()
 32 | {
 33 |     Layout s8 = make_layout(Int<8>{});
 34 |     print1D(s8);
 35 |     //  0    1    2    3    4    5    6    7
 36 | 
 37 |     Layout d8 = make_layout(8);
 38 |     print1D(d8);
 39 |     //  0    1    2    3    4    5    6    7
 40 | 
 41 |     Layout s2xs4 = make_layout(make_shape(Int<2>{}, Int<4>{}));
 42 |     print1D(s2xs4);
 43 |     //  0    1    2    3    4    5    6    7
 44 |     print2D(s2xs4);
 45 |     //   0    2    4    6
 46 |     //   1    3    5    7
 47 |     print_layout(s2xs4);
 48 |     // (_2,_4):(_1,_2)
 49 |     //       0   1   2   3
 50 |     //     +---+---+---+---+
 51 |     //  0  | 0 | 2 | 4 | 6 |
 52 |     //     +---+---+---+---+
 53 |     //  1  | 1 | 3 | 5 | 7 |
 54 |     //     +---+---+---+---+
 55 | 
 56 |     Layout s2xd4 = make_layout(make_shape(Int<2>{}, 4));
 57 |     std::cout << "-> s2xd4" << std::endl;
 58 |     print_layout(s2xd4);
 59 |     //     (_2,4):(_1,_2)
 60 |     //       0   1   2   3
 61 |     //     +---+---+---+---+
 62 |     //  0  | 0 | 2 | 4 | 6 |
 63 |     //     +---+---+---+---+
 64 |     //  1  | 1 | 3 | 5 | 7 |
 65 |     //     +---+---+---+---+
 66 | 
 67 |     Layout s2xd4_col = make_layout(make_shape(Int<2>{}, 4), LayoutLeft{});
 68 |     std::cout << "-> s2xd4_col" << std::endl;
 69 |     print_layout(s2xd4_col);
 70 |     // (_2,4):(_1,_2)
 71 |     //       0   1   2   3
 72 |     //     +---+---+---+---+
 73 |     //  0  | 0 | 2 | 4 | 6 |
 74 |     //     +---+---+---+---+
 75 |     //  1  | 1 | 3 | 5 | 7 |
 76 |     //     +---+---+---+---+
 77 | 
 78 |     Layout s2xd4_row = make_layout(make_shape(Int<2>{}, 4), LayoutRight{});
 79 |     std::cout << "-> s2xd4_row" << std::endl;
 80 |     print_layout(s2xd4_row);
 81 |     // (_2,4):(4,_1)
 82 |     //       0   1   2   3
 83 |     //     +---+---+---+---+
 84 |     //  0  | 0 | 1 | 2 | 3 |
 85 |     //     +---+---+---+---+
 86 |     //  1  | 4 | 5 | 6 | 7 |
 87 |     //     +---+---+---+---+
 88 | 
 89 |     Layout s2xd4_a = make_layout(make_shape(Int<2>{}, 4), make_stride(Int<12>{}, Int<1>{}));
 90 |     std::cout << "-> s2xd4_a" << std::endl;
 91 |     print1D(s2xd4_a);
 92 |     //   0   12    1   13    2   14    3   15
 93 |     print_layout(s2xd4_a);
 94 |     // (_2,4):(_12,_1)
 95 |     //        0    1    2    3
 96 |     //     +----+----+----+----+
 97 |     //  0  |  0 |  1 |  2 |  3 |
 98 |     //     +----+----+----+----+
 99 |     //  1  | 12 | 13 | 14 | 15 |
100 |     //     +----+----+----+----+
101 | 
102 |     Layout s2xh4 = make_layout(make_shape(2, make_shape(2, 2)), make_stride(4, make_stride(2, 1)));
103 |     std::cout << "-> s2xh4" << std::endl;
104 |     print1D(s2xh4);
105 |     //   0    4    2    6    1    5    3    7
106 |     print_layout(s2xh4);
107 |     // (2,(2,2)):(4,(2,1))
108 |     //       0   1   2   3
109 |     //     +---+---+---+---+
110 |     //  0  | 0 | 2 | 1 | 3 |
111 |     //     +---+---+---+---+
112 |     //  1  | 4 | 6 | 5 | 7 |
113 |     //     +---+---+---+---+
114 | 
115 |     Layout s2xh4_col = make_layout(shape(s2xh4), LayoutLeft{});
116 |     std::cout << "-> s2xh4_col" << std::endl;
117 |     print1D(s2xh4_col);
118 |     //   0    1    2    3    4    5    6    7
119 |     print_layout(s2xh4_col);
120 | 
121 |     Layout tt = make_layout(make_shape(4, 2), make_stride(2, 3));
122 |     print1D(tt);
123 |     //   0    2    4    6    3    5    7    9
124 |     print2D(tt);
125 |     //   0    3
126 |     //   2    5
127 |     //   4    7
128 |     //   6    9
129 |     print_layout(tt);
130 |     // (4,2):(2,3)
131 |     //        0    1
132 |     //     +----+----+
133 |     //  0  |  0 |  3 |
134 |     //     +----+----+
135 |     //  1  |  2 |  5 |
136 |     //     +----+----+
137 |     //  2  |  4 |  7 |
138 |     //     +----+----+
139 |     //  3  |  6 |  9 |
140 |     //     +----+----+
141 | 
142 |     Layout tt2 = make_layout(make_shape(4, make_shape(2, 2)), make_stride(2, make_stride(1, 8)));
143 |     print1D(tt2);
144 |     //  0    2    4    6    1    3    5    7    8   10   12   14    9   11   13   15
145 |     print2D(tt2);
146 |     //   0    1    8    9
147 |     //   2    3   10   11
148 |     //   4    5   12   13
149 |     //   6    7   14   15
150 |     print_layout(tt2);
151 |     // (4,(2,2)):(2,(1,8))          <==  (i,(j,k))
152 |     //        0    1    2    3      <== 1-D col coord
153 |     //     (0,0) (1,0) (0,1) (1,1)  <== 2-D col coord (j,k)
154 |     //     +----+----+----+----+
155 |     //  0  |  0 |  1 |  8 |  9 |
156 |     //     +----+----+----+----+
157 |     //  1  |  2 |  3 | 10 | 11 |
158 |     //     +----+----+----+----+
159 |     //  2  |  4 |  5 | 12 | 13 |
160 |     //     +----+----+----+----+
161 |     //  3  |  6 |  7 | 14 | 15 |
162 |     //     +----+----+----+----+
163 |     // (i)
164 |     print_latex(tt2);
165 | 
166 |     std::cout << rank(s8) << std::endl;
167 |     std::cout << rank(tt) << std::endl;
168 |     std::cout << rank(tt2) << std::endl;
169 |     std::cout << depth(tt2) << std::endl;
170 |     std::cout << shape(tt2) << std::endl;
171 |     std::cout << stride(tt2) << std::endl;
172 |     std::cout << size(tt2) << std::endl;
173 |     std::cout << cosize(tt2) << std::endl;
174 | 
175 |     Layout tt3 = make_layout(make_shape(make_shape(2, 2), 2), make_stride(make_stride(4, 1), 2));
176 |     print1D(tt3);
177 |     //   0    4    1    5    2    6    3    7
178 |     print_layout(tt3);
179 |     //     ((2,2),2):((4,1),2) <==  ((i,j),k)
180 |     //       0   1  <- (k)
181 |     //     +---+---+
182 |     //  0  | 0 | 2 | (0,0)
183 |     //     +---+---+
184 |     //  1  | 4 | 6 | (1,0)
185 |     //     +---+---+
186 |     //  2  | 1 | 3 | (0,1)
187 |     //     +---+---+
188 |     //  3  | 5 | 7 | (1,1)
189 |     //     +---+---+
190 |     //               (i,j)
191 | 
192 |     Layout tt4 = make_layout(make_shape(3, make_shape(2, 3)), make_stride(3, make_stride(12, 1)));
193 |     print1D(tt4);
194 |     //     0    3    6   12   15   18    1    4    7   13   16   19    2    5    8   14   17   20
195 |     print_layout(tt4);
196 |     // (3,(2,3)):(3,(12,1))
197 |     //        0     1     2     3     4     5     <== 1-D col coord
198 |     //      (0,0) (1,0) (0,1) (1,1) (0,2) (1,2)   <== 2-D col coord (j,k)
199 |     //     +-----+-----+-----+-----+-----+-----+
200 |     //  0  |  0  |  12 |  1  |  13 |  2  |  14 |
201 |     //     +-----+-----+-----+-----+-----+-----+
202 |     //  1  |  3  |  15 |  4  |  16 |  5  |  17 |
203 |     //     +-----+-----+-----+-----+-----+-----+
204 |     //  2  |  6  |  18 |  7  |  19 |  8  |  20 |
205 |     //     +-----+-----+-----+-----+-----+-----+
206 |     return 0;
207 | }


--------------------------------------------------------------------------------