├── .gitignore
├── 01-basic.md
├── 02-ptx.md
├── 03-blas.md
├── LICENSE
├── README.md
├── example.sh
├── index.md
└── src
    ├── basic
        ├── CMakeLists.txt
        └── basic.cu
    ├── blas
        ├── CMakeLists.txt
        └── blas.cu
    └── ptx
        ├── CMakeLists.txt
        └── ptx.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | **/build/
2 | 
3 | 


--------------------------------------------------------------------------------
/01-basic.md:
--------------------------------------------------------------------------------
 1 | # Example: Basic
 2 | 
 3 | This example covers the basic features of the CUDA language:
 4 | 
 5 | - defining a kernel (`__global__ basicSum()`)
 6 | - using CUDA APIs such as `cudaMalloc()`, `cudaFree()` and `cudaMemcpy()`
 7 | - launching a kernel
 8 | 
 9 | The example consists of:
10 | 
11 | - Generating test data on the host
12 | - Sending data to the device
13 | - Launching a kernel on the device
14 | - Receiving data back from the device
15 | - Checking that the data we received is what we expect
16 | 
17 | Build and run the example by following the [general instructions](./index.md).
18 | 
19 | ## Example source code
20 | 
21 | ```cpp
22 | ---8<--- "public/examples/src/basic/basic.cu"
23 | ```
24 | 
25 | ## `CMakeLists.txt` used
26 | 
27 | ```cmake
28 | ---8<--- "public/examples/src/basic/CMakeLists.txt"
29 | ```
30 | 


--------------------------------------------------------------------------------
/02-ptx.md:
--------------------------------------------------------------------------------
 1 | # Example: PTX
 2 | 
 3 | This example covers the following features on top of what was shown in the [basic example](./01-basic.md):
 4 | 
 5 | - defining `__device__` functions
 6 |     - `ptx_add()`
 7 |     - `ptx_lop3()`
 8 | - using C++ templates with `__device__` and `__global__` functions
 9 |     - `ptx_lop3()`
10 |     - `kernelLop3()`
11 | - using inline PTX Assembly `asm(...);` blocks
12 |     - `ptx_add()`
13 |     - `ptx_lop3()`
14 | 
15 | Build and run the example by following the [general instructions](./index.md).
16 | 
17 | ## Extra info
18 | 
19 | - [Using inline PTX Assembly](https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
20 | - [PTX ISA reference](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html)
21 | 
22 | PTX instructions used:
23 | 
24 | - [`add`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-add)
25 | - [`lop3`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3)
26 | 
27 | ## Example source code
28 | 
29 | ```cpp
30 | ---8<--- "public/examples/src/ptx/ptx.cu"
31 | ```
32 | 
33 | ## `CMakeLists.txt` used
34 | 
35 | ```cmake
36 | ---8<--- "public/examples/src/ptx/CMakeLists.txt"
37 | ```
38 | 


--------------------------------------------------------------------------------
/03-blas.md:
--------------------------------------------------------------------------------
 1 | # Example: BLAS
 2 | 
 3 | This example shows how a math wrapper library for BLAS can be used.
 4 | 
 5 | The example uses the `cublasDdot()` function to calculate the dot product of two vectors.
 6 | 
 7 | cuBLAS APIs are forwarded to use the relevant ROCm APIs.
 8 | Note that the example links to `cublas` in its [`CMakeLists.txt`](#cmakeliststxt-used).
 9 | Just like in other examples, this allows seamless transition of projects to SCALE without code modification.
10 | 
11 | ## Example source code
12 | 
13 | ```cpp
14 | ---8<--- "public/examples/src/blas/blas.cu"
15 | ```
16 | 
17 | ## `CMakeLists.txt` used
18 | 
19 | ```cmake
20 | ---8<--- "public/examples/src/blas/CMakeLists.txt"
21 | ```
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <https://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SCALE by example
2 | 
3 | This repository is included as a part of SCALE documentation.
4 | 
5 | The documentation root is [`index.md`](./index.md).
6 | 


--------------------------------------------------------------------------------
/example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | SCALE_DIR="$1"
 6 | EXAMPLE="$2"
 7 | 
 8 | export PATH="${SCALE_DIR}/bin:${PATH}"
 9 | export LD_LIBRARY_PATH="${SCALE_DIR}/lib:${LD_LIBRARY_PATH}"
10 | 
11 | 
12 | case "${EXAMPLE}" in
13 | 
14 |     "basic" | "blas" | "ptx")
15 |         rm -rf "src/${EXAMPLE}/build"
16 | 
17 |         cmake \
18 |             -DCUDAToolkit_ROOT="${SCALE_DIR}" \
19 |             -DCMAKE_CUDA_COMPILER="${SCALE_DIR}/bin/nvcc" \
20 |             -DCMAKE_CUDA_ARCHITECTURES="86" \
21 |             -DCMAKE_BUILD_TYPE=RelWithDebInfo \
22 |             -B "src/${EXAMPLE}/build" \
23 |             "src/${EXAMPLE}"
24 | 
25 |         make \
26 |             -C "src/${EXAMPLE}/build"
27 | 
28 |         export REDSCALE_EXCEPTIONS=1
29 | 
30 |         "src/${EXAMPLE}/build/example_${EXAMPLE}"
31 |     ;;
32 | 
33 |     *)
34 |         echo "Usage: $0 {PATH_TO_SCALE} {basic|blas|ptx}"
35 |     ;;
36 | 
37 | esac
38 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
 1 | # SCALE by example
 2 | 
 3 | Whether you are starting a new project using SCALE from scratch, or adding it to an existing project, these SCALE usage examples can be useful for you.
 4 | 
 5 | The examples don't aim to cover all the things available with SCALE.
 6 | Instead, they highlight individual features in isolation from each other.
 7 | This way, they can be used as a reference in your development process.
 8 | 
 9 | Additionally, you are welcome to use these examples as a starting point for your project.
10 | 
11 | ## List of examples
12 | 
13 | Here is the list of examples that are currently available.
14 | Read more about them in their corresponding pages.
15 | 
16 | | Example                   | What it is about           |
17 | | ------------------------- | -------------------------- |
18 | | [Basic](./01-basic.md) | Usage in its simplest form |
19 | | [PTX](./02-ptx.md)     | Using PTX Assembly         |
20 | | [BLAS](./03-blas.md)   | Using BLAS maths wrapper   |
21 | 
22 | ## Accessing the examples
23 | 
24 | The examples are hosted in a public repository.
25 | You can clone it using git:
26 | 
27 | ```sh
28 | git clone https://github.com/spectral-compute/scale-examples.git
29 | cd scale-examples
30 | ```
31 | 
32 | You can also download it as a ZIP file:
33 | 
34 | ```sh
35 | wget -O scale-examples.zip https://github.com/spectral-compute/scale-examples/archive/refs/heads/main.zip
36 | unzip scale-examples.zip
37 | cd scale-examples-main
38 | ```
39 | 
40 | ## Using the examples
41 | 
42 | To build and run the examples, you should have SCALE [installed on your machine](../manual/01-installing.md).
43 | You should also determine which [path to SCALE](../manual/02-how-to-use.md#identifying-gpu-target) to use, as it depends on your target GPU.
44 | 
45 | The example repository includes a helper script, `example.sh`, that configures, builds and runs the example of your choice.
46 | 
47 | Here is how you can use it for the [Basic](./01-basic.md) example:
48 | 
49 | ```sh
50 | # You should be in the root directory of the repository when running this
51 | ./example.sh {SCALE_DIR} basic
52 | ```
53 | 
54 | For the specified example, this will:
55 | 
56 | 1. Remove its build directory if it already exists
57 | 2. Configure CMake for that example in a freshly-created build directory
58 | 3. Build the example in that directory using Make
59 | 4. Set the `REDSCALE_EXCEPTIONS=1` environment variable for better error reporting (read more [in the manual][exceptions])
60 | 4. Run the example
61 | 
62 | [exceptions]: ../manual/03-troubleshooting.md#exceptions
63 | 
64 | ---
65 | 
66 | For accessibilty, SCALE documentation portal includes the source code of the examples in its pages.
67 | This is the source code of `example.sh` referenced above:
68 | 
69 | ```sh
70 | ---8<--- "public/examples/example.sh"
71 | ```
72 | 


--------------------------------------------------------------------------------
/src/basic/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2 | project(example_basic LANGUAGES CUDA)
3 | 
4 | add_executable(example_basic basic.cu)
5 | 


--------------------------------------------------------------------------------
/src/basic/basic.cu:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <iostream>
 3 | 
 4 | 
 5 | // The kernel we are going to launch
 6 | __global__ void basicSum(const int * a, const int * b, size_t n, int * out) {
 7 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 8 |     if(idx < n)
 9 |     {
10 |         out[idx] = a[idx] + b[idx];
11 |     }
12 | }
13 | 
14 | 
15 | // A generic helper function to simplify error handling.
16 | void check(cudaError_t error, const char * file, size_t line) {
17 |     if (error != cudaSuccess)
18 |     {
19 |         std::cout << "cuda error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
20 |         exit(1);
21 |     }
22 | }
23 | 
24 | 
25 | // A wrapper for the helper function above to include the filename and line number
26 | // where the error occurs into the output.
27 | #define CHECK(error) check(error, __FILE__, __LINE__)
28 | 
29 | 
30 | int main(int argc, char ** argv) {
31 | 
32 |     const size_t N = 4096;
33 |     const size_t BYTES = N * sizeof(int);
34 | 
35 |     std::vector<int> a(N);
36 |     std::vector<int> b(N);
37 |     std::vector<int> out(N);
38 | 
39 |     // Generate input data
40 |     for (size_t i = 0; i < N; i++) {
41 |         a[i] = i * 2;
42 |         b[i] = N - i;
43 |     }
44 | 
45 |     int * devA;
46 |     int * devB;
47 |     int * devOut;
48 | 
49 |     // Allocate memory for the inputs and the output
50 |     CHECK(cudaMalloc(&devA, BYTES));
51 |     CHECK(cudaMalloc(&devB, BYTES));
52 |     CHECK(cudaMalloc(&devOut, BYTES));
53 | 
54 |     // Copy the input data to the device
55 |     CHECK(cudaMemcpy(devA, a.data(), BYTES, cudaMemcpyHostToDevice));
56 |     CHECK(cudaMemcpy(devB, b.data(), BYTES, cudaMemcpyHostToDevice));
57 | 
58 |     // Launch the kernel
59 |     basicSum<<<N / 256 + 1, 256>>>(devA, devB, N, devOut);
60 |     CHECK(cudaDeviceSynchronize());
61 |     CHECK(cudaGetLastError());
62 | 
63 |     // Copy the output data back to host
64 |     CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));
65 | 
66 |     // Free up the memory we allocated for the inputs and the output
67 |     CHECK(cudaFree(devA));
68 |     CHECK(cudaFree(devB));
69 |     CHECK(cudaFree(devOut));
70 | 
71 |     // Test that the output matches our expectations
72 |     for (size_t i = 0; i < N; i++) {
73 |         if (a[i] + b[i] != out[i]) {
74 |             std::cout << "Incorrect sum: " << a[i] << " + " << b[i] << " = " << out[i] << " ?\n";
75 |         }
76 |     }
77 | 
78 |     std::cout << "Example finished" << std::endl;
79 | 
80 |     return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/src/blas/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2 | project(example_blas LANGUAGES CUDA)
3 | 
4 | add_executable(example_blas blas.cu)
5 | target_link_libraries(example_blas PRIVATE cublas redscale)
6 | 


--------------------------------------------------------------------------------
/src/blas/blas.cu:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <iostream>
 3 | 
 4 | #include <cublas_v2.h>
 5 | 
 6 | 
 7 | void check(cudaError_t error, const char * file, size_t line) {
 8 |     if (error != cudaSuccess)
 9 |     {
10 |         std::cout << "cuda error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
11 |         exit(1);
12 |     }
13 | }
14 | 
15 | 
16 | void checkCublas(cublasStatus_t error, const char * file, size_t line) {
17 |     if (error != CUBLAS_STATUS_SUCCESS) {
18 |         std::cout << "cublas error: " << cublasGetStatusString(error) << " at " << file << ":" << line << std::endl;
19 |         exit(1);
20 |     }
21 | }
22 | 
23 | 
24 | #define CHECK(error) check(error, __FILE__, __LINE__)
25 | #define CHECK_CUBLAS(error) checkCublas(error, __FILE__, __LINE__)
26 | 
27 | 
28 | int main(int argc, char ** argv) {
29 |     cublasHandle_t handle;
30 |     CHECK_CUBLAS(cublasCreate(&handle));
31 | 
32 |     const size_t N = 10;
33 |     const size_t BYTES = N * sizeof(double);
34 |     const double E = 1e-5;
35 | 
36 |     /* Prepare the data */
37 | 
38 |     std::vector<double> A(N);
39 |     std::vector<double> B(N);
40 | 
41 |     for (size_t i = 0; i < N; i++) {
42 |         A[i] = i;
43 |         B[i] = i + N;
44 |     }
45 | 
46 |     /* Send the data */
47 | 
48 |     double * devA;
49 |     double * devB;
50 | 
51 |     CHECK(cudaMalloc(&devA, BYTES));
52 |     CHECK(cudaMalloc(&devB, BYTES));
53 | 
54 |     CHECK(cudaMemcpy(devA, A.data(), BYTES, cudaMemcpyHostToDevice));
55 |     CHECK(cudaMemcpy(devB, B.data(), BYTES, cudaMemcpyHostToDevice));
56 | 
57 |     /* Calculate */
58 | 
59 |     const int strideA = 1;
60 |     const int strideB = 1;
61 |     double result = 0;
62 | 
63 |     CHECK_CUBLAS(cublasDdot(handle, A.size(), devA, strideA, devB, strideB, &result));
64 | 
65 |     CHECK(cudaDeviceSynchronize());
66 | 
67 |     double expected = 0;
68 |     for (size_t i = 0; i < N; i++) {
69 |         expected += A[i] * B[i];
70 |     }
71 | 
72 |     if (std::abs(result - expected) > E) {
73 |         std::cout << "Result " << result << " is different from expected " << expected << std::endl;
74 |     }
75 | 
76 |     CHECK_CUBLAS(cublasDestroy(handle));
77 | 
78 |     std::cout << "Example finished." << std::endl;
79 | 
80 |     return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/src/ptx/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2 | project(example_ptx LANGUAGES CUDA)
3 | 
4 | add_executable(example_ptx ptx.cu)
5 | 


--------------------------------------------------------------------------------
/src/ptx/ptx.cu:
--------------------------------------------------------------------------------
  1 | #include <bitset>
  2 | #include <vector>
  3 | #include <iostream>
  4 | #include <cstdint>
  5 | 
  6 | 
  7 | __device__ inline uint32_t ptx_add(uint32_t x, uint32_t y) {
  8 |     // Calculate a sum of `x` and `y`, put the result into `x`
  9 |     asm(
 10 |         "add.u32 %0, %0, %1;"
 11 |         : "+r"(x)
 12 |         : "r"(y)
 13 |     );
 14 |     return x;
 15 | }
 16 | 
 17 | 
 18 | __global__ void kernelAdd(const uint32_t * a, const uint32_t * b, size_t n, uint32_t * out) {
 19 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 20 |     if(idx < n)
 21 |     {
 22 |         out[idx] = ptx_add(a[idx], b[idx]);
 23 |     }
 24 | }
 25 | 
 26 | 
 27 | template<uint8_t Op>
 28 | __device__ inline uint32_t ptx_lop3(uint32_t x, uint32_t y, uint32_t z) {
 29 |     // Compute operator `Op` on `x`, `y`, `z`, put the result into `x`
 30 | 
 31 |     asm(
 32 |         "lop3.b32 %0, %0, %1, %2, %3;"
 33 |         : "+r"(x)
 34 |         : "r"(y), "r"(z), "n"(Op)
 35 |     );
 36 |     return x;
 37 | }
 38 | 
 39 | 
 40 | template<uint8_t Op>
 41 | __global__ void kernelLop3(const uint32_t * a, const uint32_t * b, const uint32_t * c, size_t n, uint32_t * out) {
 42 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 43 |     if(idx < n)
 44 |     {
 45 |         out[idx] = ptx_lop3<Op>(a[idx], b[idx], c[idx]);
 46 |     }
 47 | }
 48 | 
 49 | 
 50 | void check(cudaError_t error, const char * file, size_t line) {
 51 |     if (error != cudaSuccess)
 52 |     {
 53 |         std::cout << "cuda error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
 54 |         exit(1);
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | #define CHECK(error) check(error, __FILE__, __LINE__)
 60 | 
 61 | 
 62 | template<typename T>
 63 | constexpr T lop3op(T a, T b, T c) {
 64 |     return a & b ^ (~c);
 65 | }
 66 | 
 67 | 
 68 | int main(int argc, char ** argv) {
 69 | 
 70 |     const size_t N = 4096;
 71 |     const size_t BYTES = N * sizeof(uint32_t);
 72 | 
 73 |     std::vector<uint32_t> a(N);
 74 |     std::vector<uint32_t> b(N);
 75 |     std::vector<uint32_t> c(N);
 76 |     std::vector<uint32_t> out(N);
 77 | 
 78 |     for (size_t i = 0; i < N; i++) {
 79 |         a[i] = i * 2;
 80 |         b[i] = N - i;
 81 |         c[i] = i * i;
 82 |     }
 83 | 
 84 |     uint32_t * devA;
 85 |     uint32_t * devB;
 86 |     uint32_t * devC;
 87 |     uint32_t * devOut;
 88 | 
 89 |     CHECK(cudaMalloc(&devA, BYTES));
 90 |     CHECK(cudaMalloc(&devB, BYTES));
 91 |     CHECK(cudaMalloc(&devC, BYTES));
 92 |     CHECK(cudaMalloc(&devOut, BYTES));
 93 | 
 94 |     CHECK(cudaMemcpy(devA, a.data(), BYTES, cudaMemcpyHostToDevice));
 95 |     CHECK(cudaMemcpy(devB, b.data(), BYTES, cudaMemcpyHostToDevice));
 96 |     CHECK(cudaMemcpy(devC, c.data(), BYTES, cudaMemcpyHostToDevice));
 97 | 
 98 |     // Test "add"
 99 | 
100 |     kernelAdd<<<N / 256 + 1, 256>>>(devA, devB, N, devOut);
101 |     CHECK(cudaDeviceSynchronize());
102 |     CHECK(cudaGetLastError());
103 | 
104 |     CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));
105 | 
106 |     for (size_t i = 0; i < N; i++) {
107 |         if (a[i] + b[i] != out[i]) {
108 |             std::cout << "Incorrect add: " << a[i] << " + " << b[i] << " = " << out[i] << " ?\n";
109 |         }
110 |     }
111 | 
112 |     // Test "lop3"
113 | 
114 |     constexpr uint8_t TA = 0xF0;
115 |     constexpr uint8_t TB = 0xCC;
116 |     constexpr uint8_t TC = 0xAA;
117 |     constexpr uint8_t Op = lop3op(TA, TB, TC);
118 | 
119 |     kernelLop3<Op><<<N / 256 + 1, 256>>>(devA, devB, devC, N, devOut);
120 |     CHECK(cudaDeviceSynchronize());
121 |     CHECK(cudaGetLastError());
122 | 
123 |     CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));
124 | 
125 |     for (size_t i = 0; i < N; i++) {
126 |         if (lop3op(a[i], b[i], c[i]) != out[i]) {
127 |             std::cout << "Incorrect lop3: \n"
128 |                 << "    " << std::bitset<32>{a[i]} << "\n"
129 |                 << " &  " << std::bitset<32>{b[i]} << "\n"
130 |                 << " ^ ~" << std::bitset<32>{c[i]} << "\n"
131 |                 << " =  " << std::bitset<32>{out[i]} << " ?\n\n";
132 |         }
133 |     }
134 | 
135 |     CHECK(cudaFree(devA));
136 |     CHECK(cudaFree(devB));
137 |     CHECK(cudaFree(devC));
138 |     CHECK(cudaFree(devOut));
139 | 
140 |     // Finish
141 | 
142 |     std::cout << "Example finished" << std::endl;
143 | 
144 |     return 0;
145 | }
146 | 


--------------------------------------------------------------------------------