├── assets
    ├── redme.txt
    └── triton.png
├── matmul
    ├── images
    │   ├── images.txt
    │   ├── m_001.jpg
    │   └── m_001-mask.jpg
    └── readme.md
├── daily_challange
    ├── day0
    │   ├── add_constant.py
    │   └── readme.md
    ├── day1
    │   ├── vector_add.py
    │   └── readme.md
    ├── day3
    │   ├── relu.py
    │   └── readme.md
    ├── day2
    │   ├── vector_add_benchmark.py
    │   └── readme.md
    └── day4
    │   ├── relu.py
    │   └── readme.md
└── README.md


/assets/redme.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/matmul/images/images.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/assets/triton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rkinas/triton-resources/HEAD/assets/triton.png


--------------------------------------------------------------------------------
/matmul/images/m_001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rkinas/triton-resources/HEAD/matmul/images/m_001.jpg


--------------------------------------------------------------------------------
/matmul/images/m_001-mask.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rkinas/triton-resources/HEAD/matmul/images/m_001-mask.jpg


--------------------------------------------------------------------------------
/daily_challange/day0/add_constant.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import torch
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | # ------------------------------------------------------------------------------
 7 | # Triton Kernel for Constant Addition
 8 | # ------------------------------------------------------------------------------
 9 | 
10 | @triton.jit
11 | def constant_add_kernel(
12 |     x_ptr,          # Pointer to the input vector x
13 |     constant,       # The constant value to add
14 |     y_ptr,          # Pointer to the output vector y
15 |     N0: tl.constexpr,      # Total number of elements in vector x (and y)
16 |     BLOCK_SIZE: tl.constexpr  # Block size, set equal to N0
17 | ):
18 |     # Each kernel instance processes a block of elements.
19 |     # With BLOCK_SIZE equal to N0, only one instance is launched.
20 |     pid = tl.program_id(0)
21 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
22 |     mask = offsets < N0  # Ensure we don't access out-of-bound indices
23 | 
24 |     # Load x values, add the constant, and store the result in y
25 |     x = tl.load(x_ptr + offsets, mask=mask)
26 |     y = x + constant
27 |     tl.store(y_ptr + offsets, y, mask=mask)
28 | 
29 | # ------------------------------------------------------------------------------
30 | # Python Wrapper Function for the Triton Kernel
31 | # ------------------------------------------------------------------------------
32 | 
33 | def constant_add_triton(x: torch.Tensor, constant: float) -> torch.Tensor:
34 |     """
35 |     Adds a constant to each element of the input vector x using a Triton kernel.
36 |     
37 |     The block size is set equal to the vector length (N0), meaning that only one
38 |     kernel instance is launched.
39 |     
40 |     Args:
41 |         x (torch.Tensor): Input vector on CUDA.
42 |         constant (float): The constant to add to each element.
43 |     
44 |     Returns:
45 |         torch.Tensor: Output vector with the constant added.
46 |     """
47 |     N0 = x.numel()
48 |     BLOCK_SIZE = N0  # Block size equals the vector length
49 |     y = torch.empty_like(x)
50 |     
51 |     # With BLOCK_SIZE = N0, our grid consists of a single block.
52 |     grid = lambda meta: (1,)
53 |     
54 |     # Launch the Triton kernel
55 |     constant_add_kernel[grid](x, constant, y, N0, BLOCK_SIZE=BLOCK_SIZE)
56 |     return y
57 | 
58 | # ------------------------------------------------------------------------------
59 | # Main: Test Constant Add Kernel
60 | # ------------------------------------------------------------------------------
61 | 
62 | if __name__ == '__main__':
63 |     # Create an example vector on the GPU.
64 |     N0 = 1024  # Length of the vector
65 |     x = torch.arange(0, N0, device='cuda', dtype=torch.float32)
66 |     constant = 3.0  # The constant value to add
67 | 
68 |     # Compute the result using the Triton kernel.
69 |     y_triton = constant_add_triton(x, constant)
70 | 
71 |     # Compute the result using PyTorch for verification.
72 |     y_torch = x + constant
73 | 
74 |     # Verify correctness.
75 |     if torch.allclose(y_triton, y_torch):
76 |         print("Success: Triton kernel result matches PyTorch result!")
77 |     else:
78 |         print("Error: The results do not match.")
79 | 


--------------------------------------------------------------------------------
/daily_challange/day1/vector_add.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Triton Kernel for Vector Addition
 7 | # ------------------------------------------------------------------------------
 8 | 
 9 | @triton.jit
10 | def vector_add_kernel(
11 |     A_ptr,          # Pointer to first input vector A
12 |     B_ptr,          # Pointer to second input vector B
13 |     C_ptr,          # Pointer to output vector C
14 |     n_elements: tl.constexpr,  # Number of elements in the vectors
15 |     BLOCK_SIZE: tl.constexpr   # Block size (number of elements per program instance)
16 | ):
17 |     # Each program instance (kernel instance) computes a block of elements.
18 |     pid = tl.program_id(0)  # 1D grid: get the program id (i.e. block index)
19 |     # Compute the offsets for the current block
20 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
21 |     # Create a mask to avoid out-of-bound accesses
22 |     mask = offsets < n_elements
23 | 
24 |     # Load the corresponding elements from A and B
25 |     a = tl.load(A_ptr + offsets, mask=mask)
26 |     b = tl.load(B_ptr + offsets, mask=mask)
27 |     
28 |     # Perform element-wise addition
29 |     c = a + b
30 |     
31 |     # Store the result into the output pointer C
32 |     tl.store(C_ptr + offsets, c, mask=mask)
33 | 
34 | # ------------------------------------------------------------------------------
35 | # Python Wrapper Function for the Triton Kernel
36 | # ------------------------------------------------------------------------------
37 | 
38 | def vector_add_triton(A: torch.Tensor, B: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
39 |     """
40 |     Adds two vectors A and B using the Triton kernel.
41 | 
42 |     Args:
43 |         A (torch.Tensor): First input vector (on CUDA).
44 |         B (torch.Tensor): Second input vector (on CUDA).
45 |         BLOCK_SIZE (int): Number of elements per block for the kernel.
46 | 
47 |     Returns:
48 |         torch.Tensor: Output vector containing the element-wise sum.
49 |     """
50 |     assert A.numel() == B.numel(), "Input vectors must have the same number of elements."
51 |     n_elements = A.numel()
52 |     # Create an empty tensor for the result (same size and device as A)
53 |     C = torch.empty_like(A)
54 |     
55 |     # Define grid: number of blocks needed to cover all elements
56 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
57 |     
58 |     # Launch the kernel
59 |     vector_add_kernel[grid](A, B, C, n_elements, BLOCK_SIZE=BLOCK_SIZE)
60 |     return C
61 | 
62 | # ------------------------------------------------------------------------------
63 | # Main: Compare Triton Kernel with PyTorch Implementation
64 | # ------------------------------------------------------------------------------
65 | 
66 | if __name__ == '__main__':
67 |     # Create two example vectors on the GPU
68 |     n = 1024 * 10  # total number of elements
69 |     A = torch.arange(0, n, device='cuda', dtype=torch.float32)
70 |     B = torch.arange(n, 2 * n, device='cuda', dtype=torch.float32)
71 | 
72 |     # Add the vectors using the Triton kernel
73 |     C_triton = vector_add_triton(A, B)
74 | 
75 |     # Add the vectors using PyTorch (for verification)
76 |     C_pytorch = A + B
77 | 
78 |     # Verify that the results are the same
79 |     if torch.allclose(C_triton, C_pytorch):
80 |         print("Success: The Triton kernel result matches the PyTorch result!")
81 |     else:
82 |         print("Error: The results do not match.")
83 | 
84 |     # Print part of the result for inspection
85 |     print("Result (first 10 elements):", C_triton[:10])
86 | 


--------------------------------------------------------------------------------
/matmul/readme.md:
--------------------------------------------------------------------------------
  1 | # Matrix Multiplication with Triton
  2 | 
  3 | This repository demonstrates matrix multiplication using the Triton framework. We provide two examples:
  4 | 
  5 | 1. **Naive Matrix Multiplication (8x8)**: A straightforward implementation of matrix multiplication without masking.
  6 | 2. **Masked Matrix Multiplication (8x6)**: An enhanced implementation showcasing the use of masking to handle uneven matrix sizes.
  7 | 
  8 | ## Visualization
  9 | 
 10 | ### Naive Matrix Multiplication (8x8)
 11 | [View Visualization](https://claude.site/artifacts/1f66d58b-5c1e-4a88-8bb4-ddb47ed9bda1)
 12 | 
 13 | ![Naive Matrix Multiplication](./images/m_001.jpg)
 14 | 
 15 | ### Masked Matrix Multiplication (8x6)
 16 | [View Visualization](https://claude.site/artifacts/cc485433-bc54-4af4-830f-50ac4f3eefca)
 17 | 
 18 | ![Masked Matrix Multiplication](./images/m_001-mask.jpg)
 19 | 
 20 | ## Implementation
 21 | 
 22 | The examples are implemented using the Triton framework. Below is a clean and concise implementation of the naive matrix multiplication kernel and its corresponding Python integration:
 23 | 
 24 | ```python
 25 | import torch
 26 | import triton
 27 | import triton.language as tl
 28 | from functools import partial
 29 | 
 30 | DEVICE = 'cuda'
 31 | 
 32 | @triton.jit
 33 | def naive_matmul_kernel(
 34 |     a_ptr, b_ptr, c_ptr,
 35 |     m, n, k,
 36 |     stride_am, stride_ak,
 37 |     stride_bk, stride_bn,
 38 |     stride_cm, stride_cn,
 39 |     bm: tl.constexpr, bn: tl.constexpr, bk: tl.constexpr
 40 | ):
 41 |     # Program IDs
 42 |     pid_m, pid_n = tl.program_id(0), tl.program_id(1)
 43 | 
 44 |     # Block-level starting indices
 45 |     rm = pid_m * bm + tl.arange(0, bm)
 46 |     rn = pid_n * bn + tl.arange(0, bn)
 47 |     rk = tl.arange(0, bk)
 48 | 
 49 |     # Boundary masks
 50 |     rm_mask = rm < m
 51 |     rn_mask = rn < n
 52 | 
 53 |     # Offsets
 54 |     offs_a = a_ptr + rm[:, None] * stride_am + rk[None, :] * stride_ak
 55 |     offs_b = b_ptr + rk[:, None] * stride_bk + rn[None, :] * stride_bn
 56 | 
 57 |     # Accumulator
 58 |     acc = tl.zeros((bm, bn), dtype=tl.float32)
 59 | 
 60 |     # Loop over the k dimension
 61 |     for k_idx in range(0, k, bk):
 62 |         k_mask = k_idx + rk < k
 63 |         a = tl.load(offs_a, mask=rm_mask[:, None] & k_mask[None, :], other=0.0)
 64 |         b = tl.load(offs_b, mask=k_mask[:, None] & rn_mask[None, :], other=0.0)
 65 |         acc += tl.dot(a, b)
 66 | 
 67 |         # Increment offsets
 68 |         offs_a += bk * stride_ak
 69 |         offs_b += bk * stride_bk
 70 | 
 71 |     # Write back results
 72 |     c = c_ptr + rm[:, None] * stride_cm + rn[None, :] * stride_cn
 73 |     tl.store(c, acc, mask=rm_mask[:, None] & rn_mask[None, :])
 74 | 
 75 | # Python interface
 76 | def matmul(a, b, kernel, block_size=32):
 77 |     m, k = a.shape
 78 |     _, n = b.shape
 79 |     
 80 |     c = torch.empty((m, n), device=a.device, dtype=a.dtype)
 81 | 
 82 |     grid = lambda meta: (triton.cdiv(m, meta['bm']), triton.cdiv(n, meta['bn']))
 83 | 
 84 |     kernel[grid](
 85 |         a, b, c,
 86 |         m, n, k,
 87 |         a.stride(0), a.stride(1),
 88 |         b.stride(0), b.stride(1),
 89 |         c.stride(0), c.stride(1),
 90 |         bm=block_size, bn=block_size, bk=block_size
 91 |     )
 92 | 
 93 |     return c
 94 | 
 95 | naive_matmul = partial(matmul, kernel=naive_matmul_kernel)
 96 | 
 97 | # Example usage
 98 | a = torch.ones((8, 8), dtype=torch.float32, device=DEVICE)
 99 | b = torch.ones((8, 8), dtype=torch.float32, device=DEVICE)
100 | 
101 | result = naive_matmul(a, b, block_size=8)
102 | 
103 | expected = torch.matmul(a, b)
104 | assert torch.allclose(result, expected, rtol=1e-3, atol=1e-3)
105 | print("Test passed!")
106 | ```
107 | 
108 | ## Key Features
109 | 
110 | - **Naive Implementation**: Demonstrates the basics of kernel programming with Triton.
111 | - **Masked Implementation**: Illustrates handling of uneven matrices using boundary masks.
112 | 
113 | 


--------------------------------------------------------------------------------
/daily_challange/day3/relu.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import torch
  3 | import triton
  4 | import triton.language as tl
  5 | 
  6 | # ------------------------------------------------------------------------------
  7 | # Triton Kernel for ReLU Activation
  8 | # ------------------------------------------------------------------------------
  9 | 
 10 | @triton.jit
 11 | def relu_kernel(
 12 |     x_ptr,               # Pointer to the input vector x
 13 |     y_ptr,               # Pointer to the output vector y
 14 |     N: tl.constexpr,     # Total number of elements in the input vector
 15 |     BLOCK_SIZE: tl.constexpr  # Block size: number of elements processed per kernel instance
 16 | ):
 17 |     # Each kernel instance processes a block of elements.
 18 |     # Get the current program ID along the 1D grid.
 19 |     pid = tl.program_id(0)
 20 |     
 21 |     # Compute the offsets for the block of elements this kernel instance will process.
 22 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 23 |     
 24 |     # Create a mask to ensure we do not access out-of-bound memory.
 25 |     mask = offsets < N
 26 |     
 27 |     # Load elements from the input pointer.
 28 |     x = tl.load(x_ptr + offsets, mask=mask)
 29 |     
 30 |     # Compute the ReLU activation: y = max(0, x)
 31 |     y = tl.maximum(x, 0.0)
 32 |     
 33 |     # Store the result back to the output pointer.
 34 |     tl.store(y_ptr + offsets, y, mask=mask)
 35 | 
 36 | # ------------------------------------------------------------------------------
 37 | # Python Wrapper Function for the Triton ReLU Kernel
 38 | # ------------------------------------------------------------------------------
 39 | 
 40 | def relu_triton(x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
 41 |     """
 42 |     Applies the ReLU activation function on the input vector x using a Triton kernel.
 43 |     
 44 |     Args:
 45 |         x (torch.Tensor): Input tensor on CUDA.
 46 |         BLOCK_SIZE (int): Number of elements processed per kernel instance.
 47 |     
 48 |     Returns:
 49 |         torch.Tensor: Output tensor after applying ReLU activation.
 50 |     """
 51 |     N = x.numel()
 52 |     # Allocate the output tensor with the same shape and device as the input.
 53 |     y = torch.empty_like(x)
 54 |     
 55 |     # Configure the grid: number of blocks required to cover all N elements.
 56 |     grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),)
 57 |     
 58 |     # Launch the Triton kernel.
 59 |     relu_kernel[grid](x, y, N, BLOCK_SIZE=BLOCK_SIZE)
 60 |     return y
 61 | 
 62 | # ------------------------------------------------------------------------------
 63 | # Benchmarking Function
 64 | # ------------------------------------------------------------------------------
 65 | 
 66 | def benchmark(func, *args, n_warmup=10, n_iters=100):
 67 |     """
 68 |     Benchmarks a function by performing warm-up iterations followed by timed iterations.
 69 |     
 70 |     Args:
 71 |         func (callable): The function to benchmark.
 72 |         *args: Arguments to pass to the function.
 73 |         n_warmup (int): Number of warm-up iterations.
 74 |         n_iters (int): Number of iterations for timing.
 75 |     
 76 |     Returns:
 77 |         float: Average execution time per iteration in milliseconds.
 78 |     """
 79 |     # Warm-up: execute the function several times to mitigate initial overhead.
 80 |     for _ in range(n_warmup):
 81 |         func(*args)
 82 |     torch.cuda.synchronize()  # Wait for all GPU operations to finish.
 83 | 
 84 |     # Timing the execution.
 85 |     start = time.perf_counter()
 86 |     for _ in range(n_iters):
 87 |         func(*args)
 88 |     torch.cuda.synchronize()  # Ensure all GPU operations are complete.
 89 |     end = time.perf_counter()
 90 | 
 91 |     avg_time_ms = (end - start) / n_iters * 1000
 92 |     return avg_time_ms
 93 | 
 94 | # ------------------------------------------------------------------------------
 95 | # Main: Test and Benchmark the Triton ReLU Kernel
 96 | # ------------------------------------------------------------------------------
 97 | 
 98 | if __name__ == '__main__':
 99 |     # Create an example input vector on the GPU.
100 |     N = 1024 * 1024  # For instance, 1 million elements.
101 |     x = torch.randn(N, device='cuda', dtype=torch.float32)
102 |     
103 |     # Apply ReLU using the Triton kernel.
104 |     y_triton = relu_triton(x)
105 |     
106 |     # Apply ReLU using PyTorch for validation.
107 |     y_torch = torch.relu(x)
108 |     
109 |     # Verify that both outputs are the same.
110 |     if torch.allclose(y_triton, y_torch):
111 |         print("Success: Triton ReLU matches PyTorch ReLU!")
112 |     else:
113 |         print("Error: The Triton ReLU output does not match PyTorch.")
114 | 
115 |     # Benchmark the Triton kernel.
116 |     triton_time = benchmark(relu_triton, x)
117 |     print(f"Average execution time (Triton ReLU): {triton_time:.3f} ms")
118 | 
119 |     # Benchmark PyTorch’s built-in ReLU.
120 |     torch_time = benchmark(torch.relu, x)
121 |     print(f"Average execution time (PyTorch ReLU): {torch_time:.3f} ms")
122 | 


--------------------------------------------------------------------------------
/daily_challange/day2/vector_add_benchmark.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import torch
  3 | import triton
  4 | import triton.language as tl
  5 | 
  6 | # ------------------------------------------------------------------------------
  7 | # Triton Kernel for Vector Addition
  8 | # ------------------------------------------------------------------------------
  9 | 
 10 | @triton.jit
 11 | def vector_add_kernel(
 12 |     A_ptr,          # Pointer to the first input vector A
 13 |     B_ptr,          # Pointer to the second input vector B
 14 |     C_ptr,          # Pointer to the output vector C
 15 |     n_elements: tl.constexpr,  # Total number of elements in the vectors
 16 |     BLOCK_SIZE: tl.constexpr   # Block size (number of elements processed per kernel instance)
 17 | ):
 18 |     # Get the current program (block) ID
 19 |     pid = tl.program_id(0)
 20 |     # Compute the offsets for the current block
 21 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 22 |     # Create a mask to avoid accessing out-of-bound indices
 23 |     mask = offsets < n_elements
 24 | 
 25 |     # Load the elements from A and B with the computed offsets
 26 |     a = tl.load(A_ptr + offsets, mask=mask)
 27 |     b = tl.load(B_ptr + offsets, mask=mask)
 28 |     
 29 |     # Perform element-wise addition
 30 |     c = a + b
 31 |     
 32 |     # Store the result in C using the mask to ensure only valid writes
 33 |     tl.store(C_ptr + offsets, c, mask=mask)
 34 | 
 35 | # ------------------------------------------------------------------------------
 36 | # Python Wrapper for the Triton Kernel
 37 | # ------------------------------------------------------------------------------
 38 | 
 39 | def vector_add_triton(A: torch.Tensor, B: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
 40 |     """
 41 |     Adds two vectors using the Triton kernel.
 42 |     
 43 |     Args:
 44 |         A (torch.Tensor): First input vector (on CUDA).
 45 |         B (torch.Tensor): Second input vector (on CUDA).
 46 |         BLOCK_SIZE (int): Number of elements per block for the kernel.
 47 |     
 48 |     Returns:
 49 |         torch.Tensor: Output vector with the element-wise sum.
 50 |     """
 51 |     n_elements = A.numel()
 52 |     # Allocate the output tensor (same shape and device as A)
 53 |     C = torch.empty_like(A)
 54 |     
 55 |     # Define the grid (number of blocks) required to cover all elements
 56 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
 57 |     
 58 |     # Launch the Triton kernel
 59 |     vector_add_kernel[grid](A, B, C, n_elements, BLOCK_SIZE=BLOCK_SIZE)
 60 |     return C
 61 | 
 62 | # ------------------------------------------------------------------------------
 63 | # Benchmarking Function
 64 | # ------------------------------------------------------------------------------
 65 | 
 66 | def benchmark(func, *args, n_warmup=10, n_iters=100):
 67 |     """
 68 |     Benchmarks a function by running warm-up iterations followed by timed iterations.
 69 |     
 70 |     Args:
 71 |         func (callable): The function to benchmark.
 72 |         *args: Arguments to pass to the function.
 73 |         n_warmup (int): Number of warm-up iterations (to exclude startup overhead).
 74 |         n_iters (int): Number of iterations for timing.
 75 |     
 76 |     Returns:
 77 |         float: Average execution time per iteration in milliseconds.
 78 |     """
 79 |     # Warm-up runs to ensure any one-time setup is complete (e.g. CUDA context)
 80 |     for _ in range(n_warmup):
 81 |         func(*args)
 82 |     torch.cuda.synchronize()  # Ensure warm-up kernels have finished
 83 | 
 84 |     # Start timing
 85 |     start = time.perf_counter()
 86 |     for _ in range(n_iters):
 87 |         func(*args)
 88 |     torch.cuda.synchronize()  # Wait for all GPU operations to finish
 89 |     end = time.perf_counter()
 90 | 
 91 |     # Calculate the average execution time (in milliseconds)
 92 |     avg_time_ms = (end - start) / n_iters * 1000
 93 |     return avg_time_ms
 94 | 
 95 | # ------------------------------------------------------------------------------
 96 | # Main: Compare and Benchmark Triton Kernel vs. PyTorch Implementation
 97 | # ------------------------------------------------------------------------------
 98 | 
 99 | if __name__ == '__main__':
100 |     # Create two example vectors on the GPU (stress test with a large number of elements)
101 |     n = 1024 * 1024 * 10  # e.g., 10 million elements
102 |     A = torch.arange(0, n, device='cuda', dtype=torch.float32)
103 |     B = torch.arange(n, 2 * n, device='cuda', dtype=torch.float32)
104 | 
105 |     # Validate correctness by comparing results from Triton and PyTorch
106 |     C_triton = vector_add_triton(A, B)
107 |     C_pytorch = A + B
108 | 
109 |     if torch.allclose(C_triton, C_pytorch):
110 |         print("Success: The Triton result matches the PyTorch result!")
111 |     else:
112 |         print("Error: The results do not match.")
113 | 
114 |     # Benchmark the Triton kernel
115 |     triton_time = benchmark(vector_add_triton, A, B, n_warmup=10, n_iters=100)
116 |     print(f"Average execution time (Triton): {triton_time:.3f} ms")
117 | 
118 |     # Benchmark the PyTorch implementation
119 |     def pytorch_add(A, B):
120 |         return A + B
121 | 
122 |     pytorch_time = benchmark(pytorch_add, A, B, n_warmup=10, n_iters=100)
123 |     print(f"Average execution time (PyTorch): {pytorch_time:.3f} ms")
124 | 


--------------------------------------------------------------------------------
/daily_challange/day0/readme.md:
--------------------------------------------------------------------------------
  1 | # Puzzle 1: constant add
  2 | 
  3 | This challenge is the first puzzle in our Daily Triton Challenge series. The goal is to write a Triton kernel that adds a constant value to each element of a vector. The key aspects of this puzzle are:
  4 | 
  5 | - **One program ID axis:** we use a 1D grid, with a single kernel instance.
  6 | - **Block size \(B_0\):** the block size is set equal to the length of the vector \(N_0\), so the kernel processes the entire vector in one go.
  7 | - **Verification:** the result is compared against a simple PyTorch implementation.
  8 | 
  9 | ## Full code example
 10 | 
 11 | ```python
 12 | import time
 13 | import torch
 14 | import triton
 15 | import triton.language as tl
 16 | 
 17 | # ------------------------------------------------------------------------------
 18 | # Triton Kernel for Constant Addition
 19 | # ------------------------------------------------------------------------------
 20 | 
 21 | @triton.jit
 22 | def constant_add_kernel(
 23 |     x_ptr,          # Pointer to the input vector x
 24 |     constant,       # The constant value to add
 25 |     y_ptr,          # Pointer to the output vector y
 26 |     N0: tl.constexpr,      # Total number of elements in vector x (and y)
 27 |     BLOCK_SIZE: tl.constexpr  # Block size, set equal to N0
 28 | ):
 29 |     # Each kernel instance processes a block of elements.
 30 |     # With BLOCK_SIZE equal to N0, only one instance is launched.
 31 |     pid = tl.program_id(0)
 32 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 33 |     mask = offsets < N0  # Ensure we don't access out-of-bound indices
 34 | 
 35 |     # Load x values, add the constant, and store the result in y
 36 |     x = tl.load(x_ptr + offsets, mask=mask)
 37 |     y = x + constant
 38 |     tl.store(y_ptr + offsets, y, mask=mask)
 39 | 
 40 | # ------------------------------------------------------------------------------
 41 | # Python Wrapper Function for the Triton Kernel
 42 | # ------------------------------------------------------------------------------
 43 | 
 44 | def constant_add_triton(x: torch.Tensor, constant: float) -> torch.Tensor:
 45 |     """
 46 |     Adds a constant to each element of the input vector x using a Triton kernel.
 47 |     
 48 |     The block size is set equal to the vector length (N0), meaning that only one
 49 |     kernel instance is launched.
 50 |     
 51 |     Args:
 52 |         x (torch.Tensor): Input vector on CUDA.
 53 |         constant (float): The constant to add to each element.
 54 |     
 55 |     Returns:
 56 |         torch.Tensor: Output vector with the constant added.
 57 |     """
 58 |     N0 = x.numel()
 59 |     BLOCK_SIZE = N0  # Block size equals the vector length
 60 |     y = torch.empty_like(x)
 61 |     
 62 |     # With BLOCK_SIZE = N0, our grid consists of a single block.
 63 |     grid = lambda meta: (1,)
 64 |     
 65 |     # Launch the Triton kernel
 66 |     constant_add_kernel[grid](x, constant, y, N0, BLOCK_SIZE=BLOCK_SIZE)
 67 |     return y
 68 | 
 69 | # ------------------------------------------------------------------------------
 70 | # Main: Test the constant add kernel
 71 | # ------------------------------------------------------------------------------
 72 | 
 73 | if __name__ == '__main__':
 74 |     # Create an example vector on the GPU.
 75 |     N0 = 1024  # Length of the vector
 76 |     x = torch.arange(0, N0, device='cuda', dtype=torch.float32)
 77 |     constant = 3.0  # The constant value to add
 78 | 
 79 |     # Compute the result using the Triton kernel.
 80 |     y_triton = constant_add_triton(x, constant)
 81 | 
 82 |     # Compute the result using PyTorch for verification.
 83 |     y_torch = x + constant
 84 | 
 85 |     # Verify correctness.
 86 |     if torch.allclose(y_triton, y_torch):
 87 |         print("Success: Triton kernel result matches PyTorch result!")
 88 |     else:
 89 |         print("Error: The results do not match.")
 90 | 
 91 |     # Benchmark the Triton kernel.
 92 |     triton_time = benchmark(constant_add_triton, x, constant)
 93 |     print(f"Average execution time (Triton): {triton_time:.3f} ms")
 94 | ```
 95 | 
 96 | ## Code explanation
 97 | 
 98 | ### 1. The Triton kernel (`constant_add_kernel`)
 99 | - **Kernel signature:**  
100 |   the kernel receives pointers for the input vector `x`, the constant value to add, and the output vector `y`. It also gets the total number of elements `N0` and a compile-time constant `BLOCK_SIZE`.
101 |   
102 | - **Program ID and offsets:**  
103 |   `pid = tl.program_id(0)` obtains the current program ID along the single grid axis. Using this, the kernel calculates the offsets for each element in the block. Since `BLOCK_SIZE` is set equal to `N0`, only one block (one kernel instance) is needed.
104 |   
105 | - **Boundary mask:**  
106 |   a mask (`mask = offsets < N0`) ensures safe memory accesses.
107 |   
108 | - **Addition operation:**  
109 |   the kernel loads the data from `x`, adds the provided constant, and stores the result into `y`.
110 | 
111 | ### 2. Python wrapper function (`constant_add_triton`)
112 | - **Purpose:**  
113 |   this function allocates the output tensor and configures the grid for launching the Triton kernel.
114 |   
115 | - **Grid configuration:**  
116 |   with `BLOCK_SIZE = N0`, the grid is defined as `(1,)` since the entire vector is processed by a single kernel instance.
117 | 
118 | ### 3. Main routine
119 | - **Setup:**  
120 |   a vector `x` of length 1024 is created on the GPU, and a constant value of 3.0 is chosen.
121 |   
122 | - **Validation:**  
123 |   the Triton kernel’s output is compared to PyTorch's built-in addition to ensure correctness.
124 | 
125 | ## Conclusion
126 | 
127 | Puzzle 1: Constant Add is the first step in our Daily Triton Challenge. This simple yet effective exercise helps you grasp the basic structure of writing a Triton kernel, setting up the grid, and ensuring correct memory operations. As you progress, you'll build on these fundamentals to explore more advanced topics in GPU kernel programming with Triton.
128 | 


--------------------------------------------------------------------------------
/daily_challange/day4/relu.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import torch
  3 | import triton
  4 | import triton.language as tl
  5 | 
  6 | # ------------------------------------------------------------------------------
  7 | # Triton Kernel for ReLU Forward Pass
  8 | # ------------------------------------------------------------------------------
  9 | 
 10 | @triton.jit
 11 | def relu_forward_kernel(
 12 |     x_ptr,                # Pointer to input tensor x
 13 |     y_ptr,                # Pointer to output tensor y
 14 |     N: tl.constexpr,      # Total number of elements in x
 15 |     BLOCK_SIZE: tl.constexpr  # Number of elements processed per kernel instance
 16 | ):
 17 |     # Get the current program (block) ID.
 18 |     pid = tl.program_id(0)
 19 |     # Compute offsets for this block.
 20 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 21 |     # Create a mask for out-of-bound indices.
 22 |     mask = offsets < N
 23 |     # Load input values.
 24 |     x = tl.load(x_ptr + offsets, mask=mask)
 25 |     # Compute ReLU: y = max(0, x)
 26 |     y = tl.maximum(x, 0.0)
 27 |     # Store the result.
 28 |     tl.store(y_ptr + offsets, y, mask=mask)
 29 | 
 30 | # ------------------------------------------------------------------------------
 31 | # Triton Kernel for ReLU Backward Pass
 32 | # ------------------------------------------------------------------------------
 33 | 
 34 | @triton.jit
 35 | def relu_backward_kernel(
 36 |     x_ptr,                # Pointer to saved input tensor x (from forward pass)
 37 |     grad_output_ptr,      # Pointer to gradient of the output
 38 |     grad_input_ptr,       # Pointer to store computed gradient with respect to x
 39 |     N: tl.constexpr,      # Total number of elements in x
 40 |     BLOCK_SIZE: tl.constexpr  # Number of elements processed per kernel instance
 41 | ):
 42 |     pid = tl.program_id(0)
 43 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 44 |     mask = offsets < N
 45 |     # Load input values and gradient of output.
 46 |     x = tl.load(x_ptr + offsets, mask=mask)
 47 |     grad_out = tl.load(grad_output_ptr + offsets, mask=mask)
 48 |     # Compute gradient of ReLU:
 49 |     # For each element, if x > 0, gradient is grad_out; otherwise, it is 0.
 50 |     grad_in = tl.where(x > 0, grad_out, 0.0)
 51 |     tl.store(grad_input_ptr + offsets, grad_in, mask=mask)
 52 | 
 53 | # ------------------------------------------------------------------------------
 54 | # Custom Autograd Function Using Triton Kernels
 55 | # ------------------------------------------------------------------------------
 56 | 
 57 | class TritonReLUFunction(torch.autograd.Function):
 58 |     @staticmethod
 59 |     def forward(ctx, x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
 60 |         """
 61 |         Forward pass of the ReLU activation using the Triton kernel.
 62 |         Saves the input tensor for use in the backward pass.
 63 |         """
 64 |         N = x.numel()
 65 |         y = torch.empty_like(x)
 66 |         grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),)
 67 |         # Launch the forward kernel.
 68 |         relu_forward_kernel[grid](x, y, N, BLOCK_SIZE=BLOCK_SIZE)
 69 |         # Save input tensor for the backward pass.
 70 |         ctx.save_for_backward(x)
 71 |         ctx.BLOCK_SIZE = BLOCK_SIZE
 72 |         return y
 73 | 
 74 |     @staticmethod
 75 |     def backward(ctx, grad_output: torch.Tensor) -> tuple:
 76 |         """
 77 |         Backward pass computes the gradient of the ReLU activation.
 78 |         """
 79 |         x, = ctx.saved_tensors
 80 |         N = x.numel()
 81 |         grad_input = torch.empty_like(x)
 82 |         grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),)
 83 |         BLOCK_SIZE = ctx.BLOCK_SIZE
 84 |         # Launch the backward kernel.
 85 |         relu_backward_kernel[grid](x, grad_output, grad_input, N, BLOCK_SIZE=BLOCK_SIZE)
 86 |         # Return the gradient for x and None for BLOCK_SIZE (not a tensor).
 87 |         return grad_input, None
 88 | 
 89 | # Convenience function to call our custom autograd ReLU.
 90 | def triton_relu(x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
 91 |     return TritonReLUFunction.apply(x, BLOCK_SIZE)
 92 | 
 93 | # ------------------------------------------------------------------------------
 94 | # Benchmarking Function
 95 | # ------------------------------------------------------------------------------
 96 | 
 97 | def benchmark(func, *args, n_warmup=10, n_iters=100):
 98 |     """
 99 |     Benchmarks a function by running warm-up iterations followed by timed iterations.
100 |     
101 |     Args:
102 |         func (callable): The function to benchmark.
103 |         *args: Arguments to pass to the function.
104 |         n_warmup (int): Number of warm-up iterations.
105 |         n_iters (int): Number of iterations for timing.
106 |     
107 |     Returns:
108 |         float: Average execution time per iteration in milliseconds.
109 |     """
110 |     # Warm-up iterations.
111 |     for _ in range(n_warmup):
112 |         func(*args)
113 |     torch.cuda.synchronize()
114 | 
115 |     start = time.perf_counter()
116 |     for _ in range(n_iters):
117 |         func(*args)
118 |     torch.cuda.synchronize()
119 |     end = time.perf_counter()
120 |     return (end - start) / n_iters * 1000
121 | 
122 | # ------------------------------------------------------------------------------
123 | # Main: Test and Benchmark the Autograd-Compatible ReLU
124 | # ------------------------------------------------------------------------------
125 | 
126 | if __name__ == '__main__':
127 |     # Create a random input tensor on the GPU with gradient tracking.
128 |     N = 1024 * 1024  # 1 million elements
129 |     x = torch.randn(N, device='cuda', dtype=torch.float32, requires_grad=True)
130 |     BLOCK_SIZE = 1024
131 | 
132 |     # Forward pass using our custom Triton ReLU.
133 |     y_triton = triton_relu(x, BLOCK_SIZE)
134 |     # Define a dummy loss (sum of outputs) and perform backward pass.
135 |     loss_triton = y_triton.sum()
136 |     loss_triton.backward()
137 |     
138 |     # For validation, compare against PyTorch's built-in ReLU.
139 |     x_torch = x.detach().clone().requires_grad_()
140 |     y_torch = torch.relu(x_torch)
141 |     loss_torch = y_torch.sum()
142 |     loss_torch.backward()
143 | 
144 |     # Check if the gradients match.
145 |     if torch.allclose(x.grad, x_torch.grad, atol=1e-4):
146 |         print("Success: Triton autograd ReLU backward matches PyTorch!")
147 |     else:
148 |         print("Error: The gradients do not match.")
149 | 
150 |     # Benchmark the forward pass.
151 |     triton_time = benchmark(lambda: triton_relu(x, BLOCK_SIZE))
152 |     torch_time = benchmark(lambda: torch.relu(x))
153 |     print(f"Average execution time (Forward Pass):")
154 |     print(f"  Triton ReLU = {triton_time:.3f} ms")
155 |     print(f"  PyTorch ReLU = {torch_time:.3f} ms")
156 | 


--------------------------------------------------------------------------------
/daily_challange/day1/readme.md:
--------------------------------------------------------------------------------
  1 | # Puzzle2: Vector Addition with Triton and PyTorch
  2 | 
  3 | This repository contains a simple example of how to add two vectors using a custom GPU kernel written in [Triton](https://github.com/openai/triton) and compares the result to a standard PyTorch implementation. The result of both implementations is the same.
  4 | 
  5 | ## Overview
  6 | 
  7 | - **Triton Kernel:** small GPU kernel that divides the input vectors into blocks. Each kernel instance computes the addition for a block of elements.
  8 | - **PyTorch Implementation:** simple element‑wise addition using PyTorch’s built-in tensor operations.
  9 | 
 10 | This example demonstrates how to write a Triton kernel, launch it from Python, and verify that the computed result is identical to that of PyTorch.
 11 | 
 12 | ## Requirements
 13 | 
 14 | - Python 3.8+
 15 | - [PyTorch](https://pytorch.org/) (with CUDA support)
 16 | - [Triton](https://github.com/openai/triton)  
 17 |   Install via pip:
 18 | 
 19 |   ```bash
 20 |   pip install triton
 21 |   ```
 22 | 
 23 | ## Code
 24 | 
 25 | Below is the full code example:
 26 | 
 27 | ```python
 28 | import torch
 29 | import triton
 30 | import triton.language as tl
 31 | 
 32 | # ------------------------------------------------------------------------------
 33 | # Triton Kernel for Vector Addition
 34 | # ------------------------------------------------------------------------------
 35 | 
 36 | @triton.jit
 37 | def vector_add_kernel(
 38 |     A_ptr,          # Pointer to first input vector A
 39 |     B_ptr,          # Pointer to second input vector B
 40 |     C_ptr,          # Pointer to output vector C
 41 |     n_elements: tl.constexpr,  # Number of elements in the vectors
 42 |     BLOCK_SIZE: tl.constexpr   # Block size (number of elements per program instance)
 43 | ):
 44 |     # Each program instance (kernel instance) computes a block of elements.
 45 |     pid = tl.program_id(0)  # 1D grid: get the program id (i.e. block index)
 46 |     # Compute the offsets for the current block
 47 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 48 |     # Create a mask to avoid out-of-bound accesses
 49 |     mask = offsets < n_elements
 50 | 
 51 |     # Load the corresponding elements from A and B
 52 |     a = tl.load(A_ptr + offsets, mask=mask)
 53 |     b = tl.load(B_ptr + offsets, mask=mask)
 54 |     
 55 |     # Perform element-wise addition
 56 |     c = a + b
 57 |     
 58 |     # Store the result into the output pointer C
 59 |     tl.store(C_ptr + offsets, c, mask=mask)
 60 | 
 61 | # ------------------------------------------------------------------------------
 62 | # Python Wrapper Function for the Triton Kernel
 63 | # ------------------------------------------------------------------------------
 64 | 
 65 | def vector_add_triton(A: torch.Tensor, B: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
 66 |     """
 67 |     Adds two vectors A and B using the Triton kernel.
 68 | 
 69 |     Args:
 70 |         A (torch.Tensor): First input vector (on CUDA).
 71 |         B (torch.Tensor): Second input vector (on CUDA).
 72 |         BLOCK_SIZE (int): Number of elements per block for the kernel.
 73 | 
 74 |     Returns:
 75 |         torch.Tensor: Output vector containing the element-wise sum.
 76 |     """
 77 |     assert A.numel() == B.numel(), "Input vectors must have the same number of elements."
 78 |     n_elements = A.numel()
 79 |     # Create an empty tensor for the result (same size and device as A)
 80 |     C = torch.empty_like(A)
 81 |     
 82 |     # Define grid: number of blocks needed to cover all elements
 83 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
 84 |     
 85 |     # Launch the kernel
 86 |     vector_add_kernel[grid](A, B, C, n_elements, BLOCK_SIZE=BLOCK_SIZE)
 87 |     return C
 88 | 
 89 | # ------------------------------------------------------------------------------
 90 | # Main: Compare Triton Kernel with PyTorch Implementation
 91 | # ------------------------------------------------------------------------------
 92 | 
 93 | if __name__ == '__main__':
 94 |     # Create two example vectors on the GPU
 95 |     n = 1024 * 10  # total number of elements
 96 |     A = torch.arange(0, n, device='cuda', dtype=torch.float32)
 97 |     B = torch.arange(n, 2 * n, device='cuda', dtype=torch.float32)
 98 | 
 99 |     # Add the vectors using the Triton kernel
100 |     C_triton = vector_add_triton(A, B)
101 | 
102 |     # Add the vectors using PyTorch (for verification)
103 |     C_pytorch = A + B
104 | 
105 |     # Verify that the results are the same
106 |     if torch.allclose(C_triton, C_pytorch):
107 |         print("Success: The Triton kernel result matches the PyTorch result!")
108 |     else:
109 |         print("Error: The results do not match.")
110 | 
111 |     # Print part of the result for inspection
112 |     print("Result (first 10 elements):", C_triton[:10])
113 | ```
114 | 
115 | ## Code Explanation
116 | 
117 | ### 1. The Triton kernel (`vector_add_kernel`)
118 | - **Kernel signature:**  
119 |   the kernel receives pointers to the input arrays (`A_ptr` and `B_ptr`), a pointer for the output array (`C_ptr`), the total number of elements (`n_elements`), and a compile-time constant `BLOCK_SIZE`.
120 |   
121 | - **Kernel indexing:**  
122 |   `pid = tl.program_id(0)` retrieves the unique index for the current block. Using this, we compute the starting offsets for each block.
123 |   
124 | - **Boundary checking:**  
125 |   mask (`mask = offsets < n_elements`) is used to ensure that only valid elements are loaded and stored, which is important when the total number of elements is not a multiple of `BLOCK_SIZE`.
126 |   
127 | - **Memory operations:**  
128 |   the `tl.load` function reads elements from memory, and `tl.store` writes the computed result back.
129 | 
130 | ### 2. Python wrapper function (`vector_add_triton`)
131 | - **Input Validation:**  
132 |   We ensure both input vectors have the same number of elements.
133 |   
134 | - **Result tensor:**  
135 |   an output tensor `C` is allocated with the same shape and device as the input vectors.
136 |   
137 | - **Kernel launch configuration:**  
138 |   the grid is computed using `triton.cdiv(n_elements, meta['BLOCK_SIZE'])` which determines how many blocks are needed.
139 |   
140 | - **Kernel launch:**  
141 |   the Triton kernel is launched with the computed grid and the provided parameters.
142 | 
143 | ### 3. PyTorch comparison
144 | - **PyTorch addition:**  
145 |   the same vector addition is performed using PyTorch's built-in operator (`A + B`).
146 |   
147 | - **Verification:**  
148 |   `torch.allclose` checks that the results from both methods are nearly identical.
149 | 
150 | ## Conclusion
151 | 
152 | This example demonstrates a minimal Triton kernel for vector addition. Triton allows you to write custom GPU kernels in Python with a syntax similar to CUDA, enabling you to optimize performance-critical operations. The comparison with PyTorch’s built-in vector addition shows that custom kernels can be both simple to write and produce correct results.
153 | 
154 | Feel free to clone this repository, experiment with different block sizes, and extend this example to more complex operations.
155 | 


--------------------------------------------------------------------------------
/daily_challange/day3/readme.md:
--------------------------------------------------------------------------------
  1 | # Puzzle 4: ReLU Activation with Triton
  2 | 
  3 | In this challenge, you will implement the ReLU (Rectified Linear Unit) activation function using Triton. ReLU is defined as:
  4 | 
  5 | ReLU(x)=max(0,x)
  6 | 
  7 | For each element in the input vector, the kernel computes the maximum between the element and 0. This example compares the custom Triton implementation to PyTorch’s built-in ReLU, and it also includes a benchmarking function for performance measurement.
  8 | 
  9 | ## Key points
 10 | 
 11 | - **1D grid processing:** the kernel uses a one-dimensional grid of program IDs. Each kernel instance processes a block of elements.
 12 | - **Block-based computation:** the vector is processed in blocks with a configurable block size.
 13 | - **Element-wise operation:** for each element, the kernel computes `y = max(0, x)`.
 14 | 
 15 | ## Full code example
 16 | 
 17 | ```python
 18 | import time
 19 | import torch
 20 | import triton
 21 | import triton.language as tl
 22 | 
 23 | # ------------------------------------------------------------------------------
 24 | # Triton Kernel for ReLU Activation
 25 | # ------------------------------------------------------------------------------
 26 | 
 27 | @triton.jit
 28 | def relu_kernel(
 29 |     x_ptr,               # Pointer to the input vector x
 30 |     y_ptr,               # Pointer to the output vector y
 31 |     N: tl.constexpr,     # Total number of elements in the input vector
 32 |     BLOCK_SIZE: tl.constexpr  # Block size: number of elements processed per kernel instance
 33 | ):
 34 |     # Each kernel instance processes a block of elements.
 35 |     # Get the current program ID along the 1D grid.
 36 |     pid = tl.program_id(0)
 37 |     
 38 |     # Compute the offsets for the block of elements this kernel instance will process.
 39 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 40 |     
 41 |     # Create a mask to ensure we do not access out-of-bound memory.
 42 |     mask = offsets < N
 43 |     
 44 |     # Load elements from the input pointer.
 45 |     x = tl.load(x_ptr + offsets, mask=mask)
 46 |     
 47 |     # Compute the ReLU activation: y = max(0, x)
 48 |     y = tl.maximum(x, 0.0)
 49 |     
 50 |     # Store the result back to the output pointer.
 51 |     tl.store(y_ptr + offsets, y, mask=mask)
 52 | 
 53 | # ------------------------------------------------------------------------------
 54 | # Python Wrapper Function for the Triton ReLU Kernel
 55 | # ------------------------------------------------------------------------------
 56 | 
 57 | def relu_triton(x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
 58 |     """
 59 |     Applies the ReLU activation function on the input vector x using a Triton kernel.
 60 |     
 61 |     Args:
 62 |         x (torch.Tensor): Input tensor on CUDA.
 63 |         BLOCK_SIZE (int): Number of elements processed per kernel instance.
 64 |     
 65 |     Returns:
 66 |         torch.Tensor: Output tensor after applying ReLU activation.
 67 |     """
 68 |     N = x.numel()
 69 |     # Allocate the output tensor with the same shape and device as the input.
 70 |     y = torch.empty_like(x)
 71 |     
 72 |     # Configure the grid: number of blocks required to cover all N elements.
 73 |     grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),)
 74 |     
 75 |     # Launch the Triton kernel.
 76 |     relu_kernel[grid](x, y, N, BLOCK_SIZE=BLOCK_SIZE)
 77 |     return y
 78 | 
 79 | # ------------------------------------------------------------------------------
 80 | # Benchmarking Function
 81 | # ------------------------------------------------------------------------------
 82 | 
 83 | def benchmark(func, *args, n_warmup=10, n_iters=100):
 84 |     """
 85 |     Benchmarks a function by performing warm-up iterations followed by timed iterations.
 86 |     
 87 |     Args:
 88 |         func (callable): The function to benchmark.
 89 |         *args: Arguments to pass to the function.
 90 |         n_warmup (int): Number of warm-up iterations.
 91 |         n_iters (int): Number of iterations for timing.
 92 |     
 93 |     Returns:
 94 |         float: Average execution time per iteration in milliseconds.
 95 |     """
 96 |     # Warm-up: execute the function several times to mitigate initial overhead.
 97 |     for _ in range(n_warmup):
 98 |         func(*args)
 99 |     torch.cuda.synchronize()  # Wait for all GPU operations to finish.
100 | 
101 |     # Timing the execution.
102 |     start = time.perf_counter()
103 |     for _ in range(n_iters):
104 |         func(*args)
105 |     torch.cuda.synchronize()  # Ensure all GPU operations are complete.
106 |     end = time.perf_counter()
107 | 
108 |     avg_time_ms = (end - start) / n_iters * 1000
109 |     return avg_time_ms
110 | 
111 | # ------------------------------------------------------------------------------
112 | # Main: Test and Benchmark the Triton ReLU Kernel
113 | # ------------------------------------------------------------------------------
114 | 
115 | if __name__ == '__main__':
116 |     # Create an example input vector on the GPU.
117 |     N = 1024 * 1024  # For instance, 1 million elements.
118 |     x = torch.randn(N, device='cuda', dtype=torch.float32)
119 |     
120 |     # Apply ReLU using the Triton kernel.
121 |     y_triton = relu_triton(x)
122 |     
123 |     # Apply ReLU using PyTorch for validation.
124 |     y_torch = torch.relu(x)
125 |     
126 |     # Verify that both outputs are the same.
127 |     if torch.allclose(y_triton, y_torch):
128 |         print("Success: Triton ReLU matches PyTorch ReLU!")
129 |     else:
130 |         print("Error: The Triton ReLU output does not match PyTorch.")
131 | 
132 |     # Benchmark the Triton kernel.
133 |     triton_time = benchmark(relu_triton, x)
134 |     print(f"Average execution time (Triton ReLU): {triton_time:.3f} ms")
135 | 
136 |     # Benchmark PyTorch’s built-in ReLU.
137 |     torch_time = benchmark(torch.relu, x)
138 |     print(f"Average execution time (PyTorch ReLU): {torch_time:.3f} ms")
139 | ```
140 | 
141 | ## Code explanation
142 | 
143 | ### 1. The Triton kernel (`relu_kernel`)
144 | - **Kernel signature:**  
145 |   The kernel takes pointers for the input (`x_ptr`) and output (`y_ptr`) vectors, along with the total number of elements (`N`) and a compile-time constant `BLOCK_SIZE`.
146 |   
147 | - **Program ID and offsets:**  
148 |   The kernel retrieves its program ID using `tl.program_id(0)` and computes the element offsets within the vector for the current block.
149 |   
150 | - **Masking:**  
151 |   A mask is created (`mask = offsets < N`) to prevent out-of-bound memory accesses when the vector size is not an exact multiple of the block size.
152 |   
153 | - **ReLU computation:**  
154 |   The kernel loads the input elements, computes the maximum between each element and 0 using `tl.maximum(x, 0.0)`, and then stores the result.
155 |   
156 | ### 2. Python wrapper function (`relu_triton`)
157 | - **Purpose:**  
158 |   This function sets up the output tensor and computes the grid configuration needed to launch the kernel. It then calls the Triton kernel with the correct arguments.
159 |   
160 | - **Grid configuration:**  
161 |   The grid is computed with `triton.cdiv(N, meta['BLOCK_SIZE'])` ensuring all elements are processed even if the total number of elements isn’t an exact multiple of the block size.
162 | 
163 | ### 3. Benchmarking function (`benchmark`)
164 | - **Warm-up iterations:**  
165 |   Several warm-up iterations help avoid measuring the initial overhead such as CUDA context initialization.
166 |   
167 | - **Timing:**  
168 |   The function measures the average execution time over a set number of iterations. Synchronization (`torch.cuda.synchronize()`) is used before and after the timing loop to ensure accurate measurement.
169 | 
170 | ### 4. Main routine
171 | - **Setup:**  
172 |   A large random input vector is generated on the GPU.
173 |   
174 | - **Validation:**  
175 |   The output from the Triton kernel is compared with PyTorch’s `torch.relu` to ensure the correctness of the implementation.
176 |   
177 | - **Benchmarking:**  
178 |   Both the Triton and PyTorch ReLU functions are benchmarked, and their average execution times are printed.
179 | 
180 | ## Conclusion
181 | 
182 | This puzzle demonstrates how to implement a ReLU activation function using Triton. By comparing it with PyTorch’s implementation and measuring performance, you gain practical insight into writing and optimizing custom GPU kernels. This is another step forward in your Daily Triton Challenge as you explore GPU programming from basic to more advanced operations.
183 | 


--------------------------------------------------------------------------------
/daily_challange/day2/readme.md:
--------------------------------------------------------------------------------
  1 | # Puzzle3: Vector Addition with Triton and PyTorch (with Benchmarking)
  2 | 
  3 | This repository demonstrates how to add two vectors element‑wise using a custom Triton GPU kernel and compares the performance with a PyTorch implementation. A benchmarking function is included to measure the average execution time for each method.
  4 | 
  5 | ## Overview
  6 | 
  7 | - **Triton kernel:** custom GPU kernel that divides the input vectors into blocks and performs element‑wise addition.
  8 | - **PyTorch implementation:** simple vector addition using PyTorch’s built‑in tensor operations.
  9 | - **Benchmarking function:** helper function that performs warm‑up runs and measures the average execution time over several iterations.
 10 | 
 11 | ## Requirements
 12 | 
 13 | - Python 3.8+
 14 | - [PyTorch](https://pytorch.org/) (with CUDA support)
 15 | - [Triton](https://github.com/openai/triton)  
 16 |   Install via pip:
 17 | 
 18 |   ```bash
 19 |   pip install triton
 20 |   ```
 21 | 
 22 | ## Full Code Example
 23 | 
 24 | ```python
 25 | import time
 26 | import torch
 27 | import triton
 28 | import triton.language as tl
 29 | 
 30 | # ------------------------------------------------------------------------------
 31 | # Triton Kernel for Vector Addition
 32 | # ------------------------------------------------------------------------------
 33 | 
 34 | @triton.jit
 35 | def vector_add_kernel(
 36 |     A_ptr,          # Pointer to the first input vector A
 37 |     B_ptr,          # Pointer to the second input vector B
 38 |     C_ptr,          # Pointer to the output vector C
 39 |     n_elements: tl.constexpr,  # Total number of elements in the vectors
 40 |     BLOCK_SIZE: tl.constexpr   # Block size (number of elements processed per kernel instance)
 41 | ):
 42 |     # Get the current program (block) ID
 43 |     pid = tl.program_id(0)
 44 |     # Compute the offsets for the current block
 45 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 46 |     # Create a mask to avoid accessing out-of-bound indices
 47 |     mask = offsets < n_elements
 48 | 
 49 |     # Load the elements from A and B with the computed offsets
 50 |     a = tl.load(A_ptr + offsets, mask=mask)
 51 |     b = tl.load(B_ptr + offsets, mask=mask)
 52 |     
 53 |     # Perform element-wise addition
 54 |     c = a + b
 55 |     
 56 |     # Store the result in C using the mask to ensure only valid writes
 57 |     tl.store(C_ptr + offsets, c, mask=mask)
 58 | 
 59 | # ------------------------------------------------------------------------------
 60 | # Python Wrapper for the Triton Kernel
 61 | # ------------------------------------------------------------------------------
 62 | 
 63 | def vector_add_triton(A: torch.Tensor, B: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
 64 |     """
 65 |     Adds two vectors using the Triton kernel.
 66 |     
 67 |     Args:
 68 |         A (torch.Tensor): First input vector (on CUDA).
 69 |         B (torch.Tensor): Second input vector (on CUDA).
 70 |         BLOCK_SIZE (int): Number of elements per block for the kernel.
 71 |     
 72 |     Returns:
 73 |         torch.Tensor: Output vector with the element-wise sum.
 74 |     """
 75 |     n_elements = A.numel()
 76 |     # Allocate the output tensor (same shape and device as A)
 77 |     C = torch.empty_like(A)
 78 |     
 79 |     # Define the grid (number of blocks) required to cover all elements
 80 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
 81 |     
 82 |     # Launch the Triton kernel
 83 |     vector_add_kernel[grid](A, B, C, n_elements, BLOCK_SIZE=BLOCK_SIZE)
 84 |     return C
 85 | 
 86 | # ------------------------------------------------------------------------------
 87 | # Benchmarking Function
 88 | # ------------------------------------------------------------------------------
 89 | 
 90 | def benchmark(func, *args, n_warmup=10, n_iters=100):
 91 |     """
 92 |     Benchmarks a function by running warm-up iterations followed by timed iterations.
 93 |     
 94 |     Args:
 95 |         func (callable): The function to benchmark.
 96 |         *args: Arguments to pass to the function.
 97 |         n_warmup (int): Number of warm-up iterations (to exclude startup overhead).
 98 |         n_iters (int): Number of iterations for timing.
 99 |     
100 |     Returns:
101 |         float: Average execution time per iteration in milliseconds.
102 |     """
103 |     # Warm-up runs to ensure any one-time setup is complete (e.g. CUDA context)
104 |     for _ in range(n_warmup):
105 |         func(*args)
106 |     torch.cuda.synchronize()  # Ensure warm-up kernels have finished
107 | 
108 |     # Start timing
109 |     start = time.perf_counter()
110 |     for _ in range(n_iters):
111 |         func(*args)
112 |     torch.cuda.synchronize()  # Wait for all GPU operations to finish
113 |     end = time.perf_counter()
114 | 
115 |     # Calculate the average execution time (in milliseconds)
116 |     avg_time_ms = (end - start) / n_iters * 1000
117 |     return avg_time_ms
118 | 
119 | # ------------------------------------------------------------------------------
120 | # Main: Compare and Benchmark Triton Kernel vs. PyTorch Implementation
121 | # ------------------------------------------------------------------------------
122 | 
123 | if __name__ == '__main__':
124 |     # Create two example vectors on the GPU (stress test with a large number of elements)
125 |     n = 1024 * 1024 * 10  # e.g., 10 million elements
126 |     A = torch.arange(0, n, device='cuda', dtype=torch.float32)
127 |     B = torch.arange(n, 2 * n, device='cuda', dtype=torch.float32)
128 | 
129 |     # Validate correctness by comparing results from Triton and PyTorch
130 |     C_triton = vector_add_triton(A, B)
131 |     C_pytorch = A + B
132 | 
133 |     if torch.allclose(C_triton, C_pytorch):
134 |         print("Success: The Triton result matches the PyTorch result!")
135 |     else:
136 |         print("Error: The results do not match.")
137 | 
138 |     # Benchmark the Triton kernel
139 |     triton_time = benchmark(vector_add_triton, A, B, n_warmup=10, n_iters=100)
140 |     print(f"Average execution time (Triton): {triton_time:.3f} ms")
141 | 
142 |     # Benchmark the PyTorch implementation
143 |     def pytorch_add(A, B):
144 |         return A + B
145 | 
146 |     pytorch_time = benchmark(pytorch_add, A, B, n_warmup=10, n_iters=100)
147 |     print(f"Average execution time (PyTorch): {pytorch_time:.3f} ms")
148 | ```
149 | 
150 | ## Code explanation
151 | 
152 | ### 1. Triton kernel (`vector_add_kernel`)
153 | - **Kernel signature:**  
154 |   the kernel receives pointers to vectors A, B, and C, along with the total number of elements and the block size (a compile‑time constant).  
155 | - **Indexing and masking:**  
156 |   each kernel instance computes a block of element offsets and uses a mask to prevent out‑of-bound memory accesses.
157 | - **Memory operations:**  
158 |   the kernel loads values from A and B, computes their sum, and writes the result to C.
159 | 
160 | ### 2. Python wrapper (`vector_add_triton`)
161 | - **Functionality:**  
162 |   this function prepares the input data, allocates the output tensor, and configures the grid for the Triton kernel launch.
163 | - **Kernel launch:**  
164 |   the kernel is launched using the computed grid configuration.
165 | 
166 | ### 3. Benchmarking function (`benchmark`)
167 | - **Warm-up iterations:**  number of warm-up iterations are executed to overcome any one-time overhead (such as CUDA context initialization).
168 | - **Timing:**  
169 |   The function uses Python’s `time.perf_counter()` to measure elapsed time over multiple iterations.  
170 | - **Synchronization:**  
171 |   `torch.cuda.synchronize()` is called before starting and after completing the timed iterations to ensure that all GPU operations have finished.
172 | 
173 | ### 4. Main routine
174 | - **Data Preparation:**  
175 |   two large vectors (10 million elements each) are created on the GPU.
176 | - **Validation:**  
177 |   the Triton and PyTorch implementations are compared using `torch.allclose()` to ensure correctness.
178 | - **Benchmarking:**  
179 |   both implementations are benchmarked by measuring the average execution time over 100 iterations (after 10 warm-up iterations). The results are printed to the console.
180 | 
181 | ## Conclusion
182 | 
183 | This example shows how to implement and benchmark a custom Triton GPU kernel for vector addition alongside a standard PyTorch operation. With the included benchmarking function, you can stress test both implementations and compare their performance under various conditions. Feel free to modify the number of elements, block sizes, and iterations to explore performance characteristics further.
184 | 


--------------------------------------------------------------------------------
/daily_challange/day4/readme.md:
--------------------------------------------------------------------------------
  1 | # Puzzle 5: Autograd-Compatible ReLU with Triton
  2 | 
  3 | In this challenge, you will implement the ReLU activation function in a way that is fully compatible with PyTorch’s autograd. That means you’ll write a custom autograd function that uses a Triton kernel for the forward pass (computing `y = max(0, x)`) and a second Triton kernel for the backward pass (computing the gradient of ReLU, where `grad_input = grad_output` if `x > 0` and `0` otherwise).
  4 | 
  5 | ## Overview
  6 | 
  7 | - **Forward kernel:** computes the ReLU activation on the input tensor.
  8 | - **Backward kernel:** computes the gradient with respect to the input.
  9 | - **Custom autograd function:** wraps the Triton kernels so that they can be used in PyTorch’s computational graph.
 10 | - **Benchmarking and validation:** compare the custom function against PyTorch’s built‑in ReLU to ensure correctness and measure performance.
 11 | 
 12 | ## Full code example
 13 | 
 14 | ```python
 15 | import time
 16 | import torch
 17 | import triton
 18 | import triton.language as tl
 19 | 
 20 | # ------------------------------------------------------------------------------
 21 | # Triton Kernel for ReLU Forward Pass
 22 | # ------------------------------------------------------------------------------
 23 | 
 24 | @triton.jit
 25 | def relu_forward_kernel(
 26 |     x_ptr,                # Pointer to input tensor x
 27 |     y_ptr,                # Pointer to output tensor y
 28 |     N: tl.constexpr,      # Total number of elements in x
 29 |     BLOCK_SIZE: tl.constexpr  # Number of elements processed per kernel instance
 30 | ):
 31 |     # Get the current program (block) ID.
 32 |     pid = tl.program_id(0)
 33 |     # Compute offsets for this block.
 34 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 35 |     # Create a mask for out-of-bound indices.
 36 |     mask = offsets < N
 37 |     # Load input values.
 38 |     x = tl.load(x_ptr + offsets, mask=mask)
 39 |     # Compute ReLU: y = max(0, x)
 40 |     y = tl.maximum(x, 0.0)
 41 |     # Store the result.
 42 |     tl.store(y_ptr + offsets, y, mask=mask)
 43 | 
 44 | # ------------------------------------------------------------------------------
 45 | # Triton Kernel for ReLU Backward Pass
 46 | # ------------------------------------------------------------------------------
 47 | 
 48 | @triton.jit
 49 | def relu_backward_kernel(
 50 |     x_ptr,                # Pointer to saved input tensor x (from forward pass)
 51 |     grad_output_ptr,      # Pointer to gradient of the output
 52 |     grad_input_ptr,       # Pointer to store computed gradient with respect to x
 53 |     N: tl.constexpr,      # Total number of elements in x
 54 |     BLOCK_SIZE: tl.constexpr  # Number of elements processed per kernel instance
 55 | ):
 56 |     pid = tl.program_id(0)
 57 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 58 |     mask = offsets < N
 59 |     # Load input values and gradient of output.
 60 |     x = tl.load(x_ptr + offsets, mask=mask)
 61 |     grad_out = tl.load(grad_output_ptr + offsets, mask=mask)
 62 |     # Compute gradient of ReLU:
 63 |     # For each element, if x > 0, gradient is grad_out; otherwise, it is 0.
 64 |     grad_in = tl.where(x > 0, grad_out, 0.0)
 65 |     tl.store(grad_input_ptr + offsets, grad_in, mask=mask)
 66 | 
 67 | # ------------------------------------------------------------------------------
 68 | # Custom Autograd Function Using Triton Kernels
 69 | # ------------------------------------------------------------------------------
 70 | 
 71 | class TritonReLUFunction(torch.autograd.Function):
 72 |     @staticmethod
 73 |     def forward(ctx, x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
 74 |         """
 75 |         Forward pass of the ReLU activation using the Triton kernel.
 76 |         Saves the input tensor for use in the backward pass.
 77 |         """
 78 |         N = x.numel()
 79 |         y = torch.empty_like(x)
 80 |         grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),)
 81 |         # Launch the forward kernel.
 82 |         relu_forward_kernel[grid](x, y, N, BLOCK_SIZE=BLOCK_SIZE)
 83 |         # Save input tensor for the backward pass.
 84 |         ctx.save_for_backward(x)
 85 |         ctx.BLOCK_SIZE = BLOCK_SIZE
 86 |         return y
 87 | 
 88 |     @staticmethod
 89 |     def backward(ctx, grad_output: torch.Tensor) -> tuple:
 90 |         """
 91 |         Backward pass computes the gradient of the ReLU activation.
 92 |         """
 93 |         x, = ctx.saved_tensors
 94 |         N = x.numel()
 95 |         grad_input = torch.empty_like(x)
 96 |         grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),)
 97 |         BLOCK_SIZE = ctx.BLOCK_SIZE
 98 |         # Launch the backward kernel.
 99 |         relu_backward_kernel[grid](x, grad_output, grad_input, N, BLOCK_SIZE=BLOCK_SIZE)
100 |         # Return the gradient for x and None for BLOCK_SIZE (not a tensor).
101 |         return grad_input, None
102 | 
103 | # Convenience function to call our custom autograd ReLU.
104 | def triton_relu(x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor:
105 |     return TritonReLUFunction.apply(x, BLOCK_SIZE)
106 | 
107 | # ------------------------------------------------------------------------------
108 | # Benchmarking Function
109 | # ------------------------------------------------------------------------------
110 | 
111 | def benchmark(func, *args, n_warmup=10, n_iters=100):
112 |     """
113 |     Benchmarks a function by running warm-up iterations followed by timed iterations.
114 |     
115 |     Args:
116 |         func (callable): The function to benchmark.
117 |         *args: Arguments to pass to the function.
118 |         n_warmup (int): Number of warm-up iterations.
119 |         n_iters (int): Number of iterations for timing.
120 |     
121 |     Returns:
122 |         float: Average execution time per iteration in milliseconds.
123 |     """
124 |     # Warm-up iterations.
125 |     for _ in range(n_warmup):
126 |         func(*args)
127 |     torch.cuda.synchronize()
128 | 
129 |     start = time.perf_counter()
130 |     for _ in range(n_iters):
131 |         func(*args)
132 |     torch.cuda.synchronize()
133 |     end = time.perf_counter()
134 |     return (end - start) / n_iters * 1000
135 | 
136 | # ------------------------------------------------------------------------------
137 | # Main: Test and Benchmark the Autograd-Compatible ReLU
138 | # ------------------------------------------------------------------------------
139 | 
140 | if __name__ == '__main__':
141 |     # Create a random input tensor on the GPU with gradient tracking.
142 |     N = 1024 * 1024  # 1 million elements
143 |     x = torch.randn(N, device='cuda', dtype=torch.float32, requires_grad=True)
144 |     BLOCK_SIZE = 1024
145 | 
146 |     # Forward pass using our custom Triton ReLU.
147 |     y_triton = triton_relu(x, BLOCK_SIZE)
148 |     # Define a dummy loss (sum of outputs) and perform backward pass.
149 |     loss_triton = y_triton.sum()
150 |     loss_triton.backward()
151 |     
152 |     # For validation, compare against PyTorch's built-in ReLU.
153 |     x_torch = x.detach().clone().requires_grad_()
154 |     y_torch = torch.relu(x_torch)
155 |     loss_torch = y_torch.sum()
156 |     loss_torch.backward()
157 | 
158 |     # Check if the gradients match.
159 |     if torch.allclose(x.grad, x_torch.grad, atol=1e-4):
160 |         print("Success: Triton autograd ReLU backward matches PyTorch!")
161 |     else:
162 |         print("Error: The gradients do not match.")
163 | 
164 |     # Benchmark the forward pass.
165 |     triton_time = benchmark(lambda: triton_relu(x, BLOCK_SIZE))
166 |     torch_time = benchmark(lambda: torch.relu(x))
167 |     print(f"Average execution time (Forward Pass):")
168 |     print(f"  Triton ReLU = {triton_time:.3f} ms")
169 |     print(f"  PyTorch ReLU = {torch_time:.3f} ms")
170 | ```
171 | 
172 | ## Code explanation
173 | 
174 | ### 1. Forward and backward triton kernels
175 | - **Forward kernel (`relu_forward_kernel`):**  
176 |   - Each kernel instance processes a block of elements.
177 |   - For each element, it computes the ReLU activation: \( y = \max(0, x) \).
178 | - **Backward kernel (`relu_backward_kernel`):**  
179 |   - Loads the saved input and the gradient of the output.
180 |   - Computes the gradient with respect to \( x \): if \( x > 0 \), the gradient remains the same as `grad_output`; otherwise, it is set to 0.
181 | 
182 | ### 2. Custom autograd function (`TritonReLUFunction`)
183 | - **Forward method:**  
184 |   - Calls the Triton forward kernel.
185 |   - Saves the input tensor for use in the backward pass.
186 | - **Backward method:**  
187 |   - Retrieves the saved input.
188 |   - Calls the Triton backward kernel to compute the gradient.
189 |   - Returns the computed gradient for \( x \).
190 | 
191 | ### 3. Benchmarking
192 | - A helper function `benchmark` is provided to measure the average execution time of a function over multiple iterations.
193 | - The forward pass of both the custom Triton ReLU and PyTorch’s built‑in ReLU is benchmarked.
194 | 
195 | ### 4. Main routine
196 | - A large random tensor is created with gradient tracking.
197 | - Both forward and backward passes are executed, and the gradients are compared for correctness.
198 | - Performance is measured and printed for comparison.
199 | 
200 | ## Conclusion
201 | 
202 | This puzzle demonstrates how to integrate Triton kernels with PyTorch’s autograd by implementing both forward and backward methods. By comparing the custom autograd function with PyTorch’s built‑in ReLU, you gain insight into the mechanics of GPU kernel programming and automatic differentiation. This is an essential step toward building more complex, high‑performance GPU operations with Triton.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center"><img src="./assets/triton.png" alt="Triton" width="150" height="150"></div>
  2 | 
  3 | # Triton OpenAI
  4 | A curated list of resources for learning and exploring Triton, OpenAI's programming language for writing efficient GPU code.
  5 | 
  6 | ## Official Documentation
  7 | - [Official Triton Documentation](https://triton-lang.org/main/index.html)
  8 | 
  9 | ## My daily challange (Triton day by day)
 10 | This project is a step-by-step learning journey where we implement various types of Triton kernels—from the simplest examples to more advanced applications—while exploring GPU programming with Triton.
 11 | The goal of this repository is to help you (and others) get comfortable with Triton by:
 12 | - **Starting simple:** begin with basic kernels such as vector addition, and understand the building blocks of writing GPU code with Triton.
 13 | - **Incremental learning:** each day introduces a new challenge, progressively covering more complex topics, techniques, and optimizations.
 14 | - **Hands-on experience:** code, test, and benchmark your kernels against standard implementations (e.g., PyTorch) to see performance improvements and better understand GPU behavior.
 15 | 
 16 | **Daily challenges:** every day, a new challenge is posted in this repository. Each challenge focuses on a specific aspect of Triton, such as:
 17 |   - Basic operations (e.g., vector addition)
 18 |   - Memory management and optimizations
 19 |   - Advanced indexing and dynamic shapes
 20 |   - Multi-dimensional kernels
 21 |   - Reduction operations and more
 22 | - **Detailed explanations:** each kernel comes with an in-depth explanation of the code, helping you understand the concepts behind the implementation.
 23 | - **Benchmarking and stress tests:** learn how to measure performance by comparing custom Triton kernels with standard PyTorch implementations. Get hands-on experience with benchmarking on real-world GPU workloads.
 24 |     
 25 | | Day                 |  Kernel              | Description                |
 26 | |---------------------|----------------------|----------------------------|
 27 | | #1          | [Constant add](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day0)      | This challenge is the first puzzle in our Daily Triton Challenge series. The goal is to write a Triton kernel that adds a constant value to each element of a vector. |
 28 | | #2          | [Add two vectors](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day1)      | Simple example of how to add two vectors using a custom GPU kernel written in Triton and compares the result to a standard PyTorch implementation.  |
 29 | | #3          | [Add two vectors with speed benchmarking](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day2)      | This is almost the same as #2 but we meaesure kernel execution speed and compare it to Pytorch implementation.|
 30 | | #4          | [ReLU Activation with Triton](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day3)      | In this challenge, you will implement the ReLU (Rectified Linear Unit) activation function using Triton. ReLU is defined as: ReLU(x)=max(0,x)|
 31 | | #5          | [ReLU Activation forward and backward](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day4)      | In this challenge, you will implement the ReLU activation function in a way that is fully compatible with PyTorch’s autograd. That means you’ll write a custom autograd function that uses a Triton kernel for the forward pass (computing y = max(0, x)) and a second Triton kernel for the backward pass (computing the gradient of ReLU, where grad_input = grad_output if x > 0 and 0 otherwise). |
 32 | 
 33 | ## Articles
 34 | Gain deeper insights into Triton through these detailed articles:
 35 | - Understanding the Triton Tutorials [Part 1](https://isamu-website.medium.com/understanding-the-triton-tutorials-part-1-6191b59ba4c) and [Part 2](https://isamu-website.medium.com/understanding-triton-tutorials-part-2-f6839ce50ae7)
 36 | - [Softmax in OpenAI Triton](http://blog.nagi.fun/triton-intro-softmax) -> more detailed Fused Softmax Triton example explanation (step-by-step)  
 37 | - [Accelerating AI with Triton: A Deep Dive into Writing High-Performance GPU Code](https://medium.com/@nijesh-kanjinghat/accelerating-ai-with-triton-a-deep-dive-into-writing-high-performance-gpu-code-a1e4d66556cc)
 38 | - [Accelerating Triton Dequantization Kernels for GPTQ](https://pytorch.org/blog/accelerating-triton/)
 39 | - [Triton Tutorial #2](https://medium.com/@sherlockliao01/triton-tutorial-2-5de66cd2170d)
 40 | - [Triton: OpenAI’s Innovative Programming Language for Custom Deep-Learning Primitives](https://blog.devgenius.io/triton-openais-innovative-programming-language-for-custom-deep-learning-primitives-485723b0b49)
 41 | - [Triton Kernel Compilation Stages](https://pytorch.org/blog/triton-kernel-compilation-stages/)
 42 | - Deep Dive into Triton Internals [Part 1](https://www.kapilsharma.dev/posts/deep-dive-into-triton-internals/), [Part 2](https://www.kapilsharma.dev/posts/deep-dive-into-triton-internals-2/) and [Part 3](https://www.kapilsharma.dev/posts/deep-dive-into-triton-internals-3/)
 43 | - [Exploring Triton GPU programming for neural networks in Java](https://openjdk.org/projects/babylon/articles/triton)
 44 | - [Using User-Defined Triton Kernels with torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html)
 45 | - [Mamba: The Hard Way](https://srush.github.io/annotated-mamba/hard.html)
 46 | - FP8: [Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton](https://pytorch.org/blog/accelerating-gemms-triton/)
 47 | - FP8: [Deep Dive on CUTLASS Ping-Pong GEMM Kernel](https://pytorch.org/blog/cutlass-ping-pong-gemm-kernel/)
 48 | - FP8: [Deep Dive on the Hopper TMA Unit for FP8 GEMMs](https://pytorch.org/blog/hopper-tma-unit/)
 49 | - [Technical Review on PyTorch2.0 and Triton](https://www.jokeren.tech/slides/Triton_bsc.pdf)
 50 | - [Towards Agile Development of Efficient Deep Learning Operators](https://www.jokeren.tech/slides/triton_intel.pdf)
 51 | - [Developing Triton Kernels on AMD GPUs](https://rocm.blogs.amd.com/artificial-intelligence/triton/README.html)
 52 | - [CUDA-Free Inference for LLMs](https://pytorch.org/blog/cuda-free-inference-for-llms/)
 53 | - [Enabling advanced GPU features in PyTorch - Warp Specialization](https://pytorch.org/blog/warp-specialization/) - Fully automated Triton warp specialization in Triton.
 54 | - [Teaching AI to Write GPU Code: A Deep Dive into Reinforcement Fine-Tuning](https://predibase.com/blog/teaching-ai-to-write-gpu-code-a-deep-dive-into-reinforcement-fine-tuning)
 55 | 
 56 | ## Blackwell and Triton 
 57 | - [Accelerating the Future: Triton on Blackwell Architecture](https://www.youtube.com/watch?v=RW2-HtWaOS0)
 58 | - [OpenAI Triton on NVIDIA Blackwell Boosts AI Performance and Programmability](https://developer.nvidia.com/blog/openai-triton-on-nvidia-blackwell-boosts-ai-performance-and-programmability/) - Triton compiler now supports the NVIDIA Blackwell architecture.
 59 | - [Running PyTorch and Triton on the RTX 5080](https://webstorms.github.io/2025/02/06/5080-install.html)
 60 | 
 61 | ## Research Papers
 62 | Explore the academic foundation of Triton:
 63 | - [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](https://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf)
 64 | 
 65 | ## Videos
 66 | Learn by watching these informative videos:
 67 | - [Lecture 14: Practitioners Guide to Triton](https://www.youtube.com/watch?v=DdTsX6DQk24) and [notebook](https://github.com/gpu-mode/lectures/blob/main/lecture_014/A_Practitioners_Guide_to_Triton.ipynb)
 68 | - [Lecture 29: Triton Internals](https://www.youtube.com/watch?v=njgow_zaJMw)
 69 | - [Intro to Triton: Coding Softmax in PyTorch](https://www.youtube.com/watch?v=gyKBN1rnefI)
 70 | - [Triton Vector Addition Kernel, part 1: Making the Shift to Parallel Programming](https://www.youtube.com/watch?v=MEZ7XhzTLEg&t)
 71 | - [Tiled Matrix Multiplication in Triton - part 1](https://www.youtube.com/watch?v=OnZEBBJvWLU)
 72 | - [Flash Attention derived and coded from first principles with Triton (Python)](https://www.youtube.com/watch?v=zy8ChVd_oTM)
 73 | - [Triton GPU Kernels 101](https://www.youtube.com/playlist?list=PLPefVKO3tDxOJLAmCA75uShbe1z_RNqkQ)
 74 | 
 75 | ## Triton community meetup
 76 | Watch Triton community meetups to be up to date with Triton recent topics.
 77 | - [2024-11-09](https://youtu.be/N0eiYLWyNpc?si=n9T-X-0UaK3j1fXQ)
 78 | 
 79 | ## Triton-Puzzles
 80 | Challenge yourself with these engaging puzzles:
 81 | - [To Solve](https://github.com/srush/Triton-Puzzles)
 82 | - [Solved](https://github.com/alexzhang13/Triton-Puzzles-Solutions/blob/main/Triton_Puzzles_Solutions_alexzhang13.ipynb)
 83 | 
 84 | ## Tools
 85 | Enhance your Triton development workflow with these tools:
 86 | - [Triton Deja-vu](https://github.com/IBM/triton-dejavu) Framework to reduce autotune overhead of triton-lang to zero for well known deployments. This small framework is based on the Triton autotuner and contributes two features to the Triton community: 1. store and safely restore autotuner states using JSON files, 2. ConfigSpaces to explore a defined space exhaustively. Additionally, it allows to use heuristics in combination with the autotuner.
 87 | - [Triton Profiler](https://github.com/triton-lang/triton/tree/c5a14cc00598014b303eebac831f19e8a66e9e1d/third_party/proton) and video explaining how to use it [Dev Tools: Proton/Interpreter](https://www.youtube.com/watch?v=Av1za_0o2Qs)
 88 | - [Triton-Viz: A Visualization Toolkit for Programming with Triton](https://github.com/Deep-Learning-Profiling-Tools/triton-viz)
 89 | - [Make Triton easier - Triton-util provides simple higher-level abstractions for frequent but repetitive tasks. This allows you to write code that is closer to how you actually think.](https://github.com/UmerHA/triton_util/tree/main)
 90 | - [TritonBench is a collection of PyTorch operators used to evaluation the performance of Triton, and its integration with PyTorch.](https://github.com/pytorch-labs/tritonbench)
 91 | - [TritonBench features two distinct channels: TritonBench-G and TritonBench-T, each with its own evaluation framework.](https://github.com/thunlp/TritonBench)
 92 | 
 93 | ## Conferences
 94 | Catch up on the latest advancements from Triton Conferences:
 95 | - [2024 Conference Playlist](https://www.youtube.com/watch?v=nglpa_6cYYI&list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz)
 96 | - [2023 Conference Playlist](https://www.youtube.com/watch?v=ZGU0Yw7mORE&list=PLc_vA1r0qoiRZfUC3o4_yjj0FtWvodKAz)
 97 | 
 98 | ## Sample Kernels
 99 | Explore practical implementations with these sample kernels:
100 | - [attorch is a subset of PyTorch's nn module, written purely in Python using OpenAI's Triton](https://github.com/BobMcDear/attorch)
101 | - [FlagGems is a high-performance general operator library implemented in OpenAI Triton. It aims to provide a suite of kernel functions to accelerate LLM training and inference.](https://github.com/FlagOpen/FlagGems)
102 | - [Kernl lets you run Pytorch transformer models several times faster on GPU with a single line of code, and is designed to be easily hackable.](https://github.com/ELS-RD/kernl)
103 | - [Linger-Kernel](https://github.com/linkedin/Liger-Kernel)
104 | - [Triton Kernels for Efficient Low-Bit Matrix Multiplication](https://github.com/mobiusml/gemlite)
105 | - [Unsloth Kernels](https://github.com/unslothai/unsloth/tree/main/unsloth/kernels)
106 | - [This is attempt at implementing a Triton kernel for GPTQ inference. This code is based on the GPTQ-for-LLaMa codebase, which is itself based on the GPTQ codebase.](https://github.com/fpgaminer/GPTQ-triton)
107 | - [triton-index - Catalog openly available Triton kernels](https://github.com/gpu-mode/triton-index)
108 | - [Triton-based implementation of Sparse Mixture-of-Experts (SMoE) on GPUs](https://github.com/shawntan/scattermoe)
109 | - [Variety of Triton and CUDA kernels for training and inference](https://github.com/pytorch-labs/applied-ai)
110 | - [EquiTriton is a project that seeks to implement high-performance kernels for commonly used building blocks in equivariant neural networks, enabling compute efficient training and inference](https://github.com/IntelLabs/EquiTriton)
111 | - [Expanded collection of Neural Network activation functions and other function kernels in Triton by OpenAI.](https://github.com/dtunai/triton-activations)
112 | - [Fused kernels](https://github.com/kapilsh/cuda-mode-lecture)
113 | - [Triton activations](https://github.com/dtunai/triton-activations/tree/main) only feed forward
114 | - [LightLLM is a Python-based LLM (Large Language Model) inference and serving framework, notable for its lightweight design, easy scalability, and high-speed performance](https://github.com/ModelTC/lightllm/tree/main/lightllm/common/basemodel/triton_kernel)
115 | - [Bitsandbytes - ibrary is a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and 8 & 4-bit quantization functions](https://github.com/bitsandbytes-foundation/bitsandbytes/tree/main/bitsandbytes/triton)
116 | - [MInference Triton Kernels - FlashAttention ](https://github.com/microsoft/MInference)
117 | - [GridQuant](https://github.com/niconunezz/GridQuant) - This repository tries to implements the ideas presented in the blog post "Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton". Designed specifically for NVIDIA H100 GPUs, it leverages advanced features like float8 computation, Triton's high-performance GPU programming capabilities, and the Tensor Memory Accelerator (TMA).
118 | - [Efficient Triton implementations for Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention.](https://github.com/fla-org/native-sparse-attention)
119 | 
120 |   
121 | ## Triton integrations 
122 | - [jax-triton](https://github.com/jax-ml/jax-triton)
123 | 
124 | ## Triton backends
125 | - [Intel® XPU Backend for Triton](https://github.com/intel/intel-xpu-backend-for-triton)
126 | 
127 | ## Triton communities
128 | - [CUDA-MODE](https://discord.gg/gpumode)
129 | ---
130 | 
131 | ## Triton Kernel Index
132 | 
133 | | Kernel               | Description                                                                                                                                  | Resource                                                                                                                                                                                   |
134 | |----------------------|----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
135 | | **VectorAdd**        | A simple kernel that performs element-wise addition of two vectors. Useful for understanding the basics of GPU programming in Triton.       | [1](https://isamu-website.medium.com/understanding-the-triton-tutorials-part-1-6191b59ba4c) [2](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py) |
136 | | **Matmul**           | An optimized kernel for matrix multiplication, achieving high performance by leveraging memory hierarchy and parallelism.                    | [1](https://isamu-website.medium.com/understanding-the-triton-tutorials-part-1-6191b59ba4c) [2](https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py) [Grouped GEMM](https://triton-lang.org/main/getting-started/tutorials/08-grouped-gemm.html#sphx-glr-getting-started-tutorials-08-grouped-gemm-py) |
137 | | **Softmax**          | A kernel for efficient computation of the softmax function, commonly used in machine learning models like transformers.                     | [1](https://isamu-website.medium.com/understanding-the-triton-tutorials-part-1-6191b59ba4c) [2](http://blog.nagi.fun/triton-intro-softmax) [3](https://triton-lang.org/main/getting-started/tutorials/02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py) |
138 | | **Dropout**          | A kernel for implementing low-memory dropout, a regularization technique to prevent overfitting in neural networks.                         | [1](https://isamu-website.medium.com/understanding-triton-tutorials-part-2-f6839ce50ae7) [2](https://triton-lang.org/main/getting-started/tutorials/04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py) |
139 | | **Layer Normalization** | A kernel for layer normalization, which normalizes activations within a layer to improve training stability in deep learning models.     | [1](https://isamu-website.medium.com/understanding-triton-tutorials-part-2-f6839ce50ae7) [2](https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html#sphx-glr-getting-started-tutorials-05-layer-norm-py) [3](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/layer_norm.py) |
140 | | **Fused Attention**  | A kernel that efficiently implements attention mechanisms by combining multiple operations, key to transformers and similar architectures.   | [1](https://isamu-website.medium.com/understanding-triton-tutorials-part-2-f6839ce50ae7) [2](https://triton-lang.org/main/getting-started/tutorials/06-fused-attention.html#sphx-glr-getting-started-tutorials-06-fused-attention-py) |
141 | | **Conv1d**           | A kernel for 1D convolution, often used in processing sequential data like time series or audio signals.                                     | [1](https://github.com/BobMcDear/attorch/blob/main/attorch/conv_kernels.py)                                                                                                                |
142 | | **Conv2d**           | A kernel for 2D convolution, a fundamental operation in computer vision tasks such as image classification or object detection.             | [1](https://github.com/BobMcDear/attorch/blob/main/attorch/conv_kernels.py)                                                                                                                |
143 | | **MultiheadAttention** | A kernel for multi-head attention, a crucial component in transformer-based models for capturing complex relationships in data.            | [1](https://github.com/BobMcDear/attorch/blob/main/attorch/multi_head_attention_kernels.py)                                                                                                |
144 | | **Hardsigmoid**      | A kernel for the Hardsigmoid activation function, an efficient approximation of the sigmoid function used in certain neural network layers. | [1](https://github.com/BobMcDear/attorch/blob/main/attorch/act_kernels.py)                                                                                                                |
145 | | **GeLU**      | GeLU | [1](https://rocm.blogs.amd.com/artificial-intelligence/triton/README.html)                                                                                                                |
146 | | **GeGLU**      | GeGLU | [1](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/geglu.py)                                                                                                                |
147 | | **RMSNorm**      | RMSNorm | [1](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/rms_norm.py)                                                                                                                |
148 | 
149 | ## Triton updates, news, new features
150 | - [Automatic Warp Specialization Optimization](https://github.com/triton-lang/triton/pull/5622)
151 | 
152 | 
153 | 
154 | 
155 | ### Contribution
156 | Feel free to contribute more resources or suggest updates by opening a pull request or issue in this repository.
157 | 
158 | ---
159 | ### License
160 | This resource list is open-sourced under the MIT license. 
161 | 


--------------------------------------------------------------------------------