├── assets ├── redme.txt └── triton.png ├── matmul ├── images │ ├── images.txt │ ├── m_001.jpg │ └── m_001-mask.jpg └── readme.md ├── daily_challange ├── day0 │ ├── add_constant.py │ └── readme.md ├── day1 │ ├── vector_add.py │ └── readme.md ├── day3 │ ├── relu.py │ └── readme.md ├── day2 │ ├── vector_add_benchmark.py │ └── readme.md └── day4 │ ├── relu.py │ └── readme.md └── README.md /assets/redme.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /matmul/images/images.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /assets/triton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rkinas/triton-resources/HEAD/assets/triton.png -------------------------------------------------------------------------------- /matmul/images/m_001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rkinas/triton-resources/HEAD/matmul/images/m_001.jpg -------------------------------------------------------------------------------- /matmul/images/m_001-mask.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rkinas/triton-resources/HEAD/matmul/images/m_001-mask.jpg -------------------------------------------------------------------------------- /daily_challange/day0/add_constant.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import triton 4 | import triton.language as tl 5 | 6 | # ------------------------------------------------------------------------------ 7 | # Triton Kernel for Constant Addition 8 | # ------------------------------------------------------------------------------ 9 | 10 | @triton.jit 11 | def constant_add_kernel( 12 | x_ptr, # Pointer to the input vector x 13 | constant, # The constant value to add 14 | y_ptr, # Pointer to the output vector y 15 | N0: tl.constexpr, # Total number of elements in vector x (and y) 16 | BLOCK_SIZE: tl.constexpr # Block size, set equal to N0 17 | ): 18 | # Each kernel instance processes a block of elements. 19 | # With BLOCK_SIZE equal to N0, only one instance is launched. 20 | pid = tl.program_id(0) 21 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 22 | mask = offsets < N0 # Ensure we don't access out-of-bound indices 23 | 24 | # Load x values, add the constant, and store the result in y 25 | x = tl.load(x_ptr + offsets, mask=mask) 26 | y = x + constant 27 | tl.store(y_ptr + offsets, y, mask=mask) 28 | 29 | # ------------------------------------------------------------------------------ 30 | # Python Wrapper Function for the Triton Kernel 31 | # ------------------------------------------------------------------------------ 32 | 33 | def constant_add_triton(x: torch.Tensor, constant: float) -> torch.Tensor: 34 | """ 35 | Adds a constant to each element of the input vector x using a Triton kernel. 36 | 37 | The block size is set equal to the vector length (N0), meaning that only one 38 | kernel instance is launched. 39 | 40 | Args: 41 | x (torch.Tensor): Input vector on CUDA. 42 | constant (float): The constant to add to each element. 43 | 44 | Returns: 45 | torch.Tensor: Output vector with the constant added. 46 | """ 47 | N0 = x.numel() 48 | BLOCK_SIZE = N0 # Block size equals the vector length 49 | y = torch.empty_like(x) 50 | 51 | # With BLOCK_SIZE = N0, our grid consists of a single block. 52 | grid = lambda meta: (1,) 53 | 54 | # Launch the Triton kernel 55 | constant_add_kernel[grid](x, constant, y, N0, BLOCK_SIZE=BLOCK_SIZE) 56 | return y 57 | 58 | # ------------------------------------------------------------------------------ 59 | # Main: Test Constant Add Kernel 60 | # ------------------------------------------------------------------------------ 61 | 62 | if __name__ == '__main__': 63 | # Create an example vector on the GPU. 64 | N0 = 1024 # Length of the vector 65 | x = torch.arange(0, N0, device='cuda', dtype=torch.float32) 66 | constant = 3.0 # The constant value to add 67 | 68 | # Compute the result using the Triton kernel. 69 | y_triton = constant_add_triton(x, constant) 70 | 71 | # Compute the result using PyTorch for verification. 72 | y_torch = x + constant 73 | 74 | # Verify correctness. 75 | if torch.allclose(y_triton, y_torch): 76 | print("Success: Triton kernel result matches PyTorch result!") 77 | else: 78 | print("Error: The results do not match.") 79 | -------------------------------------------------------------------------------- /daily_challange/day1/vector_add.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Triton Kernel for Vector Addition 7 | # ------------------------------------------------------------------------------ 8 | 9 | @triton.jit 10 | def vector_add_kernel( 11 | A_ptr, # Pointer to first input vector A 12 | B_ptr, # Pointer to second input vector B 13 | C_ptr, # Pointer to output vector C 14 | n_elements: tl.constexpr, # Number of elements in the vectors 15 | BLOCK_SIZE: tl.constexpr # Block size (number of elements per program instance) 16 | ): 17 | # Each program instance (kernel instance) computes a block of elements. 18 | pid = tl.program_id(0) # 1D grid: get the program id (i.e. block index) 19 | # Compute the offsets for the current block 20 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 21 | # Create a mask to avoid out-of-bound accesses 22 | mask = offsets < n_elements 23 | 24 | # Load the corresponding elements from A and B 25 | a = tl.load(A_ptr + offsets, mask=mask) 26 | b = tl.load(B_ptr + offsets, mask=mask) 27 | 28 | # Perform element-wise addition 29 | c = a + b 30 | 31 | # Store the result into the output pointer C 32 | tl.store(C_ptr + offsets, c, mask=mask) 33 | 34 | # ------------------------------------------------------------------------------ 35 | # Python Wrapper Function for the Triton Kernel 36 | # ------------------------------------------------------------------------------ 37 | 38 | def vector_add_triton(A: torch.Tensor, B: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 39 | """ 40 | Adds two vectors A and B using the Triton kernel. 41 | 42 | Args: 43 | A (torch.Tensor): First input vector (on CUDA). 44 | B (torch.Tensor): Second input vector (on CUDA). 45 | BLOCK_SIZE (int): Number of elements per block for the kernel. 46 | 47 | Returns: 48 | torch.Tensor: Output vector containing the element-wise sum. 49 | """ 50 | assert A.numel() == B.numel(), "Input vectors must have the same number of elements." 51 | n_elements = A.numel() 52 | # Create an empty tensor for the result (same size and device as A) 53 | C = torch.empty_like(A) 54 | 55 | # Define grid: number of blocks needed to cover all elements 56 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 57 | 58 | # Launch the kernel 59 | vector_add_kernel[grid](A, B, C, n_elements, BLOCK_SIZE=BLOCK_SIZE) 60 | return C 61 | 62 | # ------------------------------------------------------------------------------ 63 | # Main: Compare Triton Kernel with PyTorch Implementation 64 | # ------------------------------------------------------------------------------ 65 | 66 | if __name__ == '__main__': 67 | # Create two example vectors on the GPU 68 | n = 1024 * 10 # total number of elements 69 | A = torch.arange(0, n, device='cuda', dtype=torch.float32) 70 | B = torch.arange(n, 2 * n, device='cuda', dtype=torch.float32) 71 | 72 | # Add the vectors using the Triton kernel 73 | C_triton = vector_add_triton(A, B) 74 | 75 | # Add the vectors using PyTorch (for verification) 76 | C_pytorch = A + B 77 | 78 | # Verify that the results are the same 79 | if torch.allclose(C_triton, C_pytorch): 80 | print("Success: The Triton kernel result matches the PyTorch result!") 81 | else: 82 | print("Error: The results do not match.") 83 | 84 | # Print part of the result for inspection 85 | print("Result (first 10 elements):", C_triton[:10]) 86 | -------------------------------------------------------------------------------- /matmul/readme.md: -------------------------------------------------------------------------------- 1 | # Matrix Multiplication with Triton 2 | 3 | This repository demonstrates matrix multiplication using the Triton framework. We provide two examples: 4 | 5 | 1. **Naive Matrix Multiplication (8x8)**: A straightforward implementation of matrix multiplication without masking. 6 | 2. **Masked Matrix Multiplication (8x6)**: An enhanced implementation showcasing the use of masking to handle uneven matrix sizes. 7 | 8 | ## Visualization 9 | 10 | ### Naive Matrix Multiplication (8x8) 11 | [View Visualization](https://claude.site/artifacts/1f66d58b-5c1e-4a88-8bb4-ddb47ed9bda1) 12 | 13 | ![Naive Matrix Multiplication](./images/m_001.jpg) 14 | 15 | ### Masked Matrix Multiplication (8x6) 16 | [View Visualization](https://claude.site/artifacts/cc485433-bc54-4af4-830f-50ac4f3eefca) 17 | 18 | ![Masked Matrix Multiplication](./images/m_001-mask.jpg) 19 | 20 | ## Implementation 21 | 22 | The examples are implemented using the Triton framework. Below is a clean and concise implementation of the naive matrix multiplication kernel and its corresponding Python integration: 23 | 24 | ```python 25 | import torch 26 | import triton 27 | import triton.language as tl 28 | from functools import partial 29 | 30 | DEVICE = 'cuda' 31 | 32 | @triton.jit 33 | def naive_matmul_kernel( 34 | a_ptr, b_ptr, c_ptr, 35 | m, n, k, 36 | stride_am, stride_ak, 37 | stride_bk, stride_bn, 38 | stride_cm, stride_cn, 39 | bm: tl.constexpr, bn: tl.constexpr, bk: tl.constexpr 40 | ): 41 | # Program IDs 42 | pid_m, pid_n = tl.program_id(0), tl.program_id(1) 43 | 44 | # Block-level starting indices 45 | rm = pid_m * bm + tl.arange(0, bm) 46 | rn = pid_n * bn + tl.arange(0, bn) 47 | rk = tl.arange(0, bk) 48 | 49 | # Boundary masks 50 | rm_mask = rm < m 51 | rn_mask = rn < n 52 | 53 | # Offsets 54 | offs_a = a_ptr + rm[:, None] * stride_am + rk[None, :] * stride_ak 55 | offs_b = b_ptr + rk[:, None] * stride_bk + rn[None, :] * stride_bn 56 | 57 | # Accumulator 58 | acc = tl.zeros((bm, bn), dtype=tl.float32) 59 | 60 | # Loop over the k dimension 61 | for k_idx in range(0, k, bk): 62 | k_mask = k_idx + rk < k 63 | a = tl.load(offs_a, mask=rm_mask[:, None] & k_mask[None, :], other=0.0) 64 | b = tl.load(offs_b, mask=k_mask[:, None] & rn_mask[None, :], other=0.0) 65 | acc += tl.dot(a, b) 66 | 67 | # Increment offsets 68 | offs_a += bk * stride_ak 69 | offs_b += bk * stride_bk 70 | 71 | # Write back results 72 | c = c_ptr + rm[:, None] * stride_cm + rn[None, :] * stride_cn 73 | tl.store(c, acc, mask=rm_mask[:, None] & rn_mask[None, :]) 74 | 75 | # Python interface 76 | def matmul(a, b, kernel, block_size=32): 77 | m, k = a.shape 78 | _, n = b.shape 79 | 80 | c = torch.empty((m, n), device=a.device, dtype=a.dtype) 81 | 82 | grid = lambda meta: (triton.cdiv(m, meta['bm']), triton.cdiv(n, meta['bn'])) 83 | 84 | kernel[grid]( 85 | a, b, c, 86 | m, n, k, 87 | a.stride(0), a.stride(1), 88 | b.stride(0), b.stride(1), 89 | c.stride(0), c.stride(1), 90 | bm=block_size, bn=block_size, bk=block_size 91 | ) 92 | 93 | return c 94 | 95 | naive_matmul = partial(matmul, kernel=naive_matmul_kernel) 96 | 97 | # Example usage 98 | a = torch.ones((8, 8), dtype=torch.float32, device=DEVICE) 99 | b = torch.ones((8, 8), dtype=torch.float32, device=DEVICE) 100 | 101 | result = naive_matmul(a, b, block_size=8) 102 | 103 | expected = torch.matmul(a, b) 104 | assert torch.allclose(result, expected, rtol=1e-3, atol=1e-3) 105 | print("Test passed!") 106 | ``` 107 | 108 | ## Key Features 109 | 110 | - **Naive Implementation**: Demonstrates the basics of kernel programming with Triton. 111 | - **Masked Implementation**: Illustrates handling of uneven matrices using boundary masks. 112 | 113 | -------------------------------------------------------------------------------- /daily_challange/day3/relu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import triton 4 | import triton.language as tl 5 | 6 | # ------------------------------------------------------------------------------ 7 | # Triton Kernel for ReLU Activation 8 | # ------------------------------------------------------------------------------ 9 | 10 | @triton.jit 11 | def relu_kernel( 12 | x_ptr, # Pointer to the input vector x 13 | y_ptr, # Pointer to the output vector y 14 | N: tl.constexpr, # Total number of elements in the input vector 15 | BLOCK_SIZE: tl.constexpr # Block size: number of elements processed per kernel instance 16 | ): 17 | # Each kernel instance processes a block of elements. 18 | # Get the current program ID along the 1D grid. 19 | pid = tl.program_id(0) 20 | 21 | # Compute the offsets for the block of elements this kernel instance will process. 22 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 23 | 24 | # Create a mask to ensure we do not access out-of-bound memory. 25 | mask = offsets < N 26 | 27 | # Load elements from the input pointer. 28 | x = tl.load(x_ptr + offsets, mask=mask) 29 | 30 | # Compute the ReLU activation: y = max(0, x) 31 | y = tl.maximum(x, 0.0) 32 | 33 | # Store the result back to the output pointer. 34 | tl.store(y_ptr + offsets, y, mask=mask) 35 | 36 | # ------------------------------------------------------------------------------ 37 | # Python Wrapper Function for the Triton ReLU Kernel 38 | # ------------------------------------------------------------------------------ 39 | 40 | def relu_triton(x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 41 | """ 42 | Applies the ReLU activation function on the input vector x using a Triton kernel. 43 | 44 | Args: 45 | x (torch.Tensor): Input tensor on CUDA. 46 | BLOCK_SIZE (int): Number of elements processed per kernel instance. 47 | 48 | Returns: 49 | torch.Tensor: Output tensor after applying ReLU activation. 50 | """ 51 | N = x.numel() 52 | # Allocate the output tensor with the same shape and device as the input. 53 | y = torch.empty_like(x) 54 | 55 | # Configure the grid: number of blocks required to cover all N elements. 56 | grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),) 57 | 58 | # Launch the Triton kernel. 59 | relu_kernel[grid](x, y, N, BLOCK_SIZE=BLOCK_SIZE) 60 | return y 61 | 62 | # ------------------------------------------------------------------------------ 63 | # Benchmarking Function 64 | # ------------------------------------------------------------------------------ 65 | 66 | def benchmark(func, *args, n_warmup=10, n_iters=100): 67 | """ 68 | Benchmarks a function by performing warm-up iterations followed by timed iterations. 69 | 70 | Args: 71 | func (callable): The function to benchmark. 72 | *args: Arguments to pass to the function. 73 | n_warmup (int): Number of warm-up iterations. 74 | n_iters (int): Number of iterations for timing. 75 | 76 | Returns: 77 | float: Average execution time per iteration in milliseconds. 78 | """ 79 | # Warm-up: execute the function several times to mitigate initial overhead. 80 | for _ in range(n_warmup): 81 | func(*args) 82 | torch.cuda.synchronize() # Wait for all GPU operations to finish. 83 | 84 | # Timing the execution. 85 | start = time.perf_counter() 86 | for _ in range(n_iters): 87 | func(*args) 88 | torch.cuda.synchronize() # Ensure all GPU operations are complete. 89 | end = time.perf_counter() 90 | 91 | avg_time_ms = (end - start) / n_iters * 1000 92 | return avg_time_ms 93 | 94 | # ------------------------------------------------------------------------------ 95 | # Main: Test and Benchmark the Triton ReLU Kernel 96 | # ------------------------------------------------------------------------------ 97 | 98 | if __name__ == '__main__': 99 | # Create an example input vector on the GPU. 100 | N = 1024 * 1024 # For instance, 1 million elements. 101 | x = torch.randn(N, device='cuda', dtype=torch.float32) 102 | 103 | # Apply ReLU using the Triton kernel. 104 | y_triton = relu_triton(x) 105 | 106 | # Apply ReLU using PyTorch for validation. 107 | y_torch = torch.relu(x) 108 | 109 | # Verify that both outputs are the same. 110 | if torch.allclose(y_triton, y_torch): 111 | print("Success: Triton ReLU matches PyTorch ReLU!") 112 | else: 113 | print("Error: The Triton ReLU output does not match PyTorch.") 114 | 115 | # Benchmark the Triton kernel. 116 | triton_time = benchmark(relu_triton, x) 117 | print(f"Average execution time (Triton ReLU): {triton_time:.3f} ms") 118 | 119 | # Benchmark PyTorch’s built-in ReLU. 120 | torch_time = benchmark(torch.relu, x) 121 | print(f"Average execution time (PyTorch ReLU): {torch_time:.3f} ms") 122 | -------------------------------------------------------------------------------- /daily_challange/day2/vector_add_benchmark.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import triton 4 | import triton.language as tl 5 | 6 | # ------------------------------------------------------------------------------ 7 | # Triton Kernel for Vector Addition 8 | # ------------------------------------------------------------------------------ 9 | 10 | @triton.jit 11 | def vector_add_kernel( 12 | A_ptr, # Pointer to the first input vector A 13 | B_ptr, # Pointer to the second input vector B 14 | C_ptr, # Pointer to the output vector C 15 | n_elements: tl.constexpr, # Total number of elements in the vectors 16 | BLOCK_SIZE: tl.constexpr # Block size (number of elements processed per kernel instance) 17 | ): 18 | # Get the current program (block) ID 19 | pid = tl.program_id(0) 20 | # Compute the offsets for the current block 21 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 22 | # Create a mask to avoid accessing out-of-bound indices 23 | mask = offsets < n_elements 24 | 25 | # Load the elements from A and B with the computed offsets 26 | a = tl.load(A_ptr + offsets, mask=mask) 27 | b = tl.load(B_ptr + offsets, mask=mask) 28 | 29 | # Perform element-wise addition 30 | c = a + b 31 | 32 | # Store the result in C using the mask to ensure only valid writes 33 | tl.store(C_ptr + offsets, c, mask=mask) 34 | 35 | # ------------------------------------------------------------------------------ 36 | # Python Wrapper for the Triton Kernel 37 | # ------------------------------------------------------------------------------ 38 | 39 | def vector_add_triton(A: torch.Tensor, B: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 40 | """ 41 | Adds two vectors using the Triton kernel. 42 | 43 | Args: 44 | A (torch.Tensor): First input vector (on CUDA). 45 | B (torch.Tensor): Second input vector (on CUDA). 46 | BLOCK_SIZE (int): Number of elements per block for the kernel. 47 | 48 | Returns: 49 | torch.Tensor: Output vector with the element-wise sum. 50 | """ 51 | n_elements = A.numel() 52 | # Allocate the output tensor (same shape and device as A) 53 | C = torch.empty_like(A) 54 | 55 | # Define the grid (number of blocks) required to cover all elements 56 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 57 | 58 | # Launch the Triton kernel 59 | vector_add_kernel[grid](A, B, C, n_elements, BLOCK_SIZE=BLOCK_SIZE) 60 | return C 61 | 62 | # ------------------------------------------------------------------------------ 63 | # Benchmarking Function 64 | # ------------------------------------------------------------------------------ 65 | 66 | def benchmark(func, *args, n_warmup=10, n_iters=100): 67 | """ 68 | Benchmarks a function by running warm-up iterations followed by timed iterations. 69 | 70 | Args: 71 | func (callable): The function to benchmark. 72 | *args: Arguments to pass to the function. 73 | n_warmup (int): Number of warm-up iterations (to exclude startup overhead). 74 | n_iters (int): Number of iterations for timing. 75 | 76 | Returns: 77 | float: Average execution time per iteration in milliseconds. 78 | """ 79 | # Warm-up runs to ensure any one-time setup is complete (e.g. CUDA context) 80 | for _ in range(n_warmup): 81 | func(*args) 82 | torch.cuda.synchronize() # Ensure warm-up kernels have finished 83 | 84 | # Start timing 85 | start = time.perf_counter() 86 | for _ in range(n_iters): 87 | func(*args) 88 | torch.cuda.synchronize() # Wait for all GPU operations to finish 89 | end = time.perf_counter() 90 | 91 | # Calculate the average execution time (in milliseconds) 92 | avg_time_ms = (end - start) / n_iters * 1000 93 | return avg_time_ms 94 | 95 | # ------------------------------------------------------------------------------ 96 | # Main: Compare and Benchmark Triton Kernel vs. PyTorch Implementation 97 | # ------------------------------------------------------------------------------ 98 | 99 | if __name__ == '__main__': 100 | # Create two example vectors on the GPU (stress test with a large number of elements) 101 | n = 1024 * 1024 * 10 # e.g., 10 million elements 102 | A = torch.arange(0, n, device='cuda', dtype=torch.float32) 103 | B = torch.arange(n, 2 * n, device='cuda', dtype=torch.float32) 104 | 105 | # Validate correctness by comparing results from Triton and PyTorch 106 | C_triton = vector_add_triton(A, B) 107 | C_pytorch = A + B 108 | 109 | if torch.allclose(C_triton, C_pytorch): 110 | print("Success: The Triton result matches the PyTorch result!") 111 | else: 112 | print("Error: The results do not match.") 113 | 114 | # Benchmark the Triton kernel 115 | triton_time = benchmark(vector_add_triton, A, B, n_warmup=10, n_iters=100) 116 | print(f"Average execution time (Triton): {triton_time:.3f} ms") 117 | 118 | # Benchmark the PyTorch implementation 119 | def pytorch_add(A, B): 120 | return A + B 121 | 122 | pytorch_time = benchmark(pytorch_add, A, B, n_warmup=10, n_iters=100) 123 | print(f"Average execution time (PyTorch): {pytorch_time:.3f} ms") 124 | -------------------------------------------------------------------------------- /daily_challange/day0/readme.md: -------------------------------------------------------------------------------- 1 | # Puzzle 1: constant add 2 | 3 | This challenge is the first puzzle in our Daily Triton Challenge series. The goal is to write a Triton kernel that adds a constant value to each element of a vector. The key aspects of this puzzle are: 4 | 5 | - **One program ID axis:** we use a 1D grid, with a single kernel instance. 6 | - **Block size \(B_0\):** the block size is set equal to the length of the vector \(N_0\), so the kernel processes the entire vector in one go. 7 | - **Verification:** the result is compared against a simple PyTorch implementation. 8 | 9 | ## Full code example 10 | 11 | ```python 12 | import time 13 | import torch 14 | import triton 15 | import triton.language as tl 16 | 17 | # ------------------------------------------------------------------------------ 18 | # Triton Kernel for Constant Addition 19 | # ------------------------------------------------------------------------------ 20 | 21 | @triton.jit 22 | def constant_add_kernel( 23 | x_ptr, # Pointer to the input vector x 24 | constant, # The constant value to add 25 | y_ptr, # Pointer to the output vector y 26 | N0: tl.constexpr, # Total number of elements in vector x (and y) 27 | BLOCK_SIZE: tl.constexpr # Block size, set equal to N0 28 | ): 29 | # Each kernel instance processes a block of elements. 30 | # With BLOCK_SIZE equal to N0, only one instance is launched. 31 | pid = tl.program_id(0) 32 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 33 | mask = offsets < N0 # Ensure we don't access out-of-bound indices 34 | 35 | # Load x values, add the constant, and store the result in y 36 | x = tl.load(x_ptr + offsets, mask=mask) 37 | y = x + constant 38 | tl.store(y_ptr + offsets, y, mask=mask) 39 | 40 | # ------------------------------------------------------------------------------ 41 | # Python Wrapper Function for the Triton Kernel 42 | # ------------------------------------------------------------------------------ 43 | 44 | def constant_add_triton(x: torch.Tensor, constant: float) -> torch.Tensor: 45 | """ 46 | Adds a constant to each element of the input vector x using a Triton kernel. 47 | 48 | The block size is set equal to the vector length (N0), meaning that only one 49 | kernel instance is launched. 50 | 51 | Args: 52 | x (torch.Tensor): Input vector on CUDA. 53 | constant (float): The constant to add to each element. 54 | 55 | Returns: 56 | torch.Tensor: Output vector with the constant added. 57 | """ 58 | N0 = x.numel() 59 | BLOCK_SIZE = N0 # Block size equals the vector length 60 | y = torch.empty_like(x) 61 | 62 | # With BLOCK_SIZE = N0, our grid consists of a single block. 63 | grid = lambda meta: (1,) 64 | 65 | # Launch the Triton kernel 66 | constant_add_kernel[grid](x, constant, y, N0, BLOCK_SIZE=BLOCK_SIZE) 67 | return y 68 | 69 | # ------------------------------------------------------------------------------ 70 | # Main: Test the constant add kernel 71 | # ------------------------------------------------------------------------------ 72 | 73 | if __name__ == '__main__': 74 | # Create an example vector on the GPU. 75 | N0 = 1024 # Length of the vector 76 | x = torch.arange(0, N0, device='cuda', dtype=torch.float32) 77 | constant = 3.0 # The constant value to add 78 | 79 | # Compute the result using the Triton kernel. 80 | y_triton = constant_add_triton(x, constant) 81 | 82 | # Compute the result using PyTorch for verification. 83 | y_torch = x + constant 84 | 85 | # Verify correctness. 86 | if torch.allclose(y_triton, y_torch): 87 | print("Success: Triton kernel result matches PyTorch result!") 88 | else: 89 | print("Error: The results do not match.") 90 | 91 | # Benchmark the Triton kernel. 92 | triton_time = benchmark(constant_add_triton, x, constant) 93 | print(f"Average execution time (Triton): {triton_time:.3f} ms") 94 | ``` 95 | 96 | ## Code explanation 97 | 98 | ### 1. The Triton kernel (`constant_add_kernel`) 99 | - **Kernel signature:** 100 | the kernel receives pointers for the input vector `x`, the constant value to add, and the output vector `y`. It also gets the total number of elements `N0` and a compile-time constant `BLOCK_SIZE`. 101 | 102 | - **Program ID and offsets:** 103 | `pid = tl.program_id(0)` obtains the current program ID along the single grid axis. Using this, the kernel calculates the offsets for each element in the block. Since `BLOCK_SIZE` is set equal to `N0`, only one block (one kernel instance) is needed. 104 | 105 | - **Boundary mask:** 106 | a mask (`mask = offsets < N0`) ensures safe memory accesses. 107 | 108 | - **Addition operation:** 109 | the kernel loads the data from `x`, adds the provided constant, and stores the result into `y`. 110 | 111 | ### 2. Python wrapper function (`constant_add_triton`) 112 | - **Purpose:** 113 | this function allocates the output tensor and configures the grid for launching the Triton kernel. 114 | 115 | - **Grid configuration:** 116 | with `BLOCK_SIZE = N0`, the grid is defined as `(1,)` since the entire vector is processed by a single kernel instance. 117 | 118 | ### 3. Main routine 119 | - **Setup:** 120 | a vector `x` of length 1024 is created on the GPU, and a constant value of 3.0 is chosen. 121 | 122 | - **Validation:** 123 | the Triton kernel’s output is compared to PyTorch's built-in addition to ensure correctness. 124 | 125 | ## Conclusion 126 | 127 | Puzzle 1: Constant Add is the first step in our Daily Triton Challenge. This simple yet effective exercise helps you grasp the basic structure of writing a Triton kernel, setting up the grid, and ensuring correct memory operations. As you progress, you'll build on these fundamentals to explore more advanced topics in GPU kernel programming with Triton. 128 | -------------------------------------------------------------------------------- /daily_challange/day4/relu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import triton 4 | import triton.language as tl 5 | 6 | # ------------------------------------------------------------------------------ 7 | # Triton Kernel for ReLU Forward Pass 8 | # ------------------------------------------------------------------------------ 9 | 10 | @triton.jit 11 | def relu_forward_kernel( 12 | x_ptr, # Pointer to input tensor x 13 | y_ptr, # Pointer to output tensor y 14 | N: tl.constexpr, # Total number of elements in x 15 | BLOCK_SIZE: tl.constexpr # Number of elements processed per kernel instance 16 | ): 17 | # Get the current program (block) ID. 18 | pid = tl.program_id(0) 19 | # Compute offsets for this block. 20 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 21 | # Create a mask for out-of-bound indices. 22 | mask = offsets < N 23 | # Load input values. 24 | x = tl.load(x_ptr + offsets, mask=mask) 25 | # Compute ReLU: y = max(0, x) 26 | y = tl.maximum(x, 0.0) 27 | # Store the result. 28 | tl.store(y_ptr + offsets, y, mask=mask) 29 | 30 | # ------------------------------------------------------------------------------ 31 | # Triton Kernel for ReLU Backward Pass 32 | # ------------------------------------------------------------------------------ 33 | 34 | @triton.jit 35 | def relu_backward_kernel( 36 | x_ptr, # Pointer to saved input tensor x (from forward pass) 37 | grad_output_ptr, # Pointer to gradient of the output 38 | grad_input_ptr, # Pointer to store computed gradient with respect to x 39 | N: tl.constexpr, # Total number of elements in x 40 | BLOCK_SIZE: tl.constexpr # Number of elements processed per kernel instance 41 | ): 42 | pid = tl.program_id(0) 43 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 44 | mask = offsets < N 45 | # Load input values and gradient of output. 46 | x = tl.load(x_ptr + offsets, mask=mask) 47 | grad_out = tl.load(grad_output_ptr + offsets, mask=mask) 48 | # Compute gradient of ReLU: 49 | # For each element, if x > 0, gradient is grad_out; otherwise, it is 0. 50 | grad_in = tl.where(x > 0, grad_out, 0.0) 51 | tl.store(grad_input_ptr + offsets, grad_in, mask=mask) 52 | 53 | # ------------------------------------------------------------------------------ 54 | # Custom Autograd Function Using Triton Kernels 55 | # ------------------------------------------------------------------------------ 56 | 57 | class TritonReLUFunction(torch.autograd.Function): 58 | @staticmethod 59 | def forward(ctx, x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 60 | """ 61 | Forward pass of the ReLU activation using the Triton kernel. 62 | Saves the input tensor for use in the backward pass. 63 | """ 64 | N = x.numel() 65 | y = torch.empty_like(x) 66 | grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),) 67 | # Launch the forward kernel. 68 | relu_forward_kernel[grid](x, y, N, BLOCK_SIZE=BLOCK_SIZE) 69 | # Save input tensor for the backward pass. 70 | ctx.save_for_backward(x) 71 | ctx.BLOCK_SIZE = BLOCK_SIZE 72 | return y 73 | 74 | @staticmethod 75 | def backward(ctx, grad_output: torch.Tensor) -> tuple: 76 | """ 77 | Backward pass computes the gradient of the ReLU activation. 78 | """ 79 | x, = ctx.saved_tensors 80 | N = x.numel() 81 | grad_input = torch.empty_like(x) 82 | grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),) 83 | BLOCK_SIZE = ctx.BLOCK_SIZE 84 | # Launch the backward kernel. 85 | relu_backward_kernel[grid](x, grad_output, grad_input, N, BLOCK_SIZE=BLOCK_SIZE) 86 | # Return the gradient for x and None for BLOCK_SIZE (not a tensor). 87 | return grad_input, None 88 | 89 | # Convenience function to call our custom autograd ReLU. 90 | def triton_relu(x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 91 | return TritonReLUFunction.apply(x, BLOCK_SIZE) 92 | 93 | # ------------------------------------------------------------------------------ 94 | # Benchmarking Function 95 | # ------------------------------------------------------------------------------ 96 | 97 | def benchmark(func, *args, n_warmup=10, n_iters=100): 98 | """ 99 | Benchmarks a function by running warm-up iterations followed by timed iterations. 100 | 101 | Args: 102 | func (callable): The function to benchmark. 103 | *args: Arguments to pass to the function. 104 | n_warmup (int): Number of warm-up iterations. 105 | n_iters (int): Number of iterations for timing. 106 | 107 | Returns: 108 | float: Average execution time per iteration in milliseconds. 109 | """ 110 | # Warm-up iterations. 111 | for _ in range(n_warmup): 112 | func(*args) 113 | torch.cuda.synchronize() 114 | 115 | start = time.perf_counter() 116 | for _ in range(n_iters): 117 | func(*args) 118 | torch.cuda.synchronize() 119 | end = time.perf_counter() 120 | return (end - start) / n_iters * 1000 121 | 122 | # ------------------------------------------------------------------------------ 123 | # Main: Test and Benchmark the Autograd-Compatible ReLU 124 | # ------------------------------------------------------------------------------ 125 | 126 | if __name__ == '__main__': 127 | # Create a random input tensor on the GPU with gradient tracking. 128 | N = 1024 * 1024 # 1 million elements 129 | x = torch.randn(N, device='cuda', dtype=torch.float32, requires_grad=True) 130 | BLOCK_SIZE = 1024 131 | 132 | # Forward pass using our custom Triton ReLU. 133 | y_triton = triton_relu(x, BLOCK_SIZE) 134 | # Define a dummy loss (sum of outputs) and perform backward pass. 135 | loss_triton = y_triton.sum() 136 | loss_triton.backward() 137 | 138 | # For validation, compare against PyTorch's built-in ReLU. 139 | x_torch = x.detach().clone().requires_grad_() 140 | y_torch = torch.relu(x_torch) 141 | loss_torch = y_torch.sum() 142 | loss_torch.backward() 143 | 144 | # Check if the gradients match. 145 | if torch.allclose(x.grad, x_torch.grad, atol=1e-4): 146 | print("Success: Triton autograd ReLU backward matches PyTorch!") 147 | else: 148 | print("Error: The gradients do not match.") 149 | 150 | # Benchmark the forward pass. 151 | triton_time = benchmark(lambda: triton_relu(x, BLOCK_SIZE)) 152 | torch_time = benchmark(lambda: torch.relu(x)) 153 | print(f"Average execution time (Forward Pass):") 154 | print(f" Triton ReLU = {triton_time:.3f} ms") 155 | print(f" PyTorch ReLU = {torch_time:.3f} ms") 156 | -------------------------------------------------------------------------------- /daily_challange/day1/readme.md: -------------------------------------------------------------------------------- 1 | # Puzzle2: Vector Addition with Triton and PyTorch 2 | 3 | This repository contains a simple example of how to add two vectors using a custom GPU kernel written in [Triton](https://github.com/openai/triton) and compares the result to a standard PyTorch implementation. The result of both implementations is the same. 4 | 5 | ## Overview 6 | 7 | - **Triton Kernel:** small GPU kernel that divides the input vectors into blocks. Each kernel instance computes the addition for a block of elements. 8 | - **PyTorch Implementation:** simple element‑wise addition using PyTorch’s built-in tensor operations. 9 | 10 | This example demonstrates how to write a Triton kernel, launch it from Python, and verify that the computed result is identical to that of PyTorch. 11 | 12 | ## Requirements 13 | 14 | - Python 3.8+ 15 | - [PyTorch](https://pytorch.org/) (with CUDA support) 16 | - [Triton](https://github.com/openai/triton) 17 | Install via pip: 18 | 19 | ```bash 20 | pip install triton 21 | ``` 22 | 23 | ## Code 24 | 25 | Below is the full code example: 26 | 27 | ```python 28 | import torch 29 | import triton 30 | import triton.language as tl 31 | 32 | # ------------------------------------------------------------------------------ 33 | # Triton Kernel for Vector Addition 34 | # ------------------------------------------------------------------------------ 35 | 36 | @triton.jit 37 | def vector_add_kernel( 38 | A_ptr, # Pointer to first input vector A 39 | B_ptr, # Pointer to second input vector B 40 | C_ptr, # Pointer to output vector C 41 | n_elements: tl.constexpr, # Number of elements in the vectors 42 | BLOCK_SIZE: tl.constexpr # Block size (number of elements per program instance) 43 | ): 44 | # Each program instance (kernel instance) computes a block of elements. 45 | pid = tl.program_id(0) # 1D grid: get the program id (i.e. block index) 46 | # Compute the offsets for the current block 47 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 48 | # Create a mask to avoid out-of-bound accesses 49 | mask = offsets < n_elements 50 | 51 | # Load the corresponding elements from A and B 52 | a = tl.load(A_ptr + offsets, mask=mask) 53 | b = tl.load(B_ptr + offsets, mask=mask) 54 | 55 | # Perform element-wise addition 56 | c = a + b 57 | 58 | # Store the result into the output pointer C 59 | tl.store(C_ptr + offsets, c, mask=mask) 60 | 61 | # ------------------------------------------------------------------------------ 62 | # Python Wrapper Function for the Triton Kernel 63 | # ------------------------------------------------------------------------------ 64 | 65 | def vector_add_triton(A: torch.Tensor, B: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 66 | """ 67 | Adds two vectors A and B using the Triton kernel. 68 | 69 | Args: 70 | A (torch.Tensor): First input vector (on CUDA). 71 | B (torch.Tensor): Second input vector (on CUDA). 72 | BLOCK_SIZE (int): Number of elements per block for the kernel. 73 | 74 | Returns: 75 | torch.Tensor: Output vector containing the element-wise sum. 76 | """ 77 | assert A.numel() == B.numel(), "Input vectors must have the same number of elements." 78 | n_elements = A.numel() 79 | # Create an empty tensor for the result (same size and device as A) 80 | C = torch.empty_like(A) 81 | 82 | # Define grid: number of blocks needed to cover all elements 83 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 84 | 85 | # Launch the kernel 86 | vector_add_kernel[grid](A, B, C, n_elements, BLOCK_SIZE=BLOCK_SIZE) 87 | return C 88 | 89 | # ------------------------------------------------------------------------------ 90 | # Main: Compare Triton Kernel with PyTorch Implementation 91 | # ------------------------------------------------------------------------------ 92 | 93 | if __name__ == '__main__': 94 | # Create two example vectors on the GPU 95 | n = 1024 * 10 # total number of elements 96 | A = torch.arange(0, n, device='cuda', dtype=torch.float32) 97 | B = torch.arange(n, 2 * n, device='cuda', dtype=torch.float32) 98 | 99 | # Add the vectors using the Triton kernel 100 | C_triton = vector_add_triton(A, B) 101 | 102 | # Add the vectors using PyTorch (for verification) 103 | C_pytorch = A + B 104 | 105 | # Verify that the results are the same 106 | if torch.allclose(C_triton, C_pytorch): 107 | print("Success: The Triton kernel result matches the PyTorch result!") 108 | else: 109 | print("Error: The results do not match.") 110 | 111 | # Print part of the result for inspection 112 | print("Result (first 10 elements):", C_triton[:10]) 113 | ``` 114 | 115 | ## Code Explanation 116 | 117 | ### 1. The Triton kernel (`vector_add_kernel`) 118 | - **Kernel signature:** 119 | the kernel receives pointers to the input arrays (`A_ptr` and `B_ptr`), a pointer for the output array (`C_ptr`), the total number of elements (`n_elements`), and a compile-time constant `BLOCK_SIZE`. 120 | 121 | - **Kernel indexing:** 122 | `pid = tl.program_id(0)` retrieves the unique index for the current block. Using this, we compute the starting offsets for each block. 123 | 124 | - **Boundary checking:** 125 | mask (`mask = offsets < n_elements`) is used to ensure that only valid elements are loaded and stored, which is important when the total number of elements is not a multiple of `BLOCK_SIZE`. 126 | 127 | - **Memory operations:** 128 | the `tl.load` function reads elements from memory, and `tl.store` writes the computed result back. 129 | 130 | ### 2. Python wrapper function (`vector_add_triton`) 131 | - **Input Validation:** 132 | We ensure both input vectors have the same number of elements. 133 | 134 | - **Result tensor:** 135 | an output tensor `C` is allocated with the same shape and device as the input vectors. 136 | 137 | - **Kernel launch configuration:** 138 | the grid is computed using `triton.cdiv(n_elements, meta['BLOCK_SIZE'])` which determines how many blocks are needed. 139 | 140 | - **Kernel launch:** 141 | the Triton kernel is launched with the computed grid and the provided parameters. 142 | 143 | ### 3. PyTorch comparison 144 | - **PyTorch addition:** 145 | the same vector addition is performed using PyTorch's built-in operator (`A + B`). 146 | 147 | - **Verification:** 148 | `torch.allclose` checks that the results from both methods are nearly identical. 149 | 150 | ## Conclusion 151 | 152 | This example demonstrates a minimal Triton kernel for vector addition. Triton allows you to write custom GPU kernels in Python with a syntax similar to CUDA, enabling you to optimize performance-critical operations. The comparison with PyTorch’s built-in vector addition shows that custom kernels can be both simple to write and produce correct results. 153 | 154 | Feel free to clone this repository, experiment with different block sizes, and extend this example to more complex operations. 155 | -------------------------------------------------------------------------------- /daily_challange/day3/readme.md: -------------------------------------------------------------------------------- 1 | # Puzzle 4: ReLU Activation with Triton 2 | 3 | In this challenge, you will implement the ReLU (Rectified Linear Unit) activation function using Triton. ReLU is defined as: 4 | 5 | ReLU(x)=max(0,x) 6 | 7 | For each element in the input vector, the kernel computes the maximum between the element and 0. This example compares the custom Triton implementation to PyTorch’s built-in ReLU, and it also includes a benchmarking function for performance measurement. 8 | 9 | ## Key points 10 | 11 | - **1D grid processing:** the kernel uses a one-dimensional grid of program IDs. Each kernel instance processes a block of elements. 12 | - **Block-based computation:** the vector is processed in blocks with a configurable block size. 13 | - **Element-wise operation:** for each element, the kernel computes `y = max(0, x)`. 14 | 15 | ## Full code example 16 | 17 | ```python 18 | import time 19 | import torch 20 | import triton 21 | import triton.language as tl 22 | 23 | # ------------------------------------------------------------------------------ 24 | # Triton Kernel for ReLU Activation 25 | # ------------------------------------------------------------------------------ 26 | 27 | @triton.jit 28 | def relu_kernel( 29 | x_ptr, # Pointer to the input vector x 30 | y_ptr, # Pointer to the output vector y 31 | N: tl.constexpr, # Total number of elements in the input vector 32 | BLOCK_SIZE: tl.constexpr # Block size: number of elements processed per kernel instance 33 | ): 34 | # Each kernel instance processes a block of elements. 35 | # Get the current program ID along the 1D grid. 36 | pid = tl.program_id(0) 37 | 38 | # Compute the offsets for the block of elements this kernel instance will process. 39 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 40 | 41 | # Create a mask to ensure we do not access out-of-bound memory. 42 | mask = offsets < N 43 | 44 | # Load elements from the input pointer. 45 | x = tl.load(x_ptr + offsets, mask=mask) 46 | 47 | # Compute the ReLU activation: y = max(0, x) 48 | y = tl.maximum(x, 0.0) 49 | 50 | # Store the result back to the output pointer. 51 | tl.store(y_ptr + offsets, y, mask=mask) 52 | 53 | # ------------------------------------------------------------------------------ 54 | # Python Wrapper Function for the Triton ReLU Kernel 55 | # ------------------------------------------------------------------------------ 56 | 57 | def relu_triton(x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 58 | """ 59 | Applies the ReLU activation function on the input vector x using a Triton kernel. 60 | 61 | Args: 62 | x (torch.Tensor): Input tensor on CUDA. 63 | BLOCK_SIZE (int): Number of elements processed per kernel instance. 64 | 65 | Returns: 66 | torch.Tensor: Output tensor after applying ReLU activation. 67 | """ 68 | N = x.numel() 69 | # Allocate the output tensor with the same shape and device as the input. 70 | y = torch.empty_like(x) 71 | 72 | # Configure the grid: number of blocks required to cover all N elements. 73 | grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),) 74 | 75 | # Launch the Triton kernel. 76 | relu_kernel[grid](x, y, N, BLOCK_SIZE=BLOCK_SIZE) 77 | return y 78 | 79 | # ------------------------------------------------------------------------------ 80 | # Benchmarking Function 81 | # ------------------------------------------------------------------------------ 82 | 83 | def benchmark(func, *args, n_warmup=10, n_iters=100): 84 | """ 85 | Benchmarks a function by performing warm-up iterations followed by timed iterations. 86 | 87 | Args: 88 | func (callable): The function to benchmark. 89 | *args: Arguments to pass to the function. 90 | n_warmup (int): Number of warm-up iterations. 91 | n_iters (int): Number of iterations for timing. 92 | 93 | Returns: 94 | float: Average execution time per iteration in milliseconds. 95 | """ 96 | # Warm-up: execute the function several times to mitigate initial overhead. 97 | for _ in range(n_warmup): 98 | func(*args) 99 | torch.cuda.synchronize() # Wait for all GPU operations to finish. 100 | 101 | # Timing the execution. 102 | start = time.perf_counter() 103 | for _ in range(n_iters): 104 | func(*args) 105 | torch.cuda.synchronize() # Ensure all GPU operations are complete. 106 | end = time.perf_counter() 107 | 108 | avg_time_ms = (end - start) / n_iters * 1000 109 | return avg_time_ms 110 | 111 | # ------------------------------------------------------------------------------ 112 | # Main: Test and Benchmark the Triton ReLU Kernel 113 | # ------------------------------------------------------------------------------ 114 | 115 | if __name__ == '__main__': 116 | # Create an example input vector on the GPU. 117 | N = 1024 * 1024 # For instance, 1 million elements. 118 | x = torch.randn(N, device='cuda', dtype=torch.float32) 119 | 120 | # Apply ReLU using the Triton kernel. 121 | y_triton = relu_triton(x) 122 | 123 | # Apply ReLU using PyTorch for validation. 124 | y_torch = torch.relu(x) 125 | 126 | # Verify that both outputs are the same. 127 | if torch.allclose(y_triton, y_torch): 128 | print("Success: Triton ReLU matches PyTorch ReLU!") 129 | else: 130 | print("Error: The Triton ReLU output does not match PyTorch.") 131 | 132 | # Benchmark the Triton kernel. 133 | triton_time = benchmark(relu_triton, x) 134 | print(f"Average execution time (Triton ReLU): {triton_time:.3f} ms") 135 | 136 | # Benchmark PyTorch’s built-in ReLU. 137 | torch_time = benchmark(torch.relu, x) 138 | print(f"Average execution time (PyTorch ReLU): {torch_time:.3f} ms") 139 | ``` 140 | 141 | ## Code explanation 142 | 143 | ### 1. The Triton kernel (`relu_kernel`) 144 | - **Kernel signature:** 145 | The kernel takes pointers for the input (`x_ptr`) and output (`y_ptr`) vectors, along with the total number of elements (`N`) and a compile-time constant `BLOCK_SIZE`. 146 | 147 | - **Program ID and offsets:** 148 | The kernel retrieves its program ID using `tl.program_id(0)` and computes the element offsets within the vector for the current block. 149 | 150 | - **Masking:** 151 | A mask is created (`mask = offsets < N`) to prevent out-of-bound memory accesses when the vector size is not an exact multiple of the block size. 152 | 153 | - **ReLU computation:** 154 | The kernel loads the input elements, computes the maximum between each element and 0 using `tl.maximum(x, 0.0)`, and then stores the result. 155 | 156 | ### 2. Python wrapper function (`relu_triton`) 157 | - **Purpose:** 158 | This function sets up the output tensor and computes the grid configuration needed to launch the kernel. It then calls the Triton kernel with the correct arguments. 159 | 160 | - **Grid configuration:** 161 | The grid is computed with `triton.cdiv(N, meta['BLOCK_SIZE'])` ensuring all elements are processed even if the total number of elements isn’t an exact multiple of the block size. 162 | 163 | ### 3. Benchmarking function (`benchmark`) 164 | - **Warm-up iterations:** 165 | Several warm-up iterations help avoid measuring the initial overhead such as CUDA context initialization. 166 | 167 | - **Timing:** 168 | The function measures the average execution time over a set number of iterations. Synchronization (`torch.cuda.synchronize()`) is used before and after the timing loop to ensure accurate measurement. 169 | 170 | ### 4. Main routine 171 | - **Setup:** 172 | A large random input vector is generated on the GPU. 173 | 174 | - **Validation:** 175 | The output from the Triton kernel is compared with PyTorch’s `torch.relu` to ensure the correctness of the implementation. 176 | 177 | - **Benchmarking:** 178 | Both the Triton and PyTorch ReLU functions are benchmarked, and their average execution times are printed. 179 | 180 | ## Conclusion 181 | 182 | This puzzle demonstrates how to implement a ReLU activation function using Triton. By comparing it with PyTorch’s implementation and measuring performance, you gain practical insight into writing and optimizing custom GPU kernels. This is another step forward in your Daily Triton Challenge as you explore GPU programming from basic to more advanced operations. 183 | -------------------------------------------------------------------------------- /daily_challange/day2/readme.md: -------------------------------------------------------------------------------- 1 | # Puzzle3: Vector Addition with Triton and PyTorch (with Benchmarking) 2 | 3 | This repository demonstrates how to add two vectors element‑wise using a custom Triton GPU kernel and compares the performance with a PyTorch implementation. A benchmarking function is included to measure the average execution time for each method. 4 | 5 | ## Overview 6 | 7 | - **Triton kernel:** custom GPU kernel that divides the input vectors into blocks and performs element‑wise addition. 8 | - **PyTorch implementation:** simple vector addition using PyTorch’s built‑in tensor operations. 9 | - **Benchmarking function:** helper function that performs warm‑up runs and measures the average execution time over several iterations. 10 | 11 | ## Requirements 12 | 13 | - Python 3.8+ 14 | - [PyTorch](https://pytorch.org/) (with CUDA support) 15 | - [Triton](https://github.com/openai/triton) 16 | Install via pip: 17 | 18 | ```bash 19 | pip install triton 20 | ``` 21 | 22 | ## Full Code Example 23 | 24 | ```python 25 | import time 26 | import torch 27 | import triton 28 | import triton.language as tl 29 | 30 | # ------------------------------------------------------------------------------ 31 | # Triton Kernel for Vector Addition 32 | # ------------------------------------------------------------------------------ 33 | 34 | @triton.jit 35 | def vector_add_kernel( 36 | A_ptr, # Pointer to the first input vector A 37 | B_ptr, # Pointer to the second input vector B 38 | C_ptr, # Pointer to the output vector C 39 | n_elements: tl.constexpr, # Total number of elements in the vectors 40 | BLOCK_SIZE: tl.constexpr # Block size (number of elements processed per kernel instance) 41 | ): 42 | # Get the current program (block) ID 43 | pid = tl.program_id(0) 44 | # Compute the offsets for the current block 45 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 46 | # Create a mask to avoid accessing out-of-bound indices 47 | mask = offsets < n_elements 48 | 49 | # Load the elements from A and B with the computed offsets 50 | a = tl.load(A_ptr + offsets, mask=mask) 51 | b = tl.load(B_ptr + offsets, mask=mask) 52 | 53 | # Perform element-wise addition 54 | c = a + b 55 | 56 | # Store the result in C using the mask to ensure only valid writes 57 | tl.store(C_ptr + offsets, c, mask=mask) 58 | 59 | # ------------------------------------------------------------------------------ 60 | # Python Wrapper for the Triton Kernel 61 | # ------------------------------------------------------------------------------ 62 | 63 | def vector_add_triton(A: torch.Tensor, B: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 64 | """ 65 | Adds two vectors using the Triton kernel. 66 | 67 | Args: 68 | A (torch.Tensor): First input vector (on CUDA). 69 | B (torch.Tensor): Second input vector (on CUDA). 70 | BLOCK_SIZE (int): Number of elements per block for the kernel. 71 | 72 | Returns: 73 | torch.Tensor: Output vector with the element-wise sum. 74 | """ 75 | n_elements = A.numel() 76 | # Allocate the output tensor (same shape and device as A) 77 | C = torch.empty_like(A) 78 | 79 | # Define the grid (number of blocks) required to cover all elements 80 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 81 | 82 | # Launch the Triton kernel 83 | vector_add_kernel[grid](A, B, C, n_elements, BLOCK_SIZE=BLOCK_SIZE) 84 | return C 85 | 86 | # ------------------------------------------------------------------------------ 87 | # Benchmarking Function 88 | # ------------------------------------------------------------------------------ 89 | 90 | def benchmark(func, *args, n_warmup=10, n_iters=100): 91 | """ 92 | Benchmarks a function by running warm-up iterations followed by timed iterations. 93 | 94 | Args: 95 | func (callable): The function to benchmark. 96 | *args: Arguments to pass to the function. 97 | n_warmup (int): Number of warm-up iterations (to exclude startup overhead). 98 | n_iters (int): Number of iterations for timing. 99 | 100 | Returns: 101 | float: Average execution time per iteration in milliseconds. 102 | """ 103 | # Warm-up runs to ensure any one-time setup is complete (e.g. CUDA context) 104 | for _ in range(n_warmup): 105 | func(*args) 106 | torch.cuda.synchronize() # Ensure warm-up kernels have finished 107 | 108 | # Start timing 109 | start = time.perf_counter() 110 | for _ in range(n_iters): 111 | func(*args) 112 | torch.cuda.synchronize() # Wait for all GPU operations to finish 113 | end = time.perf_counter() 114 | 115 | # Calculate the average execution time (in milliseconds) 116 | avg_time_ms = (end - start) / n_iters * 1000 117 | return avg_time_ms 118 | 119 | # ------------------------------------------------------------------------------ 120 | # Main: Compare and Benchmark Triton Kernel vs. PyTorch Implementation 121 | # ------------------------------------------------------------------------------ 122 | 123 | if __name__ == '__main__': 124 | # Create two example vectors on the GPU (stress test with a large number of elements) 125 | n = 1024 * 1024 * 10 # e.g., 10 million elements 126 | A = torch.arange(0, n, device='cuda', dtype=torch.float32) 127 | B = torch.arange(n, 2 * n, device='cuda', dtype=torch.float32) 128 | 129 | # Validate correctness by comparing results from Triton and PyTorch 130 | C_triton = vector_add_triton(A, B) 131 | C_pytorch = A + B 132 | 133 | if torch.allclose(C_triton, C_pytorch): 134 | print("Success: The Triton result matches the PyTorch result!") 135 | else: 136 | print("Error: The results do not match.") 137 | 138 | # Benchmark the Triton kernel 139 | triton_time = benchmark(vector_add_triton, A, B, n_warmup=10, n_iters=100) 140 | print(f"Average execution time (Triton): {triton_time:.3f} ms") 141 | 142 | # Benchmark the PyTorch implementation 143 | def pytorch_add(A, B): 144 | return A + B 145 | 146 | pytorch_time = benchmark(pytorch_add, A, B, n_warmup=10, n_iters=100) 147 | print(f"Average execution time (PyTorch): {pytorch_time:.3f} ms") 148 | ``` 149 | 150 | ## Code explanation 151 | 152 | ### 1. Triton kernel (`vector_add_kernel`) 153 | - **Kernel signature:** 154 | the kernel receives pointers to vectors A, B, and C, along with the total number of elements and the block size (a compile‑time constant). 155 | - **Indexing and masking:** 156 | each kernel instance computes a block of element offsets and uses a mask to prevent out‑of-bound memory accesses. 157 | - **Memory operations:** 158 | the kernel loads values from A and B, computes their sum, and writes the result to C. 159 | 160 | ### 2. Python wrapper (`vector_add_triton`) 161 | - **Functionality:** 162 | this function prepares the input data, allocates the output tensor, and configures the grid for the Triton kernel launch. 163 | - **Kernel launch:** 164 | the kernel is launched using the computed grid configuration. 165 | 166 | ### 3. Benchmarking function (`benchmark`) 167 | - **Warm-up iterations:** number of warm-up iterations are executed to overcome any one-time overhead (such as CUDA context initialization). 168 | - **Timing:** 169 | The function uses Python’s `time.perf_counter()` to measure elapsed time over multiple iterations. 170 | - **Synchronization:** 171 | `torch.cuda.synchronize()` is called before starting and after completing the timed iterations to ensure that all GPU operations have finished. 172 | 173 | ### 4. Main routine 174 | - **Data Preparation:** 175 | two large vectors (10 million elements each) are created on the GPU. 176 | - **Validation:** 177 | the Triton and PyTorch implementations are compared using `torch.allclose()` to ensure correctness. 178 | - **Benchmarking:** 179 | both implementations are benchmarked by measuring the average execution time over 100 iterations (after 10 warm-up iterations). The results are printed to the console. 180 | 181 | ## Conclusion 182 | 183 | This example shows how to implement and benchmark a custom Triton GPU kernel for vector addition alongside a standard PyTorch operation. With the included benchmarking function, you can stress test both implementations and compare their performance under various conditions. Feel free to modify the number of elements, block sizes, and iterations to explore performance characteristics further. 184 | -------------------------------------------------------------------------------- /daily_challange/day4/readme.md: -------------------------------------------------------------------------------- 1 | # Puzzle 5: Autograd-Compatible ReLU with Triton 2 | 3 | In this challenge, you will implement the ReLU activation function in a way that is fully compatible with PyTorch’s autograd. That means you’ll write a custom autograd function that uses a Triton kernel for the forward pass (computing `y = max(0, x)`) and a second Triton kernel for the backward pass (computing the gradient of ReLU, where `grad_input = grad_output` if `x > 0` and `0` otherwise). 4 | 5 | ## Overview 6 | 7 | - **Forward kernel:** computes the ReLU activation on the input tensor. 8 | - **Backward kernel:** computes the gradient with respect to the input. 9 | - **Custom autograd function:** wraps the Triton kernels so that they can be used in PyTorch’s computational graph. 10 | - **Benchmarking and validation:** compare the custom function against PyTorch’s built‑in ReLU to ensure correctness and measure performance. 11 | 12 | ## Full code example 13 | 14 | ```python 15 | import time 16 | import torch 17 | import triton 18 | import triton.language as tl 19 | 20 | # ------------------------------------------------------------------------------ 21 | # Triton Kernel for ReLU Forward Pass 22 | # ------------------------------------------------------------------------------ 23 | 24 | @triton.jit 25 | def relu_forward_kernel( 26 | x_ptr, # Pointer to input tensor x 27 | y_ptr, # Pointer to output tensor y 28 | N: tl.constexpr, # Total number of elements in x 29 | BLOCK_SIZE: tl.constexpr # Number of elements processed per kernel instance 30 | ): 31 | # Get the current program (block) ID. 32 | pid = tl.program_id(0) 33 | # Compute offsets for this block. 34 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 35 | # Create a mask for out-of-bound indices. 36 | mask = offsets < N 37 | # Load input values. 38 | x = tl.load(x_ptr + offsets, mask=mask) 39 | # Compute ReLU: y = max(0, x) 40 | y = tl.maximum(x, 0.0) 41 | # Store the result. 42 | tl.store(y_ptr + offsets, y, mask=mask) 43 | 44 | # ------------------------------------------------------------------------------ 45 | # Triton Kernel for ReLU Backward Pass 46 | # ------------------------------------------------------------------------------ 47 | 48 | @triton.jit 49 | def relu_backward_kernel( 50 | x_ptr, # Pointer to saved input tensor x (from forward pass) 51 | grad_output_ptr, # Pointer to gradient of the output 52 | grad_input_ptr, # Pointer to store computed gradient with respect to x 53 | N: tl.constexpr, # Total number of elements in x 54 | BLOCK_SIZE: tl.constexpr # Number of elements processed per kernel instance 55 | ): 56 | pid = tl.program_id(0) 57 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 58 | mask = offsets < N 59 | # Load input values and gradient of output. 60 | x = tl.load(x_ptr + offsets, mask=mask) 61 | grad_out = tl.load(grad_output_ptr + offsets, mask=mask) 62 | # Compute gradient of ReLU: 63 | # For each element, if x > 0, gradient is grad_out; otherwise, it is 0. 64 | grad_in = tl.where(x > 0, grad_out, 0.0) 65 | tl.store(grad_input_ptr + offsets, grad_in, mask=mask) 66 | 67 | # ------------------------------------------------------------------------------ 68 | # Custom Autograd Function Using Triton Kernels 69 | # ------------------------------------------------------------------------------ 70 | 71 | class TritonReLUFunction(torch.autograd.Function): 72 | @staticmethod 73 | def forward(ctx, x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 74 | """ 75 | Forward pass of the ReLU activation using the Triton kernel. 76 | Saves the input tensor for use in the backward pass. 77 | """ 78 | N = x.numel() 79 | y = torch.empty_like(x) 80 | grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),) 81 | # Launch the forward kernel. 82 | relu_forward_kernel[grid](x, y, N, BLOCK_SIZE=BLOCK_SIZE) 83 | # Save input tensor for the backward pass. 84 | ctx.save_for_backward(x) 85 | ctx.BLOCK_SIZE = BLOCK_SIZE 86 | return y 87 | 88 | @staticmethod 89 | def backward(ctx, grad_output: torch.Tensor) -> tuple: 90 | """ 91 | Backward pass computes the gradient of the ReLU activation. 92 | """ 93 | x, = ctx.saved_tensors 94 | N = x.numel() 95 | grad_input = torch.empty_like(x) 96 | grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']),) 97 | BLOCK_SIZE = ctx.BLOCK_SIZE 98 | # Launch the backward kernel. 99 | relu_backward_kernel[grid](x, grad_output, grad_input, N, BLOCK_SIZE=BLOCK_SIZE) 100 | # Return the gradient for x and None for BLOCK_SIZE (not a tensor). 101 | return grad_input, None 102 | 103 | # Convenience function to call our custom autograd ReLU. 104 | def triton_relu(x: torch.Tensor, BLOCK_SIZE: int = 1024) -> torch.Tensor: 105 | return TritonReLUFunction.apply(x, BLOCK_SIZE) 106 | 107 | # ------------------------------------------------------------------------------ 108 | # Benchmarking Function 109 | # ------------------------------------------------------------------------------ 110 | 111 | def benchmark(func, *args, n_warmup=10, n_iters=100): 112 | """ 113 | Benchmarks a function by running warm-up iterations followed by timed iterations. 114 | 115 | Args: 116 | func (callable): The function to benchmark. 117 | *args: Arguments to pass to the function. 118 | n_warmup (int): Number of warm-up iterations. 119 | n_iters (int): Number of iterations for timing. 120 | 121 | Returns: 122 | float: Average execution time per iteration in milliseconds. 123 | """ 124 | # Warm-up iterations. 125 | for _ in range(n_warmup): 126 | func(*args) 127 | torch.cuda.synchronize() 128 | 129 | start = time.perf_counter() 130 | for _ in range(n_iters): 131 | func(*args) 132 | torch.cuda.synchronize() 133 | end = time.perf_counter() 134 | return (end - start) / n_iters * 1000 135 | 136 | # ------------------------------------------------------------------------------ 137 | # Main: Test and Benchmark the Autograd-Compatible ReLU 138 | # ------------------------------------------------------------------------------ 139 | 140 | if __name__ == '__main__': 141 | # Create a random input tensor on the GPU with gradient tracking. 142 | N = 1024 * 1024 # 1 million elements 143 | x = torch.randn(N, device='cuda', dtype=torch.float32, requires_grad=True) 144 | BLOCK_SIZE = 1024 145 | 146 | # Forward pass using our custom Triton ReLU. 147 | y_triton = triton_relu(x, BLOCK_SIZE) 148 | # Define a dummy loss (sum of outputs) and perform backward pass. 149 | loss_triton = y_triton.sum() 150 | loss_triton.backward() 151 | 152 | # For validation, compare against PyTorch's built-in ReLU. 153 | x_torch = x.detach().clone().requires_grad_() 154 | y_torch = torch.relu(x_torch) 155 | loss_torch = y_torch.sum() 156 | loss_torch.backward() 157 | 158 | # Check if the gradients match. 159 | if torch.allclose(x.grad, x_torch.grad, atol=1e-4): 160 | print("Success: Triton autograd ReLU backward matches PyTorch!") 161 | else: 162 | print("Error: The gradients do not match.") 163 | 164 | # Benchmark the forward pass. 165 | triton_time = benchmark(lambda: triton_relu(x, BLOCK_SIZE)) 166 | torch_time = benchmark(lambda: torch.relu(x)) 167 | print(f"Average execution time (Forward Pass):") 168 | print(f" Triton ReLU = {triton_time:.3f} ms") 169 | print(f" PyTorch ReLU = {torch_time:.3f} ms") 170 | ``` 171 | 172 | ## Code explanation 173 | 174 | ### 1. Forward and backward triton kernels 175 | - **Forward kernel (`relu_forward_kernel`):** 176 | - Each kernel instance processes a block of elements. 177 | - For each element, it computes the ReLU activation: \( y = \max(0, x) \). 178 | - **Backward kernel (`relu_backward_kernel`):** 179 | - Loads the saved input and the gradient of the output. 180 | - Computes the gradient with respect to \( x \): if \( x > 0 \), the gradient remains the same as `grad_output`; otherwise, it is set to 0. 181 | 182 | ### 2. Custom autograd function (`TritonReLUFunction`) 183 | - **Forward method:** 184 | - Calls the Triton forward kernel. 185 | - Saves the input tensor for use in the backward pass. 186 | - **Backward method:** 187 | - Retrieves the saved input. 188 | - Calls the Triton backward kernel to compute the gradient. 189 | - Returns the computed gradient for \( x \). 190 | 191 | ### 3. Benchmarking 192 | - A helper function `benchmark` is provided to measure the average execution time of a function over multiple iterations. 193 | - The forward pass of both the custom Triton ReLU and PyTorch’s built‑in ReLU is benchmarked. 194 | 195 | ### 4. Main routine 196 | - A large random tensor is created with gradient tracking. 197 | - Both forward and backward passes are executed, and the gradients are compared for correctness. 198 | - Performance is measured and printed for comparison. 199 | 200 | ## Conclusion 201 | 202 | This puzzle demonstrates how to integrate Triton kernels with PyTorch’s autograd by implementing both forward and backward methods. By comparing the custom autograd function with PyTorch’s built‑in ReLU, you gain insight into the mechanics of GPU kernel programming and automatic differentiation. This is an essential step toward building more complex, high‑performance GPU operations with Triton. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
Triton
2 | 3 | # Triton OpenAI 4 | A curated list of resources for learning and exploring Triton, OpenAI's programming language for writing efficient GPU code. 5 | 6 | ## Official Documentation 7 | - [Official Triton Documentation](https://triton-lang.org/main/index.html) 8 | 9 | ## My daily challange (Triton day by day) 10 | This project is a step-by-step learning journey where we implement various types of Triton kernels—from the simplest examples to more advanced applications—while exploring GPU programming with Triton. 11 | The goal of this repository is to help you (and others) get comfortable with Triton by: 12 | - **Starting simple:** begin with basic kernels such as vector addition, and understand the building blocks of writing GPU code with Triton. 13 | - **Incremental learning:** each day introduces a new challenge, progressively covering more complex topics, techniques, and optimizations. 14 | - **Hands-on experience:** code, test, and benchmark your kernels against standard implementations (e.g., PyTorch) to see performance improvements and better understand GPU behavior. 15 | 16 | **Daily challenges:** every day, a new challenge is posted in this repository. Each challenge focuses on a specific aspect of Triton, such as: 17 | - Basic operations (e.g., vector addition) 18 | - Memory management and optimizations 19 | - Advanced indexing and dynamic shapes 20 | - Multi-dimensional kernels 21 | - Reduction operations and more 22 | - **Detailed explanations:** each kernel comes with an in-depth explanation of the code, helping you understand the concepts behind the implementation. 23 | - **Benchmarking and stress tests:** learn how to measure performance by comparing custom Triton kernels with standard PyTorch implementations. Get hands-on experience with benchmarking on real-world GPU workloads. 24 | 25 | | Day | Kernel | Description | 26 | |---------------------|----------------------|----------------------------| 27 | | #1 | [Constant add](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day0) | This challenge is the first puzzle in our Daily Triton Challenge series. The goal is to write a Triton kernel that adds a constant value to each element of a vector. | 28 | | #2 | [Add two vectors](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day1) | Simple example of how to add two vectors using a custom GPU kernel written in Triton and compares the result to a standard PyTorch implementation. | 29 | | #3 | [Add two vectors with speed benchmarking](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day2) | This is almost the same as #2 but we meaesure kernel execution speed and compare it to Pytorch implementation.| 30 | | #4 | [ReLU Activation with Triton](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day3) | In this challenge, you will implement the ReLU (Rectified Linear Unit) activation function using Triton. ReLU is defined as: ReLU(x)=max(0,x)| 31 | | #5 | [ReLU Activation forward and backward](https://github.com/rkinas/triton-resources/tree/main/daily_challange/day4) | In this challenge, you will implement the ReLU activation function in a way that is fully compatible with PyTorch’s autograd. That means you’ll write a custom autograd function that uses a Triton kernel for the forward pass (computing y = max(0, x)) and a second Triton kernel for the backward pass (computing the gradient of ReLU, where grad_input = grad_output if x > 0 and 0 otherwise). | 32 | 33 | ## Articles 34 | Gain deeper insights into Triton through these detailed articles: 35 | - Understanding the Triton Tutorials [Part 1](https://isamu-website.medium.com/understanding-the-triton-tutorials-part-1-6191b59ba4c) and [Part 2](https://isamu-website.medium.com/understanding-triton-tutorials-part-2-f6839ce50ae7) 36 | - [Softmax in OpenAI Triton](http://blog.nagi.fun/triton-intro-softmax) -> more detailed Fused Softmax Triton example explanation (step-by-step) 37 | - [Accelerating AI with Triton: A Deep Dive into Writing High-Performance GPU Code](https://medium.com/@nijesh-kanjinghat/accelerating-ai-with-triton-a-deep-dive-into-writing-high-performance-gpu-code-a1e4d66556cc) 38 | - [Accelerating Triton Dequantization Kernels for GPTQ](https://pytorch.org/blog/accelerating-triton/) 39 | - [Triton Tutorial #2](https://medium.com/@sherlockliao01/triton-tutorial-2-5de66cd2170d) 40 | - [Triton: OpenAI’s Innovative Programming Language for Custom Deep-Learning Primitives](https://blog.devgenius.io/triton-openais-innovative-programming-language-for-custom-deep-learning-primitives-485723b0b49) 41 | - [Triton Kernel Compilation Stages](https://pytorch.org/blog/triton-kernel-compilation-stages/) 42 | - Deep Dive into Triton Internals [Part 1](https://www.kapilsharma.dev/posts/deep-dive-into-triton-internals/), [Part 2](https://www.kapilsharma.dev/posts/deep-dive-into-triton-internals-2/) and [Part 3](https://www.kapilsharma.dev/posts/deep-dive-into-triton-internals-3/) 43 | - [Exploring Triton GPU programming for neural networks in Java](https://openjdk.org/projects/babylon/articles/triton) 44 | - [Using User-Defined Triton Kernels with torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html) 45 | - [Mamba: The Hard Way](https://srush.github.io/annotated-mamba/hard.html) 46 | - FP8: [Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton](https://pytorch.org/blog/accelerating-gemms-triton/) 47 | - FP8: [Deep Dive on CUTLASS Ping-Pong GEMM Kernel](https://pytorch.org/blog/cutlass-ping-pong-gemm-kernel/) 48 | - FP8: [Deep Dive on the Hopper TMA Unit for FP8 GEMMs](https://pytorch.org/blog/hopper-tma-unit/) 49 | - [Technical Review on PyTorch2.0 and Triton](https://www.jokeren.tech/slides/Triton_bsc.pdf) 50 | - [Towards Agile Development of Efficient Deep Learning Operators](https://www.jokeren.tech/slides/triton_intel.pdf) 51 | - [Developing Triton Kernels on AMD GPUs](https://rocm.blogs.amd.com/artificial-intelligence/triton/README.html) 52 | - [CUDA-Free Inference for LLMs](https://pytorch.org/blog/cuda-free-inference-for-llms/) 53 | - [Enabling advanced GPU features in PyTorch - Warp Specialization](https://pytorch.org/blog/warp-specialization/) - Fully automated Triton warp specialization in Triton. 54 | - [Teaching AI to Write GPU Code: A Deep Dive into Reinforcement Fine-Tuning](https://predibase.com/blog/teaching-ai-to-write-gpu-code-a-deep-dive-into-reinforcement-fine-tuning) 55 | 56 | ## Blackwell and Triton 57 | - [Accelerating the Future: Triton on Blackwell Architecture](https://www.youtube.com/watch?v=RW2-HtWaOS0) 58 | - [OpenAI Triton on NVIDIA Blackwell Boosts AI Performance and Programmability](https://developer.nvidia.com/blog/openai-triton-on-nvidia-blackwell-boosts-ai-performance-and-programmability/) - Triton compiler now supports the NVIDIA Blackwell architecture. 59 | - [Running PyTorch and Triton on the RTX 5080](https://webstorms.github.io/2025/02/06/5080-install.html) 60 | 61 | ## Research Papers 62 | Explore the academic foundation of Triton: 63 | - [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](https://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf) 64 | 65 | ## Videos 66 | Learn by watching these informative videos: 67 | - [Lecture 14: Practitioners Guide to Triton](https://www.youtube.com/watch?v=DdTsX6DQk24) and [notebook](https://github.com/gpu-mode/lectures/blob/main/lecture_014/A_Practitioners_Guide_to_Triton.ipynb) 68 | - [Lecture 29: Triton Internals](https://www.youtube.com/watch?v=njgow_zaJMw) 69 | - [Intro to Triton: Coding Softmax in PyTorch](https://www.youtube.com/watch?v=gyKBN1rnefI) 70 | - [Triton Vector Addition Kernel, part 1: Making the Shift to Parallel Programming](https://www.youtube.com/watch?v=MEZ7XhzTLEg&t) 71 | - [Tiled Matrix Multiplication in Triton - part 1](https://www.youtube.com/watch?v=OnZEBBJvWLU) 72 | - [Flash Attention derived and coded from first principles with Triton (Python)](https://www.youtube.com/watch?v=zy8ChVd_oTM) 73 | - [Triton GPU Kernels 101](https://www.youtube.com/playlist?list=PLPefVKO3tDxOJLAmCA75uShbe1z_RNqkQ) 74 | 75 | ## Triton community meetup 76 | Watch Triton community meetups to be up to date with Triton recent topics. 77 | - [2024-11-09](https://youtu.be/N0eiYLWyNpc?si=n9T-X-0UaK3j1fXQ) 78 | 79 | ## Triton-Puzzles 80 | Challenge yourself with these engaging puzzles: 81 | - [To Solve](https://github.com/srush/Triton-Puzzles) 82 | - [Solved](https://github.com/alexzhang13/Triton-Puzzles-Solutions/blob/main/Triton_Puzzles_Solutions_alexzhang13.ipynb) 83 | 84 | ## Tools 85 | Enhance your Triton development workflow with these tools: 86 | - [Triton Deja-vu](https://github.com/IBM/triton-dejavu) Framework to reduce autotune overhead of triton-lang to zero for well known deployments. This small framework is based on the Triton autotuner and contributes two features to the Triton community: 1. store and safely restore autotuner states using JSON files, 2. ConfigSpaces to explore a defined space exhaustively. Additionally, it allows to use heuristics in combination with the autotuner. 87 | - [Triton Profiler](https://github.com/triton-lang/triton/tree/c5a14cc00598014b303eebac831f19e8a66e9e1d/third_party/proton) and video explaining how to use it [Dev Tools: Proton/Interpreter](https://www.youtube.com/watch?v=Av1za_0o2Qs) 88 | - [Triton-Viz: A Visualization Toolkit for Programming with Triton](https://github.com/Deep-Learning-Profiling-Tools/triton-viz) 89 | - [Make Triton easier - Triton-util provides simple higher-level abstractions for frequent but repetitive tasks. This allows you to write code that is closer to how you actually think.](https://github.com/UmerHA/triton_util/tree/main) 90 | - [TritonBench is a collection of PyTorch operators used to evaluation the performance of Triton, and its integration with PyTorch.](https://github.com/pytorch-labs/tritonbench) 91 | - [TritonBench features two distinct channels: TritonBench-G and TritonBench-T, each with its own evaluation framework.](https://github.com/thunlp/TritonBench) 92 | 93 | ## Conferences 94 | Catch up on the latest advancements from Triton Conferences: 95 | - [2024 Conference Playlist](https://www.youtube.com/watch?v=nglpa_6cYYI&list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz) 96 | - [2023 Conference Playlist](https://www.youtube.com/watch?v=ZGU0Yw7mORE&list=PLc_vA1r0qoiRZfUC3o4_yjj0FtWvodKAz) 97 | 98 | ## Sample Kernels 99 | Explore practical implementations with these sample kernels: 100 | - [attorch is a subset of PyTorch's nn module, written purely in Python using OpenAI's Triton](https://github.com/BobMcDear/attorch) 101 | - [FlagGems is a high-performance general operator library implemented in OpenAI Triton. It aims to provide a suite of kernel functions to accelerate LLM training and inference.](https://github.com/FlagOpen/FlagGems) 102 | - [Kernl lets you run Pytorch transformer models several times faster on GPU with a single line of code, and is designed to be easily hackable.](https://github.com/ELS-RD/kernl) 103 | - [Linger-Kernel](https://github.com/linkedin/Liger-Kernel) 104 | - [Triton Kernels for Efficient Low-Bit Matrix Multiplication](https://github.com/mobiusml/gemlite) 105 | - [Unsloth Kernels](https://github.com/unslothai/unsloth/tree/main/unsloth/kernels) 106 | - [This is attempt at implementing a Triton kernel for GPTQ inference. This code is based on the GPTQ-for-LLaMa codebase, which is itself based on the GPTQ codebase.](https://github.com/fpgaminer/GPTQ-triton) 107 | - [triton-index - Catalog openly available Triton kernels](https://github.com/gpu-mode/triton-index) 108 | - [Triton-based implementation of Sparse Mixture-of-Experts (SMoE) on GPUs](https://github.com/shawntan/scattermoe) 109 | - [Variety of Triton and CUDA kernels for training and inference](https://github.com/pytorch-labs/applied-ai) 110 | - [EquiTriton is a project that seeks to implement high-performance kernels for commonly used building blocks in equivariant neural networks, enabling compute efficient training and inference](https://github.com/IntelLabs/EquiTriton) 111 | - [Expanded collection of Neural Network activation functions and other function kernels in Triton by OpenAI.](https://github.com/dtunai/triton-activations) 112 | - [Fused kernels](https://github.com/kapilsh/cuda-mode-lecture) 113 | - [Triton activations](https://github.com/dtunai/triton-activations/tree/main) only feed forward 114 | - [LightLLM is a Python-based LLM (Large Language Model) inference and serving framework, notable for its lightweight design, easy scalability, and high-speed performance](https://github.com/ModelTC/lightllm/tree/main/lightllm/common/basemodel/triton_kernel) 115 | - [Bitsandbytes - ibrary is a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and 8 & 4-bit quantization functions](https://github.com/bitsandbytes-foundation/bitsandbytes/tree/main/bitsandbytes/triton) 116 | - [MInference Triton Kernels - FlashAttention ](https://github.com/microsoft/MInference) 117 | - [GridQuant](https://github.com/niconunezz/GridQuant) - This repository tries to implements the ideas presented in the blog post "Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton". Designed specifically for NVIDIA H100 GPUs, it leverages advanced features like float8 computation, Triton's high-performance GPU programming capabilities, and the Tensor Memory Accelerator (TMA). 118 | - [Efficient Triton implementations for Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention.](https://github.com/fla-org/native-sparse-attention) 119 | 120 | 121 | ## Triton integrations 122 | - [jax-triton](https://github.com/jax-ml/jax-triton) 123 | 124 | ## Triton backends 125 | - [Intel® XPU Backend for Triton](https://github.com/intel/intel-xpu-backend-for-triton) 126 | 127 | ## Triton communities 128 | - [CUDA-MODE](https://discord.gg/gpumode) 129 | --- 130 | 131 | ## Triton Kernel Index 132 | 133 | | Kernel | Description | Resource | 134 | |----------------------|----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 135 | | **VectorAdd** | A simple kernel that performs element-wise addition of two vectors. Useful for understanding the basics of GPU programming in Triton. | [1](https://isamu-website.medium.com/understanding-the-triton-tutorials-part-1-6191b59ba4c) [2](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py) | 136 | | **Matmul** | An optimized kernel for matrix multiplication, achieving high performance by leveraging memory hierarchy and parallelism. | [1](https://isamu-website.medium.com/understanding-the-triton-tutorials-part-1-6191b59ba4c) [2](https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py) [Grouped GEMM](https://triton-lang.org/main/getting-started/tutorials/08-grouped-gemm.html#sphx-glr-getting-started-tutorials-08-grouped-gemm-py) | 137 | | **Softmax** | A kernel for efficient computation of the softmax function, commonly used in machine learning models like transformers. | [1](https://isamu-website.medium.com/understanding-the-triton-tutorials-part-1-6191b59ba4c) [2](http://blog.nagi.fun/triton-intro-softmax) [3](https://triton-lang.org/main/getting-started/tutorials/02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py) | 138 | | **Dropout** | A kernel for implementing low-memory dropout, a regularization technique to prevent overfitting in neural networks. | [1](https://isamu-website.medium.com/understanding-triton-tutorials-part-2-f6839ce50ae7) [2](https://triton-lang.org/main/getting-started/tutorials/04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py) | 139 | | **Layer Normalization** | A kernel for layer normalization, which normalizes activations within a layer to improve training stability in deep learning models. | [1](https://isamu-website.medium.com/understanding-triton-tutorials-part-2-f6839ce50ae7) [2](https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html#sphx-glr-getting-started-tutorials-05-layer-norm-py) [3](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/layer_norm.py) | 140 | | **Fused Attention** | A kernel that efficiently implements attention mechanisms by combining multiple operations, key to transformers and similar architectures. | [1](https://isamu-website.medium.com/understanding-triton-tutorials-part-2-f6839ce50ae7) [2](https://triton-lang.org/main/getting-started/tutorials/06-fused-attention.html#sphx-glr-getting-started-tutorials-06-fused-attention-py) | 141 | | **Conv1d** | A kernel for 1D convolution, often used in processing sequential data like time series or audio signals. | [1](https://github.com/BobMcDear/attorch/blob/main/attorch/conv_kernels.py) | 142 | | **Conv2d** | A kernel for 2D convolution, a fundamental operation in computer vision tasks such as image classification or object detection. | [1](https://github.com/BobMcDear/attorch/blob/main/attorch/conv_kernels.py) | 143 | | **MultiheadAttention** | A kernel for multi-head attention, a crucial component in transformer-based models for capturing complex relationships in data. | [1](https://github.com/BobMcDear/attorch/blob/main/attorch/multi_head_attention_kernels.py) | 144 | | **Hardsigmoid** | A kernel for the Hardsigmoid activation function, an efficient approximation of the sigmoid function used in certain neural network layers. | [1](https://github.com/BobMcDear/attorch/blob/main/attorch/act_kernels.py) | 145 | | **GeLU** | GeLU | [1](https://rocm.blogs.amd.com/artificial-intelligence/triton/README.html) | 146 | | **GeGLU** | GeGLU | [1](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/geglu.py) | 147 | | **RMSNorm** | RMSNorm | [1](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/rms_norm.py) | 148 | 149 | ## Triton updates, news, new features 150 | - [Automatic Warp Specialization Optimization](https://github.com/triton-lang/triton/pull/5622) 151 | 152 | 153 | 154 | 155 | ### Contribution 156 | Feel free to contribute more resources or suggest updates by opening a pull request or issue in this repository. 157 | 158 | --- 159 | ### License 160 | This resource list is open-sourced under the MIT license. 161 | --------------------------------------------------------------------------------