├── .gitignore ├── Makefile ├── ReadMe.md ├── day01 ├── addition.cu └── printAdd.cu ├── day02 ├── function.cu └── function.py ├── day03 ├── addMatrix.cu ├── addMatrix.py └── anotherMatrix.cu ├── day04 └── layerNorm.cu ├── day05 └── vectorSumTricks.cu ├── day06 ├── AdditionKernel │ ├── additionKernel.cpython-312-x86_64-linux-gnu.so │ ├── additionKernel.cu │ ├── additionKernel.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ └── top_level.txt │ ├── additionKernelBinding.cpp │ ├── additionkernel.cpython-312-x86_64-linux-gnu.so │ ├── additionkernel.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ └── top_level.txt │ ├── build │ │ ├── lib.linux-x86_64-cpython-312 │ │ │ ├── additionKernel.cpython-312-x86_64-linux-gnu.so │ │ │ └── additionkernel.cpython-312-x86_64-linux-gnu.so │ │ └── temp.linux-x86_64-cpython-312 │ │ │ ├── additionKernel.o │ │ │ └── additionKernelBinding.o │ ├── pythontest.py │ └── setup.py ├── ImportingToPython │ ├── build │ │ ├── lib.linux-x86_64-cpython-312 │ │ │ └── example_kernels.cpython-312-x86_64-linux-gnu.so │ │ └── temp.linux-x86_64-cpython-312 │ │ │ ├── rollcall.o │ │ │ └── rollcallbinding.o │ ├── example_kernels.cpython-312-x86_64-linux-gnu.so │ ├── example_kernels.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ └── top_level.txt │ ├── pythontest.py │ ├── rollcall.cu │ ├── rollcallbinding.cpp │ └── setup.py ├── SMBlocks.cu ├── SoftMax.cu ├── TransposeMatrix.cu └── note ├── day07 ├── conv1d.cu ├── globalMemoryCoalescing.cu ├── matmul.cu ├── naive.cu └── pythontest.py ├── day08 ├── idk.cu ├── pmpbook │ ├── chapter3ex.cu │ ├── chapter3matvecmul.cu │ ├── color2gray.cu │ ├── deviceinfo.cu │ ├── imageblur.cu │ └── vecaddition.cu └── selfAttention │ └── selfAttention.cu ├── day09 ├── bind.cpp ├── flashAttention.cu ├── flashAttentionFromTut.cu └── test.py ├── day10 ├── FlashAttention.cpp ├── FlashAttention.cu ├── linking │ ├── simpleKernel.cpp │ ├── simpleKernel.cu │ └── test.py ├── ppmbook │ └── matrixmul.cu ├── setup.py └── test.py ├── day100 └── delta.cu ├── day11 ├── FlashTestPytorch │ ├── FlashAttention.cu │ ├── binding.cpp │ └── test.py ├── LeakyReLU.cu ├── ReLU.cu ├── SoftMax.cu ├── TanH.cu ├── binding.cpp ├── test.py └── testbackward.py ├── day12 ├── NN │ └── kernels.cu ├── softMax.cu └── tileMatrix.cu ├── day13 ├── RMS.cu ├── RMSBetter.cu ├── binding.cpp └── test.py ├── day14 ├── FA2 │ ├── flash.cu │ ├── helper.cu │ ├── helper.cuh │ ├── kernels.cu │ └── kernels.cuh ├── FlashAttention2 │ └── kernel.cu ├── cat.jpg └── conv.cu ├── day15 ├── Attention.cu ├── SMM.cu └── dotproduct.cu ├── day16 ├── attentionbwkd.cu └── test.py ├── day17 ├── cublas1.cu ├── cublas2.cu └── cublas3.cu ├── day18 ├── atomic1.cu ├── atomic2.cu └── wrap.cu ├── day19 └── cublasMM.cu ├── day20 ├── rope.cu └── test_rope.py ├── day21 └── conv.cu ├── day22 ├── persistent2.cu └── persistentKernel.cu ├── day23 ├── kernel.ptx └── main.cu ├── day24 └── GeGLU.cu ├── day25 └── nbody.cu ├── day26 ├── gradientdescent.cu └── gradientdescent.out ├── day27 ├── kmeans.cu └── kmeans.out ├── day28 ├── sample.cu └── test_sample.py ├── day29 └── pi.cu ├── day30 └── kernelHisto.cu ├── day31 └── kernel.cu ├── day32 ├── Makefile └── matmul_kernels │ ├── kernel_1 │ └── kernel_1.cpp │ ├── kernel_2 │ └── kernel_2.cpp │ ├── kernel_3 │ └── kernel_3.cpp │ └── kernel_rocblas │ └── kernel_rocblas.cpp ├── day33 └── load_in_pytorch │ ├── kernel.cpp │ ├── kernel.so │ └── test.py ├── day34 └── tensor_lib │ ├── test1.cpp │ └── test1.out ├── day35 └── layernorm.cpp ├── day36 └── random.cpp ├── day37 └── MultiStreams │ ├── MHA.cpp │ ├── MHA.out │ ├── notes.md │ ├── results.copy_stats.csv │ ├── results.db │ ├── results.hip_stats.csv │ ├── results.hsa_stats.csv │ ├── results.json │ ├── results.stats.csv │ └── results.sysinfo.txt ├── day38 └── myreduction.cpp ├── day39 └── advancedcudamm.cu ├── day40 └── flaship.cpp ├── day41 └── MLA.cu ├── day42 ├── mat_mul.py └── mat_mul_2.py ├── day43 └── rope.py ├── day44 ├── average_duration_per_block_size.png ├── benchmark_results.csv ├── duration_vs_total_elements.png └── tritonkernel.py ├── day45 └── cross_entropy │ └── cross_entropy.py ├── day46 └── flash_attention.py ├── day47 ├── hip_cooperative_groups.h └── kernel.cpp ├── day48 └── kernel.py ├── day49 └── kernel.py ├── day50 └── tritonnn.py ├── day51 └── main.py ├── day52 └── functionsused.py ├── day53 └── layer_norm.py ├── day54 └── softmax.py ├── day55 └── ddpm.py ├── day56 └── main.py ├── day57 └── main.py ├── day58 └── layer_norm.cpp ├── day59 └── test.py ├── day60 └── fused.py ├── day61 └── backprop.py ├── day62 └── main.py ├── day63 └── lstm.py ├── day64 └── main.py ├── day65 └── quant.cpp ├── day66 └── kernel.cpp ├── day67 └── lora.py ├── day68 └── adam.py ├── day69 └── main.py ├── day70 └── gla.py ├── day71 └── main.py ├── day72 └── main.py ├── day73 └── code.py ├── day74 └── kernel.py ├── day75 └── kernel.py ├── day76 └── kernel.py ├── day77 └── main.py ├── day78 └── rmsnorm.py ├── day79 └── main.py ├── day80 └── kernel.py ├── day81 └── main.py ├── day82 └── rope.py ├── day83 └── lin.py ├── day84 └── kernel.py ├── day85 └── TensorMatMul.cu ├── day86 └── hard_sigmoid.cu ├── day87 └── SymMatMul.cu ├── day88 └── MSE.cu ├── day89 └── LTMM.cu ├── day90 └── FrobeniusNorm.cu ├── day91 └── Hinge_Loss.cu ├── day92 └── 1D_Convolution.cu ├── day93 └── RMS_Normalization.cu ├── day94 └── ELU.cu ├── day95 └── 2D_Max_Pooling.cu ├── day96 └── Product_Over_Dimension.cu ├── day97 └── elu_optim.cu ├── day98 └── kernel.cpp ├── day99 └── kernel.cpp ├── notes └── offsetcudatriton.md └── nvidiadocs └── addition.cu /.gitignore: -------------------------------------------------------------------------------- 1 | /.vscode -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJECT_DIR := $(CURDIR) 2 | 3 | COLOR_RESET := \033[0m 4 | COLOR_GREEN := \033[32m 5 | COLOR_YELLOW := \033[33m 6 | COLOR_BLUE := \033[34m 7 | COLOR_RED := \033[31m 8 | 9 | CUDA_ARCH := sm_89 # Specify CUDA architecture (e.g., sm_89 for RTX 4070) 10 | 11 | all: build 12 | 13 | build: $(PROJECT_DIR)/$(dir)/$(program).out 14 | 15 | $(PROJECT_DIR)/$(dir)/$(program).out: $(PROJECT_DIR)/$(dir)/$(program).cu 16 | @echo "$(COLOR_YELLOW)Building program $(program) in directory $(dir)...$(COLOR_RESET)" 17 | @nvcc -arch=$(CUDA_ARCH) -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -o $@ $< -lcuda 18 | @echo "$(COLOR_GREEN)Build completed for $(program).out in $(dir)$(COLOR_RESET)" 19 | 20 | run: $(PROJECT_DIR)/$(dir)/$(program).out 21 | @echo "$(COLOR_BLUE)Running $(program).out in directory $(dir)...$(COLOR_RESET)" 22 | @./$(dir)/$(program).out 23 | 24 | clean: 25 | @echo "$(COLOR_RED)Cleaning up .out files in directory $(dir)...$(COLOR_RESET)" 26 | @rm -f $(PROJECT_DIR)/$(dir)/*.out 27 | @echo "$(COLOR_GREEN)Clean completed for directory $(dir)$(COLOR_RESET)" 28 | 29 | cleanall: 30 | @echo "$(COLOR_RED)Cleaning up all .out files in all directories...$(COLOR_RESET)" 31 | @find $(PROJECT_DIR) -type f -name "*.out" -exec rm -f {} \; 32 | @echo "$(COLOR_GREEN)Cleanall completed for all directories$(COLOR_RESET)" 33 | 34 | help: 35 | @echo "$(COLOR_BLUE)Usage instructions for Makefile:$(COLOR_RESET)" 36 | @echo "" 37 | @echo "$(COLOR_YELLOW)make dir= program=$(COLOR_RESET) # Build the program .cu in directory " 38 | @echo "$(COLOR_YELLOW)make run dir= program=$(COLOR_RESET) # Run the compiled .out in directory " 39 | @echo "$(COLOR_YELLOW)make clean dir=$(COLOR_RESET) # Clean all .out files in directory " 40 | @echo "$(COLOR_YELLOW)make cleanall$(COLOR_RESET) # Clean all .out files in all directories" 41 | @echo "" 42 | @echo "$(COLOR_BLUE)Examples:$(COLOR_RESET)" 43 | @echo "$(COLOR_GREEN)make dir=day1 program=addition$(COLOR_RESET) # Build addition.cu in day1" 44 | @echo "$(COLOR_GREEN)make run dir=day1 program=addition$(COLOR_RESET) # Run addition.out in day1" 45 | @echo "$(COLOR_GREEN)make clean dir=day1$(COLOR_RESET) # Clean up .out files in day1" 46 | @echo "$(COLOR_GREEN)make cleanall$(COLOR_RESET) # Clean all .out files in all directories" 47 | -------------------------------------------------------------------------------- /day01/addition.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void vectorAdd(const float* A , const float *B, float *C, int N){ 5 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 6 | // so blockIdx.x -> is the ID of thread 7 | // block dim = the size of the window we work on it 8 | // threaidx = 9 | if (idx>>(d_A,d_B,d_C,N); 41 | 42 | cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost); 43 | for(int i =N-10;i>>(N); 20 | 21 | // Wait for the device to finish 22 | cudaDeviceSynchronize(); 23 | 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /day02/function.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __device__ float square(int x){ 5 | return x*x; 6 | // __device__ marked function can only be called from anoter device function 7 | // or a kernel method 8 | } 9 | 10 | __global__ void voidKernel(int *input,int *output,int N) { 11 | int i = blockIdx.x * blockDim.x + threadIdx.x; 12 | if (i < N){ 13 | output[i] = square(input[i]); 14 | } 15 | } 16 | 17 | 18 | int main(){ 19 | int N = 10; // size of input and output arrays 20 | int size = N*sizeof(int); // total memory to allocate for the ararys 21 | int *h_input = new int[N]; // alocate memory on the CPU 22 | int *h_output = new int[N]; // alocate memory on the CPU 23 | 24 | for(int i = 0;i>>(d_input, d_output, N); cudaMemcpy(h_output,d_output,size,cudaMemcpyDeviceToHost); 38 | 39 | std::cout << "Squared array: "; 40 | for (int i = 0; i < N; i++) { 41 | std::cout << h_output[i] << " "; 42 | } 43 | std::cout << std::endl; 44 | 45 | delete[] h_input; 46 | delete[] h_output; 47 | cudaFree(d_input); 48 | cudaFree(d_output); 49 | 50 | return 0; 51 | } -------------------------------------------------------------------------------- /day02/function.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def __kernelfunction__(input_pointer, output_pointer, N, 7 | BLOCKSIZE: tl.constexpr): 8 | pid = tl.program_id(0) # Get the program (block) ID 9 | 10 | offset = pid * BLOCKSIZE + tl.arange(0, BLOCKSIZE) 11 | mask = offset < N 12 | 13 | input_data = tl.load(input_pointer + offset, mask=mask) 14 | output_data = tl.sqrt(input_data) 15 | tl.store(output_pointer + offset, output_data, mask=mask) 16 | 17 | def main(): 18 | N = 10 19 | 20 | input_data = torch.arange(0, N, dtype=torch.float32) 21 | print("Input data:", input_data) 22 | 23 | output_data = torch.empty_like(input_data) 24 | 25 | input_ptr = input_data.to("cuda") 26 | output_ptr = output_data.to("cuda") 27 | 28 | BLOCKSIZE = 256 29 | 30 | GRID = (triton.cdiv(N, BLOCKSIZE),) 31 | 32 | __kernelfunction__[GRID](input_ptr, output_ptr, N, BLOCKSIZE=BLOCKSIZE) 33 | 34 | output_data = output_ptr.cpu() 35 | print("Output data:", output_data) 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /day03/addMatrix.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | void printMatrix(const float *Matrix, const int size = 16) { 7 | int rootSize = sqrt(size); 8 | for (int i = 0; i < rootSize; i++) { 9 | for (int j = 0; j < rootSize; j++) { 10 | std::cout << Matrix[i * rootSize + j] << " "; 11 | } 12 | std::cout << "\n"; 13 | } 14 | } 15 | 16 | __global__ void matrixAddCUDA(const float *Matrix_A, const float *Matrix_B, float *Matrix_C, 17 | const int sizeX, const int sizeY) { 18 | int col = blockIdx.x * blockDim.x + threadIdx.x; 19 | int row = blockIdx.y * blockDim.y + threadIdx.y; 20 | 21 | if (row < sizeY && col < sizeX) { 22 | Matrix_C[row * sizeX + col] = Matrix_A[row * sizeX + col] + Matrix_B[row * sizeX + col]; 23 | } 24 | } 25 | 26 | void matrixAddCPU(const float *Matrix_A, const float *Matrix_B, float *Matrix_C, int sizeX, int sizeY) { 27 | for (int row = 0; row < sizeY; row++) { 28 | for (int col = 0; col < sizeX; col++) { 29 | Matrix_C[row * sizeX + col] = Matrix_A[row * sizeX + col] + Matrix_B[row * sizeX + col]; 30 | } 31 | } 32 | } 33 | 34 | void compareExecutionTime(const float *Matrix_A, const float *Matrix_B, float *Matrix_C, 35 | const int sizeX, const int sizeY) { 36 | const int matrixSize = sizeX * sizeY; 37 | const int matrixBytes = sizeof(float) * matrixSize; 38 | 39 | float *gpu_A, *gpu_B, *gpu_C; 40 | cudaMalloc((void **)&gpu_A, matrixBytes); 41 | cudaMalloc((void **)&gpu_B, matrixBytes); 42 | cudaMalloc((void **)&gpu_C, matrixBytes); 43 | 44 | cudaMemcpy(gpu_A, Matrix_A, matrixBytes, cudaMemcpyHostToDevice); 45 | cudaMemcpy(gpu_B, Matrix_B, matrixBytes, cudaMemcpyHostToDevice); 46 | 47 | int BLOCK_SIZE = 32; 48 | dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); 49 | dim3 gridDim((sizeX + BLOCK_SIZE - 1) / BLOCK_SIZE, (sizeY + BLOCK_SIZE - 1) / BLOCK_SIZE); 50 | 51 | auto startCPU = std::chrono::high_resolution_clock::now(); 52 | matrixAddCPU(Matrix_A, Matrix_B, Matrix_C, sizeX, sizeY); 53 | auto endCPU = std::chrono::high_resolution_clock::now(); 54 | 55 | auto startCUDA = std::chrono::high_resolution_clock::now(); 56 | matrixAddCUDA<<>>(gpu_A, gpu_B, gpu_C, sizeX, sizeY); 57 | cudaDeviceSynchronize(); 58 | auto endCUDA = std::chrono::high_resolution_clock::now(); 59 | 60 | cudaMemcpy(Matrix_C, gpu_C, matrixBytes, cudaMemcpyDeviceToHost); 61 | 62 | std::chrono::duration cpuDuration = endCPU - startCPU; 63 | std::chrono::duration cudaDuration = endCUDA - startCUDA; 64 | std::cout << "CPU Execution Time: " << cpuDuration.count() << " seconds\n"; 65 | std::cout << "CUDA Execution Time: " << cudaDuration.count() << " seconds\n"; 66 | 67 | cudaFree(gpu_A); 68 | cudaFree(gpu_B); 69 | cudaFree(gpu_C); 70 | } 71 | 72 | int main() { 73 | const int sizeX = 1024*16; 74 | const int sizeY = 1024*16; 75 | const int matrixSize = sizeX * sizeY; 76 | 77 | float *cpu_A = new float[matrixSize]; 78 | float *cpu_B = new float[matrixSize]; 79 | float *cpu_C = new float[matrixSize]; 80 | 81 | for (int i = 0; i < matrixSize; i++) { 82 | cpu_A[i] = 10.0f; 83 | cpu_B[i] = static_cast(i); 84 | } 85 | 86 | compareExecutionTime(cpu_A, cpu_B, cpu_C, sizeX, sizeY); 87 | 88 | delete[] cpu_A; 89 | delete[] cpu_B; 90 | delete[] cpu_C; 91 | 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /day03/addMatrix.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import torch 3 | import triton.language as tl 4 | 5 | 6 | @triton.jit 7 | def addMatrix(Matrix_A,Matrix_B,Matrix_C,sizeX,sizeY,BLOCK_SIZE:tl.constexpr): 8 | 9 | pid_x = tl.program_id(0) # we have the rows 10 | pid_y = tl.program_id(1) # we have the collumns 11 | 12 | row_start = pid_x*BLOCK_SIZE 13 | col_start = pid_y*BLOCK_SIZE 14 | 15 | row_indices = row_start + tl.arange(0,BLOCK_SIZE) 16 | col_indices = col_start + tl.arange(0,BLOCK_SIZE) 17 | 18 | row_indices = row_indices[:,None] 19 | col_indices = col_indices[None,:] 20 | 21 | row_mask = row_indices < sizeY 22 | col_mask = col_indices < sizeX 23 | valid_mask = row_mask & col_mask 24 | 25 | flat_indicies = row_indices * sizeX + col_indices 26 | 27 | A = tl.load(Matrix_A + flat_indicies,mask =valid_mask,other=0.0) 28 | B = tl.load(Matrix_B + flat_indicies,mask = valid_mask,other = 0.0) 29 | 30 | C = A+B; 31 | 32 | tl.store(Matrix_C+flat_indicies,C,mask=valid_mask) 33 | 34 | 35 | def test_addMatrix(): 36 | sizeX = 8 37 | sizeY = 8 38 | BLOCK_SIZE = 2 39 | 40 | Matrix_A = torch.randn(sizeY, sizeX, device='cuda', dtype=torch.float32) 41 | Matrix_B = torch.randn(sizeY, sizeX, device='cuda', dtype=torch.float32) 42 | Matrix_C = torch.zeros_like(Matrix_A, device='cuda', dtype=torch.float32) 43 | 44 | Matrix_A_flat = Matrix_A.flatten() 45 | Matrix_B_flat = Matrix_B.flatten() 46 | Matrix_C_flat = Matrix_C.flatten() 47 | 48 | grid = (triton.cdiv(sizeX, BLOCK_SIZE), triton.cdiv(sizeY, BLOCK_SIZE)) 49 | addMatrix[grid](Matrix_A_flat, Matrix_B_flat, Matrix_C_flat, sizeX, sizeY, BLOCK_SIZE) 50 | 51 | Matrix_C = Matrix_C_flat.reshape(sizeY, sizeX) 52 | 53 | expected = Matrix_A + Matrix_B 54 | print("Matrix A:\n", Matrix_A) 55 | print("Matrix B:\n", Matrix_B) 56 | print("Matrix C (Triton):\n", Matrix_C) 57 | print("Expected (PyTorch):\n", expected) 58 | assert torch.allclose(Matrix_C, expected), "Triton result does not match PyTorch result!" 59 | 60 | test_addMatrix() 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /day03/anotherMatrix.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __device__ float randomFunction(float x, float y) 5 | { 6 | return x + y * 2; 7 | } 8 | 9 | __global__ void matrixFunction(const float *A, const float *B, float *C, const int size) 10 | { 11 | int i = blockIdx.x * blockDim.x + threadIdx.x; 12 | int j = blockIdx.y * blockDim.y + threadIdx.y; 13 | 14 | if (i < size && j < size) 15 | { 16 | C[i + size * j] = randomFunction(A[i + size * j], B[i + size * j]); 17 | } 18 | } 19 | 20 | int main() 21 | { 22 | int N = 8; 23 | int BLOCK_SIZE = 2; 24 | dim3 blockDim(BLOCK_SIZE * BLOCK_SIZE); 25 | dim3 gridDim(N + BLOCK_SIZE - 1 / BLOCK_SIZE, N + BLOCK_SIZE - 1 / BLOCK_SIZE); 26 | int size = sizeof(float) * N * N; 27 | 28 | float *A,*B,*C; 29 | float *dA,*dB,*dC; 30 | A = new float[N*N]; 31 | B = new float[N*N]; 32 | C = new float[N*N]; 33 | 34 | cudaMalloc((void**)&dA,size); 35 | cudaMalloc((void**)&dB,size); 36 | cudaMalloc((void**)&dC,size); 37 | 38 | for (int i = 0; i < N; ++i) { 39 | for (int j = 0; j < N; ++j) { 40 | A[i + N * j] = 1.0f; 41 | B[i + N * j] = 2.0f; 42 | } 43 | } 44 | 45 | cudaMemcpy(dA,A,size,cudaMemcpyHostToDevice); 46 | cudaMemcpy(dB,B,size,cudaMemcpyHostToDevice); 47 | 48 | // now we have everything set up 49 | matrixFunction<<>>(dA,dB,dC,N); 50 | cudaDeviceSynchronize(); 51 | 52 | cudaMemcpy(C,dC,size,cudaMemcpyDeviceToHost); 53 | 54 | for (int i = 0; i < N*N; i++) { 55 | std::cout << C[i] << " "; 56 | if ((i + 1) % N == 0) std::cout << std::endl; 57 | } 58 | } -------------------------------------------------------------------------------- /day06/AdditionKernel/additionKernel.cpython-312-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/additionKernel.cpython-312-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /day06/AdditionKernel/additionKernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | __global__ void addKernel(T* input, int arraySize) { 6 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 7 | if (idx < arraySize) { 8 | input[idx] += 10; 9 | } 10 | } 11 | 12 | void addition(torch::Tensor& input, int arraySize) { 13 | int threads_per_block = 256; 14 | int blocks = (arraySize + threads_per_block - 1) / threads_per_block; 15 | 16 | AT_DISPATCH_FLOATING_TYPES(input.type(), "arrayAddition", [&]() { 17 | addKernel<<>>(input.data_ptr(), arraySize); 18 | }); 19 | cudaDeviceSynchronize(); 20 | 21 | auto err = cudaGetLastError(); 22 | if (err != cudaSuccess) { 23 | TORCH_CHECK(false, "CUDA error: ", cudaGetErrorString(err)); 24 | } 25 | } -------------------------------------------------------------------------------- /day06/AdditionKernel/additionKernel.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: additionKernel 3 | Version: 0.0.1 4 | -------------------------------------------------------------------------------- /day06/AdditionKernel/additionKernel.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | additionKernel.cu 2 | additionKernelBinding.cpp 3 | setup.py 4 | additionKernel.egg-info/PKG-INFO 5 | additionKernel.egg-info/SOURCES.txt 6 | additionKernel.egg-info/dependency_links.txt 7 | additionKernel.egg-info/top_level.txt -------------------------------------------------------------------------------- /day06/AdditionKernel/additionKernel.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /day06/AdditionKernel/additionKernel.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | additionKernel 2 | -------------------------------------------------------------------------------- /day06/AdditionKernel/additionKernelBinding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void addition(torch::Tensor& input, int arraySize); 4 | 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 6 | m.def("addition", &addition, "Adds 10 to each element of the tensor"); 7 | } -------------------------------------------------------------------------------- /day06/AdditionKernel/additionkernel.cpython-312-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/additionkernel.cpython-312-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /day06/AdditionKernel/additionkernel.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: additionkernel 3 | Version: 0.0.0 4 | -------------------------------------------------------------------------------- /day06/AdditionKernel/additionkernel.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | additionKernel.cu 2 | additionKernelBinding.cpp 3 | setup.py 4 | additionkernel.egg-info/PKG-INFO 5 | additionkernel.egg-info/SOURCES.txt 6 | additionkernel.egg-info/dependency_links.txt 7 | additionkernel.egg-info/top_level.txt -------------------------------------------------------------------------------- /day06/AdditionKernel/additionkernel.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /day06/AdditionKernel/additionkernel.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | additionkernel 2 | -------------------------------------------------------------------------------- /day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionKernel.cpython-312-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionKernel.cpython-312-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionkernel.cpython-312-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionkernel.cpython-312-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernel.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernel.o -------------------------------------------------------------------------------- /day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernelBinding.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernelBinding.o -------------------------------------------------------------------------------- /day06/AdditionKernel/pythontest.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import additionkernel 3 | 4 | input_tensor = torch.randn(100).cuda() 5 | additionkernel.addition(input_tensor, input_tensor.size(0)) 6 | print("Result after addition:", input_tensor) -------------------------------------------------------------------------------- /day06/AdditionKernel/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | setup( 5 | name='additionkernel', 6 | ext_modules=[ 7 | CUDAExtension( 8 | name='additionkernel', 9 | sources=[ 10 | 'additionKernelBinding.cpp', 11 | 'additionKernel.cu', 12 | ] 13 | ) 14 | ], 15 | cmdclass={ 16 | 'build_ext': BuildExtension 17 | } 18 | ) -------------------------------------------------------------------------------- /day06/ImportingToPython/build/lib.linux-x86_64-cpython-312/example_kernels.cpython-312-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/lib.linux-x86_64-cpython-312/example_kernels.cpython-312-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcall.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcall.o -------------------------------------------------------------------------------- /day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcallbinding.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcallbinding.o -------------------------------------------------------------------------------- /day06/ImportingToPython/example_kernels.cpython-312-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/example_kernels.cpython-312-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /day06/ImportingToPython/example_kernels.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: example_kernels 3 | Version: 0.0.1 4 | -------------------------------------------------------------------------------- /day06/ImportingToPython/example_kernels.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | rollcall.cu 2 | rollcallbinding.cpp 3 | setup.py 4 | example_kernels.egg-info/PKG-INFO 5 | example_kernels.egg-info/SOURCES.txt 6 | example_kernels.egg-info/dependency_links.txt 7 | example_kernels.egg-info/top_level.txt -------------------------------------------------------------------------------- /day06/ImportingToPython/example_kernels.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /day06/ImportingToPython/example_kernels.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | example_kernels 2 | -------------------------------------------------------------------------------- /day06/ImportingToPython/pythontest.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import example_kernels 3 | example_kernels.rollcall() -------------------------------------------------------------------------------- /day06/ImportingToPython/rollcall.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void roll_call_kernel() { 5 | const int threadIndex = threadIdx.x; 6 | printf("Thread %d here!\n", threadIndex); 7 | printf("Te iubesc atat de mult: %d \n",threadIndex*1000); 8 | } 9 | 10 | void roll_call_launcher() { 11 | roll_call_kernel<<<1, 5>>>(); 12 | cudaDeviceSynchronize(); 13 | } 14 | 15 | int main() { 16 | roll_call_launcher(); 17 | return 0; 18 | } -------------------------------------------------------------------------------- /day06/ImportingToPython/rollcallbinding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void roll_call_launcher(); 5 | 6 | void roll_call_binding(){ 7 | roll_call_launcher(); 8 | } 9 | 10 | PYBIND11_MODULE(example_kernels, m) { 11 | m.def( 12 | "rollcall", // Name of the Python function to create 13 | &roll_call_binding, // Corresponding C++ function to call 14 | "Launches the roll_call kernel" // Docstring 15 | ); 16 | } 17 | -------------------------------------------------------------------------------- /day06/ImportingToPython/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | __version__ = "0.0.1" 5 | 6 | ext_modules = [ 7 | CUDAExtension('example_kernels', 8 | [ 9 | 'rollcallbinding.cpp', 10 | 'rollcall.cu', 11 | ]) 12 | ] 13 | 14 | setup( 15 | name="example_kernels", 16 | version=__version__, 17 | ext_modules=ext_modules, 18 | cmdclass={"build_ext": BuildExtension} 19 | ) 20 | -------------------------------------------------------------------------------- /day06/SMBlocks.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void sm_roll_call() { 5 | const int threadIndex = threadIdx.x; 6 | 7 | uint streamingMultiprocessorId; 8 | asm("mov.u32 %0, %smid;" : "=r"(streamingMultiprocessorId) ); 9 | 10 | printf("Thread %d running on SM %d!\n", threadIndex, streamingMultiprocessorId); 11 | } 12 | 13 | int main() { 14 | sm_roll_call<<<4, 2>>>(); 15 | cudaDeviceSynchronize(); 16 | return 0; 17 | } -------------------------------------------------------------------------------- /day06/SoftMax.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void SoftMaxNaive(float *input,float *output,int size){ 5 | int numThreads = blockDim.x; 6 | 7 | 8 | //each thread to compute softmax for this: 9 | int numElementsPerThread = size/numThreads; 10 | 11 | int threadIndex = threadIdx.x; 12 | 13 | int startIndex = threadIndex * numElementsPerThread; 14 | int endIndex = min(size,startIndex* numElementsPerThread); 15 | 16 | 17 | float MaxValue = 0.0; 18 | for (int i = 0; i < size; i++) { 19 | if (input[i] > MaxValue) { 20 | MaxValue = input[i]; 21 | } 22 | } 23 | 24 | float sumExp = 0.0; 25 | for(int i =0;i MaxValue) { 47 | MaxValue = input[i]; 48 | } 49 | } 50 | SharedMaxValue[threadIndex] = MaxValue; 51 | __syncthreads(); 52 | for (int i = 0; i < numThreads; i++) { 53 | if (SharedMaxValue[i] > MaxValue) { 54 | MaxValue = SharedMaxValue[i]; 55 | } 56 | } 57 | 58 | 59 | /// Now we need to calcualte the SumExp 60 | __shared__ float sharedSumExp[numThreads]; 61 | float sumExp = 0.0; 62 | for(int i =startIndex;i 2 | #include 3 | 4 | __global__ void transposeKernel(int *A, int *B) 5 | { 6 | const int idx = threadIdx.x + threadIdx.y * blockDim.x; 7 | // threadIDx.x -> id of the row 8 | // threadIdx.y -> id of the collumn 9 | // BlockDim.x -> the size of the Dimension of the row 10 | // So we will get the idx to be on the element in the flattned matrix 11 | 12 | // 1 2 3 1 2 5 13 | // 2 3 4 -> 2 3 2 14 | // 5 2 1 3 4 1 15 | const int outidx = threadIdx.y + threadIdx.x * blockDim.y; 16 | B[outidx] = A[idx]; 17 | } 18 | 19 | int main() 20 | { 21 | int rows = 3; 22 | int cols = 3; 23 | int sizeMatrix = rows * cols; 24 | int *Matrix = (int *)malloc(sizeof(int) * cols * rows); 25 | for (int i = 0; i < sizeMatrix; i++) 26 | { 27 | Matrix[i] = i; 28 | } 29 | for (int i = 0; i < sizeMatrix; i++) 30 | { 31 | std::cout << Matrix[i] << " "; 32 | if (i % cols == cols - 1) 33 | std::cout << std::endl; 34 | } 35 | 36 | int *MatrixD, *MatrixOut; 37 | cudaMalloc((void **)&MatrixD, sizeMatrix * sizeof(int)); 38 | cudaMalloc((void **)&MatrixOut, sizeMatrix * sizeof(int)); 39 | cudaMemcpy(MatrixD, Matrix, sizeMatrix * sizeof(int), cudaMemcpyHostToDevice); 40 | 41 | dim3 numThreadsPerBlock(rows, cols); 42 | 43 | cudaFuncSetAttribute( 44 | transposeKernel, 45 | cudaFuncAttributePreferredSharedMemoryCarveout, 46 | 20 // Use 20% of combined L1/Shared Memory for Shared Memory 47 | ); 48 | transposeKernel<<<1, numThreadsPerBlock>>>(MatrixD, MatrixOut); 49 | 50 | cudaMemcpy(Matrix, MatrixOut, sizeMatrix * sizeof(float), cudaMemcpyDeviceToHost); 51 | std::cout << "\nTransposed\n"; 52 | for (int i = 0; i < sizeMatrix; i++) 53 | { 54 | std::cout << Matrix[i] << " "; 55 | if (i % rows == rows - 1) 56 | std::cout << std::endl; 57 | } 58 | 59 | cudaFree(MatrixD); 60 | cudaFree(MatrixOut); 61 | free(Matrix); 62 | 63 | return 0; 64 | } -------------------------------------------------------------------------------- /day06/note: -------------------------------------------------------------------------------- 1 | I will work more on this day to surprise my biggest supporter in this Journey :D 2 | I will start wit this tutorial : https://tinkerd.net/blog/machine-learning/cuda-basics/ 3 | And later this day will continue with working on the softmax forward + backward -------------------------------------------------------------------------------- /day07/conv1d.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void conv1D(float *X, float *K, float *Y, int input_size, int kernel_size) 5 | { 6 | 7 | extern __shared__ float shared[]; 8 | 9 | int i = blockIdx.x * blockDim.x + threadIdx.x; 10 | 11 | int radius = kernel_size / 2; 12 | 13 | int sharedIdx = threadIdx.x + radius; // the main element from the conv 14 | // index will start from the radius so that we have left 2 more behind use 15 | /// SO we load in the share memory all the elements our filter will work on the block 16 | if (threadIdx.x < blockDim.x - radius) 17 | { 18 | int left = i - radius; 19 | int right = i + blockDim.x; 20 | 21 | shared[threadIdx.x] = (left >= 0) ? X[left] : 0.0f; 22 | shared[sharedIdx + blockDim.x] = (right < input_size) ? X[right] : 0.0f; 23 | } 24 | 25 | __syncthreads(); 26 | 27 | float sum = 0.0; 28 | for (int j = -radius; j <= radius; j++) 29 | { 30 | sum += shared[sharedIdx + j] * K[radius + j]; 31 | // we iterate from -2 to 2 . so we have -2 -1 0 1 2. Which is normal 32 | // So we have this: 33 | } 34 | 35 | if (i < input_size) 36 | { 37 | Y[i] = sum; 38 | } 39 | } 40 | 41 | int main() 42 | { 43 | int N = 1024; // size of the vector 44 | int BlockSize = 256; // size of the block we use 45 | int GridSize = (N + BlockSize - 1) / BlockSize; // size of the grid we use. Also ceil function 46 | 47 | int KernelSize = 5; 48 | float Kernel[KernelSize] = {1.0f, 2.0f, 1.0f, 1.0f, -2.0f}; 49 | int radius = KernelSize / 2; 50 | int SharedMemory = (BlockSize + 2 * radius) * sizeof(float); 51 | 52 | float *Xcpu, *Ycpu; 53 | float *Xgpu, *Ygpu, *Kgpu; 54 | 55 | Xcpu = (float *)malloc(N * sizeof(float)); 56 | Ycpu = (float *)malloc(N * sizeof(float)); 57 | // we already have declared our kernel; 58 | 59 | for (int i = 0; i < N; i++) 60 | { 61 | Xcpu[i] = 1; 62 | } 63 | 64 | // now lets launch this data in the air baby 65 | cudaMalloc((void **)&Xgpu, N * sizeof(float)); 66 | cudaMalloc((void **)&Ygpu, N * sizeof(float)); 67 | cudaMalloc((void **)&Kgpu, KernelSize * sizeof(float)); 68 | cudaMemcpy(Xgpu, Xcpu, N * sizeof(float), cudaMemcpyHostToDevice); 69 | cudaMemcpy(Kgpu, Kernel, KernelSize * sizeof(float), cudaMemcpyHostToDevice); 70 | 71 | conv1D<<>>(Xgpu, Kgpu, Ygpu, N, KernelSize); 72 | 73 | cudaMemcpy(Ycpu, Ygpu, N * sizeof(float), cudaMemcpyDeviceToHost); 74 | 75 | std::cout << "First 10 elements " << std::endl; 76 | for (size_t i = 0; i < 10; i++) 77 | { 78 | std::cout << Xcpu[i] << " "; 79 | } 80 | 81 | std::cout << "\nFirst 10 elements after the convolution op" << std::endl; 82 | for (size_t i = 0; i < 10; i++) 83 | { 84 | std::cout << Ycpu[i] << " "; 85 | } 86 | 87 | free(Xcpu); 88 | free(Ycpu); 89 | cudaFree(Xgpu); 90 | cudaFree(Ygpu); 91 | cudaFree(Kgpu); 92 | 93 | return 0; 94 | } 95 | -------------------------------------------------------------------------------- /day07/globalMemoryCoalescing.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define CEIL_DIV(x, y) (((x) + (y) - 1) / (y)) 5 | int M = 10; 6 | int N = 10; 7 | 8 | dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1); 9 | dim3 blockDim(32, 32, 1); // 32 * 32 * 1 10 | 11 | __global__ void sgemm_naive(int M, int N, int K, float alpha, 12 | const float *A, const float *B, float beta, float *C) 13 | { 14 | const int x = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE); 15 | const int y = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE); 16 | 17 | if (x < M && y 2 | #include 3 | #define BLOCK_SIZE 32 4 | 5 | __global__ void matmulKernel(float *A, float *B, float *C, int dim) 6 | { 7 | int i, j; // i and j indexes 8 | float temp = 0; // temp value 9 | 10 | int row = blockIdx.y * blockDim.y + threadIdx.y; 11 | int col = blockIdx.x * blockDim.x + threadIdx.x; 12 | 13 | __shared__ float ASharedT[BLOCK_SIZE][BLOCK_SIZE]; // we allocate memory for shared 14 | __shared__ float BSharedT[BLOCK_SIZE][BLOCK_SIZE]; // we allocate memory fro shared 15 | 16 | for (int tileNUM = 0; tileNUM < gridDim.x; tileNUM++) 17 | { 18 | j = tileNUM * BLOCK_SIZE + threadIdx.x; 19 | i = tileNUM * BLOCK_SIZE + threadIdx.y; 20 | 21 | ASharedT[threadIdx.y][threadIdx.x] = A[i * dim + j]; 22 | BSharedT[threadIdx.y][threadIdx.x] = B[i * dim + j]; 23 | 24 | __syncthreads(); 25 | 26 | for (int k = 0; k < BLOCK_SIZE; k++) 27 | { 28 | temp += ASharedT[threadIdx.y][k] * BSharedT[k][threadIdx.x]; 29 | } 30 | 31 | __syncthreads(); 32 | } 33 | C[row * dim + col] = temp; 34 | } 35 | 36 | int main() 37 | { 38 | int N = 1024; 39 | float *Acpu, *Bcpu, *Ccpu; 40 | float *Agpu, *Bgpu, *Cgpu; 41 | 42 | Acpu = (float *)malloc(N * N * sizeof(float)); 43 | Bcpu = (float *)malloc(N * N * sizeof(float)); 44 | Ccpu = (float *)malloc(N * N * sizeof(float)); 45 | 46 | for (int i = 0; i < N * N; i++) 47 | { 48 | Acpu[i] = sin(i); 49 | Bcpu[i] = cos(i); 50 | } 51 | 52 | size_t vectorSize = N * N * sizeof(float); 53 | 54 | cudaMalloc((void **)&Agpu, vectorSize); 55 | cudaMalloc((void **)&Bgpu, vectorSize); 56 | cudaMalloc((void **)&Cgpu, vectorSize); 57 | cudaMemcpy(Agpu, Acpu, vectorSize, cudaMemcpyHostToDevice); 58 | cudaMemcpy(Bgpu, Bcpu, vectorSize, cudaMemcpyHostToDevice); 59 | 60 | dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); 61 | dim3 gridDim(N / BLOCK_SIZE, N / BLOCK_SIZE); 62 | 63 | cudaEvent_t start, stop; 64 | cudaEventCreate(&start); 65 | cudaEventCreate(&stop); 66 | cudaEventRecord(start, 0); 67 | 68 | matmulKernel<<>>(Agpu, Bgpu, Cgpu, N); 69 | 70 | cudaEventRecord(stop, 0); 71 | cudaEventSynchronize(stop); 72 | float et; 73 | cudaEventElapsedTime(&et, start, stop); 74 | cudaEventDestroy(start); 75 | cudaEventDestroy(stop); 76 | 77 | cudaMemcpy(Ccpu, Cgpu, vectorSize, cudaMemcpyDeviceToHost); 78 | 79 | printf("GPU time= %f ms\n", et); 80 | 81 | free(Acpu); 82 | free(Bcpu); 83 | free(Ccpu); 84 | cudaFree(Agpu); 85 | cudaFree(Bgpu); 86 | cudaFree(Cgpu); 87 | 88 | return 0; 89 | } -------------------------------------------------------------------------------- /day07/naive.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define CEIL_DIV(x, y) (((x) + (y) - 1) / (y)) 5 | int M = 10; 6 | int N = 10; 7 | 8 | dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1); 9 | dim3 blockDim(32, 32, 1); // 32 * 32 * 1 10 | 11 | __global__ void sgemm_naive(int M, int N, int K, float alpha, 12 | const float *A, const float *B, float beta, float *C) 13 | { 14 | const uint x = blockIdx.x * blockDim.x + threadIdx.x; 15 | const uint y = blockIdx.y * blockDim.y + threadIdx.y; 16 | 17 | if (x < M && y 2 | #include 3 | -------------------------------------------------------------------------------- /day08/pmpbook/chapter3matvecmul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define CUDA_CHECK(err) \ 5 | { \ 6 | cuda_assert((err), __FILE__, __LINE__); \ 7 | } 8 | inline void cuda_assert(cudaError_t code, const char *file, int line) 9 | { 10 | if (code != cudaSuccess) 11 | { 12 | std::cerr << "CUDA Error: " << cudaGetErrorString(code) 13 | << " in " << file << ":" << line << std::endl; 14 | exit(1); 15 | } 16 | } 17 | 18 | __global__ void matrixveckernel(const float *A,const float*b,float*C,const int N){ 19 | // N the size of the NxN A matrix 20 | // N aslo the size of the vector 21 | // we need so that each thread will iterate the row 22 | 23 | int i = blockIdx.x * blockDim.x + threadIdx.x; 24 | // we got 25 | 26 | if(i>>(dA,db,dc,N); 53 | CUDA_CHECK(cudaGetLastError()); 54 | 55 | CUDA_CHECK(cudaMemcpy(c,dc,sizeb,cudaMemcpyDeviceToHost)); 56 | 57 | 58 | CUDA_CHECK(cudaFree(dA)); 59 | CUDA_CHECK(cudaFree(db)); 60 | CUDA_CHECK(cudaFree(dc)); 61 | 62 | } 63 | 64 | int main(){ 65 | int N = 1024; 66 | float *A = new float[N * N]; 67 | float *b = new float[N]; 68 | 69 | for(int i = 0 ;i 2 | #include 3 | 4 | __global__ void color2graykernel(const float* R, const float*G,const float*B,float *O,const int n){ 5 | // assume the matrix is nxn; 6 | 7 | int i = blockIdx.x * blockDim.x + threadIdx.x; // so this will be for collumns 8 | int j = blockIdx.y * blockDim.y + threadIdx.y; // this will be for rows 9 | 10 | 11 | if( i>>(d_r,d_g,d_b,d_o,n); 34 | 35 | float *O = (float*)malloc(size); 36 | cudaMemcpy(O,d_o,size,cudaMemcpyDeviceToHost); 37 | 38 | cudaFree(d_r); 39 | cudaFree(d_g); 40 | cudaFree(d_b); 41 | cudaFree(d_o); 42 | 43 | return O; 44 | } -------------------------------------------------------------------------------- /day08/pmpbook/deviceinfo.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | int dev_count; 7 | cudaGetDeviceCount(&dev_count); 8 | std::cout << "Devices are : " << dev_count << std::endl; 9 | 10 | cudaDeviceProp dev_prop; 11 | for (int i = 0; i < dev_count; ++i) 12 | { 13 | cudaGetDeviceProperties(&dev_prop, i); 14 | } 15 | std::cout << "Max Threads per Block : " << dev_prop.maxThreadsPerBlock << std::endl; 16 | std::cout << "Max Threads per MultiProcessor :" << dev_prop.maxThreadsPerMultiProcessor << std::endl; 17 | std::cout << "Max Blocks per MultiProcessor : " << dev_prop.maxBlocksPerMultiProcessor << std::endl; 18 | std::cout << "Clock rate : " << dev_prop.clockRate << std::endl; 19 | std::cout << "Max Grid Size (X,Y,Z) : (" << dev_prop.maxGridSize[0] << "," << dev_prop.maxGridSize[1] << "," << dev_prop.maxGridSize[2] << ")" << std::endl; 20 | std::cout << "Max Threads Dim (X,Y,Z) : (" << dev_prop.maxThreadsDim[0] << "," << dev_prop.maxThreadsDim[1] << "," << dev_prop.maxThreadsDim[2] << ")" << std::endl; 21 | std::cout << "Max Shared Memory per Block : " << dev_prop.sharedMemPerBlock << std::endl; 22 | std::cout << "Max Shared Memory per MultiProcessor : " << dev_prop.sharedMemPerMultiprocessor << std::endl; 23 | std::cout << "Max Registers per Block : " << dev_prop.regsPerBlock << std::endl; 24 | std::cout << "Max Registers per MultiProcessor : " << dev_prop.regsPerMultiprocessor << std::endl; 25 | std::cout << "Warp Size : " << dev_prop.warpSize << std::endl; 26 | std::cout << "Max Threads per Warp : " << dev_prop.maxThreadsPerMultiProcessor / dev_prop.warpSize << std::endl; 27 | std::cout << "Max Warps per MultiProcessor : " << dev_prop.maxThreadsPerMultiProcessor / dev_prop.warpSize << std::endl; 28 | std::cout << "Max Warps per Block : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize << std::endl; 29 | std::cout << "Max Warps per Grid : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] << std::endl; 30 | std::cout << "Max Warps per Device : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] * dev_prop.multiProcessorCount << std::endl; 31 | std::cout << "Max Blocks per Device : " << dev_prop.maxBlocksPerMultiProcessor * dev_prop.multiProcessorCount << std::endl; 32 | std::cout << "Max Threads per Device : " << dev_prop.maxThreadsPerBlock * dev_prop.multiProcessorCount << std::endl; 33 | std::cout << "Max Warps per Device : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] * dev_prop.multiProcessorCount << std::endl; 34 | 35 | } -------------------------------------------------------------------------------- /day08/pmpbook/imageblur.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void imageblurkernel(const float *A, float *C, const int sizeArray, const int sizeKernel) 5 | { 6 | int i = blockIdx.x * blockDim.x + threadIdx.x; 7 | int j = blockIdx.y * blockDim.y + threadIdx.y; 8 | 9 | int radius = sizeKernel / 2; 10 | 11 | // 1 2 3 2 12 | // 4 5 6 2 13 | // 1 2 3 2 14 | // 5 6 7 2 15 | // 16 | // Sow we lets say we are at index = 1 first element 17 | // we need now to do this : 18 | // we only use the blur when if it dosnt overflow 19 | if (i < sizeArray && j < sizeArray) 20 | { 21 | float PixelValue = 0.0; 22 | int pixels = 0; 23 | for (int blurRow = -radius; i <= radius; i++) 24 | { 25 | for (int blurCol = -radius; j <= radius; j++) 26 | { 27 | // so now we are in the kernel 28 | int curRow = i + blurRow; 29 | int curCol = j + blurCol; 30 | 31 | if (curRow < 0 || curRow >= sizeArray || curCol < 0 || curCol >= sizeArray) 32 | { 33 | PixelValue += A[curRow * sizeArray + curCol]; 34 | pixels++; 35 | } 36 | } 37 | } 38 | C[sizeArray * j + i] = PixelValue / pixels; 39 | } 40 | } -------------------------------------------------------------------------------- /day08/pmpbook/vecaddition.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | //// CHAPTER 2 DONE 5 | __global__ void addkernel(float *a, float *b, float *c, int N) 6 | { 7 | int i = blockIdx.x * blockDim.x + threadIdx.x; 8 | if (i < N) 9 | { 10 | c[i] = a[i] + b[i]; 11 | } 12 | } 13 | 14 | 15 | void vecAdd(float *A, float *B, float*C,int n){ 16 | int size = n*sizeof(float); 17 | float *d_A, *d_B, *d_C; 18 | 19 | cudaMalloc((void**)&d_A, size); 20 | cudaMalloc((void**)&d_B, size); 21 | cudaMalloc((void**)&d_C, size); 22 | 23 | cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice); 24 | cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice); 25 | 26 | dim3 dimGrid(ceil(n/256.0),1,1); 27 | dim3 dimBlock(256,1,1); 28 | addkernel<<>>(d_A, d_B, d_C, n); 29 | // launches a gri of 4 blocks with 256 threads per block 30 | 31 | cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost); 32 | cudaFree(d_A); 33 | cudaFree(d_B); 34 | cudaFree(d_C); 35 | } -------------------------------------------------------------------------------- /day09/bind.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor forward(torch::Tensor Q, torch::Tensor K, torch::Tensor V); 4 | 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 6 | m.def("forward", torch::wrap_pybind_function(forward), "forward"); 7 | } -------------------------------------------------------------------------------- /day09/test.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | from torch.utils.cpp_extension import load 6 | 7 | print("LOADING FLASH ATTENTION") 8 | minimal_attn = load(name='minimal_attn', sources=['bind.cpp', 'flashAttentionFromTut.cu'], extra_cuda_cflags=['-O2']) 9 | print("LOADED FLASH ATTENTION") 10 | 11 | batch_size = 16 12 | n_head = 12 13 | seq_len = 64 14 | head_embd = 64 15 | 16 | q = torch.randn(batch_size, n_head, seq_len, head_embd).cuda() 17 | k = torch.randn(batch_size, n_head, seq_len, head_embd).cuda() 18 | v = torch.randn(batch_size, n_head, seq_len, head_embd).cuda() 19 | 20 | print('=== profiling manual attention ===') 21 | 22 | # Our minimal flash attention aims to be faster than this by avoiding HBM read/writes of N^2 matrices. 23 | def manual_attn(q, k, v): 24 | att = (q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.size(-1)))) 25 | att = F.softmax(att, dim=-1) 26 | y = att @ v 27 | return y 28 | 29 | with torch.autograd.profiler.profile(use_cuda=True) as prof: 30 | manual_result = manual_attn(q, k, v) 31 | print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10)) 32 | 33 | print('=== profiling minimal flash attention === ') 34 | 35 | with torch.autograd.profiler.profile(use_cuda=True) as prof: 36 | minimal_result = minimal_attn.forward(q, k, v) 37 | print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10)) 38 | 39 | print('attn values sanity check:', torch.allclose(minimal_result, manual_result, rtol=0, atol=1e-02)) -------------------------------------------------------------------------------- /day10/FlashAttention.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void FlashAttention(torch::Tensor &Q, 5 | torch::Tensor &K, 6 | torch::Tensor &V, 7 | torch::Tensor &O, 8 | torch::Tensor &m, 9 | torch::Tensor &l, 10 | const int seq_len, 11 | const int head_dim, 12 | int Tc, int Tr, int Bc, int Br); 13 | 14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 15 | m.def("FlashAttention", &FlashAttention, "FlashAttention forward"); 16 | } -------------------------------------------------------------------------------- /day10/linking/simpleKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "ATen/ATen.h" 3 | 4 | void cuda_simpleKernel(float *A); 5 | 6 | void simpleKernel(at::Tensor A) { 7 | cuda_simpleKernel(A.data_ptr()); 8 | } 9 | 10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 11 | m.def("simplekernel", &simpleKernel, "A simple kernel (CUDA)"); 12 | } 13 | -------------------------------------------------------------------------------- /day10/linking/simpleKernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "ATen/ATen.h" 4 | 5 | template 6 | __global__ void simpleKernel(T* A) { 7 | A[threadIdx.x] += 100; 8 | } 9 | 10 | void cuda_simpleKernel(float *A ) { 11 | dim3 blocks(1); 12 | simpleKernel<<>>(A); 13 | } -------------------------------------------------------------------------------- /day10/linking/test.py: -------------------------------------------------------------------------------- 1 | from torch.utils.cpp_extension import load 2 | 3 | simplekernel = load( 4 | name='simplekernel', 5 | sources=['simpleKernel.cpp', 'simpleKernel.cu'], 6 | verbose=True 7 | ) 8 | 9 | # Test kernel 10 | import torch 11 | A = torch.zeros(32, device='cuda', dtype=torch.float32) 12 | simplekernel.simplekernel(A) 13 | print(A) 14 | -------------------------------------------------------------------------------- /day10/ppmbook/matrixmul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void matrixmulkernel(float *M,float *N,float *P,int width) 4 | { 5 | int row = blockIdx.y * blockDim.y + threadIdx.y; 6 | int col = blockIdx.x * blockDim.x + threadIdx.x; 7 | 8 | if(row < width && col 2 | #include 3 | using namespace nvcuda; 4 | 5 | // Block layout: one block per (batch, head) 6 | template 7 | __global__ void delta_net_attention( 8 | const half* __restrict__ K, // [B, S, D] 9 | const half* __restrict__ V, // [B, S, D] 10 | const half* __restrict__ Q, // [B, S, D] 11 | half* __restrict__ O, // [B, S, D] 12 | int batch, int seq_len) 13 | { 14 | extern __shared__ half shared_mem[]; // size = D*D 15 | half* S = shared_mem; // state matrix S 16 | int b = blockIdx.x; // batch index 17 | 18 | // Initialize S to zero 19 | for (int idx = threadIdx.x; idx < D*D; idx += blockDim.x) { 20 | S[idx] = __float2half(0.0f); 21 | } 22 | __syncthreads(); 23 | 24 | // Loop over sequence length 25 | for (int t = 0; t < seq_len; ++t) { 26 | // Load k_t and v_t into registers 27 | half k_vec[D], v_vec[D]; 28 | #pragma unroll 29 | for (int i = threadIdx.x; i < D; i += blockDim.x) { 30 | int base = (b*seq_len + t)*D; 31 | k_vec[i] = K[base + i]; 32 | v_vec[i] = V[base + i]; 33 | } 34 | __syncthreads(); 35 | 36 | // S += v_vec * k_vec^T — outer-product update 37 | for (int i = threadIdx.y; i < D; i += blockDim.y) { 38 | #pragma unroll 39 | for (int j = threadIdx.x; j < D; j += blockDim.x) { 40 | int idx = i*D + j; 41 | float s = __half2float(S[idx]); 42 | s += __half2float(v_vec[i]) * __half2float(k_vec[j]); 43 | S[idx] = __float2half(s); 44 | } 45 | } 46 | __syncthreads(); 47 | 48 | // Load q_t and compute o_t = S * q_vec 49 | half q_vec[D]; 50 | #pragma unroll 51 | for (int i = threadIdx.x; i < D; i += blockDim.x) { 52 | int base = (b*seq_len + t)*D; 53 | q_vec[i] = Q[base + i]; 54 | } 55 | __syncthreads(); 56 | 57 | #pragma unroll 58 | for (int i = threadIdx.x; i < D; i += blockDim.x) { 59 | float o = 0.0f; 60 | #pragma unroll 61 | for (int j = 0; j < D; ++j) { 62 | o += __half2float(S[i*D + j]) * __half2float(q_vec[j]); 63 | } 64 | int out_idx = (b*seq_len + t)*D + i; 65 | O[out_idx] = __float2half(o); 66 | } 67 | __syncthreads(); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /day11/FlashTestPytorch/binding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "ATen/ATen.h" 3 | 4 | void CudaFlashAttention(const float *Q, 5 | const float *K, 6 | const float *V, 7 | float *O, 8 | float *m, 9 | float *l, 10 | const int seq_len, 11 | const int head_dim, 12 | const int batch_size, 13 | const int nr_heads); 14 | 15 | torch::Tensor FlashAttention(torch::Tensor Q, 16 | torch::Tensor K, 17 | torch::Tensor V) 18 | { 19 | int batch_size = Q.size(0); 20 | int nr_heads = Q.size(1); 21 | int seq_len = Q.size(2); 22 | int head_dim = Q.size(3); 23 | 24 | torch::Tensor m = torch::full({batch_size, nr_heads, seq_len}, 25 | -std::numeric_limits::infinity(),Q.options()); 26 | torch::Tensor l = torch::zeros({batch_size, nr_heads, seq_len},Q.options()); 27 | 28 | torch::Tensor O = torch::zeros_like(Q); 29 | CudaFlashAttention(Q.data_ptr(), K.data_ptr(), V.data_ptr(), O.data_ptr(), m.data_ptr(), l.data_ptr(), seq_len, head_dim, batch_size, nr_heads); 30 | return O; 31 | } 32 | 33 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 34 | { 35 | m.def("FlashAttention", &FlashAttention, "FlashAttention (CUDA)"); 36 | } -------------------------------------------------------------------------------- /day11/FlashTestPytorch/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.cpp_extension import load 3 | import time 4 | 5 | # Load the custom CUDA extension 6 | sources = ["binding.cpp", "FlashAttention.cu"] 7 | flash_attention = load("flash_attention", sources=sources, verbose=True) 8 | print("Custom CUDA extension loaded.") 9 | 10 | def manual_attention(Q, K, V): 11 | batch_size, num_heads, seq_len, head_dim = Q.shape 12 | 13 | attn_scores = torch.matmul(Q, K.transpose(-2, -1)) # [batch, heads, seq_len, seq_len] 14 | scale = 1.0 / (head_dim ** 0.5) 15 | attn_scores = attn_scores * scale 16 | attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1) 17 | output = torch.matmul(attn_weights, V) # [batch, heads, seq_len, head_dim] 18 | return output 19 | def test_flash_attention(): 20 | batch_size = 2 21 | num_heads = 4 22 | seq_len = 128 23 | head_dim = 64 24 | 25 | # Create random input tensors 26 | Q = torch.randn(batch_size, num_heads, seq_len, head_dim, device='cuda') 27 | K = torch.randn_like(Q) 28 | V = torch.randn_like(Q) 29 | 30 | # Warmup runs 31 | for _ in range(3): 32 | _ = flash_attention.FlashAttention(Q, K, V) 33 | _ = manual_attention(Q, K, V) 34 | 35 | # Benchmark custom FlashAttention 36 | custom_times = [] 37 | for _ in range(100): 38 | torch.cuda.synchronize() 39 | start = time.time() 40 | _ = flash_attention.FlashAttention(Q, K, V) 41 | torch.cuda.synchronize() 42 | custom_times.append(time.time() - start) 43 | 44 | # Benchmark manual attention 45 | manual_times = [] 46 | for _ in range(100): 47 | torch.cuda.synchronize() 48 | start = time.time() 49 | _ = manual_attention(Q, K, V) 50 | torch.cuda.synchronize() 51 | manual_times.append(time.time() - start) 52 | 53 | # Get fastest iterations 54 | fastest_custom = min(custom_times) * 1000 # Convert to milliseconds 55 | fastest_manual = min(manual_times) * 1000 56 | 57 | # Print performance results 58 | print("\nPerformance results (fastest iteration):") 59 | print(f"Custom FlashAttention: {fastest_custom:.2f} ms") 60 | print(f"Manual PyTorch attention: {fastest_manual:.2f} ms") 61 | print(f"Speedup factor: {fastest_manual / fastest_custom:.2f}x") 62 | 63 | if __name__ == "__main__": 64 | test_flash_attention() -------------------------------------------------------------------------------- /day11/LeakyReLU.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | __global__ void leakyreluKernel(float*input,float*output,float slope,int N){ 5 | int index = blockDim.x * blockIdx.x + threadIdx.x; 6 | if(index < N) 7 | output[index] = input[index] < 0 ? input[index]*slope : input[index]; 8 | } 9 | 10 | void CudaLeakyReLU(float *A,float*B,float slope ,int N){ 11 | int ThreadsPerBlock = 256; 12 | int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock; 13 | leakyreluKernel<<>>(A, B,slope,N); 14 | } -------------------------------------------------------------------------------- /day11/ReLU.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | __global__ void reluKernel(float*input,float*output,int N){ 5 | int index = blockDim.x * blockIdx.x + threadIdx.x; 6 | if(index < N) 7 | output[index] = input[index] < 0 ? 0 : input[index]; 8 | } 9 | 10 | void CudaReLU(float *A,float*B, int N){ 11 | int ThreadsPerBlock = 256; 12 | int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock; 13 | reluKernel<<>>(A, B,N); 14 | } 15 | 16 | //========================= 17 | 18 | __global__ void reluKernelBackward(float *input, float *grad_input, float *grad_output, int N){ 19 | int index = blockDim.x * blockIdx.x + threadIdx.x; 20 | if(index < N) 21 | grad_input[index] = input[index] < 0 ? 0 : grad_output[index]; 22 | } 23 | 24 | void CudaReLUBackward(float *A, float *Gi, float *Go, int N){ 25 | int ThreadsPerBlock = 256; 26 | int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock; 27 | reluKernelBackward<<>>(A, Gi, Go, N); 28 | } -------------------------------------------------------------------------------- /day11/SoftMax.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | inline int prevPow2(int n) { 4 | if (n == 0) return 0; 5 | int prev = 1; 6 | while (prev <= n/2) { 7 | prev *= 2; 8 | } 9 | return prev; 10 | } 11 | 12 | __global__ void softmaxKernel(float *input, float *output, int Dim) { 13 | int batch_idx = blockIdx.x; // Current batch index 14 | int tid = threadIdx.x; // Thread index within the block 15 | 16 | extern __shared__ float shared_data[]; 17 | float max_val = -INFINITY; 18 | for (int i = tid; i < Dim; i += blockDim.x) { 19 | max_val = fmaxf(max_val, input[batch_idx * Dim + i]); 20 | } 21 | 22 | shared_data[tid] = max_val; 23 | __syncthreads(); 24 | 25 | // Reduction for max_val 26 | for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { 27 | if (tid < stride) { 28 | shared_data[tid] = fmaxf(shared_data[tid], shared_data[tid + stride]); 29 | } 30 | __syncthreads(); 31 | } 32 | max_val = shared_data[0]; 33 | 34 | float sum_exp = 0.0f; 35 | for (int i = tid; i < Dim; i += blockDim.x) { 36 | output[batch_idx * Dim + i] = expf(input[batch_idx * Dim + i] - max_val); 37 | sum_exp += output[batch_idx * Dim + i]; 38 | } 39 | 40 | shared_data[tid] = sum_exp; 41 | __syncthreads(); 42 | 43 | // Reduction for sum_exp 44 | for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { 45 | if (tid < stride) { 46 | shared_data[tid] += shared_data[tid + stride]; 47 | } 48 | __syncthreads(); 49 | } 50 | sum_exp = shared_data[0]; 51 | 52 | for (int i = tid; i < Dim; i += blockDim.x) { 53 | output[batch_idx * Dim + i] /= sum_exp; 54 | } 55 | } 56 | 57 | 58 | void CudaSoftmax(float *input, float *output, int BatchSize, int Dim) { 59 | int max_threads = min(512, Dim); 60 | int threads = prevPow2(max_threads); 61 | if (threads == 0) threads = 1; // Ensure at least 1 thread 62 | size_t shared_mem_size = threads * sizeof(float); 63 | softmaxKernel<<>>(input, output, Dim); 64 | cudaDeviceSynchronize(); // Ensure kernel completion 65 | } -------------------------------------------------------------------------------- /day11/TanH.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | __global__ void tanhKernel(float*input,float*output,int N){ 5 | int index = blockDim.x * blockIdx.x + threadIdx.x; 6 | if(index < N) 7 | output[index] = tanhf(input[index]); 8 | } 9 | 10 | void CudaTanH(float *A,float*B, int N){ 11 | int ThreadsPerBlock = 256; 12 | int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock; 13 | tanhKernel<<>>(A, B,N); 14 | } 15 | -------------------------------------------------------------------------------- /day11/binding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "ATen/ATen.h" 3 | 4 | 5 | void CudaLeakyReLU(float *A,float*B,float slope ,int N); 6 | torch::Tensor LeakyReLU(torch::Tensor A, float slope){ 7 | torch::Tensor B = torch::empty_like(A); 8 | int N = A.numel(); 9 | CudaLeakyReLU(A.data_ptr(),B.data_ptr(),slope,N); 10 | return B; 11 | } 12 | 13 | void CudaReLU(float *A,float*B, int N); 14 | torch::Tensor ReLU(torch::Tensor A){ 15 | torch::Tensor B = torch::empty_like(A); 16 | int N = A.numel(); 17 | CudaReLU(A.data_ptr(),B.data_ptr(),N); 18 | return B; 19 | } 20 | 21 | void CudaReLUBackward(float *A, float *Gi, float *Go, int N); 22 | torch::Tensor ReLUBackward(torch::Tensor A, torch::Tensor Go){ 23 | torch::Tensor Gi = torch::empty_like(A); 24 | int N = A.numel(); 25 | CudaReLUBackward(A.data_ptr(),Gi.data_ptr(),Go.data_ptr(),N); 26 | return Go; 27 | } 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | void CudaSoftmax(float *input, float *output, int BatchSize, int Dim) ; 39 | torch::Tensor Softmax(torch::Tensor input) { 40 | int BatchSize = input.size(0); 41 | int Dim = input.size(1); 42 | torch::Tensor output = torch::empty_like(input); 43 | CudaSoftmax(input.data_ptr(), output.data_ptr(), BatchSize, Dim); 44 | return output; 45 | } 46 | 47 | void CudaTanH(float *A,float*B, int N); 48 | torch::Tensor TanH(torch::Tensor A){ 49 | torch::Tensor B = torch::empty_like(A); 50 | int N = A.numel(); 51 | CudaTanH(A.data_ptr(),B.data_ptr(),N); 52 | return B; 53 | } 54 | 55 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 56 | m.def("LeakyReLU", &LeakyReLU, "LeakyReLU (CUDA)"); 57 | m.def("ReLU", &ReLU, "ReLU (CUDA)"); 58 | m.def("ReLUBackward", &ReLUBackward, "ReLU (CUDA)"); 59 | m.def("Softmax", &Softmax, "Softmax (CUDA)"); 60 | m.def("TanH", &TanH, "TanH (CUDA)"); 61 | } -------------------------------------------------------------------------------- /day11/testbackward.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.cpp_extension import load 3 | 4 | sources = ["binding.cpp", "ReLU.cu", "SoftMax.cu", "LeakyReLU.cu", "TanH.cu"] 5 | functions = load("functions", sources=sources, verbose=True) 6 | 7 | class CustomReLU(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, input): 10 | ctx.save_for_backward(input) 11 | return functions.ReLU(input) 12 | 13 | @staticmethod 14 | def backward(ctx, grad_output): 15 | input, = ctx.saved_tensors 16 | return functions.ReLUBackward(input, grad_output) 17 | 18 | x = torch.tensor([-1.0, -1.0, -1.0, -2.0], device='cuda', requires_grad=True) 19 | 20 | relu = CustomReLU.apply 21 | 22 | y_custom = relu(x) 23 | y_custom.sum().backward() 24 | grad_custom = x.grad.clone() 25 | 26 | x.grad.zero_() 27 | y_pytorch = torch.nn.functional.relu(x) 28 | y_pytorch.sum().backward() 29 | grad_pytorch = x.grad.clone() 30 | 31 | # Compare the gradients 32 | print("Custom ReLU Gradient:", grad_custom) 33 | print("PyTorch ReLU Gradient:", grad_pytorch) 34 | 35 | if torch.allclose(grad_custom, grad_pytorch, atol=1e-6): 36 | print("Gradients match!") 37 | else: 38 | print("Gradients do not match!") 39 | -------------------------------------------------------------------------------- /day12/tileMatrix.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define TILE_WIDTH 32 4 | 5 | __global__ void tileKernel(const float *dM,const float *dN,float *dP,const int Width){ 6 | __shared__ float Mds[TILE_WIDTH][TILE_WIDTH]; 7 | __shared__ float Nds[TILE_WIDTH][TILE_WIDTH]; 8 | 9 | int bx = blockIdx.x; 10 | int by = blockIdx.y; 11 | int tx = threadIdx.x; 12 | int ty = threadIdx.y; 13 | 14 | int row = by * TILE_WIDTH + ty; 15 | int col = bx * TILE_WIDTH + tx; 16 | 17 | float Pvalue = 0; 18 | for(int i = 0 ; i < TILE_WIDTH/Width ; ++i){ 19 | Mds[ty][tx] = dM[row*Width + i*TILE_WIDTH + tx]; 20 | Nds[ty][tx] = dN[(i*TILE_WIDTH + ty)*Width + col]; 21 | __syncthreads(); 22 | 23 | for(int k = 0 ;k 2 | 3 | // Define the CEILING macro 4 | #define CEILING(x, y) (((x) + (y) - 1) / (y)) 5 | 6 | #define blockdimy 128 7 | 8 | __global__ void RMSKernel1_V1(float *input, float *output, const int w, const int h) 9 | { 10 | int col = blockIdx.x * blockDim.x + threadIdx.x; 11 | int row = blockIdx.y * blockDim.y + threadIdx.y; 12 | 13 | if (row < h && col < w) 14 | { 15 | float sum = 0; 16 | for (int i = 0; i < w; ++i) 17 | { 18 | sum += input[row * w + i] * input[row * w + i]; 19 | } 20 | sum = sqrt((float)1 / w * sum); 21 | 22 | output[row + w * col] = input[row * w + col] / sum; 23 | } 24 | } 25 | 26 | 27 | void RMSV1(float *input, float *output, int w, int h) 28 | { 29 | 30 | dim3 block_size = dim3(32, 32); 31 | dim3 grid_size = dim3(CEILING(w, 32), CEILING(32, h)); 32 | RMSKernel1_V1<<>>(input, output, w, h); 33 | } 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /day13/RMSBetter.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define CEILING(x, y) (((x) + (y) - 1) / (y)) 4 | 5 | #define blockdimy 128 6 | 7 | __device__ float warpReduceSum(float val) 8 | { 9 | for (int offset = 16; offset > 0; offset /= 2) 10 | { 11 | val += __shfl_down_sync(0xffffffff, val, offset, 32); 12 | } 13 | return val; 14 | } 15 | 16 | __global__ void RMSKernel_V2(float *input, float *output, const int w, const int h) 17 | { 18 | int row = blockIdx.x * blockDim.x + threadIdx.x; 19 | int col = blockIdx.y * blockDim.y + threadIdx.y; 20 | 21 | __shared__ float shared_data[32]; 22 | 23 | float sum = 0.0f; 24 | 25 | if (row < h && col < w) 26 | { 27 | float4 val = reinterpret_cast(&input[row * w + col * 4])[0]; 28 | sum += val.x * val.x + val.y * val.y + val.z * val.z + val.w * val.w; 29 | } 30 | __syncthreads(); 31 | 32 | sum = warpReduceSum(sum); 33 | 34 | __syncthreads(); 35 | 36 | if (threadIdx.x % 32 == 0) 37 | { 38 | shared_data[threadIdx.x / 32] = sum; 39 | } 40 | 41 | __syncthreads(); 42 | 43 | if (threadIdx.x == 0) 44 | { 45 | float final_sum = 0.0f; 46 | for (int i = 0; i < blockDim.x / 32; ++i) 47 | { 48 | final_sum += shared_data[i]; 49 | } 50 | output[row] = input[row] / sqrt(final_sum / float(w)); 51 | } 52 | } 53 | 54 | void RMSV2(float *input, float *output, int w, int h) 55 | { 56 | dim3 block_size = dim3(1, 32, 1); 57 | dim3 grid_size = dim3(h, 1, 1); 58 | RMSKernel_V2<<>>(input, output, w, h); 59 | } 60 | -------------------------------------------------------------------------------- /day13/binding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "ATen/ATen.h" 3 | 4 | void RMSV1(float *input, float *output, int w, int h); 5 | 6 | torch::Tensor RMS_V1(torch::Tensor input) 7 | { 8 | auto out = torch::empty_like(input); 9 | int h = input.size(0); 10 | int w = input.size(1); 11 | RMSV1(input.data_ptr(), out.data_ptr(), w, h); 12 | return out; 13 | } 14 | 15 | void RMSV2(float *input, float *output, int w, int h); 16 | torch::Tensor RMS_V2(torch::Tensor input) 17 | { 18 | auto out = torch::empty_like(input); 19 | int h = input.size(0); 20 | int w = input.size(1); 21 | RMSV1(input.data_ptr(), out.data_ptr(), w, h); 22 | return out; 23 | } 24 | 25 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 26 | { 27 | m.def("RMSV1", &RMS_V1, "RMSV1 (CUDA)"); 28 | m.def("RMSV2", &RMS_V2, "RMSV2 (CUDA)"); 29 | } -------------------------------------------------------------------------------- /day13/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.cpp_extension import load 3 | import time 4 | from liger_kernel.ops import rms_norm 5 | 6 | def rms_norm(tensor): 7 | return tensor / torch.sqrt(torch.mean(tensor ** 2)) 8 | 9 | sources = ["binding.cpp", "RMS.cu", "RMSBetter.cu"] 10 | RMS = load("RMS", sources=sources, verbose=True) 11 | print("Custom CUDA extension loaded.") 12 | 13 | tensor_sizes = [(1024, 1024), (2048, 2048), (4096, 4096), (8192, 8192)] 14 | 15 | for tensor_size in tensor_sizes: 16 | print("=" * 50) 17 | print("Input Size: ", tensor_size) 18 | print("=" * 50) 19 | input_tensor = torch.randn(tensor_size, device='cuda') 20 | 21 | # PyTorch RMS time and result 22 | pytorch_time = 0 23 | result_pytorch = None 24 | for _ in range(5): 25 | start_time = time.time() 26 | result_pytorch = rms_norm(input_tensor) 27 | pytorch_time += time.time() - start_time 28 | print(f"PyTorch RMS time: {pytorch_time / 6:.6f} seconds") 29 | 30 | # Custom kernel time and result 31 | custom_time = 0 32 | result_custom = None 33 | for _ in range(5): 34 | start_time = time.time() 35 | result_custom = RMS.RMSV2(input_tensor) 36 | custom_time += time.time() - start_time 37 | print(f"Custom kernel time: {custom_time / 6:.6f} seconds") 38 | 39 | # Liger kernel time and result 40 | liger_time = 0 41 | result_liger = None 42 | for _ in range(5): 43 | start_time = time.time() 44 | result_liger = rms_norm(input_tensor) 45 | liger_time += time.time() - start_time 46 | print(f"Liger kernel time: {liger_time / 6:.6f} seconds") 47 | 48 | # Checking if the results are the same 49 | pytorch_custom_diff = torch.max(torch.abs(result_pytorch - result_custom)) 50 | pytorch_liger_diff = torch.max(torch.abs(result_pytorch - result_liger)) 51 | 52 | print(f"Max difference between PyTorch and Custom kernel: {pytorch_custom_diff.item():.6f}") 53 | print(f"Max difference between PyTorch and Liger kernel: {pytorch_liger_diff.item():.6f}") 54 | 55 | # Check if they are numerically close (within tolerance) 56 | are_pytorch_custom_close = torch.allclose(result_pytorch, result_custom, atol=1) # You can adjust the tolerance 57 | are_pytorch_liger_close = torch.allclose(result_pytorch, result_liger, atol=1) # You can adjust the tolerance 58 | 59 | if are_pytorch_custom_close: 60 | print("PyTorch and Custom kernel results are the same!") 61 | else: 62 | print("PyTorch and Custom kernel results are different.") 63 | 64 | if are_pytorch_liger_close: 65 | print("PyTorch and Liger kernel results are the same!") 66 | else: 67 | print("PyTorch and Liger kernel results are different.") 68 | 69 | print("=" * 50 + "\n") 70 | -------------------------------------------------------------------------------- /day14/FA2/helper.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "helper.cuh" 4 | 5 | __device__ float warpReduceMax(float val) { 6 | for (int offset = 16; offset > 0; offset /= 2) { 7 | val = fmaxf(val, __shfl_down_sync(0xffffffff, val, offset)); 8 | } 9 | return val; 10 | } -------------------------------------------------------------------------------- /day14/FA2/helper.cuh: -------------------------------------------------------------------------------- 1 | #ifndef HELPER_CUH 2 | #define HELPER_CUH 3 | 4 | __device__ float warpReduceMax(float val); 5 | 6 | #endif -------------------------------------------------------------------------------- /day14/FA2/kernels.cuh: -------------------------------------------------------------------------------- 1 | #ifndef KERNELS_CUH 2 | #define KERNELS_CUH 3 | 4 | __global__ void computeDKernel(const float* dO, const float* O, float* D, int N, int d); 5 | 6 | __global__ void computeSiKernel(const float* Qi, const float* Kj, float* Si, int Br, int Bc, int d, float scale); 7 | 8 | __global__ void findRowMaxSiKernel(float* Si, float* maxSi, int Br, int Bc); 9 | 10 | __global__ void computeSoftmaxKernel(float* Si, float* softmaxSi, int Br, int Bc); 11 | 12 | __global__ void computeAttentionKernel(const float* Q, const float* K, const float* V, float* attention, int N, int d); 13 | 14 | __global__ void computeQKernel(const float* Q, const float* dO, float* dQ, int N, int d); 15 | 16 | __global__ void computeKKernel(const float* K, const float* dO, float* dK, int N, int d); 17 | 18 | __global__ void computeVKernel(const float* V, const float* dO, float* dV, int N, int d); 19 | 20 | __global__ void computeGradientsKernel(const float* dO, float* dQ, float* dK, float* dV, int N, int d); 21 | 22 | #endif -------------------------------------------------------------------------------- /day14/FlashAttention2/kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void baackwardKernel(float *Q, float *K, float *V, float *O, 5 | float *dQ, float *dK, float *dV, float *dO, 6 | float *L, int Bc, int Br, 7 | int batch_size, int N, int nr_heads, int d) 8 | { 9 | int Tr = ceil(N / Br); 10 | int Tc = ceil(N / Bc); 11 | 12 | // Q1 - > size of Br* d size in shared memory 13 | // O1 - > size of Br* d size in shared memory 14 | 15 | // K1 - > size of Bc *d size in shared memory 16 | // V1 - > size of Bc *d size in shared memory 17 | 18 | // L - > size of Br each 19 | 20 | int row = blockIdx.y * blockDim.y + threadIdx.y; 21 | int col = blockIdx.x * blockDim.x + threadIdx.x; 22 | 23 | 24 | } -------------------------------------------------------------------------------- /day14/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day14/cat.jpg -------------------------------------------------------------------------------- /day15/SMM.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __global__ void spmv_csr_kernel(int num_rows, const float *values, const int *column_indices, const int *row_offsets, const float *x, float *y) { 6 | int row = blockIdx.x * blockDim.x + threadIdx.x; 7 | if (row < num_rows) { 8 | float dot = 0; 9 | for (int i = row_offsets[row]; i < row_offsets[row + 1]; i++) { 10 | dot += values[i] * x[column_indices[i]]; 11 | } 12 | y[row] = dot; 13 | } 14 | } 15 | 16 | void spmv_csr(int num_rows, int nnz, float *h_values, int *h_column_indices, int *h_row_offsets, float *h_x, float *h_y) { 17 | float *d_values; 18 | float*d_x; 19 | float *d_y; 20 | int *d_column_indices; 21 | int *d_row_offsets; 22 | 23 | cudaMalloc(&d_values, nnz * sizeof(float)); 24 | cudaMalloc(&d_column_indices, nnz * sizeof(int)); 25 | cudaMalloc(&d_row_offsets, (num_rows + 1) * sizeof(int)); 26 | cudaMalloc(&d_x, num_rows * sizeof(float)); 27 | cudaMalloc(&d_y, num_rows * sizeof(float)); 28 | 29 | cudaMemcpy(d_values, h_values, nnz * sizeof(float), cudaMemcpyHostToDevice); 30 | cudaMemcpy(d_column_indices, h_column_indices, nnz * sizeof(int), cudaMemcpyHostToDevice); 31 | cudaMemcpy(d_row_offsets, h_row_offsets, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice); 32 | cudaMemcpy(d_x, h_x, num_rows * sizeof(float), cudaMemcpyHostToDevice); 33 | 34 | int blockSize = 256; 35 | int gridSize = (num_rows + blockSize - 1) / blockSize; 36 | spmv_csr_kernel<<>>(num_rows, d_values, d_column_indices, d_row_offsets, d_x, d_y); 37 | 38 | cudaMemcpy(h_y, d_y, num_rows * sizeof(float), cudaMemcpyDeviceToHost); 39 | 40 | cudaFree(d_values); 41 | cudaFree(d_column_indices); 42 | cudaFree(d_row_offsets); 43 | cudaFree(d_x); 44 | cudaFree(d_y); 45 | } 46 | 47 | int main() { 48 | int num_rows = 3; 49 | int nnz = 4; 50 | float values[] = {1, 2, 3, 4}; 51 | int column_indices[] = {0, 2, 1, 2}; 52 | int row_offsets[] = {0, 1, 3, 4}; 53 | float x[] = {1, 2, 3}; 54 | float y[3] = {0}; 55 | 56 | spmv_csr(num_rows, nnz, values, column_indices, row_offsets, x, y); 57 | 58 | std::cout << "Rezultat SpMV: "; 59 | for (int i = 0; i < num_rows; i++) { 60 | std::cout << y[i] << " "; 61 | } 62 | std::cout << std::endl; 63 | return 0; 64 | } -------------------------------------------------------------------------------- /day16/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | # Set the seed for reproducibility 5 | torch.manual_seed(42) 6 | 7 | # Define the dimensions 8 | seq_len = 4 9 | dim = 4 10 | 11 | # Initialize the tensors 12 | Q = torch.full((seq_len, dim), 2.0, requires_grad=True) 13 | K = torch.full((seq_len, dim), 2.0, requires_grad=True) 14 | V = torch.full((seq_len, dim), 2.0, requires_grad=True) 15 | # Forward pass 16 | scores = torch.matmul(Q, K.transpose(-2, -1)) / (dim ** 0.5) 17 | P = F.softmax(scores, dim=-1) 18 | O = torch.matmul(P, V) 19 | 20 | # Create a dummy gradient for the output 21 | dO = torch.ones_like(O) 22 | 23 | # Backward pass 24 | O.backward(dO) 25 | 26 | 27 | print("PyTorch O:") 28 | print(O) 29 | # Print the gradients 30 | print("PyTorch dQ:") 31 | print(Q.grad) 32 | print("PyTorch dK:") 33 | print(K.grad) 34 | print("PyTorch dV:") 35 | print(V.grad) -------------------------------------------------------------------------------- /day17/cublas1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | #define n 6 7 | 8 | int main(){ 9 | cudaError_t cudaStat; 10 | cublasStatus_t stat; 11 | cublasHandle_t handle; 12 | 13 | int j; 14 | float *x; 15 | x = (float*)malloc(sizeof(float)*n); 16 | for(j = 0 ; j 2 | #include 3 | #include 4 | 5 | #define n 10 6 | 7 | int main() 8 | { 9 | cudaError_t cudaStat; 10 | cublasStatus_t stat; 11 | cublasHandle_t handle; 12 | 13 | int j; 14 | float *x, *y; 15 | x = (float *)malloc(sizeof(float) * n); 16 | y = (float *)malloc(sizeof(float) * n); 17 | 18 | for (j = 0; j < n; ++j) 19 | { 20 | x[j] = (float)j; 21 | y[j] = (float)j + 1; 22 | } 23 | 24 | printf("\nx:\n"); 25 | for (j = 0; j < n; ++j) 26 | { 27 | printf("%f ", x[j]); 28 | } 29 | 30 | printf("\ny:\n"); 31 | for (j = 0; j < n; ++j) 32 | { 33 | printf("%f ", y[j]); 34 | } 35 | 36 | float *d_x, *d_y; 37 | cudaStat = cudaMalloc((void **)&d_x, n * sizeof(float)); 38 | cudaStat = cudaMalloc((void **)&d_y, n * sizeof(float)); 39 | 40 | stat = cublasCreate(&handle); 41 | stat = cublasSetVector(n, sizeof(float), x, 1, d_x, 1); 42 | stat = cublasSetVector(n, sizeof(float), y, 1, d_y, 1); 43 | float a = 3.0; 44 | 45 | stat = cublasSaxpy(handle, n, &a, d_x, 1, d_y, 1); 46 | stat = cublasGetVector(n, sizeof(float), d_y, 1, y, 1); 47 | 48 | printf("\nNew y:\n"); 49 | for (j = 0; j < n; ++j) 50 | { 51 | printf("%f ", y[j]); 52 | } 53 | cudaFree(d_y); 54 | cudaFree(d_x); 55 | cublasDestroy(handle); 56 | free(x); 57 | free(y); 58 | return 0; 59 | } -------------------------------------------------------------------------------- /day17/cublas3.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define n 10 6 | 7 | int main() 8 | { 9 | cudaError_t cudaStat; 10 | cublasStatus_t stat; 11 | cublasHandle_t handle; 12 | 13 | int j; 14 | float *x, *y; 15 | x = (float *)malloc(sizeof(float) * n); 16 | y = (float *)malloc(sizeof(float) * n); 17 | for (j = 0; j < n; ++j) 18 | { 19 | x[j] = (float)1; 20 | y[j] = (float)1; 21 | } 22 | 23 | printf("\nx:\n"); 24 | for (j = 0; j < n; ++j) 25 | { 26 | printf("%f ", x[j]); 27 | } 28 | 29 | printf("\ny:\n"); 30 | for (j = 0; j < n; ++j) 31 | { 32 | printf("%f ", y[j]); 33 | } 34 | 35 | float *d_x, *d_y; 36 | cudaStat = cudaMalloc((void **)&d_x, n * sizeof(float)); 37 | cudaStat = cudaMalloc((void **)&d_y, n * sizeof(float)); 38 | 39 | stat = cublasCreate(&handle); 40 | stat = cublasSetVector(n, sizeof(float), x, 1, d_x, 1); 41 | stat = cublasSetVector(n, sizeof(float), y, 1, d_y, 1); 42 | float a = 3.0; 43 | 44 | float result; 45 | stat = cublasSdot(handle, n, d_x, 1, d_y, 1, &result); 46 | printf("\ndot product x . y : \n "); 47 | printf (" %7.0f \n " , result ); 48 | 49 | 50 | cudaFree(d_y); 51 | cudaFree(d_x); 52 | cublasDestroy(handle); 53 | free(x); 54 | free(y); 55 | return 0; 56 | } -------------------------------------------------------------------------------- /day18/atomic1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #define N 32 3 | #include 4 | __device__ int lane_id() { 5 | return threadIdx.x & 31; 6 | } 7 | 8 | __device__ int atomicAggInc(int *ptr) { 9 | int mask = __match_any_sync(__activemask(), (unsigned long long)ptr); 10 | int leader = __ffs(mask) - 1; 11 | int res; 12 | if (lane_id() == leader) 13 | res = atomicAdd(ptr, __popc(mask)); 14 | res = __shfl_sync(mask, res, leader); 15 | return res + __popc(mask & ((1 << lane_id()) - 1)); 16 | } 17 | 18 | __global__ void test_atomicAggInc(int *d_ptr, int *d_results) { 19 | int old_val = atomicAggInc(d_ptr); 20 | d_results[threadIdx.x] = old_val; 21 | } 22 | 23 | int main() { 24 | int *d_ptr, *d_results; 25 | int h_ptr = 0; 26 | int h_results[N]; 27 | 28 | cudaMalloc(&d_ptr, sizeof(int)); 29 | cudaMalloc(&d_results, N * sizeof(int)); 30 | 31 | cudaMemcpy(d_ptr, &h_ptr, sizeof(int), cudaMemcpyHostToDevice); 32 | 33 | test_atomicAggInc<<<1, N>>>(d_ptr, d_results); 34 | 35 | cudaMemcpy(&h_ptr, d_ptr, sizeof(int), cudaMemcpyDeviceToHost); 36 | cudaMemcpy(h_results, d_results, N * sizeof(int), cudaMemcpyDeviceToHost); 37 | 38 | printf("Final value of ptr: %d\n", h_ptr); 39 | printf("Old values returned by each thread:\n"); 40 | for (int i = 0; i < N; i++) { 41 | printf("Thread %2d -> %d\n", i, h_results[i]); 42 | } 43 | 44 | cudaFree(d_ptr); 45 | cudaFree(d_results); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /day18/atomic2.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define N 320 5 | 6 | __device__ int lane_id(){ 7 | return threadIdx.x & 31; 8 | } 9 | 10 | // incremenets threads by pointers 11 | __device__ int atomicIncrement(int * ptr){ 12 | int mask = __match_any_sync(__activemask(), (unsigned long long)ptr); 13 | int leader = __ffs(mask) -1; 14 | int res; 15 | if(lane_id() == leader){ 16 | res = atomicAdd(ptr,__popc(mask)); // add on ptr number of active threads 17 | } 18 | __shfl_sync(mask,res,leader); 19 | return *ptr; 20 | } 21 | 22 | __global__ void testatomicIncrement(int *d_ptr, int *d_results){ 23 | int val = atomicIncrement(d_ptr); 24 | d_results[threadIdx.x] = val; 25 | } 26 | 27 | 28 | 29 | int main() { 30 | int *d_ptr, *d_results; 31 | int h_ptr = 100; 32 | int h_results[N]; 33 | 34 | cudaMalloc(&d_ptr, sizeof(int)); 35 | cudaMalloc(&d_results, N * sizeof(int)); 36 | 37 | cudaMemcpy(d_ptr, &h_ptr, sizeof(int), cudaMemcpyHostToDevice); 38 | 39 | testatomicIncrement<<<1, N>>>(d_ptr, d_results); 40 | 41 | cudaMemcpy(&h_ptr, d_ptr, sizeof(int), cudaMemcpyDeviceToHost); 42 | cudaMemcpy(h_results, d_results, N * sizeof(int), cudaMemcpyDeviceToHost); 43 | 44 | printf("Final value of ptr: %d\n", h_ptr); 45 | printf("Old values returned by each thread:\n"); 46 | for (int i = 0; i < N; i++) { 47 | printf("Thread %2d -> %d\n", i, h_results[i]); 48 | } 49 | 50 | cudaFree(d_ptr); 51 | cudaFree(d_results); 52 | 53 | return 0; 54 | } 55 | 56 | -------------------------------------------------------------------------------- /day18/wrap.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | __device__ int lane_id() { 7 | return threadIdx.x & 31; 8 | } 9 | 10 | __device__ float reduceMax(float val) { 11 | // threads of 32 we perform reduction on them 12 | for (int offset = 16; offset > 0; offset /= 2) { 13 | float temp = __shfl_xor_sync(0xFFFFFFFF, val, offset); 14 | val = fmaxf(val, temp); 15 | } 16 | return val; 17 | } 18 | 19 | __device__ float atomicMaxFloat(float *addr, float value) { 20 | // give the adres of input 21 | // save old adress 22 | int *addr_as_int = (int*)addr; 23 | int old = *addr_as_int; 24 | int assumed; 25 | do { 26 | assumed = old; 27 | float old_val = __int_as_float(assumed); 28 | if (old_val >= value) { 29 | return old_val; 30 | } 31 | old = atomicCAS(addr_as_int, assumed, __float_as_int(fmaxf(old_val, value))); 32 | } while (assumed != old); 33 | return __int_as_float(old); 34 | } 35 | 36 | __global__ void MaxValue(float *data, float *max_value, int N) { 37 | int tx = threadIdx.x; 38 | int bx = blockIdx.x; 39 | 40 | extern __shared__ float reduction[]; 41 | 42 | float block_max = -INFINITY; 43 | 44 | for (int i = bx * blockDim.x + tx; i < N; i += gridDim.x * blockDim.x) { 45 | block_max = fmaxf(block_max, data[i]); 46 | } 47 | 48 | block_max = reduceMax(block_max); 49 | 50 | reduction[tx] = block_max; 51 | __syncthreads(); 52 | 53 | if (tx == 0) { 54 | float final_max = -INFINITY; 55 | for (int i = 0; i < blockDim.x; ++i) { 56 | final_max = fmaxf(final_max, reduction[i]); 57 | } 58 | atomicMaxFloat(max_value, final_max); 59 | } 60 | } 61 | 62 | int main() { 63 | int N = 1024; 64 | float *host_data = (float*)malloc(N * sizeof(float)); 65 | float host_result = -INFINITY; 66 | 67 | for (int i = 0; i < N; ++i) { 68 | host_data[i] = rand()%10000; 69 | if (host_data[i] > host_result) { 70 | host_result = host_data[i]; 71 | } 72 | } 73 | 74 | float *device_data, *device_result; 75 | cudaMalloc(&device_data, N * sizeof(float)); 76 | cudaMalloc(&device_result, sizeof(float)); 77 | 78 | cudaMemcpy(device_data, host_data, N * sizeof(float), cudaMemcpyHostToDevice); 79 | cudaMemcpy(device_result, &host_result, sizeof(float), cudaMemcpyHostToDevice); 80 | 81 | int threadsPerBlock = 256; 82 | int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; 83 | 84 | MaxValue<<>>(device_data, device_result, N); 85 | 86 | cudaMemcpy(&host_result, device_result, sizeof(float), cudaMemcpyDeviceToHost); 87 | 88 | std::cout << "Max value: " << host_result << std::endl; 89 | 90 | free(host_data); 91 | cudaFree(device_data); 92 | cudaFree(device_result); 93 | 94 | return 0; 95 | } -------------------------------------------------------------------------------- /day20/rope.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define BLOCK_SIZE 256 7 | #define theta 10000.0f 8 | #define STRINGFY(str) #str 9 | #define TORCH_BINDING_COMMON_EXTENSION(func) \ 10 | m.def(STRINGFY(func), &func, STRINGFY(func)); 11 | 12 | __global__ void rope_kernel(float* x, float* out, int N){ 13 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 14 | float4 x_v = reinterpret_cast(&(x[idx * 4]))[0]; 15 | 16 | int token_pos = idx / N; 17 | int token_idx = idx % N; 18 | 19 | float exp_f_v = 1.0f / powf(theta, token_idx * 2 / (N * 4)); 20 | float exp_s_v = 1.0f / powf(theta, ((token_idx * 2) + 1) / (N * 4)); 21 | 22 | float sin_f_v = sinf(token_pos / exp_f_v); 23 | float cos_f_v = cosf(token_pos / exp_f_v); 24 | 25 | float sin_s_v = sinf(token_pos / exp_s_v); 26 | float cos_s_v = cosf(token_pos / exp_s_v); 27 | float4 out_v; 28 | 29 | out_v.x = x_v.x * cos_f_v - x_v.y * sin_f_v; 30 | out_v.y = x_v.x * sin_f_v + x_v.y * cos_f_v; 31 | out_v.z = x_v.z * cos_s_v - x_v.w * sin_s_v; 32 | out_v.w = x_v.z * sin_s_v + x_v.w * cos_s_v; 33 | 34 | reinterpret_cast(&(out[idx * 4]))[0] = out_v; 35 | } 36 | 37 | void rope(torch::Tensor x, torch::Tensor out) { 38 | int seq_len = x.size(0); 39 | int hidden_size = x.size(1); 40 | 41 | int N = (int)(hidden_size/4); 42 | 43 | dim3 grid((seq_len * N + BLOCK_SIZE - 1) / BLOCK_SIZE); 44 | dim3 block(BLOCK_SIZE); 45 | 46 | rope_kernel<<>>(x.data_ptr(), out.data_ptr(), N); 47 | } 48 | 49 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 50 | TORCH_BINDING_COMMON_EXTENSION(rope) 51 | } -------------------------------------------------------------------------------- /day20/test_rope.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | from torch.utils.cpp_extension import load 4 | print(torch.__version__) # Verifică versiunea PyTorch 5 | print(torch.cuda.is_available()) # Dacă e False, PyTorch nu vede CUDA 6 | print(torch.version.cuda) # Verifică versiunea CUDA detectată de 7 | lib = load( 8 | name="rope", 9 | sources=["rope.cu"], 10 | extra_cuda_cflags=[ "-O3", 11 | "--use_fast_math", 12 | ], 13 | extra_cflags=["-std=c++17"], 14 | ) 15 | 16 | def benchmark(func, x, out=None, iters=20): 17 | torch.cuda.synchronize() 18 | start = time.time() 19 | for _ in range(iters): 20 | if out is not None: 21 | func(x, out) 22 | else: 23 | _ = func(x) 24 | torch.cuda.synchronize() 25 | return (time.time() - start) * 1000 / iters 26 | 27 | def naive_rope(x, theta=10000.0): 28 | dim = x.shape[-1] 29 | seq_len = x.shape[-2] 30 | x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2)) 31 | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim)).cuda() 32 | freqs = torch.outer(torch.arange(seq_len, device='cuda'), freqs) 33 | freqs_cis = torch.polar(torch.ones_like(freqs), freqs) 34 | return torch.view_as_real(x_ * freqs_cis).flatten(1).type_as(x) 35 | 36 | sizes = [(4096, 512), (4096, 1024), (8192, 512), (8192, 1024)] 37 | for M, N in sizes: 38 | print(f"Testing M={M}, N={N}") 39 | x = torch.randn((M, N), device='cuda', dtype=torch.float32).contiguous() 40 | out = torch.zeros_like(x) 41 | 42 | t_naive = benchmark(naive_rope, x) 43 | naive_out = naive_rope(x) 44 | 45 | t_cuda = benchmark(lib.rope, x, out) 46 | 47 | # Compute the maximum absolute difference 48 | max_diff = torch.max(torch.abs(naive_out - out)).item() 49 | 50 | print(f"Naive: {t_naive:.4f}ms, CUDA f32: {t_cuda:.4f}ms") 51 | print(f"Max difference: {max_diff:.6f}") 52 | print("-" * 60) 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /day23/kernel.ptx: -------------------------------------------------------------------------------- 1 | .version 6.0 2 | .target sm_50 3 | .address_size 64 4 | 5 | .visible .entry vectorAdd( 6 | .param .u64 param_A, 7 | .param .u64 param_B, 8 | .param .u64 param_C, 9 | .param .u32 param_N 10 | ) 11 | { 12 | .reg .pred %p<2>; 13 | .reg .s32 %r<6>; 14 | .reg .f32 %f<4>; 15 | .reg .u64 %rd<10>; 16 | 17 | ld.param.u64 %rd1, [param_A]; 18 | ld.param.u64 %rd2, [param_B]; 19 | ld.param.u64 %rd3, [param_C]; 20 | ld.param.u32 %r1, [param_N]; 21 | 22 | mov.u32 %r2, %tid.x; 23 | mov.u32 %r3, %ctaid.x; 24 | mov.u32 %r4, %ntid.x; 25 | mad.lo.s32 %r5, %r3, %r4, %r2; 26 | 27 | setp.ge.s32 %p1, %r5, %r1; 28 | @%p1 bra EXIT; 29 | 30 | cvt.u64.s32 %rd4, %r5; 31 | mul.wide.s32 %rd5, %r5, 4; 32 | add.u64 %rd6, %rd1, %rd5; 33 | add.u64 %rd7, %rd2, %rd5; 34 | add.u64 %rd8, %rd3, %rd5; 35 | 36 | ld.global.f32 %f1, [%rd6]; 37 | ld.global.f32 %f2, [%rd7]; 38 | 39 | add.f32 %f3, %f1, %f2; 40 | 41 | st.global.f32 [%rd8], %f3; 42 | 43 | EXIT: 44 | ret; 45 | } 46 | -------------------------------------------------------------------------------- /day24/GeGLU.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | 3 | __global__ void GLUKernel(float* x, float* W, float* V, float* b, float* c, float* out, int M, int N, int K) { 4 | int row = blockIdx.x * blockDim.x + threadIdx.x; 5 | int col = blockIdx.y * blockDim.y + threadIdx.y; 6 | 7 | if (row < M && col < K) { 8 | float sum1 = b[col]; 9 | float sum2 = c[col]; 10 | 11 | for (int i = 0; i < N; i++) { 12 | sum1 += x[row * N + i] * W[i * K + col]; 13 | sum2 += x[row * N + i] * V[i * K + col]; 14 | } 15 | 16 | float gate = 1.0f / (1.0f + expf(-sum1)); 17 | out[row * K + col] = gate * sum2; 18 | } 19 | } 20 | 21 | extern "C" void launchGLU(float* x, float* W, float* V, float* b, float* c, float* out, int M, int N, int K) { 22 | dim3 blockSize(16, 16); 23 | dim3 gridSize((M + 15) / 16, (K + 15) / 16); 24 | 25 | GLUKernel<<>>(x, W, V, b, c, out, M, N, K); 26 | cudaDeviceSynchronize(); 27 | } 28 | -------------------------------------------------------------------------------- /day26/gradientdescent.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day26/gradientdescent.out -------------------------------------------------------------------------------- /day27/kmeans.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day27/kmeans.out -------------------------------------------------------------------------------- /day28/sample.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __global__ void update_x_kernel( 6 | float *x, const float *noise, const float *predicted_noise, 7 | float sqrt_alpha, float sqrt_alpha_hat, float beta, float alpha, 8 | int numel) 9 | { 10 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 11 | if (idx < numel) 12 | { 13 | x[idx] = (1.0f / sqrt_alpha) *(x[idx] - ((1 - alpha) / sqrt_alpha_hat) * predicted_noise[idx]) +sqrt(beta) * noise[idx]; 14 | } 15 | } 16 | 17 | torch::Tensor update_x(torch::Tensor x, torch::Tensor noise, torch::Tensor predicted_noise, 18 | torch::Tensor sqrt_alpha, torch::Tensor sqrt_alpha_hat, 19 | torch::Tensor beta, torch::Tensor alpha) 20 | { 21 | int numel = x.numel(); 22 | float sqrt_alpha_val = sqrt_alpha.item(); 23 | float sqrt_alpha_hat_val = sqrt_alpha_hat.item(); 24 | float beta_val = beta.item(); 25 | float alpha_val = alpha.item(); 26 | 27 | const int threads = 1024; 28 | const int blocks = (numel + threads - 1) / threads; 29 | 30 | update_x_kernel<<>>( 31 | x.data_ptr(), noise.data_ptr(), predicted_noise.data_ptr(), 32 | sqrt_alpha_val, sqrt_alpha_hat_val, beta_val, alpha_val, numel); 33 | 34 | cudaError_t err = cudaGetLastError(); 35 | if (err != cudaSuccess) { 36 | printf("CUDA error: %s\n", cudaGetErrorString(err)); 37 | throw std::runtime_error(cudaGetErrorString(err)); 38 | } 39 | 40 | cudaDeviceSynchronize(); 41 | 42 | return x; 43 | } 44 | 45 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 46 | { 47 | m.def("update_x", &update_x, "CUDA kernel for updating x"); 48 | } 49 | -------------------------------------------------------------------------------- /day28/test_sample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.cpp_extension import load 3 | import time 4 | 5 | lib = load( 6 | name="update_x", 7 | sources=["sample.cu"], 8 | extra_cuda_cflags=[ "-O3", 9 | "--use_fast_math", 10 | ], 11 | extra_cflags=["-std=c++17"], 12 | ) 13 | 14 | 15 | print("Loaded ") 16 | 17 | size = 10**6 18 | device = "cuda" 19 | 20 | x = torch.randn(size, device=device) 21 | noise = torch.randn(size, device=device) 22 | predicted_noise = torch.randn(size, device=device) 23 | alpha = torch.tensor(0.9, device=device) 24 | beta = torch.tensor(0.1, device=device) 25 | alpha_hat = torch.tensor(0.81, device=device) 26 | 27 | sqrt_alpha = torch.sqrt(alpha) 28 | sqrt_alpha_hat = torch.sqrt(1 - alpha_hat) 29 | 30 | torch.cuda.synchronize() 31 | start = time.time() 32 | x_cuda = lib.update_x(x.clone(), noise, predicted_noise, sqrt_alpha, sqrt_alpha_hat, beta, alpha) 33 | torch.cuda.synchronize() 34 | time_cuda = time.time() - start 35 | 36 | torch.cuda.synchronize() 37 | start = time.time() 38 | x_torch = 1 / sqrt_alpha * (x - ((1 - alpha) / sqrt_alpha_hat) * predicted_noise) + torch.sqrt(beta) * noise 39 | torch.cuda.synchronize() 40 | time_torch = time.time() - start 41 | 42 | print(f"CUDA Kernel Time: {time_cuda:.6f}s") 43 | print(f"PyTorch Time: {time_torch:.6f}s") 44 | print(f"Speedup: {time_torch / time_cuda:.2f}x") 45 | -------------------------------------------------------------------------------- /day29/pi.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __device__ float randomFloat(unsigned int *seed) { 6 | *seed = (*seed * 1664525u + 1013904223u); 7 | return (float)(*seed & 0x00FFFFFF) / (float)0x01000000; 8 | } 9 | 10 | __global__ void monteCarloPi(int iterations, unsigned long long *d_count) { 11 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 12 | unsigned int seed = tid; 13 | unsigned int local_count = 0; 14 | 15 | for (int i = 0; i < iterations; i++) { 16 | float x = randomFloat(&seed); 17 | float y = randomFloat(&seed); 18 | if (x * x + y * y <= 1.0f) 19 | local_count++; 20 | } 21 | 22 | atomicAdd(d_count, (unsigned long long)local_count); 23 | } 24 | 25 | int main() { 26 | int iterations = 10000; 27 | int threadsPerBlock = 256; 28 | int blocks = 256; 29 | 30 | unsigned long long totalPoints = (unsigned long long)iterations * threadsPerBlock * blocks; 31 | 32 | unsigned long long host_count = 0; 33 | unsigned long long *d_count; 34 | cudaMalloc((void**)&d_count, sizeof(unsigned long long)); 35 | cudaMemset(d_count, 0, sizeof(unsigned long long)); 36 | 37 | monteCarloPi<<>>(iterations, d_count); 38 | cudaDeviceSynchronize(); 39 | 40 | cudaMemcpy(&host_count, d_count, sizeof(unsigned long long), cudaMemcpyDeviceToHost); 41 | 42 | float pi = 4.0f * (float)host_count / (float)totalPoints; 43 | printf("Estimated Pi = %f\n", pi); 44 | 45 | cudaFree(d_count); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /day30/kernelHisto.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define BLOCK_SIZE 16 // 16x16 thread block 5 | #define HIST_SIZE 256 // Grayscale histogram bins 6 | 7 | __global__ void histogram_equalization(unsigned char *d_img, unsigned char *d_out, int width, int height) { 8 | __shared__ unsigned int hist_shared[HIST_SIZE]; // Shared memory for histogram 9 | __shared__ float cdf_shared[HIST_SIZE]; // Shared memory for CDF 10 | 11 | int tx = threadIdx.x, ty = threadIdx.y; 12 | int x = blockIdx.x * blockDim.x + tx; 13 | int y = blockIdx.y * blockDim.y + ty; 14 | 15 | int index = y * width + x; 16 | 17 | // Initialize shared histogram 18 | if (tx < HIST_SIZE / BLOCK_SIZE && ty == 0) { 19 | hist_shared[tx * BLOCK_SIZE] = 0; 20 | } 21 | __syncthreads(); 22 | 23 | // First pass: compute local histogram using atomic operations 24 | if (x < width && y < height) { 25 | atomicAdd(&hist_shared[d_img[index]], 1); 26 | } 27 | __syncthreads(); 28 | 29 | // Merge local histograms into global memory 30 | __shared__ unsigned int hist_global[HIST_SIZE]; 31 | if (tx == 0 && ty == 0) { 32 | for (int i = 0; i < HIST_SIZE; i++) { 33 | atomicAdd(&hist_global[i], hist_shared[i]); 34 | } 35 | } 36 | __syncthreads(); 37 | 38 | // Compute CDF (Cumulative Distribution Function) 39 | if (tx == 0 && ty == 0) { 40 | float sum = 0; 41 | for (int i = 0; i < HIST_SIZE; i++) { 42 | sum += hist_global[i]; 43 | cdf_shared[i] = sum; 44 | } 45 | 46 | // Normalize the CDF 47 | float min_cdf = cdf_shared[0]; 48 | for (int i = 0; i < HIST_SIZE; i++) { 49 | cdf_shared[i] = ((cdf_shared[i] - min_cdf) / (width * height - min_cdf)) * 255.0f; 50 | } 51 | } 52 | __syncthreads(); 53 | 54 | // Apply equalization 55 | if (x < width && y < height) { 56 | d_out[index] = (unsigned char)cdf_shared[d_img[index]]; 57 | } 58 | } -------------------------------------------------------------------------------- /day32/Makefile: -------------------------------------------------------------------------------- 1 | PROJECT_DIR := $(CURDIR) 2 | 3 | COLOR_RESET := \033[0m 4 | COLOR_GREEN := \033[32m 5 | COLOR_YELLOW := \033[33m 6 | COLOR_BLUE := \033[34m 7 | COLOR_RED := \033[31m 8 | 9 | HIP_GPU_TARGET := gfx90a 10 | 11 | all: build 12 | 13 | build: $(PROJECT_DIR)/$(dir)/$(program).out 14 | 15 | $(PROJECT_DIR)/$(dir)/$(program).out: $(PROJECT_DIR)/$(dir)/$(program).cpp 16 | @echo "$(COLOR_YELLOW)Building HIP program $(program) in directory $(dir)...$(COLOR_RESET)" 17 | @hipcc --offload-arch=$(HIP_GPU_TARGET) -O3 -o $@ $< -lrocblas 18 | @echo "$(COLOR_GREEN)Build completed for $(program).out in $(dir)$(COLOR_RESET)" 19 | 20 | run: $(PROJECT_DIR)/$(dir)/$(program).out 21 | @echo "$(COLOR_BLUE)Running $(program).out in directory $(dir)...$(COLOR_RESET)" 22 | @./$(dir)/$(program).out 23 | 24 | # Target: Separate rocprof command for kernel profiling 25 | rocprof: $(PROJECT_DIR)/$(dir)/$(program).out 26 | @echo "$(COLOR_BLUE)Running rocprof for kernel trace on $(program).out in directory $(dir)...$(COLOR_RESET)" 27 | @mkdir -p $(PROJECT_DIR)/$(dir)/output 28 | @echo "$(COLOR_GREEN)Kernel profiling completed for $(program).out in $(dir)$(COLOR_RESET)" 29 | 30 | # Target: Generate ISA assembly files and kernel resource usage analysis. 31 | isa: 32 | @echo "$(COLOR_BLUE)Generating ISA and kernel resource usage for $(program) in directory $(dir)...$(COLOR_RESET)" 33 | @mkdir -p $(PROJECT_DIR)/$(dir)/isa_output 34 | @hipcc -c --save-temps=obj -O3 -Rpass-analysis=kernel-resource-usage --offload-arch=$(HIP_GPU_TARGET) -o $(PROJECT_DIR)/$(dir)/isa_output/$(program).o $(PROJECT_DIR)/$(dir)/$(program).cpp 35 | @echo "$(COLOR_GREEN)ISA and resource analysis files saved in $(dir)/isa_output$(COLOR_RESET)" 36 | 37 | clean: 38 | @echo "$(COLOR_RED)Cleaning up .out and ISA files in directory $(dir)...$(COLOR_RESET)" 39 | @rm -f $(PROJECT_DIR)/$(dir)/*.out 40 | @rm -rf $(PROJECT_DIR)/$(dir)/isa_output 41 | @rm -rf $(PROJECT_DIR)/$(dir)/output 42 | @echo "$(COLOR_GREEN)Clean completed for directory $(dir)$(COLOR_RESET)" 43 | 44 | cleanall: 45 | @echo "$(COLOR_RED)Cleaning up all .out and ISA files in all directories...$(COLOR_RESET)" 46 | @find $(PROJECT_DIR) -type f -name "*.out" -exec rm -f {} \; 47 | @find $(PROJECT_DIR) -type d -name "isa_output" -exec rm -rf {} \; 48 | @find $(PROJECT_DIR) -type d -name "output" -exec rm -rf {} \; 49 | @echo "$(COLOR_GREEN)Cleanall completed for all directories$(COLOR_RESET)" 50 | 51 | help: 52 | @echo "$(COLOR_BLUE)Usage instructions for HIP Makefile:$(COLOR_RESET)" 53 | @echo "" 54 | @echo "$(COLOR_YELLOW)make dir= program=$(COLOR_RESET) # Build the HIP program .cpp in directory " 55 | @echo "$(COLOR_YELLOW)make run dir= program=$(COLOR_RESET) # Run the compiled .out in directory " 56 | @echo "$(COLOR_YELLOW)make clean dir=$(COLOR_RESET) # Clean all .out files in directory " 57 | @echo "$(COLOR_YELLOW)make cleanall$(COLOR_RESET) # Clean all .out files in all directories" 58 | @echo "$(COLOR_YELLOW)make isa dir= program=$(COLOR_RESET) # Generate ISA assembly files and kernel resource usage analysis" 59 | @echo "" 60 | @echo "$(COLOR_BLUE)Examples:$(COLOR_RESET)" 61 | @echo "$(COLOR_GREEN)make dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)" 62 | @echo "$(COLOR_GREEN)make run dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)" 63 | @echo "$(COLOR_GREEN)make isa dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)" 64 | -------------------------------------------------------------------------------- /day32/matmul_kernels/kernel_1/kernel_1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define HIP_CHECK(status) \ 6 | { \ 7 | hipError_t err = status; \ 8 | if (err != hipSuccess) { \ 9 | std::cerr << "HIP error: " << hipGetErrorString(err) \ 10 | << " at line " << __LINE__ << std::endl; \ 11 | exit(err); \ 12 | } \ 13 | } 14 | 15 | __global__ void kernel(float *A, float *B, float *C, int N, int M, int K, float alpha, float beta) { 16 | int row = blockDim.y * blockIdx.y + threadIdx.y; 17 | int col = blockDim.x * blockIdx.x + threadIdx.x; 18 | 19 | if (row < M && col < N) { 20 | float sum = 0.0f; 21 | for (int k = 0; k < K; ++k) { 22 | sum += A[row * K + k] * B[k * N + col]; 23 | } 24 | C[row * N + col] = alpha * sum + beta * C[row * N + col]; 25 | } 26 | } 27 | 28 | // int main() { 29 | // float *A, *B, *C; 30 | // float *d_A, *d_B, *d_C; 31 | 32 | // float alpha, beta; 33 | 34 | // // For simplicity, we use a square matrix. 35 | // int SIZE = 100; 36 | // size_t mem_size = SIZE * SIZE * sizeof(float); 37 | 38 | // alpha = 1.0f; 39 | // beta = 0.0f; 40 | 41 | // A = (float*)malloc(mem_size); 42 | // B = (float*)malloc(mem_size); 43 | // C = (float*)malloc(mem_size); 44 | 45 | // for (int i = 0; i < SIZE * SIZE; ++i) { 46 | // A[i] = i%3; 47 | // B[i] = i%3; 48 | // C[i] = 0.0f; 49 | // } 50 | 51 | // HIP_CHECK(hipMalloc(&d_A, mem_size)); 52 | // HIP_CHECK(hipMalloc(&d_B, mem_size)); 53 | // HIP_CHECK(hipMalloc(&d_C, mem_size)); 54 | 55 | // HIP_CHECK(hipMemcpy(d_A, A, mem_size, hipMemcpyHostToDevice)); 56 | // HIP_CHECK(hipMemcpy(d_B, B, mem_size, hipMemcpyHostToDevice)); 57 | // HIP_CHECK(hipMemcpy(d_C, C, mem_size, hipMemcpyHostToDevice)); 58 | 59 | // dim3 threadsPerBlock(16, 16); 60 | // dim3 blocksPerGrid((SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x, 61 | // (SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y); 62 | 63 | // hipLaunchKernelGGL(kernel, blocksPerGrid, threadsPerBlock, 0, 0, 64 | // d_A, d_B, d_C, SIZE, SIZE, SIZE, alpha, beta); 65 | 66 | // HIP_CHECK(hipDeviceSynchronize()); 67 | // HIP_CHECK(hipMemcpy(C, d_C, mem_size, hipMemcpyDeviceToHost)); 68 | 69 | // std::cout << "Result matrix C (first 10 elements):" << std::endl; 70 | // for (int i = 0; i < 10; ++i) { 71 | // std::cout << C[i] << " "; 72 | // } 73 | // std::cout << std::endl; 74 | 75 | // HIP_CHECK(hipFree(d_A)); 76 | // HIP_CHECK(hipFree(d_B)); 77 | // HIP_CHECK(hipFree(d_C)); 78 | // free(A); 79 | // free(B); 80 | // free(C); 81 | 82 | // return 0; 83 | // } 84 | -------------------------------------------------------------------------------- /day32/matmul_kernels/kernel_2/kernel_2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Macro to check HIP errors. 5 | #define CHECK_HIP_ERROR(error) \ 6 | { \ 7 | if ((error) != hipSuccess) \ 8 | { \ 9 | std::cerr << "HIP error: " << hipGetErrorString(error) \ 10 | << " at line " << __LINE__ << std::endl; \ 11 | exit(EXIT_FAILURE); \ 12 | } \ 13 | } 14 | 15 | #define TILESIZE 32 16 | 17 | __global__ void kernel(const float *A, const float *B, float *C, int N) 18 | { 19 | __shared__ float As[TILESIZE][TILESIZE]; 20 | __shared__ float Bs[TILESIZE][TILESIZE]; 21 | 22 | int col = blockIdx.x * blockDim.x + threadIdx.x; 23 | int row = blockIdx.y * blockDim.y + threadIdx.y; 24 | 25 | float sum = 0.0f; 26 | 27 | for (int t = 0; t < N; t += TILESIZE) 28 | { 29 | Bs[threadIdx.y][threadIdx.x] = B[(t + threadIdx.y) * N + col]; 30 | As[threadIdx.y][threadIdx.x] = A[row * N + t + threadIdx.x]; 31 | 32 | __syncthreads(); 33 | 34 | for (int k = 0; k < TILESIZE; k++) 35 | { 36 | sum += As[threadIdx.y][k] * Bs[k][threadIdx.x]; 37 | } 38 | 39 | __syncthreads(); 40 | } 41 | if (row < N && col < N) 42 | { 43 | C[row * N + col] = sum; 44 | } 45 | } 46 | 47 | int main(){ 48 | float *A, *B, *C; 49 | float *d_A, *d_B, *d_C; 50 | 51 | int N = 1024; // Size of the matrix 52 | 53 | size_t size = N*N* sizeof(float); 54 | 55 | // Allocate host memory 56 | A = (float *)malloc(size); 57 | B = (float *)malloc(size); 58 | C = (float *)malloc(size); 59 | 60 | for(int i = 0 ; i < N * N ; i++){ 61 | A[i] = i; 62 | B[i] = i; 63 | } 64 | 65 | CHECK_HIP_ERROR(hipMalloc((void**)&d_A,size)); 66 | CHECK_HIP_ERROR(hipMalloc((void**)&d_B,size)); 67 | CHECK_HIP_ERROR(hipMalloc((void**)&d_C,size)); 68 | 69 | CHECK_HIP_ERROR(hipMemcpy(d_A, A, size, hipMemcpyHostToDevice)); 70 | CHECK_HIP_ERROR(hipMemcpy(d_B, B, size, hipMemcpyHostToDevice)); 71 | 72 | dim3 Threads(TILESIZE, TILESIZE); 73 | dim3 Blocks((N+Threads.x-1)/Threads.x, (N+Threads.y-1)/Threads.y); 74 | hipLaunchKernelGGL(kernel, Blocks, Threads, 0, 0, d_A, d_B, d_C, N); 75 | 76 | CHECK_HIP_ERROR(hipMemcpy(C, d_C, size, hipMemcpyDeviceToHost)); 77 | 78 | // Check the result 79 | for(int i = 0 ; i < 10 ; i++){ 80 | for(int j = 0 ; j < 10 ; j++){ 81 | std::cout << C[i*N+j] << " "; 82 | } 83 | std::cout << std::endl; 84 | } 85 | 86 | CHECK_HIP_ERROR(hipFree(d_A)); 87 | CHECK_HIP_ERROR(hipFree(d_B)); 88 | CHECK_HIP_ERROR(hipFree(d_C)); 89 | 90 | free(A); 91 | free(B); 92 | free(C); 93 | 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /day33/load_in_pytorch/kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Your HIP kernel remains the same. 5 | extern "C" __global__ void kernel_addition(const float *A, const float *B, float *C, size_t N) { 6 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x; 7 | if (idx < N) { 8 | C[idx] = A[idx] + B[idx]; 9 | } 10 | } 11 | 12 | // Host wrapper function that launches the kernel. 13 | // This function will be callable from Python. 14 | extern "C" void launch_kernel_addition(const float *A, const float *B, float *C, size_t N, 15 | int grid_x, int grid_y, int grid_z, 16 | int block_x, int block_y, int block_z) { 17 | // Create dim3 objects for grid and block dimensions. 18 | dim3 grid(grid_x, grid_y, grid_z); 19 | dim3 block(block_x, block_y, block_z); 20 | 21 | // Launch the kernel with the provided configuration. 22 | hipLaunchKernelGGL(kernel_addition, grid, block, 0, 0, A, B, C, N); 23 | 24 | // Wait for the kernel to finish. 25 | hipDeviceSynchronize(); 26 | } 27 | -------------------------------------------------------------------------------- /day33/load_in_pytorch/kernel.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day33/load_in_pytorch/kernel.so -------------------------------------------------------------------------------- /day33/load_in_pytorch/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import ctypes 3 | import time 4 | 5 | # Load the shared object 6 | lib = ctypes.CDLL('./kernel.so') 7 | 8 | # Specify the argument types for the host wrapper function. 9 | lib.launch_kernel_addition.argtypes = [ 10 | ctypes.c_void_p, # pointer to A 11 | ctypes.c_void_p, # pointer to B 12 | ctypes.c_void_p, # pointer to C 13 | ctypes.c_size_t, # N 14 | ctypes.c_int, # grid_x 15 | ctypes.c_int, # grid_y 16 | ctypes.c_int, # grid_z 17 | ctypes.c_int, # block_x 18 | ctypes.c_int, # block_y 19 | ctypes.c_int # block_z 20 | ] 21 | lib.launch_kernel_addition.restype = None 22 | 23 | N = 1000 24 | 25 | # Create input tensors on the ROCm device. 26 | A = torch.randn(N, device='cuda', dtype=torch.float32) 27 | B = torch.randn(N, device='cuda', dtype=torch.float32) 28 | C = torch.empty(N, device='cuda', dtype=torch.float32) 29 | 30 | # Get pointers to the tensor data. 31 | a_ptr = A.data_ptr() 32 | b_ptr = B.data_ptr() 33 | c_ptr = C.data_ptr() 34 | 35 | # Define block and grid sizes. 36 | block_size = 256 37 | grid_size = (N + block_size - 1) // block_size 38 | 39 | def measure_amd_kernel_time(): 40 | start_amd = time.time() 41 | lib.launch_kernel_addition( 42 | ctypes.c_void_p(a_ptr), 43 | ctypes.c_void_p(b_ptr), 44 | ctypes.c_void_p(c_ptr), 45 | ctypes.c_size_t(N), 46 | ctypes.c_int(grid_size), # grid_x 47 | ctypes.c_int(1), # grid_y 48 | ctypes.c_int(1), # grid_z 49 | ctypes.c_int(block_size), # block_x 50 | ctypes.c_int(1), # block_y 51 | ctypes.c_int(1) # block_z 52 | ) 53 | torch.cuda.synchronize() # Ensure the kernel has finished executing 54 | end_amd = time.time() 55 | return end_amd - start_amd 56 | 57 | def measure_pytorch_time(): 58 | start_pytorch = time.time() 59 | c_pytorch = A + B 60 | end_pytorch = time.time() 61 | return end_pytorch - start_pytorch 62 | 63 | # Run the measurements 5 times and get the lowest time 64 | amd_times = [measure_amd_kernel_time() for _ in range(5)] 65 | pytorch_times = [measure_pytorch_time() for _ in range(5)] 66 | 67 | min_amd_time = min(amd_times) 68 | min_pytorch_time = min(pytorch_times) 69 | 70 | # Verify the result. 71 | if torch.allclose(C, A + B): 72 | print("Success!") 73 | else: 74 | print("Error in computation.") 75 | 76 | print(f"Lowest AMD kernel execution time: {min_amd_time} seconds") 77 | print(f"Lowest Pytorch computation time: {min_pytorch_time} seconds") 78 | -------------------------------------------------------------------------------- /day34/tensor_lib/test1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __global__ void kernel_noise_image(float *X, const float *e, const float *alpha_hat, int N) 6 | { 7 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 8 | __shared__ float sqrt_alphas[2]; 9 | 10 | if (threadIdx.x == 0) { 11 | sqrt_alphas[0] = sqrtf(*alpha_hat); 12 | sqrt_alphas[1] = sqrtf(1.0f - *alpha_hat); 13 | } 14 | 15 | __syncthreads(); 16 | 17 | if (idx < N) { 18 | e[idx] = hiprand_normal(&state[idx]); 19 | 20 | X[idx] = sqrt_alphas[0] * X[idx] + sqrt_alphas[1] * e[idx]; 21 | } 22 | } 23 | 24 | torch::Tensor noiseImage(torch::Tensor X, int t, torch::Tensor alpha_hat) 25 | { 26 | torch::Tensor alpha_at_t = alpha_hat.index({t}); 27 | 28 | float *d_X, *d_e, *d_alpha_hat; 29 | int N = X.numel(); 30 | 31 | hipMalloc(&d_X, N * sizeof(float)); 32 | hipMalloc(&d_e, N * sizeof(float)); 33 | hipMalloc(&d_alpha_hat, sizeof(float)); 34 | 35 | hipMemcpy(d_X, X.data_ptr(), N * sizeof(float), hipMemcpyHostToDevice); 36 | hipMemcpy(d_alpha_hat, alpha_at_t.data_ptr(), sizeof(float), hipMemcpyHostToDevice); 37 | 38 | int blockSize = 256; 39 | int numBlocks = (N + blockSize - 1) / blockSize; 40 | 41 | kernel_noise_image<<>>(d_X, d_e, d_alpha_hat, N); 42 | 43 | hipDeviceSynchronize(); 44 | 45 | hipMemcpy(X.data_ptr(), d_X, N * sizeof(float), hipMemcpyDeviceToHost); 46 | 47 | hipFree(d_X); 48 | hipFree(d_e); 49 | hipFree(d_alpha_hat); 50 | 51 | return X; 52 | } 53 | 54 | int main() 55 | { 56 | torch::Tensor X = torch::rand({1, 3, 64, 64}, torch::kFloat32); 57 | torch::Tensor alpha_hat = torch::rand({1000}, torch::kFloat32); 58 | 59 | int t = 500; 60 | 61 | X = noiseImage(X, t, alpha_hat); 62 | 63 | std::cout << "Noisy image tensor shape: " << X.sizes() << std::endl; 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /day34/tensor_lib/test1.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day34/tensor_lib/test1.out -------------------------------------------------------------------------------- /day36/random.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define BLOCK_SIZE 256 7 | 8 | __global__ void reductionKernelOptimized(const float *g_in, float *g_out, int n) { 9 | extern __shared__ float sdata[]; 10 | 11 | unsigned int tid = threadIdx.x; 12 | unsigned int idx = blockIdx.x * (BLOCK_SIZE * 2) + tid; 13 | 14 | float mySum = 0.0f; 15 | if (idx < n) 16 | mySum = g_in[idx]; 17 | if (idx + BLOCK_SIZE < n) 18 | mySum += g_in[idx + BLOCK_SIZE]; 19 | 20 | sdata[tid] = mySum; 21 | __syncthreads(); 22 | 23 | for (unsigned int s = BLOCK_SIZE / 2; s > 32; s >>= 1) { 24 | if (tid < s) 25 | sdata[tid] += sdata[tid + s]; 26 | __syncthreads(); 27 | } 28 | 29 | if (tid < 32) { 30 | volatile float *vsmem = sdata; 31 | vsmem[tid] += vsmem[tid + 32]; 32 | vsmem[tid] += vsmem[tid + 16]; 33 | vsmem[tid] += vsmem[tid + 8]; 34 | vsmem[tid] += vsmem[tid + 4]; 35 | vsmem[tid] += vsmem[tid + 2]; 36 | vsmem[tid] += vsmem[tid + 1]; 37 | } 38 | 39 | if (tid == 0) 40 | g_out[blockIdx.x] = sdata[0]; 41 | } 42 | 43 | int main() { 44 | int n = 1 << 20; 45 | size_t size = n * sizeof(float); 46 | 47 | float *h_array = (float*)malloc(size); 48 | for (int i = 0; i < n; i++) { 49 | h_array[i] = 1.0f; 50 | } 51 | 52 | float *d_in, *d_out; 53 | hipMalloc(&d_in, size); 54 | int numBlocks = (n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2); 55 | hipMalloc(&d_out, numBlocks * sizeof(float)); 56 | 57 | hipMemcpy(d_in, h_array, size, hipMemcpyHostToDevice); 58 | 59 | size_t sharedMemSize = BLOCK_SIZE * sizeof(float); 60 | hipLaunchKernelGGL(reductionKernelOptimized, dim3(numBlocks), dim3(BLOCK_SIZE), sharedMemSize, 0, d_in, d_out, n); 61 | hipDeviceSynchronize(); 62 | 63 | float *h_partialSums = (float*)malloc(numBlocks * sizeof(float)); 64 | hipMemcpy(h_partialSums, d_out, numBlocks * sizeof(float), hipMemcpyDeviceToHost); 65 | 66 | float sum = 0.0f; 67 | for (int i = 0; i < numBlocks; i++) { 68 | sum += h_partialSums[i]; 69 | } 70 | printf("Reduction result: %f (expected %f)\n", sum, (float)n); 71 | 72 | free(h_array); 73 | free(h_partialSums); 74 | hipFree(d_in); 75 | hipFree(d_out); 76 | 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /day37/MultiStreams/MHA.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define HEADS 8 6 | #define SEQ_LEN 128 7 | #define DIM 768 // head dimension 8 | 9 | __global__ void addition(const float* query, const float* key, const float* value, 10 | float* output, int seq_len, int dim, int head_id) 11 | { 12 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 13 | int total = seq_len * dim; // work items per head 14 | if (idx < total) { 15 | int seq = idx / dim; // which seq | on row 16 | int d = idx % dim; // pos in seq | on col 17 | 18 | int offset = head_id * (seq_len * dim) + seq * dim; 19 | output[offset + d] = query[offset + d] + key[offset + d] + value[offset + d]; 20 | } 21 | } 22 | 23 | int main(){ 24 | size_t total_elements = HEADS * SEQ_LEN * DIM; 25 | size_t size = total_elements * sizeof(float); 26 | 27 | // Create one HIP stream per head. 28 | hipStream_t streams[HEADS]; 29 | for (int i = 0; i < HEADS; i++){ 30 | hipStreamCreate(&streams[i]); 31 | } 32 | 33 | float *key = (float*)malloc(size); 34 | float *value = (float*)malloc(size); 35 | float *query = (float*)malloc(size); 36 | float *output= (float*)malloc(size); 37 | 38 | for (size_t i = 0; i < total_elements; i++){ 39 | key[i] = 3.0f; 40 | value[i] = 5.0f; 41 | query[i] = 6.0f; 42 | } 43 | 44 | float *d_key, *d_value, *d_query, *d_output; 45 | hipMalloc(&d_key, size); 46 | hipMalloc(&d_value, size); 47 | hipMalloc(&d_query, size); 48 | hipMalloc(&d_output, size); 49 | 50 | size_t headSize = SEQ_LEN * DIM * sizeof(float); 51 | 52 | // [HEADS][SEQ_LEN][DIM] 53 | for (int head = 0; head < HEADS; head++){ 54 | int offset = head * SEQ_LEN * DIM; 55 | hipMemcpyAsync(d_key + offset, key + offset, headSize, hipMemcpyHostToDevice, streams[head]); 56 | hipMemcpyAsync(d_value + offset, value + offset, headSize, hipMemcpyHostToDevice, streams[head]); 57 | hipMemcpyAsync(d_query + offset, query + offset, headSize, hipMemcpyHostToDevice, streams[head]); 58 | } 59 | 60 | int threadsPerBlock = 256; // threads per block 16x16 layout 61 | int totalWork = SEQ_LEN * DIM; // elements in a head 62 | int blocks = (totalWork + threadsPerBlock - 1) / threadsPerBlock; 63 | 64 | for (int head = 0; head < HEADS; head++){ 65 | hipLaunchKernelGGL(addition, dim3(blocks), dim3(threadsPerBlock), 0, streams[head], 66 | d_query, d_key, d_value, d_output, SEQ_LEN, DIM, head); 67 | } 68 | 69 | for (int head = 0; head < HEADS; head++){ 70 | int offset = head * SEQ_LEN * DIM; 71 | hipMemcpyAsync(output + offset, d_output + offset, headSize, 72 | hipMemcpyDeviceToHost, streams[head]); 73 | } 74 | 75 | hipDeviceSynchronize(); 76 | 77 | 78 | for (int i = 0; i < HEADS; i++){ 79 | hipStreamDestroy(streams[i]); 80 | } 81 | 82 | hipFree(d_key); 83 | hipFree(d_value); 84 | hipFree(d_query); 85 | hipFree(d_output); 86 | free(key); 87 | free(value); 88 | free(query); 89 | free(output); 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /day37/MultiStreams/MHA.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day37/MultiStreams/MHA.out -------------------------------------------------------------------------------- /day37/MultiStreams/results.copy_stats.csv: -------------------------------------------------------------------------------- 1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage" 2 | "CopyDeviceToHost",8,894880,111860,52.82395164336985 3 | "CopyHostToDevice",24,799200,33300,47.17604835663015 4 | -------------------------------------------------------------------------------- /day37/MultiStreams/results.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day37/MultiStreams/results.db -------------------------------------------------------------------------------- /day37/MultiStreams/results.hip_stats.csv: -------------------------------------------------------------------------------- 1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage" 2 | "hipStreamCreate",8,287069631,35883703,96.91385734531264 3 | "hipMemcpyAsync",32,6984316,218259,2.357884399407559 4 | "hipStreamDestroy",8,1219926,152490,0.41184340511392464 5 | "hipLaunchKernel",8,587473,73434,0.19832914515510996 6 | "hipMalloc",4,186051,46512,0.06281026665949475 7 | "hipFree",4,147901,36975,0.049930939630563304 8 | "hipDeviceSynchronize",1,12841,12841,0.004335083574797083 9 | "__hipPushCallConfiguration",8,1710,213,0.0005772909362902432 10 | "__hipPopCallConfiguration",8,1280,160,0.0004321242096207668 11 | -------------------------------------------------------------------------------- /day37/MultiStreams/results.hsa_stats.csv: -------------------------------------------------------------------------------- 1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage" 2 | "hsa_queue_create",4,47570153,11892538,77.44727724772702 3 | "hsa_amd_memory_pool_allocate",24,6784269,282677,11.045227501499935 4 | "hsa_signal_wait_scacquire",74,1694809,22902,2.7592583632207988 5 | "hsa_amd_memory_async_copy",32,1068866,33402,1.7401827873597333 6 | "hsa_amd_memory_pool_free",20,837405,41870,1.363349350666012 7 | "hsa_agent_get_info",65,764114,11755,1.2440268755677468 8 | "hsa_executable_load_agent_code_object",2,604473,302236,0.9841210310962273 9 | "hsa_amd_agents_allow_access",20,537142,26857,0.8745018204040375 10 | "hsa_signal_create",547,500153,914,0.8142813426999574 11 | "hsa_executable_freeze",2,332962,166481,0.5420836112710775 12 | "hsa_signal_load_relaxed",1496,123321,82,0.20077454191637648 13 | "hsa_signal_destroy",546,102760,188,0.1672999077799146 14 | "hsa_code_object_reader_create_from_memory",2,97010,48505,0.15793853691834872 15 | "hsa_amd_signal_async_handler",32,82190,2568,0.13381062106297373 16 | "hsa_isa_get_info_alt",2,74801,37400,0.12178085248973718 17 | "hsa_executable_iterate_symbols",16,55820,3488,0.09087856025958381 18 | "hsa_executable_create_alt",2,24260,12130,0.039496844713319657 19 | "hsa_iterate_agents",1,24220,24220,0.039431722133413116 20 | "hsa_amd_pointer_info",128,21370,166,0.03479173831507177 21 | "hsa_executable_symbol_get_info",260,15970,61,0.026000190027688167 22 | "hsa_signal_store_screlease",48,15030,313,0.024469809399884357 23 | "hsa_amd_agent_iterate_memory_pools",4,13870,3467,0.022581254582594544 24 | "hsa_amd_profiling_get_async_copy_time",32,13490,421,0.021962590073482367 25 | "hsa_queue_load_read_index_relaxed",48,12480,260,0.0203182449308421 26 | "hsa_amd_profiling_set_profiler_enabled",4,9170,2292,0.014929351443575484 27 | "hsa_amd_profiling_get_dispatch_time",16,7110,444,0.011575538578388408 28 | "hsa_executable_get_symbol_by_name",16,6530,408,0.010631261169743502 29 | "hsa_queue_add_write_index_screlease",48,5490,114,0.008938074092173327 30 | "hsa_signal_silent_store_relaxed",80,5350,66,0.008710145062500419 31 | "hsa_amd_profiling_async_copy_enable",8,4790,598,0.007798428943808787 32 | "hsa_amd_memory_pool_get_info",27,4010,148,0.006528538635631156 33 | "hsa_queue_load_read_index_scacquire",48,3000,62,0.00488419349299089 34 | "hsa_amd_memory_copy_engine_status",2,2180,1090,0.0035491806049067127 35 | "hsa_agent_iterate_isas",1,1860,1860,0.0030281999656543513 36 | "hsa_amd_agent_memory_pool_get_info",9,1470,163,0.0023932548115655357 37 | "hsa_system_get_info",4,370,92,0.000602383864135543 38 | "hsa_system_get_major_extension_table",1,360,360,0.0005861032191589067 39 | -------------------------------------------------------------------------------- /day37/MultiStreams/results.stats.csv: -------------------------------------------------------------------------------- 1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage" 2 | "addition(float const*, float const*, float const*, float*, int, int, int)",8,42080,5260,100.0 3 | -------------------------------------------------------------------------------- /day38/myreduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for FLT_MAX 4 | 5 | #ifndef WARP_SIZE 6 | #define WARP_SIZE 64 7 | #endif 8 | 9 | #define HIPCHECK(error) \ 10 | { \ 11 | if ((error) != hipSuccess) \ 12 | { \ 13 | std::cerr << "HIP error: " << hipGetErrorString(error) \ 14 | << " at line " << __LINE__ << std::endl; \ 15 | exit(EXIT_FAILURE); \ 16 | } \ 17 | } 18 | 19 | 20 | template 21 | __global__ void reduce_max_1d(const scalar_t *__restrict__ input, 22 | scalar_t *__restrict__ output, 23 | int n) 24 | { 25 | extern __shared__ float sdata[]; 26 | const uint32_t tid = threadIdx.x; 27 | const uint32_t i = blockIdx.x * (blockDim.x * 2) + tid; 28 | const uint32_t lane = tid % WARP_SIZE; 29 | const uint32_t warp_id = tid / WARP_SIZE; 30 | float max_val = -FLT_MAX; 31 | if (i < n) 32 | max_val = input[i]; 33 | if (i + blockDim.x < n) 34 | max_val = fmaxf(max_val, input[i + blockDim.x]); 35 | 36 | for (uint32_t offset = WARP_SIZE / 2; offset > 0; offset /= 2) 37 | { 38 | max_val = fmaxf(max_val, __shfl_down(max_val, offset, WARP_SIZE)); 39 | } 40 | 41 | 42 | if (lane == 0) 43 | { 44 | sdata[warp_id] = max_val; 45 | } 46 | __syncthreads(); 47 | 48 | 49 | const uint32_t numWarps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE; 50 | if (tid < numWarps) 51 | { 52 | max_val = sdata[lane]; 53 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) 54 | { 55 | max_val = fmaxf(max_val, __shfl_down(max_val, offset, WARP_SIZE)); 56 | } 57 | if (lane == 0) 58 | sdata[tid] = max_val; 59 | } 60 | __syncthreads(); 61 | 62 | if (tid == 0) 63 | output[blockIdx.x] = sdata[0]; 64 | } 65 | 66 | 67 | int main() 68 | { 69 | const int n = 102400; 70 | std::vector h_input(n, 1.0f); 71 | h_input[500] = 133.0f; 72 | 73 | float *d_input; 74 | float *d_output; 75 | HIPCHECK(hipMalloc(&d_input, n * sizeof(float))); 76 | HIPCHECK(hipMalloc(&d_output, sizeof(float))); 77 | 78 | HIPCHECK(hipMemcpy(d_input, h_input.data(), n * sizeof(float), hipMemcpyHostToDevice)); 79 | 80 | const int threadsPerBlock = 256; 81 | const int blocks = (n + threadsPerBlock * 2 - 1) / (threadsPerBlock * 2); 82 | const size_t sharedMemSize = ((threadsPerBlock + WARP_SIZE - 1) / WARP_SIZE) * sizeof(float); 83 | 84 | hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_max_1d), dim3(blocks), dim3(threadsPerBlock), sharedMemSize, 0, d_input, d_output, n); 85 | 86 | float h_output; 87 | HIPCHECK(hipMemcpy(&h_output, d_output, sizeof(float), hipMemcpyDeviceToHost)); 88 | 89 | 90 | HIPCHECK(hipFree(d_input)); 91 | HIPCHECK(hipFree(d_output)); 92 | 93 | std::cout << "Maximum value: " << h_output << std::endl; 94 | 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /day42/mat_mul.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | import triton.language as tl 5 | 6 | 7 | DEVICE = torch.device("cuda:0") 8 | 9 | 10 | @triton.jit 11 | def matmul_kernel( 12 | a_ptr, b_ptr, c_ptr, 13 | M, N, K, 14 | stride_am, stride_ak, 15 | stride_bk, stride_bn, 16 | stride_cm, stride_cn, 17 | BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, # 18 | GROUP_SIZE_M: tl.constexpr, # 19 | ACTIVATION: tl.constexpr # 20 | ): 21 | pid = tl.program_id(axis=0) 22 | num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) 23 | num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) 24 | num_pid_in_group = GROUP_SIZE_M * num_pid_n 25 | 26 | group_id = pid // num_pid_in_group 27 | first_pid_m = group_id * GROUP_SIZE_M 28 | 29 | group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 30 | 31 | pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) 32 | pid_n = (pid % num_pid_in_group) // group_size_m 33 | 34 | offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M 35 | offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N 36 | 37 | offs_k = tl.arange(0, BLOCK_SIZE_K) 38 | 39 | a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) 40 | b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) 41 | 42 | accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) 43 | for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): 44 | 45 | a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) 46 | b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) 47 | 48 | accumulator = tl.dot(a, b, accumulator) 49 | 50 | a_ptrs += BLOCK_SIZE_K * stride_ak 51 | b_ptrs += BLOCK_SIZE_K * stride_bk 52 | if ACTIVATION == "leaky_relu": 53 | accumulator = leaky_relu(accumulator) 54 | c = accumulator.to(tl.float16) 55 | 56 | offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) 57 | offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) 58 | 59 | c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] 60 | c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) 61 | 62 | tl.store(c_ptrs, c, mask=c_mask) 63 | 64 | 65 | @triton.jit 66 | def leaky_relu(x): 67 | return tl.where(x >= 0, x, 0.01 * x) 68 | 69 | 70 | def matmul(a, b, activation=""): 71 | assert a.shape[1] == b.shape[0], "Incompatible dimensions" 72 | assert a.is_contiguous(), "Matrix A must be contiguous" 73 | M, K = a.shape 74 | K, N = b.shape 75 | 76 | c = torch.empty((M, N), device=a.device, dtype=torch.float16) 77 | 78 | grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), ) 79 | 80 | matmul_kernel[grid]( 81 | a, b, c, 82 | M, N, K, 83 | a.stride(0), a.stride(1), 84 | b.stride(0), b.stride(1), 85 | c.stride(0), c.stride(1), 86 | ACTIVATION=activation 87 | ) 88 | return c 89 | 90 | -------------------------------------------------------------------------------- /day42/mat_mul_2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | import triton.language as tl 5 | 6 | @triton.jit 7 | def get_1d_offest(size,n_prev_chunks): 8 | return n_prev_chunks * size + tl.arange(0,size) 9 | 10 | @triton.jit 11 | def get_2d_offest(offs_0,offs_1,stride_0,stride_1): 12 | return tl.expand_dims(offs_0,1)*stride_0 + tl.expand_dims(offs_1,0)*stride_1 13 | 14 | @triton.jit 15 | def get_1d_mask(offs,max): 16 | return offs 2 | 3 | namespace cg = cooperative_groups; 4 | 5 | template 6 | __device__ T reduce_sum(cg::thread_block_tile<32>& group, T val) { 7 | for (int offset = group.size()/2; offset > 0; offset /= 2) { 8 | T temp = group.shfl_down(val, offset); 9 | val += temp; 10 | } 11 | return val; 12 | } 13 | 14 | template 15 | __device__ T reduce_max(cg::thread_block_tile<32>& group, T val) { 16 | for (int offset = group.size()/2; offset > 0; offset /= 2) { 17 | T temp = group.shfl_down(val, offset); 18 | val = max(val, temp); 19 | } 20 | return val; 21 | } 22 | 23 | template 24 | __device__ T reduce_sum(cg::thread_block_tile<64>& group, T val) { 25 | for (int offset = group.size()/2; offset > 0; offset /= 2) { 26 | T temp = group.shfl_down(val, offset); 27 | val += temp; 28 | } 29 | return val; 30 | } 31 | 32 | template 33 | __device__ T reduce_max(cg::thread_block_tile<64>& group, T val) { 34 | for (int offset = group.size()/2; offset > 0; offset /= 2) { 35 | T temp = group.shfl_down(val, offset); 36 | val = max(val, temp); 37 | } 38 | return val; 39 | } 40 | 41 | __device__ cg::thread_block this_thread_block() { 42 | return cg::this_thread_block(); 43 | } 44 | 45 | template 46 | __device__ cg::thread_block_tile tiled_partition(cg::thread_block& block) { 47 | return cg::tiled_partition(block); 48 | } -------------------------------------------------------------------------------- /day48/kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def gelu_kernel( 7 | x_ptr, 8 | output_ptr, 9 | n_elements, 10 | BLOCK_SIZE: tl.constexpr, 11 | ): 12 | pid = tl.program_id(axis=0) 13 | block_start = pid * BLOCK_SIZE 14 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 15 | 16 | mask = offsets < n_elements 17 | 18 | x = tl.load(x_ptr + offsets, mask=mask) 19 | 20 | coeff1 = 0.7978845608028654 21 | coeff2 = 0.044715 22 | x_cubed = x * x * x 23 | inner = coeff1 * (x + coeff2 * x_cubed) 24 | tanh = tl.math.tanh(inner) 25 | output = 0.5 * x * (1.0 + tanh) 26 | 27 | tl.store(output_ptr + offsets, output, mask=mask) 28 | 29 | def fused_gelu(x: torch.Tensor): 30 | output = torch.empty_like(x) 31 | 32 | n_elements = x.numel() 33 | 34 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 35 | 36 | gelu_kernel[grid](x, output, n_elements, BLOCK_SIZE=1024) 37 | 38 | return output 39 | 40 | if __name__ == "__main__": 41 | torch.manual_seed(0) 42 | 43 | x = torch.randn(1000000, device='cuda') 44 | 45 | triton_output = fused_gelu(x) 46 | 47 | torch_output = torch.nn.functional.gelu(x) 48 | 49 | print(f"Maximum absolute error: {torch.max(torch.abs(triton_output - torch_output)):.2e}") 50 | print(f"Results match: {torch.allclose(triton_output, torch_output, atol=1e-5)}") -------------------------------------------------------------------------------- /day49/kernel.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import triton.language as tl 3 | import torch 4 | import time 5 | 6 | @triton.jit 7 | def fused_bias_skip_gelu_scale_kernel( 8 | x_ptr, 9 | bias_ptr, 10 | skip_ptr, 11 | gamma_ptr, 12 | y_ptr, 13 | n_elements: tl.constexpr 14 | ): 15 | BLOCK_SIZE = 1024 16 | pid = tl.program_id(0) 17 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 18 | mask = offsets < n_elements 19 | x = tl.load(x_ptr + offsets, mask=mask) 20 | bias = tl.load(bias_ptr + offsets, mask=mask) 21 | skip = tl.load(skip_ptr + offsets, mask=mask) 22 | gamma = tl.load(gamma_ptr + offsets, mask=mask) 23 | temp = x + bias + skip 24 | gelu = 0.5 * temp * (1.0 + tl.tanh(0.7978845608028654 * (temp + 0.044715 * temp * temp * temp))) 25 | out = gelu * gamma 26 | tl.store(y_ptr + offsets, out, mask=mask) 27 | 28 | def test_fused_kernel(): 29 | n_elements = 2048 30 | BLOCK_SIZE = 1024 31 | x = torch.randn(n_elements, device='cuda', dtype=torch.float32) 32 | bias = torch.randn(n_elements, device='cuda', dtype=torch.float32) 33 | skip = torch.randn(n_elements, device='cuda', dtype=torch.float32) 34 | gamma = torch.randn(n_elements, device='cuda', dtype=torch.float32) 35 | y = torch.empty_like(x) 36 | grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE,) 37 | fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements) 38 | torch.cuda.synchronize() 39 | temp = x + bias + skip 40 | gelu = 0.5 * temp * (1.0 + torch.tanh(0.7978845608028654 * (temp + 0.044715 * temp ** 3))) 41 | ref = gelu * gamma 42 | if torch.allclose(y, ref, atol=1e-6): 43 | print("Test passed! Kernel output matches reference.") 44 | else: 45 | print("Test failed! Maximum absolute error:", (y - ref).abs().max().item()) 46 | 47 | def benchmark_kernel(): 48 | n_elements = 2048 49 | BLOCK_SIZE = 1024 50 | x = torch.randn(n_elements, device='cuda', dtype=torch.float32) 51 | bias = torch.randn(n_elements, device='cuda', dtype=torch.float32) 52 | skip = torch.randn(n_elements, device='cuda', dtype=torch.float32) 53 | gamma = torch.randn(n_elements, device='cuda', dtype=torch.float32) 54 | y = torch.empty_like(x) 55 | grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE,) 56 | for _ in range(10): 57 | fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements) 58 | torch.cuda.synchronize() 59 | n_iter = 100 60 | start = time.time() 61 | for _ in range(n_iter): 62 | fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements) 63 | torch.cuda.synchronize() 64 | end = time.time() 65 | avg_time_ms = (end - start) / n_iter * 1000 66 | print(f"Average kernel time over {n_iter} iterations: {avg_time_ms:.3f} ms") 67 | 68 | if __name__ == "__main__": 69 | test_fused_kernel() 70 | benchmark_kernel() 71 | -------------------------------------------------------------------------------- /day50/tritonnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def gelu_kernel( 7 | x_ptr, 8 | output_ptr, 9 | n_elements, 10 | BLOCK_SIZE: tl.constexpr, 11 | ): 12 | pid = tl.program_id(axis=0) 13 | block_start = pid * BLOCK_SIZE 14 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 15 | mask = offsets < n_elements 16 | x = tl.load(x_ptr + offsets, mask=mask) 17 | sqrt_2_over_pi = tl.sqrt(2.0 / tl.math.pi) 18 | cdf = 0.5 * (1.0 + tl.tanh(sqrt_2_over_pi * (x + 0.044715 * (x ** 3)))) 19 | output = x * cdf 20 | tl.store(output_ptr + offsets, output, mask=mask) 21 | 22 | @triton.jit 23 | def fused_add_multiply_kernel( 24 | a_ptr, b_ptr, c_ptr, output_ptr, 25 | n_elements, 26 | BLOCK_SIZE: tl.constexpr, 27 | ): 28 | pid = tl.program_id(axis=0) 29 | block_start = pid * BLOCK_SIZE 30 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 31 | mask = offsets < n_elements 32 | a = tl.load(a_ptr + offsets, mask=mask) 33 | b = tl.load(b_ptr + offsets, mask=mask) 34 | c = tl.load(c_ptr + offsets, mask=mask) 35 | output = (a + b) * c 36 | tl.store(output_ptr + offsets, output, mask=mask) 37 | 38 | class GELUTriton(torch.autograd.Function): 39 | @staticmethod 40 | def forward(ctx, x): 41 | x = x.contiguous() 42 | output = torch.empty_like(x) 43 | n_elements = output.numel() 44 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 45 | gelu_kernel[grid](x, output, n_elements, BLOCK_SIZE=1024) 46 | ctx.save_for_backward(x) 47 | return output 48 | 49 | @staticmethod 50 | def backward(ctx, grad_output): 51 | x, = ctx.saved_tensors 52 | with torch.enable_grad(): 53 | x = x.detach().requires_grad_() 54 | with torch.cuda.amp.autocast(): 55 | output = GELUTriton.apply(x) 56 | grad_input = torch.autograd.grad( 57 | output, x, grad_output, create_graph=True 58 | )[0] 59 | return grad_input 60 | 61 | def fused_add_multiply(a, b, c): 62 | assert a.shape == b.shape == c.shape 63 | a, b, c = a.contiguous(), b.contiguous(), c.contiguous() 64 | output = torch.empty_like(a) 65 | n_elements = output.numel() 66 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 67 | fused_add_multiply_kernel[grid](a, b, c, output, n_elements, BLOCK_SIZE=1024) 68 | return output 69 | 70 | class TritonNN(torch.nn.Module): 71 | def __init__(self, in_features, hidden_size, out_features): 72 | super().__init__() 73 | self.fc1 = torch.nn.Linear(in_features, hidden_size) 74 | self.fc2 = torch.nn.Linear(hidden_size, out_features) 75 | 76 | def forward(self, x): 77 | x = self.fc1(x) 78 | x = GELUTriton.apply(x) 79 | 80 | residual = x 81 | a = x 82 | b = torch.ones_like(x) * 0.5 83 | c = torch.ones_like(x) * 1.5 84 | x = fused_add_multiply(a, b, c) 85 | x += residual 86 | 87 | return self.fc2(x) 88 | 89 | if __name__ == "__main__": 90 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 91 | model = TritonNN(784, 256, 10).to(device) 92 | x = torch.randn(32, 784).to(device) 93 | output = model(x) 94 | print("Output shape:", output.shape) 95 | print("Output values:", output[0, :5]) -------------------------------------------------------------------------------- /day52/functionsused.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import numpy as np 3 | 4 | import triton.language as tl 5 | 6 | @triton.jit 7 | def init_matrix(matrix, seed: tl.constexpr): 8 | idx = tl.arange(0, matrix.shape[0]) 9 | matrix[idx] = tl.random(seed + idx) 10 | 11 | @triton.jit 12 | def add_matrices(a, b, result): 13 | idx = tl.arange(0, a.shape[0]) 14 | result[idx] = a[idx] + b[idx] 15 | 16 | @triton.jit 17 | def multiply_matrices(a, b, result): 18 | idx = tl.arange(0, a.shape[0]) 19 | result[idx] = a[idx] * b[idx] 20 | 21 | @triton.jit 22 | def transpose_matrix(matrix, result): 23 | idx = tl.arange(0, matrix.shape[0]) 24 | idy = tl.arange(0, matrix.shape[1]) 25 | result[idy, idx] = matrix[idx, idy] 26 | 27 | @triton.jit 28 | def matmul_kernel(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr): 29 | pid = tl.program_id(0) 30 | row = pid // N 31 | col = pid % N 32 | 33 | acc = 0.0 34 | for k in range(K): 35 | acc += a[row, k] * b[k, col] 36 | 37 | c[row, col] = acc 38 | 39 | if __name__ == "__main__": 40 | 41 | M, N, K = 128, 128, 128 42 | a = np.random.rand(M, K).astype(np.float32) 43 | b = np.random.rand(K, N).astype(np.float32) 44 | c = np.zeros((M, N), dtype=np.float32) 45 | 46 | a_dev = triton.testing.to_device(a) 47 | b_dev = triton.testing.to_device(b) 48 | c_dev = triton.testing.to_device(c) 49 | 50 | grid = (M * N,) 51 | matmul_kernel[grid](a_dev, b_dev, c_dev, M, N, K) 52 | 53 | c = c_dev.cpu() 54 | print(c) -------------------------------------------------------------------------------- /day54/softmax.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def softmax_kernel( 7 | output_ptr, input_ptr, 8 | input_row_stride, output_row_stride, 9 | n_cols, 10 | BLOCK_SIZE: tl.constexpr 11 | ): 12 | row_idx = tl.program_id(0) 13 | row_start = row_idx * input_row_stride 14 | 15 | col_offsets = tl.arange(0, BLOCK_SIZE) 16 | input_ptrs = input_ptr + row_start + col_offsets 17 | row_mask = col_offsets < n_cols 18 | 19 | row = tl.load(input_ptrs, mask=row_mask, other=-float('inf')) 20 | row_minus_max = row - tl.max(row, axis=0) 21 | numerator = tl.exp(row_minus_max) 22 | denominator = tl.sum(numerator, axis=0) 23 | softmax_output = numerator / denominator 24 | 25 | output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets 26 | tl.store(output_ptrs, softmax_output, mask=row_mask) 27 | 28 | def triton_softmax(x): 29 | n_rows, n_cols = x.shape 30 | BLOCK_SIZE = triton.next_power_of_2(n_cols) 31 | 32 | y = torch.empty_like(x) 33 | assert x.is_cuda and y.is_cuda 34 | 35 | num_warps = 4 36 | if BLOCK_SIZE >= 2048: 37 | num_warps = 8 38 | if BLOCK_SIZE >= 4096: 39 | num_warps = 16 40 | 41 | softmax_kernel[(n_rows,)]( 42 | y, x, 43 | x.stride(0), y.stride(0), 44 | n_cols, 45 | BLOCK_SIZE=BLOCK_SIZE, 46 | num_warps=num_warps 47 | ) 48 | return y 49 | 50 | x = torch.randn(10000, 1000, device='cuda') 51 | triton_result = triton_softmax(x) 52 | torch_result = torch.softmax(x, axis=1) 53 | 54 | print(f"Max error: {torch.max(torch.abs(triton_result - torch_result)):.2e}") -------------------------------------------------------------------------------- /day57/main.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import triton.language as tl 3 | import torch 4 | @triton.jit 5 | def fused_linear_xentropy_forward( 6 | input_ptr, weight_ptr, bias_ptr, target_ptr, loss_ptr, 7 | batch_size, in_features, out_features, 8 | stride_input_batch, stride_input_feature, 9 | stride_weight_out, stride_weight_in, 10 | stride_bias_out, 11 | BLOCK_SIZE_IN: tl.constexpr, 12 | BLOCK_SIZE_OUT: tl.constexpr, 13 | ): 14 | pid = tl.program_id(0) 15 | if pid >= batch_size: 16 | return 17 | 18 | input_row = input_ptr + pid * stride_input_batch 19 | target = tl.load(target_ptr + pid) 20 | 21 | logits = tl.zeros((BLOCK_SIZE_OUT,), dtype=tl.float32) 22 | 23 | for i in range(0, in_features, BLOCK_SIZE_IN): 24 | input_offsets = i + tl.arange(0, BLOCK_SIZE_IN) 25 | input_mask = input_offsets < in_features 26 | current_input = tl.load(input_row + input_offsets, mask=input_mask, other=0.0) 27 | 28 | weight_offsets = (i + tl.arange(0, BLOCK_SIZE_IN))[None, :] * stride_weight_in + \ 29 | tl.arange(0, BLOCK_SIZE_OUT)[:, None] * stride_weight_out 30 | weight_mask = (input_mask[None, :]) & (tl.arange(0, BLOCK_SIZE_OUT)[:, None] < out_features) 31 | current_weight = tl.load(weight_ptr + weight_offsets, mask=weight_mask, other=0.0) 32 | 33 | logits += tl.sum(current_input[None, :] * current_weight, axis=1) 34 | 35 | bias_offsets = tl.arange(0, BLOCK_SIZE_OUT) * stride_bias_out 36 | bias_mask = tl.arange(0, BLOCK_SIZE_OUT) < out_features 37 | bias = tl.load(bias_ptr + bias_offsets, mask=bias_mask, other=0.0) 38 | logits += bias 39 | 40 | max_logit = tl.max(logits, axis=0) 41 | exp_logits = tl.exp(logits - max_logit) 42 | sum_exp = tl.sum(exp_logits, axis=0) 43 | log_sum_exp = tl.log(sum_exp) 44 | log_probs = logits - max_logit - log_sum_exp 45 | 46 | target_mask = tl.arange(0, BLOCK_SIZE_OUT) == target 47 | contribution = -tl.sum(log_probs * target_mask, axis=0) 48 | tl.atomic_add(loss_ptr, contribution / batch_size) 49 | 50 | def fused_linear_cross_entropy( 51 | input: torch.Tensor, 52 | weight: torch.Tensor, 53 | bias: torch.Tensor, 54 | target: torch.Tensor 55 | ) -> torch.Tensor: 56 | assert input.is_cuda and weight.is_cuda and bias.is_cuda and target.is_cuda 57 | batch_size, in_features = input.shape 58 | out_features, _ = weight.shape 59 | 60 | loss = torch.zeros(1, device=input.device, dtype=torch.float32) 61 | 62 | BLOCK_SIZE_IN = 128 63 | BLOCK_SIZE_OUT = triton.next_power_of_2(out_features) 64 | if BLOCK_SIZE_OUT > 4096: 65 | raise ValueError("Too many output features for this kernel implementation") 66 | 67 | grid = (batch_size,) 68 | fused_linear_xentropy_forward[grid]( 69 | input_ptr=input, 70 | weight_ptr=weight, 71 | bias_ptr=bias, 72 | target_ptr=target, 73 | loss_ptr=loss, 74 | batch_size=batch_size, 75 | in_features=in_features, 76 | out_features=out_features, 77 | stride_input_batch=input.stride(0), 78 | stride_input_feature=input.stride(1), 79 | stride_weight_out=weight.stride(0), 80 | stride_weight_in=weight.stride(1), 81 | stride_bias_out=bias.stride(0), 82 | BLOCK_SIZE_IN=BLOCK_SIZE_IN, 83 | BLOCK_SIZE_OUT=BLOCK_SIZE_OUT, 84 | ) 85 | return loss -------------------------------------------------------------------------------- /day58/layer_norm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void layer_norm_kernel( 4 | float* output, 5 | const float* input, 6 | const float* gamma, 7 | const float* beta, 8 | int batch_size, 9 | int hidden_size, 10 | float epsilon) 11 | { 12 | extern __shared__ float shared[]; 13 | int batch_idx = blockIdx.x; 14 | int tid = threadIdx.x; 15 | 16 | if (batch_idx >= batch_size) return; 17 | 18 | float* sum = shared; 19 | float* sum_sq = &shared[blockDim.x]; 20 | 21 | float thread_sum = 0.0f; 22 | for (int i = tid; i < hidden_size; i += blockDim.x) { 23 | float val = input[batch_idx * hidden_size + i]; 24 | thread_sum += val; 25 | } 26 | sum[tid] = thread_sum; 27 | __syncthreads(); 28 | 29 | for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { 30 | if (tid < stride) { 31 | sum[tid] += sum[tid + stride]; 32 | } 33 | __syncthreads(); 34 | } 35 | float mean = sum[0] / hidden_size; 36 | 37 | float thread_sum_sq = 0.0f; 38 | for (int i = tid; i < hidden_size; i += blockDim.x) { 39 | float val = input[batch_idx * hidden_size + i]; 40 | float diff = val - mean; 41 | thread_sum_sq += diff * diff; 42 | } 43 | sum_sq[tid] = thread_sum_sq; 44 | __syncthreads(); 45 | 46 | for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { 47 | if (tid < stride) { 48 | sum_sq[tid] += sum_sq[tid + stride]; 49 | } 50 | __syncthreads(); 51 | } 52 | float variance = sum_sq[0] / hidden_size + epsilon; 53 | float inv_std = rsqrtf(variance); 54 | 55 | for (int i = tid; i < hidden_size; i += blockDim.x) { 56 | float val = input[batch_idx * hidden_size + i]; 57 | float normalized = (val - mean) * inv_std; 58 | output[batch_idx * hidden_size + i] = normalized * gamma[i] + beta[i]; 59 | } 60 | } 61 | 62 | void layer_norm_hip( 63 | float* output, 64 | const float* input, 65 | const float* gamma, 66 | const float* beta, 67 | int batch_size, 68 | int hidden_size, 69 | float epsilon, 70 | hipStream_t stream) 71 | { 72 | dim3 blocks(batch_size); 73 | dim3 threads(256); 74 | size_t shared_mem = 2 * threads.x * sizeof(float); 75 | 76 | hipLaunchKernelGGL( 77 | layer_norm_kernel, 78 | blocks, threads, shared_mem, stream, 79 | output, input, gamma, beta, 80 | batch_size, hidden_size, epsilon 81 | ); 82 | } -------------------------------------------------------------------------------- /day60/fused.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def _fused_skip_act_norm_dropout_kernel( 7 | input_ptr, skip_ptr, output_ptr, 8 | weight_ptr, bias_ptr, 9 | M, N, 10 | stride_input, stride_skip, stride_output, 11 | dropout_p, seed, 12 | eps, 13 | is_training, 14 | BLOCK_SIZE: tl.constexpr, 15 | ): 16 | pid = tl.program_id(0) 17 | 18 | offsets = pid * stride_input + tl.arange(0, BLOCK_SIZE) 19 | mask = tl.arange(0, BLOCK_SIZE) < N 20 | 21 | input = tl.load(input_ptr + offsets, mask=mask, other=0.0) 22 | skip = tl.load(skip_ptr + offsets, mask=mask, other=0.0) 23 | 24 | summed = input + skip 25 | 26 | mean = tl.sum(summed, axis=0) / N 27 | centered = summed - mean 28 | var = tl.sum(centered * centered, axis=0) / N 29 | inv_std = 1.0 / tl.sqrt(var + eps) 30 | 31 | normalized = centered * inv_std 32 | 33 | if weight_ptr is not None: 34 | weight = tl.load(weight_ptr + tl.arange(0, BLOCK_SIZE), mask=mask, other=1.0) 35 | normalized *= weight 36 | if bias_ptr is not None: 37 | bias = tl.load(bias_ptr + tl.arange(0, BLOCK_SIZE), mask=mask, other=0.0) 38 | normalized += bias 39 | 40 | gelu = normalized * 0.5 * (1.0 + tl.erf(normalized / tl.sqrt(2.0))) 41 | 42 | if is_training: 43 | dropout_mask = tl.rand(seed, offsets) > dropout_p 44 | gelu = tl.where(dropout_mask, gelu / (1 - dropout_p), 0.0) 45 | 46 | tl.store(output_ptr + offsets, gelu, mask=mask) 47 | 48 | class FusedSkipNormActDropout(torch.autograd.Function): 49 | @staticmethod 50 | def forward(ctx, input, skip, weight, bias, p, training, eps): 51 | assert input.shape == skip.shape 52 | M, N = input.shape 53 | output = torch.empty_like(input) 54 | 55 | BLOCK_SIZE = triton.next_power_of_2(N) 56 | 57 | seed = torch.randint(0, 2**31, (1,)).item() 58 | 59 | grid = (M,) 60 | _fused_skip_act_norm_dropout_kernel[grid]( 61 | input, skip, output, 62 | weight if weight is not None else None, 63 | bias if bias is not None else None, 64 | M, N, 65 | input.stride(0), skip.stride(0), output.stride(0), 66 | dropout_p=p, 67 | seed=seed, 68 | eps=eps, 69 | is_training=training, 70 | BLOCK_SIZE=BLOCK_SIZE, 71 | ) 72 | 73 | ctx.training = training 74 | ctx.p = p 75 | ctx.eps = eps 76 | ctx.save_for_backward(input, skip, weight, bias, output) 77 | 78 | return output 79 | 80 | @staticmethod 81 | def backward(ctx, grad_output): 82 | raise NotImplementedError("Backward not implemented for this fused operation") 83 | 84 | def fused_skip_norm_act_dropout( 85 | input: torch.Tensor, 86 | skip: torch.Tensor, 87 | weight: torch.Tensor = None, 88 | bias: torch.Tensor = None, 89 | p: float = 0.5, 90 | training: bool = False, 91 | eps: float = 1e-5 92 | ) -> torch.Tensor: 93 | return FusedSkipNormActDropout.apply(input, skip, weight, bias, p, training, eps) -------------------------------------------------------------------------------- /day64/main.py: -------------------------------------------------------------------------------- 1 | import torch, time 2 | import torch.nn.functional as F 3 | import triton 4 | import triton.language as tl 5 | 6 | @triton.jit 7 | def geglu_kernel(input_ptr, output_ptr, numel: tl.constexpr, D: tl.constexpr, BLOCK_SIZE: tl.constexpr): 8 | pid = tl.program_id(0) 9 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 10 | mask = offsets < numel 11 | 12 | row = offsets // D 13 | col = offsets % D 14 | base_offset = row * (2 * D) 15 | x = tl.load(input_ptr + base_offset + col, mask=mask) 16 | gate = tl.load(input_ptr + base_offset + D + col, mask=mask) 17 | 18 | t = 0.7978845608 * (gate + 0.044715 * gate * gate * gate) 19 | exp_2t = tl.exp(2 * t) 20 | tanh_t = (exp_2t - 1.0) / (exp_2t + 1.0) 21 | gelu_gate = 0.5 * gate * (1.0 + tanh_t) 22 | out = x * gelu_gate 23 | 24 | tl.store(output_ptr + offsets, out, mask=mask) 25 | 26 | def fused_geglu(input_tensor): 27 | N, twoD = input_tensor.shape 28 | D = twoD // 2 29 | output = torch.empty((N, D), device=input_tensor.device, dtype=input_tensor.dtype) 30 | numel = N * D 31 | BLOCK_SIZE = 256 32 | grid = lambda meta: ((numel + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],) 33 | geglu_kernel[grid](input_tensor, output, numel, D, BLOCK_SIZE) 34 | return output 35 | 36 | def torch_geglu(input_tensor): 37 | x, gate = input_tensor.chunk(2, dim=-1) 38 | return x * F.gelu(gate) 39 | 40 | input_tensor = torch.randn(8192, 8192, device='cuda') 41 | 42 | _ = fused_geglu(input_tensor) 43 | _ = torch_geglu(input_tensor) 44 | 45 | torch.cuda.synchronize() 46 | start = time.time() 47 | for _ in range(100): 48 | _ = fused_geglu(input_tensor) 49 | torch.cuda.synchronize() 50 | fused_time = time.time() - start 51 | 52 | torch.cuda.synchronize() 53 | start = time.time() 54 | for _ in range(100): 55 | _ = torch_geglu(input_tensor) 56 | torch.cuda.synchronize() 57 | torch_time = time.time() - start 58 | 59 | print("Fused Triton kernel time: {:.6f} sec".format(fused_time)) 60 | print("Torch baseline time: {:.6f} sec".format(torch_time)) 61 | -------------------------------------------------------------------------------- /day67/lora.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def lora_kernel( 7 | y_ptr, x_ptr, w_ptr, a_ptr, b_ptr, 8 | M, N, K, R, 9 | stride_ym, stride_yn, 10 | stride_xm, stride_xk, 11 | stride_wk, stride_wn, 12 | stride_ak, stride_ar, 13 | stride_br, stride_bn, 14 | BLOCK_SIZE_M: tl.constexpr, 15 | BLOCK_SIZE_N: tl.constexpr, 16 | BLOCK_SIZE_K: tl.constexpr, 17 | BLOCK_SIZE_R: tl.constexpr, 18 | ): 19 | pid = tl.program_id(0) 20 | num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) 21 | num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) 22 | pid_m = pid // num_pid_n 23 | pid_n = pid % num_pid_n 24 | 25 | offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) 26 | offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) 27 | offs_k = tl.arange(0, BLOCK_SIZE_K) 28 | offs_r = tl.arange(0, BLOCK_SIZE_R) 29 | 30 | y_ptrs = y_ptr + offs_m[:, None] * stride_ym + offs_n[None, :] * stride_yn 31 | mask_m = (offs_m < M)[:, None] 32 | mask_n = (offs_n < N)[None, :] 33 | mask_y = mask_m & mask_n 34 | 35 | acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) 36 | 37 | for k in range(0, K, BLOCK_SIZE_K): 38 | x_ptrs = x_ptr + offs_m[:, None] * stride_xm + (k + offs_k)[None, :] * stride_xk 39 | mask_x = (offs_m < M)[:, None] & ((k + offs_k) < K)[None, :] 40 | x = tl.load(x_ptrs, mask=mask_x, other=0.0) 41 | 42 | w_ptrs = w_ptr + (k + offs_k)[:, None] * stride_wk + offs_n[None, :] * stride_wn 43 | mask_w = ((k + offs_k) < K)[:, None] & (offs_n < N)[None, :] 44 | w = tl.load(w_ptrs, mask=mask_w, other=0.0) 45 | 46 | ab = tl.zeros((BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=tl.float32) 47 | for r in range(0, R, BLOCK_SIZE_R): 48 | a_ptrs = a_ptr + (k + offs_k)[:, None] * stride_ak + (r + offs_r)[None, :] * stride_ar 49 | mask_a = ((k + offs_k) < K)[:, None] & ((r + offs_r) < R)[None, :] 50 | a = tl.load(a_ptrs, mask=mask_a, other=0.0) 51 | 52 | b_ptrs = b_ptr + (r + offs_r)[:, None] * stride_br + offs_n[None, :] * stride_bn 53 | mask_b = ((r + offs_r) < R)[:, None] & (offs_n < N)[None, :] 54 | b = tl.load(b_ptrs, mask=mask_b, other=0.0) 55 | 56 | ab += tl.dot(a.to(tl.float32), b.to(tl.float32)) 57 | 58 | w_eff = w.to(tl.float32) + ab 59 | acc += tl.dot(x.to(tl.float32), w_eff) 60 | 61 | tl.store(y_ptrs, acc.to(tl.float16), mask=mask_y) 62 | 63 | def lora_matmul(x, W, A, B): 64 | M, K = x.shape 65 | _, N = W.shape 66 | R = A.shape[1] 67 | y = torch.empty((M, N), device=x.device, dtype=x.dtype) 68 | 69 | BLOCK_SIZE_M = 64 70 | BLOCK_SIZE_N = 64 71 | BLOCK_SIZE_K = 32 72 | BLOCK_SIZE_R = 32 73 | 74 | grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N, meta['BLOCK_SIZE_N']),) 75 | 76 | lora_kernel[grid]( 77 | y, x, W, A, B, 78 | M, N, K, R, 79 | y.stride(0), y.stride(1), 80 | x.stride(0), x.stride(1), 81 | W.stride(0), W.stride(1), 82 | A.stride(0), A.stride(1), 83 | B.stride(0), B.stride(1), 84 | BLOCK_SIZE_M=BLOCK_SIZE_M, 85 | BLOCK_SIZE_N=BLOCK_SIZE_N, 86 | BLOCK_SIZE_K=BLOCK_SIZE_K, 87 | BLOCK_SIZE_R=BLOCK_SIZE_R, 88 | ) 89 | return y 90 | -------------------------------------------------------------------------------- /day68/adam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def adam_fp8_kernel( 7 | param_ptr, grad_ptr, m_ptr, v_ptr, lr_ptr, 8 | beta1, beta2, eps, step, 9 | BLOCK_SIZE: tl.constexpr 10 | ): 11 | pid = tl.program_id(axis=0) 12 | offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 13 | mask = offset < tl.numel(param_ptr) 14 | 15 | param = tl.load(param_ptr + offset, mask=mask, other=0.0).to(tl.float16) 16 | grad = tl.load(grad_ptr + offset, mask=mask, other=0.0).to(tl.float16) 17 | m = tl.load(m_ptr + offset, mask=mask, other=0.0).to(tl.float16) 18 | v = tl.load(v_ptr + offset, mask=mask, other=0.0).to(tl.float16) 19 | lr = tl.load(lr_ptr + offset, mask=mask, other=0.0).to(tl.float16) 20 | 21 | m_new = beta1 * m + (1 - beta1) * grad 22 | v_new = beta2 * v + (1 - beta2) * grad * grad 23 | m_hat = m_new / (1 - beta1 ** step) 24 | v_hat = v_new / (1 - beta2 ** step) 25 | update = m_hat / (tl.sqrt(v_hat) + eps) 26 | param_new = param - lr * update 27 | 28 | param_new_fp8 = param_new.to(tl.float8_e4m3) 29 | m_new_fp8 = m_new.to(tl.float8_e4m3) 30 | v_new_fp8 = v_new.to(tl.float8_e4m3) 31 | 32 | tl.store(param_ptr + offset, param_new_fp8, mask=mask) 33 | tl.store(m_ptr + offset, m_new_fp8, mask=mask) 34 | tl.store(v_ptr + offset, v_new_fp8, mask=mask) 35 | 36 | def adam_fp8(param, grad, m, v, lr, beta1=0.9, beta2=0.999, eps=1e-8, step=1): 37 | BLOCK_SIZE = 1024 38 | n = param.numel() 39 | grid = lambda meta: (triton.cdiv(n, meta['BLOCK_SIZE']),) 40 | adam_fp8_kernel[grid]( 41 | param, grad, m, v, lr, 42 | beta1, beta2, eps, step, 43 | BLOCK_SIZE=BLOCK_SIZE 44 | ) 45 | -------------------------------------------------------------------------------- /day69/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def reduce_kernel(K, V, A_ptr, b_ptr, N: tl.constexpr, D: tl.constexpr): 7 | acc_A = tl.zeros([D, D], dtype=tl.float32) 8 | acc_b = tl.zeros([D], dtype=tl.float32) 9 | for j in range(N): 10 | k = tl.load(K + j * D) 11 | v = tl.load(V + j * D) 12 | k_phi = tl.relu(k) + 1.0 13 | for i in range(D): 14 | for jj in range(D): 15 | acc_A[i, jj] += k_phi[i] * v[jj] 16 | acc_b += k_phi 17 | tl.store(A_ptr, acc_A) 18 | tl.store(b_ptr, acc_b) 19 | 20 | @triton.jit 21 | def attention_kernel(Q, A_ptr, b_ptr, Out, N: tl.constexpr, D: tl.constexpr): 22 | pid = tl.program_id(0) 23 | q = tl.load(Q + pid * D) 24 | q_phi = tl.relu(q) + 1.0 25 | out_vec = tl.zeros([D], dtype=tl.float32) 26 | for i in range(D): 27 | a_row = tl.load(A_ptr + i * D) 28 | out_vec[i] = tl.dot(q_phi, a_row) 29 | denom = tl.dot(q_phi, tl.load(b_ptr)) 30 | tl.store(Out + pid * D, out_vec / denom) 31 | 32 | def linear_attention(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor) -> torch.Tensor: 33 | assert Q.is_cuda and K.is_cuda and V.is_cuda, "Input tensors must be on CUDA" 34 | N, D = Q.shape 35 | A = torch.empty((D, D), device='cuda', dtype=torch.float32) 36 | b = torch.empty((D,), device='cuda', dtype=torch.float32) 37 | reduce_kernel[(1,)](K, V, A, b, N, D) 38 | Out = torch.empty_like(Q) 39 | attention_kernel[(N,)](Q, A, b, Out, N, D) 40 | return Out 41 | 42 | if __name__ == "__main__": 43 | N = 1024 44 | D = 64 45 | Q = torch.randn((N, D), device='cuda', dtype=torch.float32) 46 | K = torch.randn((N, D), device='cuda', dtype=torch.float32) 47 | V = torch.randn((N, D), device='cuda', dtype=torch.float32) 48 | Out = linear_attention(Q, K, V) 49 | print("Output shape:", Out.shape) 50 | -------------------------------------------------------------------------------- /day72/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def sgd_kernel( 7 | param_ptr, 8 | grad_ptr, 9 | momentum_ptr, 10 | lr, 11 | weight_decay, 12 | momentum_factor, 13 | dampening, 14 | nesterov, 15 | n_elements, 16 | BLOCK_SIZE: tl.constexpr, 17 | ): 18 | pid = tl.program_id(axis=0) 19 | block_start = pid * BLOCK_SIZE 20 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 21 | mask = offsets < n_elements 22 | params = tl.load(param_ptr + offsets, mask=mask) 23 | grads = tl.load(grad_ptr + offsets, mask=mask) 24 | if weight_decay != 0.0: 25 | grads = grads + weight_decay * params 26 | if momentum_factor != 0.0: 27 | momentum_buf = tl.load(momentum_ptr + offsets, mask=mask) 28 | momentum_buf = momentum_factor * momentum_buf + (1.0 - dampening) * grads 29 | tl.store(momentum_ptr + offsets, momentum_buf, mask=mask) 30 | if nesterov: 31 | grads = grads + momentum_factor * momentum_buf 32 | else: 33 | grads = momentum_buf 34 | params = params - lr * grads 35 | tl.store(param_ptr + offsets, params, mask=mask) 36 | 37 | def sgd_update( 38 | params, 39 | grads, 40 | momentum_buffer=None, 41 | lr=0.01, 42 | weight_decay=0.0, 43 | momentum=0.0, 44 | dampening=0.0, 45 | nesterov=False, 46 | ): 47 | n_elements = params.numel() 48 | if momentum != 0.0 and momentum_buffer is None: 49 | momentum_buffer = torch.zeros_like(params) 50 | BLOCK_SIZE = 1024 51 | grid = (n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE 52 | sgd_kernel[grid, 1]( 53 | params.data_ptr(), 54 | grads.data_ptr(), 55 | momentum_buffer.data_ptr() if momentum != 0.0 else 0, 56 | lr, 57 | weight_decay, 58 | momentum, 59 | dampening, 60 | 1 if nesterov else 0, 61 | n_elements, 62 | BLOCK_SIZE, 63 | ) 64 | return params, momentum_buffer 65 | 66 | def example(): 67 | params = torch.randn(10000, device='cuda') 68 | grads = torch.randn(10000, device='cuda') 69 | momentum_buffer = torch.zeros_like(params) 70 | updated_params, updated_momentum = sgd_update( 71 | params, 72 | grads, 73 | momentum_buffer, 74 | lr=0.01, 75 | weight_decay=0.0001, 76 | momentum=0.9, 77 | nesterov=True 78 | ) 79 | print(f"Updated {params.shape} parameters using Triton SGD kernel") 80 | 81 | if __name__ == "__main__": 82 | example() 83 | -------------------------------------------------------------------------------- /day73/code.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import triton.language as tl 3 | import numpy as np 4 | 5 | @triton.jit 6 | def ddim_step_kernel( 7 | x_ptr, 8 | eps_ptr, 9 | out_ptr, 10 | alpha_t: tl.constexpr, 11 | alpha_t_prev: tl.constexpr, 12 | n_elements: tl.constexpr, 13 | BLOCK_SIZE: tl.constexpr = 1024 14 | ): 15 | pid = tl.program_id(0) 16 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 17 | mask = offsets < n_elements 18 | 19 | x = tl.load(x_ptr + offsets, mask=mask) 20 | eps = tl.load(eps_ptr + offsets, mask=mask) 21 | 22 | sqrt_alpha_t = tl.sqrt(alpha_t) 23 | sqrt_one_minus_alpha_t = tl.sqrt(1 - alpha_t) 24 | sqrt_alpha_t_prev = tl.sqrt(alpha_t_prev) 25 | sqrt_one_minus_alpha_t_prev = tl.sqrt(1 - alpha_t_prev) 26 | 27 | x0 = (x - sqrt_one_minus_alpha_t * eps) / sqrt_alpha_t 28 | new_x = sqrt_alpha_t_prev * x0 + sqrt_one_minus_alpha_t_prev * eps 29 | 30 | tl.store(out_ptr + offsets, new_x, mask=mask) 31 | 32 | def ddim_sampling_step(x: np.ndarray, eps: np.ndarray, alpha_t: float, alpha_t_prev: float): 33 | x = np.ascontiguousarray(x.astype(np.float32)) 34 | eps = np.ascontiguousarray(eps.astype(np.float32)) 35 | out = np.empty_like(x) 36 | 37 | n_elements = x.size 38 | grid = (triton.cdiv(n_elements, 1024),) 39 | 40 | ddim_step_kernel[grid]( 41 | x_ptr=x, 42 | eps_ptr=eps, 43 | out_ptr=out, 44 | alpha_t=alpha_t, 45 | alpha_t_prev=alpha_t_prev, 46 | n_elements=n_elements, 47 | BLOCK_SIZE=1024 48 | ) 49 | return out 50 | 51 | if __name__ == '__main__': 52 | N = 4096 53 | x = np.random.randn(N).astype(np.float32) 54 | eps = np.random.randn(N).astype(np.float32) 55 | alpha_t = 0.9 56 | alpha_t_prev = 0.85 57 | 58 | x_prev = ddim_sampling_step(x, eps, alpha_t, alpha_t_prev) 59 | print("Updated sample:", x_prev) 60 | -------------------------------------------------------------------------------- /day74/kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def relu_device_fn(x): 7 | return tl.maximum(0.0, x) 8 | 9 | @triton.jit 10 | def swish_device_fn(x): 11 | return x * tl.sigmoid(x) 12 | 13 | @triton.jit 14 | def gelu_device_fn(x): 15 | return 0.5 * x * (1.0 + tl.tanh(0.7978845608 * (x + 0.044715 * x * x * x))) 16 | 17 | def create_activation_kernel(device_fn): 18 | @triton.jit 19 | def kernel(x_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): 20 | pid = tl.program_id(axis=0) 21 | block_start = pid * BLOCK_SIZE 22 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 23 | mask = offsets < n_elements 24 | x = tl.load(x_ptr + offsets, mask=mask) 25 | output = device_fn(x) 26 | tl.store(output_ptr + offsets, output, mask=mask) 27 | return kernel 28 | 29 | def create_activation_function(kernel, name): 30 | @triton.autotune( 31 | configs=[ 32 | triton.Config({'BLOCK_SIZE': 128}), 33 | triton.Config({'BLOCK_SIZE': 256}), 34 | triton.Config({'BLOCK_SIZE': 512}), 35 | triton.Config({'BLOCK_SIZE': 1024}), 36 | ], 37 | key=['n_elements'], 38 | ) 39 | def activation_fn(x): 40 | n_elements = x.numel() 41 | output = torch.empty_like(x) 42 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 43 | kernel[grid]( 44 | x.data_ptr(), 45 | output.data_ptr(), 46 | n_elements, 47 | ) 48 | return output 49 | activation_fn.__name__ = name 50 | return activation_fn 51 | 52 | relu_kernel = create_activation_kernel(relu_device_fn) 53 | swish_kernel = create_activation_kernel(swish_device_fn) 54 | gelu_kernel = create_activation_kernel(gelu_device_fn) 55 | 56 | relu = create_activation_function(relu_kernel, "relu") 57 | swish = create_activation_function(swish_kernel, "swish") 58 | gelu = create_activation_function(gelu_kernel, "gelu") 59 | 60 | def example(): 61 | x = torch.randn(1024, 1024, device='cuda') 62 | y_relu = relu(x) 63 | y_swish = swish(x) 64 | y_gelu = gelu(x) 65 | print(f"Input shape: {x.shape}") 66 | print(f"ReLU output shape: {y_relu.shape}") 67 | print(f"Swish output shape: {y_swish.shape}") 68 | print(f"GELU output shape: {y_gelu.shape}") 69 | torch_relu = torch.nn.functional.relu(x) 70 | torch_gelu = torch.nn.functional.gelu(x) 71 | torch_swish = torch.nn.functional.silu(x) 72 | print(f"ReLU max error: {(y_relu - torch_relu).abs().max().item()}") 73 | print(f"Swish max error: {(y_swish - torch_swish).abs().max().item()}") 74 | print(f"GELU max error: {(y_gelu - torch_gelu).abs().max().item()}") 75 | 76 | if __name__ == "__main__": 77 | example() 78 | -------------------------------------------------------------------------------- /day76/kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def moe_kernel( 7 | input_ptr, 8 | gate_weight_ptr, 9 | experts_ptr, 10 | output_ptr, 11 | num_tokens, 12 | hidden_size, 13 | num_experts, 14 | top_k, 15 | input_token_stride, 16 | input_hidden_stride, 17 | expert_stride, 18 | expert_hidden_stride, 19 | BLOCK_SIZE: tl.constexpr, 20 | ): 21 | token_idx = tl.program_id(0) 22 | if token_idx >= num_tokens: 23 | return 24 | 25 | input_offset = token_idx * input_token_stride 26 | input = tl.load(input_ptr + input_offset + tl.arange(0, BLOCK_SIZE) * input_hidden_stride, 27 | mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0) 28 | 29 | gate_logits = tl.zeros((num_experts,), dtype=tl.float32) 30 | for expert in range(num_experts): 31 | gate_w = tl.load(gate_weight_ptr + expert * hidden_size + tl.arange(0, BLOCK_SIZE), 32 | mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0) 33 | logit = tl.sum(input * gate_w) 34 | gate_logits = tl.store(gate_logits + expert, logit) 35 | 36 | max_logit = tl.max(gate_logits) 37 | exp_logits = tl.exp(gate_logits - max_logit) 38 | sum_exp = tl.sum(exp_logits) 39 | probs = exp_logits / sum_exp 40 | 41 | topk_values, topk_indices = tl.topk(probs, top_k) 42 | 43 | output = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) 44 | for i in range(top_k): 45 | expert_idx = topk_indices[i] 46 | weight = topk_values[i] 47 | 48 | expert_offset = expert_idx * expert_stride 49 | expert_output = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) 50 | for j in range(hidden_size): 51 | w = tl.load(experts_ptr + expert_offset + j * expert_hidden_stride + tl.arange(0, BLOCK_SIZE), 52 | mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0) 53 | expert_output += input[j] * w 54 | 55 | output += weight * expert_output 56 | 57 | tl.store(output_ptr + token_idx * input_token_stride + tl.arange(0, BLOCK_SIZE) * input_hidden_stride, 58 | output, mask=tl.arange(0, BLOCK_SIZE) < hidden_size) 59 | 60 | def moe_layer(input: torch.Tensor, gate: torch.Tensor, experts: torch.Tensor, top_k: int): 61 | assert experts.shape[0] >= top_k, "Number of experts must be >= top_k" 62 | output = torch.empty_like(input) 63 | hidden_size = input.size(1) 64 | num_tokens = input.size(0) 65 | num_experts = gate.size(1) 66 | 67 | # Ensure block size is a power of two for optimal performance 68 | BLOCK_SIZE = triton.next_power_of_2(hidden_size) 69 | if BLOCK_SIZE > 4096: 70 | BLOCK_SIZE = 4096 71 | 72 | moe_kernel[(num_tokens,)]( 73 | input_ptr=input, 74 | gate_weight_ptr=gate, 75 | experts_ptr=experts, 76 | output_ptr=output, 77 | num_tokens=num_tokens, 78 | hidden_size=hidden_size, 79 | num_experts=num_experts, 80 | top_k=top_k, 81 | input_token_stride=input.stride(0), 82 | input_hidden_stride=input.stride(1), 83 | expert_stride=experts.stride(0), 84 | expert_hidden_stride=experts.stride(2), 85 | BLOCK_SIZE=BLOCK_SIZE, 86 | ) 87 | return output -------------------------------------------------------------------------------- /day77/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import time 5 | 6 | # A simplified RetNet kernel: a decaying cumulative sum. 7 | # Given an input sequence x and a decay factor alpha, 8 | # it computes: y[0] = x[0] and for i>0, y[i] = x[i] + alpha * y[i-1] 9 | # Note: This kernel assumes the sequence length (N) is known at compile-time. 10 | @triton.jit 11 | def retnet_kernel(x_ptr, y_ptr, N: tl.constexpr, alpha: tl.constexpr): 12 | # we use a single program (grid = (1,)) to process the full sequence sequentially. 13 | acc = tl.zeros([1], dtype=tl.float32) 14 | # Process each element in sequence. 15 | for i in range(N): 16 | # Load the i-th element from input. 17 | x_val = tl.load(x_ptr + i) 18 | # Compute the recurrent relation. 19 | acc = x_val + alpha * acc 20 | # Store the result. 21 | tl.store(y_ptr + i, acc) 22 | 23 | # A CPU reference implementation for testing correctness and timing. 24 | def retnet_cpu(x, alpha): 25 | y = torch.empty_like(x) 26 | acc = 0.0 27 | for i in range(x.shape[0]): 28 | acc = x[i].item() + alpha * acc 29 | y[i] = acc 30 | return y 31 | 32 | def main(): 33 | # Parameters 34 | N = 1024 # Sequence length (must match the kernel compile-time constant) 35 | alpha = 0.9 36 | # Create a random input tensor on the GPU. 37 | x = torch.randn(N, device='cuda', dtype=torch.float32) 38 | y = torch.empty_like(x) 39 | 40 | # Define a grid that launches one program instance (since the kernel is sequential). 41 | grid = lambda meta: (1,) 42 | 43 | # Warm-up: launch the kernel once to compile and warm up. 44 | retnet_kernel[grid](x, y, N, alpha) 45 | torch.cuda.synchronize() 46 | 47 | # Time the Triton kernel using CUDA events. 48 | start_event = torch.cuda.Event(enable_timing=True) 49 | end_event = torch.cuda.Event(enable_timing=True) 50 | start_event.record() 51 | retnet_kernel[grid](x, y, N, alpha) 52 | end_event.record() 53 | torch.cuda.synchronize() 54 | triton_time = start_event.elapsed_time(end_event) # milliseconds 55 | 56 | # Run the CPU version for comparison. 57 | x_cpu = x.cpu() 58 | start = time.time() 59 | y_cpu = retnet_cpu(x_cpu, alpha) 60 | cpu_time = (time.time() - start) * 1000 # convert to ms 61 | 62 | # Verify correctness. 63 | y_ref = y_cpu.to(device='cuda') 64 | if torch.allclose(y, y_ref, atol=1e-5): 65 | print("Results match.") 66 | else: 67 | print("Results differ!") 68 | 69 | print("Triton kernel time (ms):", triton_time) 70 | print("CPU cumulative sum time (ms):", cpu_time) 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /day79/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def quantize_kernel(input_ptr, output_ptr, n_elements, scale, BLOCK_SIZE: tl.constexpr): 7 | 8 | pid = tl.program_id(0) 9 | block_start = pid * BLOCK_SIZE 10 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 11 | 12 | mask = offsets < n_elements 13 | 14 | x = tl.load(input_ptr + offsets, mask=mask) 15 | 16 | x_scaled = x * scale 17 | 18 | x_rounded = tl.round(x_scaled) 19 | 20 | x_clamped = tl.max(tl.min(x_rounded, 127), -128) 21 | 22 | tl.store(output_ptr + offsets, tl.cast(x_clamped, tl.int8), mask=mask) 23 | 24 | def quantize(input_tensor, scale): 25 | 26 | assert input_tensor.is_cuda, "Input tensor must be on a CUDA device" 27 | n_elements = input_tensor.numel() 28 | output_tensor = torch.empty_like(input_tensor, dtype=torch.int8) 29 | 30 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 31 | quantize_kernel[grid](input_tensor, output_tensor, n_elements, scale, BLOCK_SIZE=1024) 32 | 33 | return output_tensor 34 | 35 | if __name__ == '__main__': 36 | 37 | input_tensor = torch.randn(1024 * 1024, device='cuda', dtype=torch.float32) 38 | scale = 127.0 39 | output_tensor = quantize(input_tensor, scale) 40 | print("Quantization complete. Output tensor:") 41 | print(output_tensor) 42 | -------------------------------------------------------------------------------- /day80/kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | @triton.jit 6 | def rwkv_kernel( 7 | output_ptr, 8 | k_ptr, 9 | v_ptr, 10 | w_ptr, 11 | n_time: tl.constexpr, 12 | n_channels: tl.constexpr, 13 | stride_time: tl.constexpr, 14 | stride_batch: tl.constexpr 15 | ): 16 | pid = tl.program_id(0) 17 | batch = pid // n_channels 18 | channel = pid % n_channels 19 | 20 | w = tl.load(w_ptr + channel) 21 | 22 | max_val = -1e30 23 | numerator = 0.0 24 | denominator = 0.0 25 | 26 | for t in range(n_time): 27 | offset = batch * stride_batch + t * stride_time + channel 28 | 29 | cur_k = tl.load(k_ptr + offset) 30 | cur_v = tl.load(v_ptr + offset) 31 | 32 | m = tl.maximum(max_val, cur_k) 33 | 34 | exp_max_diff = tl.exp(max_val - m) 35 | exp_k_diff = tl.exp(cur_k - m) 36 | 37 | numerator = numerator * exp_max_diff + cur_v * exp_k_diff 38 | denominator = denominator * exp_max_diff + exp_k_diff 39 | 40 | result = numerator / denominator 41 | tl.store(output_ptr + offset, result) 42 | 43 | max_val = m + w 44 | 45 | def rwkv_forward(k: torch.Tensor, v: torch.Tensor, w: torch.Tensor) -> torch.Tensor: 46 | assert k.is_cuda and v.is_cuda and w.is_cuda, "All tensors must be on CUDA." 47 | B, T, C = k.shape 48 | 49 | output = torch.empty_like(v) 50 | 51 | stride_time = k.stride(1) 52 | stride_batch = k.stride(0) 53 | 54 | grid = (B * C,) 55 | 56 | rwkv_kernel[grid]( 57 | output_ptr=output, 58 | k_ptr=k, 59 | v_ptr=v, 60 | w_ptr=w, 61 | n_time=T, 62 | n_channels=C, 63 | stride_time=stride_time, 64 | stride_batch=stride_batch, 65 | ) 66 | return output 67 | 68 | if __name__ == '__main__': 69 | B = 2 # batch size 70 | T = 128 # sequence length 71 | C = 256 # number of channels 72 | 73 | k_tensor = torch.randn(B, T, C, device='cuda', dtype=torch.float32) 74 | v_tensor = torch.randn(B, T, C, device='cuda', dtype=torch.float32) 75 | w_tensor = torch.randn(C, device='cuda', dtype=torch.float32) * 0.1 76 | 77 | output_tensor = rwkv_forward(k_tensor, v_tensor, w_tensor) 78 | print("Output shape:", output_tensor.shape) 79 | print("Output sample:", output_tensor[0, :5, :5]) 80 | -------------------------------------------------------------------------------- /day81/main.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import triton.language as tl 3 | 4 | @triton.jit 5 | def fused_layernorm_ff_dropout_kernel( 6 | x_ptr, out_ptr, 7 | gamma_ptr, beta_ptr, 8 | weight1_ptr, bias1_ptr, 9 | weight2_ptr, bias2_ptr, 10 | seed, 11 | dropout_p: tl.constexpr, 12 | N: tl.constexpr, 13 | M: tl.constexpr, 14 | BLOCK: tl.constexpr 15 | ): 16 | row_idx = tl.program_id(0) 17 | row_offset = row_idx * N 18 | 19 | x = tl.load(x_ptr + row_offset + tl.arange(0, N)) 20 | mean = tl.sum(x, axis=0) / N 21 | diff = x - mean 22 | var = tl.sum(diff * diff, axis=0) / N 23 | norm = diff * tl.rsqrt(var + 1e-5) 24 | 25 | gamma = tl.load(gamma_ptr + tl.arange(0, N)) 26 | beta = tl.load(beta_ptr + tl.arange(0, N)) 27 | norm = norm * gamma + beta 28 | 29 | hidden = tl.zeros([M], dtype=x.dtype) 30 | for i in range(0, N, BLOCK): 31 | block_range = i + tl.arange(0, BLOCK) 32 | norm_block = norm[block_range] 33 | weight1_block = tl.load( 34 | weight1_ptr + i * M + tl.arange(0, BLOCK)[:, None] * M + tl.arange(0, M), 35 | mask=(i + tl.arange(0, BLOCK))[:, None] < N, other=0.0 36 | ) 37 | hidden += tl.dot(norm_block, weight1_block) 38 | 39 | bias1 = tl.load(bias1_ptr + tl.arange(0, M)) 40 | hidden += bias1 41 | 42 | SQRT_2_OVER_PI = 0.7978845608028654 43 | gelu_hidden = 0.5 * hidden * (1.0 + tl.tanh(SQRT_2_OVER_PI * (hidden + 0.044715 * hidden * hidden * hidden))) 44 | 45 | prng = tl.arange(0, M) + row_idx * M + seed 46 | rand_vals = ((1103515245 * prng + 12345) & 0x7fffffff) / 2147483647.0 47 | dropout_mask = rand_vals > dropout_p 48 | dropout_scale = 1.0 / (1.0 - dropout_p) 49 | dropped = gelu_hidden * dropout_mask * dropout_scale 50 | 51 | out = tl.zeros([N], dtype=x.dtype) 52 | for j in range(0, M, BLOCK): 53 | block_range = j + tl.arange(0, BLOCK) 54 | dropped_block = dropped[block_range] 55 | weight2_block = tl.load( 56 | weight2_ptr + j * N + tl.arange(0, BLOCK)[:, None] * N + tl.arange(0, N), 57 | mask=(j + tl.arange(0, BLOCK))[:, None] < M, other=0.0 58 | ) 59 | out += tl.dot(dropped_block, weight2_block) 60 | 61 | bias2 = tl.load(bias2_ptr + tl.arange(0, N)) 62 | out += bias2 63 | 64 | tl.store(out_ptr + row_offset + tl.arange(0, N), out) 65 | -------------------------------------------------------------------------------- /day82/rope.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import math 5 | 6 | @triton.jit 7 | def rope_kernel(q_ptr, cos_ptr, sin_ptr, stride_q0, stride_q1, stride_cos0, stride_cos1, seq_len: tl.constexpr, head_half: tl.constexpr, BLOCK_SEQ: tl.constexpr, BLOCK_HD: tl.constexpr): 8 | 9 | pid_seq = tl.program_id(0) 10 | pid_hd = tl.program_id(1) 11 | 12 | seq_offset = pid_seq * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ) 13 | hd_offset = pid_hd * BLOCK_HD + tl.arange(0, BLOCK_HD) 14 | 15 | mask_seq = seq_offset < seq_len 16 | mask_hd = hd_offset < head_half 17 | 18 | q_ptrs = q_ptr + seq_offset[:, None] * stride_q0 + hd_offset[None, :] * (2 * stride_q1) 19 | 20 | q0 = tl.load(q_ptrs, mask=mask_seq[:, None] & mask_hd[None, :]) 21 | q1 = tl.load(q_ptrs + stride_q1, mask=mask_seq[:, None] & mask_hd[None, :]) 22 | 23 | cos_ptrs = cos_ptr + seq_offset[:, None] * stride_cos0 + hd_offset[None, :] * stride_cos1 24 | sin_ptrs = sin_ptr + seq_offset[:, None] * stride_cos0 + hd_offset[None, :] * stride_cos1 25 | 26 | cos_val = tl.load(cos_ptrs, mask=mask_seq[:, None] & mask_hd[None, :]) 27 | sin_val = tl.load(sin_ptrs, mask=mask_seq[:, None] & mask_hd[None, :]) 28 | 29 | out0 = q0 * cos_val - q1 * sin_val 30 | out1 = q0 * sin_val + q1 * cos_val 31 | 32 | tl.store(q_ptrs, out0, mask=mask_seq[:, None] & mask_hd[None, :]) 33 | tl.store(q_ptrs + stride_q1, out1, mask=mask_seq[:, None] & mask_hd[None, :]) 34 | 35 | def apply_rope(q, cos, sin, BLOCK_SEQ=64, BLOCK_HD=32): 36 | 37 | seq_len, head_dim = q.shape 38 | assert head_dim % 2 == 0 39 | head_half = head_dim // 2 40 | 41 | grid = ((seq_len + BLOCK_SEQ - 1) // BLOCK_SEQ, (head_half + BLOCK_HD - 1) // BLOCK_HD) 42 | 43 | q_contig = q.contiguous() 44 | 45 | rope_kernel[grid](q_contig, cos, sin, q_contig.stride(0), q_contig.stride(1), cos.stride(0), cos.stride(1), seq_len, head_half, BLOCK_SEQ, BLOCK_HD) 46 | return q_contig 47 | 48 | if __name__ == "__main__": 49 | torch.manual_seed(0) 50 | device = 'cuda' 51 | 52 | seq_len = 128 53 | head_dim = 64 54 | 55 | q = torch.randn(seq_len, head_dim, device=device, dtype=torch.float32) 56 | 57 | positions = torch.arange(seq_len, device=device, dtype=torch.float32).unsqueeze(1) 58 | dim_idx = torch.arange(head_dim // 2, device=device, dtype=torch.float32).unsqueeze(0) 59 | inv_freq = 1.0 / (10000 ** (dim_idx / (head_dim // 2))) 60 | theta = positions * inv_freq 61 | 62 | cos = torch.cos(theta) 63 | sin = torch.sin(theta) 64 | 65 | q_transformed = apply_rope(q, cos, sin) 66 | print("Transformed q:") 67 | print(q_transformed) 68 | -------------------------------------------------------------------------------- /day84/kernel.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import triton.language as tl 3 | import torch 4 | 5 | @triton.jit 6 | def fp8_gemm_kernel( 7 | a_ptr, b_ptr, c_ptr, 8 | M, N, K, 9 | stride_am, stride_ak, 10 | stride_bk, stride_bn, 11 | stride_cm, stride_cn, 12 | scale_a, scale_b, scale_c, 13 | BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr 14 | ): 15 | pid_m = tl.program_id(0) 16 | pid_n = tl.program_id(1) 17 | 18 | rm = tl.arange(0, BLOCK_M) 19 | rn = tl.arange(0, BLOCK_N) 20 | offm = pid_m * BLOCK_M + rm 21 | offn = pid_n * BLOCK_N + rn 22 | 23 | acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) 24 | 25 | for k in range(0, K, BLOCK_K): 26 | offk = k + tl.arange(0, BLOCK_K) 27 | 28 | a = tl.load( 29 | a_ptr + offm[:, None] * stride_am + offk[None, :] * stride_ak, 30 | mask=(offm[:, None] < M) & (offk[None, :] < K), 31 | other=0, 32 | ) 33 | b = tl.load( 34 | b_ptr + offk[:, None] * stride_bk + offn[None, :] * stride_bn, 35 | mask=(offk[:, None] < K) & (offn[None, :] < N), 36 | other=0, 37 | ) 38 | 39 | a_fp32 = tl.cast(a, tl.float32) * scale_a 40 | b_fp32 = tl.cast(b, tl.float32) * scale_b 41 | 42 | acc += tl.dot(a_fp32, b_fp32) 43 | 44 | c_fp8 = tl.round(acc / scale_c) 45 | c_fp8 = tl.max(tl.min(c_fp8, 127), -128) 46 | 47 | tl.store( 48 | c_ptr + offm[:, None] * stride_cm + offn[None, :] * stride_cn, 49 | c_fp8.to(tl.int8), 50 | mask=(offm[:, None] < M) & (offn[None, :] < N) 51 | ) 52 | 53 | def fp8_gemm(a: torch.Tensor, b: torch.Tensor, 54 | scale_a: float, scale_b: float, scale_c: float, 55 | BLOCK_M: int = 64, BLOCK_N: int = 64, BLOCK_K: int = 32) -> torch.Tensor: 56 | assert a.dtype == torch.int8 and b.dtype == torch.int8 57 | M, K = a.shape 58 | K2, N = b.shape 59 | assert K == K2 60 | 61 | c = torch.empty((M, N), device=a.device, dtype=torch.int8) 62 | 63 | grid = (triton.cdiv(M, BLOCK_M), triton.cdiv(N, BLOCK_N)) 64 | 65 | fp8_gemm_kernel[grid]( 66 | a, b, c, 67 | M, N, K, 68 | a.stride(0), a.stride(1), 69 | b.stride(0), b.stride(1), 70 | c.stride(0), c.stride(1), 71 | scale_a, scale_b, scale_c, 72 | BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K 73 | ) 74 | return c 75 | 76 | if __name__ == "__main__": 77 | torch.manual_seed(0) 78 | M, K, N = 128, 256, 64 79 | 80 | a_fp8 = torch.randint(-128, 127, (M, K), device='cuda', dtype=torch.int8) 81 | b_fp8 = torch.randint(-128, 127, (K, N), device='cuda', dtype=torch.int8) 82 | 83 | scale_a, scale_b, scale_c = 0.1, 0.1, 0.05 84 | 85 | c_fp8 = fp8_gemm(a_fp8, b_fp8, scale_a, scale_b, scale_c) 86 | print("GEMM result (FP8 stored as int8):", c_fp8) 87 | -------------------------------------------------------------------------------- /day85/TensorMatMul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | __global__ void tensorMatrixMultKernel( 5 | const float* A, 6 | const float* B, 7 | float* C, 8 | size_t B_dim, 9 | size_t I_dim, 10 | size_t J_dim, 11 | size_t L_dim, 12 | size_t K_dim 13 | ) { 14 | 15 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 16 | 17 | 18 | int total_elements = B_dim * I_dim * J_dim * K_dim; 19 | if (idx < total_elements) { 20 | int k = idx % K_dim; 21 | int j = (idx / K_dim) % J_dim; 22 | int i = (idx / (K_dim * J_dim)) % I_dim; 23 | int b = idx / (K_dim * J_dim * I_dim); 24 | 25 | 26 | size_t c_idx = ((b * I_dim + i) * J_dim + j) * K_dim + k; 27 | 28 | 29 | float sum = 0.0f; 30 | 31 | 32 | size_t a_base = ((b * I_dim + i) * J_dim + j) * L_dim; 33 | 34 | for (int l = 0; l < L_dim; l++) { 35 | sum += A[a_base + l] * B[l * K_dim + k]; 36 | } 37 | 38 | C[c_idx] = sum; 39 | } 40 | } 41 | 42 | extern "C" void solution(const float* A, const float* B, float* C, size_t b, size_t i, size_t j, size_t l, size_t k) { 43 | 44 | size_t total_elements = b * i * j * k; 45 | 46 | 47 | int threadsPerBlock = 256; 48 | int blocksPerGrid = (total_elements + threadsPerBlock - 1) / threadsPerBlock; 49 | 50 | 51 | tensorMatrixMultKernel<<>>(A, B, C, b, i, j, l, k); 52 | } -------------------------------------------------------------------------------- /day86/hard_sigmoid.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | __global__ void hard_sigmoid_kernel(const float* input, float* output, size_t total_elements) { 6 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x; 7 | if (idx >= total_elements) return; 8 | 9 | float x = input[idx]; 10 | if (x <= -3.0f) 11 | output[idx] = 0.0f; 12 | else if (x >= 3.0f) 13 | output[idx] = 1.0f; 14 | else 15 | output[idx] = (x + 3.0f) / 6.0f; 16 | } 17 | 18 | 19 | extern "C" void solution(const float* input, float* output, size_t n, size_t m) { 20 | 21 | size_t total_elements = n * m; 22 | 23 | const int threadsPerBlock = 256; 24 | int blocksPerGrid = (total_elements + threadsPerBlock - 1) / threadsPerBlock; 25 | 26 | hard_sigmoid_kernel<<>>(input, output, total_elements); 27 | 28 | 29 | } 30 | -------------------------------------------------------------------------------- /day87/SymMatMul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define BLOCK_SIZE 16 4 | 5 | __global__ void matrixMulKernel(const float* A, const float* B, float* C, size_t n) { 6 | size_t row = blockIdx.y * blockDim.y + threadIdx.y; 7 | size_t col = blockIdx.x * blockDim.x + threadIdx.x; 8 | 9 | if (row < n && col < n) { 10 | float sum = 0.0f; 11 | for (size_t k = 0; k < n; k++) { 12 | sum += A[row * n + k] * B[k * n + col]; 13 | } 14 | C[row * n + col] = sum; 15 | } 16 | } 17 | 18 | extern "C" void solution(const float* input_a, const float* input_b, float* output_c, size_t n) { 19 | dim3 block(BLOCK_SIZE, BLOCK_SIZE); 20 | dim3 grid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, (n + BLOCK_SIZE - 1) / BLOCK_SIZE); 21 | 22 | matrixMulKernel<<>>(input_a, input_b, output_c, n); 23 | 24 | 25 | cudaDeviceSynchronize(); 26 | } 27 | -------------------------------------------------------------------------------- /day88/MSE.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) { 5 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x; 6 | if (idx < numElements) { 7 | float diff = predictions[idx] - targets[idx]; 8 | float sq_diff = diff * diff; 9 | 10 | atomicAdd(sum, sq_diff); 11 | } 12 | } 13 | 14 | extern "C" void solution(const float* predictions, const float* targets, float* output, size_t* shape, size_t ndim) { 15 | 16 | size_t* hostShape = new size_t[ndim]; 17 | cudaMemcpy(hostShape, shape, ndim * sizeof(size_t), cudaMemcpyDeviceToHost); 18 | 19 | size_t numElements = 1; 20 | for (size_t i = 0; i < ndim; i++) { 21 | numElements *= hostShape[i]; 22 | } 23 | delete[] hostShape; 24 | 25 | 26 | float init = 0.0f; 27 | cudaMemcpy(output, &init, sizeof(float), cudaMemcpyHostToDevice); 28 | 29 | 30 | int threadsPerBlock = 256; 31 | int blocks = (numElements + threadsPerBlock - 1) / threadsPerBlock; 32 | mseKernel<<>>(predictions, targets, numElements, output); 33 | cudaDeviceSynchronize(); 34 | 35 | float hostSum = 0.0f; 36 | cudaMemcpy(&hostSum, output, sizeof(float), cudaMemcpyDeviceToHost); 37 | 38 | float mse = hostSum / numElements; 39 | 40 | cudaMemcpy(output, &mse, sizeof(float), cudaMemcpyHostToDevice); 41 | } 42 | -------------------------------------------------------------------------------- /day89/LTMM.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define BLOCK_SIZE 16 5 | 6 | 7 | __global__ 8 | void lowerTriangularMultiplyKernel(const float* A, const float* B, float* C, size_t n) { 9 | int row = blockIdx.y * blockDim.y + threadIdx.y; 10 | int col = blockIdx.x * blockDim.x + threadIdx.x; 11 | 12 | if (row < n && col < n) { 13 | if (col > row) { 14 | C[row * n + col] = 0.0f; 15 | } else { 16 | float sum = 0.0f; 17 | for (int k = col; k <= row; k++) { 18 | sum += A[row * n + k] * B[k * n + col]; 19 | } 20 | C[row * n + col] = sum; 21 | } 22 | } 23 | } 24 | 25 | extern "C" void solution(const float* input_a, const float* input_b, float* output_c, size_t n) { 26 | dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); 27 | dim3 gridDim((n + blockDim.x - 1) / blockDim.x, 28 | (n + blockDim.y - 1) / blockDim.y); 29 | 30 | lowerTriangularMultiplyKernel<<>>(input_a, input_b, output_c, n); 31 | 32 | cudaDeviceSynchronize(); 33 | } 34 | -------------------------------------------------------------------------------- /day90/FrobeniusNorm.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | __global__ void calculateSumOfSquares(const float* X, float* partialSums, size_t size) { 5 | extern __shared__ float sharedData[]; 6 | 7 | 8 | unsigned int tid = threadIdx.x; 9 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 10 | 11 | 12 | sharedData[tid] = 0.0f; 13 | 14 | 15 | while (i < size) { 16 | sharedData[tid] += X[i] * X[i]; 17 | i += blockDim.x * gridDim.x; 18 | } 19 | 20 | 21 | __syncthreads(); 22 | 23 | 24 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { 25 | if (tid < s) { 26 | sharedData[tid] += sharedData[tid + s]; 27 | } 28 | __syncthreads(); 29 | } 30 | 31 | 32 | if (tid == 0) { 33 | partialSums[blockIdx.x] = sharedData[0]; 34 | } 35 | } 36 | 37 | 38 | __global__ void normalizeByFrobeniusNorm(const float* X, float* Y, size_t size, float frobeniusNorm) { 39 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 40 | 41 | if (i < size) { 42 | Y[i] = X[i] / frobeniusNorm; 43 | } 44 | } 45 | 46 | extern "C" void solution(const float* X, float* Y, size_t size) { 47 | 48 | int blockSize = 256; 49 | int gridSize = (size + blockSize - 1) / blockSize; 50 | int maxBlocks = 1024; 51 | 52 | if (gridSize > maxBlocks) { 53 | gridSize = maxBlocks; 54 | } 55 | 56 | float* d_partialSums; 57 | cudaMalloc(&d_partialSums, gridSize * sizeof(float)); 58 | 59 | calculateSumOfSquares<<>>(X, d_partialSums, size); 60 | 61 | float* h_partialSums = new float[gridSize]; 62 | cudaMemcpy(h_partialSums, d_partialSums, gridSize * sizeof(float), cudaMemcpyDeviceToHost); 63 | 64 | float sumOfSquares = 0.0f; 65 | for (int i = 0; i < gridSize; i++) { 66 | sumOfSquares += h_partialSums[i]; 67 | } 68 | 69 | float frobeniusNorm = sqrt(sumOfSquares); 70 | 71 | if (frobeniusNorm < 1e-10) { 72 | frobeniusNorm = 1.0f; 73 | } 74 | 75 | normalizeByFrobeniusNorm<<<(size + blockSize - 1) / blockSize, blockSize>>>(X, Y, size, frobeniusNorm); 76 | 77 | delete[] h_partialSums; 78 | cudaFree(d_partialSums); 79 | } -------------------------------------------------------------------------------- /day91/Hinge_Loss.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | __global__ void hingeKernel(const float* predictions, const float* targets, float* output, size_t n) { 5 | 6 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x; 7 | 8 | if (idx < n) { 9 | float prod = predictions[idx] * targets[idx]; 10 | output[idx] = fmaxf(0.0f, 1.0f - prod); 11 | } 12 | } 13 | 14 | 15 | extern "C" void solution(const float* predictions, const float* targets, float* output, size_t n) { 16 | // I found this to be the best configuration for the kernel (h100) 17 | const int blockSize = 256; 18 | const int gridSize = (n + blockSize - 1) / blockSize; 19 | 20 | hingeKernel<<>>(predictions, targets, output, n); 21 | 22 | 23 | } -------------------------------------------------------------------------------- /day92/1D_Convolution.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ 4 | void conv1d(const float* A, 5 | const float* B, 6 | float* C, 7 | size_t N, 8 | size_t K) 9 | { 10 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; 11 | int radius = int(K/2); 12 | 13 | if (i < N) { 14 | float sum = 0.0f; 15 | for (int j = 0; j < int(K); ++j) { 16 | int idx = int(i) + j - radius; 17 | if (idx >= 0 && idx < int(N)) { 18 | sum += A[idx] * B[j]; 19 | } 20 | } 21 | C[i] = sum; 22 | } 23 | } 24 | 25 | extern "C" 26 | void solution(const float* A, 27 | const float* B, 28 | float* C, 29 | size_t N, 30 | size_t K) 31 | { 32 | int threads = 1024; 33 | int blocks = int((N + threads - 1) / threads); 34 | 35 | conv1d<<>>(A, B, C, N, K); 36 | } 37 | -------------------------------------------------------------------------------- /day93/RMS_Normalization.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define EPSILON 1e-5f 5 | 6 | __global__ void compute_rms(const float* X, float* rms, size_t B, size_t N) { 7 | extern __shared__ float sdata[]; 8 | size_t row = blockIdx.x; 9 | size_t tid = threadIdx.x; 10 | const float* row_ptr = X + row * N; 11 | 12 | float sum = 0.0f; 13 | for (size_t i = tid; i < N; i += blockDim.x) { 14 | float v = row_ptr[i]; 15 | sum += v * v; 16 | } 17 | sdata[tid] = sum; 18 | __syncthreads(); 19 | 20 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { 21 | if (tid < s) { 22 | sdata[tid] += sdata[tid + s]; 23 | } 24 | __syncthreads(); 25 | } 26 | 27 | if (tid == 0) { 28 | float mean_sq = sdata[0] / static_cast(N); 29 | rms[row] = sqrtf(mean_sq + EPSILON); 30 | } 31 | } 32 | 33 | __global__ void normalize_rms(const float* X, float* Y, const float* rms, size_t B, size_t N) { 34 | size_t row = blockIdx.x; 35 | size_t tid = threadIdx.x; 36 | float r = rms[row]; 37 | const float* row_in = X + row * N; 38 | float* row_out = Y + row * N; 39 | 40 | for (size_t i = tid; i < N; i += blockDim.x) { 41 | row_out[i] = row_in[i] / r; 42 | } 43 | } 44 | 45 | extern "C" void solution(const float* X, float* Y, size_t B, size_t N) { 46 | int threads = (N < 256) ? int(N) : 256; 47 | size_t shared_mem_size = threads * sizeof(float); 48 | 49 | float* d_rms = nullptr; 50 | cudaMalloc(&d_rms, B * sizeof(float)); 51 | 52 | compute_rms<<>>(X, d_rms, B, N); 53 | 54 | normalize_rms<<>>(X, Y, d_rms, B, N); 55 | 56 | cudaFree(d_rms); 57 | } 58 | -------------------------------------------------------------------------------- /day94/ELU.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day94/ELU.cu -------------------------------------------------------------------------------- /day95/2D_Max_Pooling.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include // for FLT_MAX 3 | #include // for size_t 4 | 5 | 6 | __global__ 7 | void maxpool2d_kernel(const float* __restrict__ input, 8 | int H, int W, 9 | int kernel_size, int stride, int padding, int dilation, 10 | int H_out, int W_out, 11 | float* __restrict__ output) 12 | { 13 | 14 | int out_y = blockIdx.y * blockDim.y + threadIdx.y; 15 | int out_x = blockIdx.x * blockDim.x + threadIdx.x; 16 | if (out_y >= H_out || out_x >= W_out) return; 17 | 18 | 19 | float max_val = -FLT_MAX; 20 | for (int m = 0; m < kernel_size; ++m) { 21 | int in_y = out_y * stride + m * dilation - padding; 22 | for (int n = 0; n < kernel_size; ++n) { 23 | int in_x = out_x * stride + n * dilation - padding; 24 | 25 | if (in_y >= 0 && in_y < H && in_x >= 0 && in_x < W) { 26 | float v = input[in_y * W + in_x]; 27 | if (v > max_val) max_val = v; 28 | } 29 | } 30 | } 31 | output[out_y * W_out + out_x] = max_val; 32 | } 33 | 34 | 35 | extern "C" 36 | void solution(const float* input, 37 | int kernel_size, 38 | int stride, 39 | int padding, 40 | int dilation, 41 | float* output, 42 | size_t H, 43 | size_t W) 44 | { 45 | 46 | int H_out = (int)(( (int)H + 2*padding 47 | - dilation*(kernel_size-1) 48 | - 1 ) / stride) + 1; 49 | int W_out = (int)(( (int)W + 2*padding 50 | - dilation*(kernel_size-1) 51 | - 1 ) / stride) + 1; 52 | 53 | 54 | const int Bx = 16, By = 16; 55 | dim3 block(Bx, By); 56 | dim3 grid( (W_out + Bx - 1) / Bx, 57 | (H_out + By - 1) / By ); 58 | 59 | maxpool2d_kernel<<>>( 60 | input, 61 | (int)H, (int)W, 62 | kernel_size, stride, padding, dilation, 63 | H_out, W_out, 64 | output 65 | ); 66 | 67 | 68 | } 69 | -------------------------------------------------------------------------------- /day96/Product_Over_Dimension.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | __global__ 6 | void prod_reduce_kernel(const float* __restrict__ input, 7 | float* __restrict__ output, 8 | size_t M, 9 | size_t S_d, 10 | size_t N) 11 | { 12 | 13 | size_t out_idx = blockIdx.x; 14 | 15 | size_t m = out_idx / N; 16 | size_t n = out_idx - m * N; 17 | 18 | 19 | const float* base = input + (m * S_d) * N + n; 20 | 21 | 22 | double prod = 1.0; 23 | for (size_t k = threadIdx.x; k < S_d; k += blockDim.x) { 24 | prod *= static_cast( base[k * N] ); 25 | } 26 | 27 | 28 | constexpr unsigned FULL_MASK = 0xffffffffu; 29 | for (int offset = warpSize/2; offset > 0; offset >>= 1) { 30 | prod *= __shfl_down_sync(FULL_MASK, prod, offset); 31 | } 32 | 33 | 34 | __shared__ double warp_prod[1024/32]; 35 | int lane = threadIdx.x & (warpSize - 1); 36 | int wid = threadIdx.x >> 5; 37 | if (lane == 0) warp_prod[wid] = prod; 38 | __syncthreads(); 39 | 40 | 41 | if (wid == 0) { 42 | double block_prod = (lane < ((blockDim.x+31)/32)) 43 | ? warp_prod[lane] 44 | : 1.0; 45 | for (int offset = ((blockDim.x+31)/32)/2; offset > 0; offset >>= 1) { 46 | block_prod *= __shfl_down_sync(FULL_MASK, block_prod, offset); 47 | } 48 | if (lane == 0) { 49 | 50 | output[out_idx] = static_cast(block_prod); 51 | } 52 | } 53 | } 54 | 55 | 56 | extern "C" 57 | void solution(const float* input, 58 | int dim, 59 | float* output, 60 | size_t* shape, 61 | size_t ndim) 62 | { 63 | 64 | std::vector hshape(ndim); 65 | cudaMemcpy(hshape.data(), shape, ndim*sizeof(size_t), 66 | cudaMemcpyDeviceToHost); 67 | 68 | 69 | size_t M = 1, N = 1; 70 | for (int i = 0; i < dim; ++i) M *= hshape[i]; 71 | for (int i = dim+1; i < (int)ndim; ++i) N *= hshape[i]; 72 | size_t S_d = hshape[dim]; 73 | 74 | size_t total_outputs = M * N; 75 | if (total_outputs == 0 || S_d == 0) return; 76 | 77 | 78 | int blk = 1; 79 | while (blk < (int)S_d && blk < 1024) blk <<= 1; 80 | blk = std::max(blk, 32); 81 | blk = std::min(blk, 1024); 82 | 83 | 84 | dim3 grid( total_outputs ); 85 | dim3 block( blk ); 86 | 87 | prod_reduce_kernel<<>>(input, output, M, S_d, N); 88 | 89 | 90 | } 91 | -------------------------------------------------------------------------------- /day97/elu_optim.cu: -------------------------------------------------------------------------------- 1 | // To optimize the code I used: float4 loads for FP32 tail, __half2 vectorized ELU for even-index FP16, branchless, FMA, __exp2f for faster exp 2 | 3 | #include 4 | #include 5 | 6 | #define EXPM1f(x) expm1f(x) 7 | #define EXP2f(x) __exp2f(x) 8 | 9 | 10 | __global__ __launch_bounds__(1024, 4) 11 | void elu_fp16(const float* __restrict__ input, 12 | float* __restrict__ output, 13 | size_t total, 14 | float alpha) { 15 | size_t tid = blockIdx.x * blockDim.x + threadIdx.x; 16 | size_t stride = blockDim.x * gridDim.x; 17 | 18 | 19 | size_t vec8 = (total / 8) * 8; 20 | for (size_t base = tid * 8; base < vec8; base += stride * 8) { 21 | 22 | float4 f0 = __ldg((const float4*)(input + base)); 23 | float4 f1 = __ldg((const float4*)(input + base + 4)); 24 | 25 | 26 | f0.x = f0.x > 0.f ? f0.x : alpha * EXPM1f(f0.x); 27 | f0.y = f0.y > 0.f ? f0.y : alpha * EXPM1f(f0.y); 28 | f0.z = f0.z > 0.f ? f0.z : alpha * EXPM1f(f0.z); 29 | f0.w = f0.w > 0.f ? f0.w : alpha * EXPM1f(f0.w); 30 | 31 | f1.x = f1.x > 0.f ? f1.x : alpha * EXPM1f(f1.x); 32 | f1.y = f1.y > 0.f ? f1.y : alpha * EXPM1f(f1.y); 33 | f1.z = f1.z > 0.f ? f1.z : alpha * EXPM1f(f1.z); 34 | f1.w = f1.w > 0.f ? f1.w : alpha * EXPM1f(f1.w); 35 | 36 | 37 | reinterpret_cast(output + base)[0] = f0; 38 | reinterpret_cast(output + base)[1] = f1; 39 | } 40 | 41 | for (size_t i = vec8 + tid; i < total; i += stride) { 42 | float x = __ldg(&input[i]); 43 | output[i] = x > 0.f ? x : alpha * EXPM1f(x); 44 | } 45 | } 46 | 47 | extern "C" void solution(const float* input, float* output, size_t n, size_t m, float alpha) { 48 | size_t total = n * m; 49 | const int threads = 1024; 50 | int blocks = (total / 8 + threads - 1) / threads; 51 | blocks = max(blocks, 320); 52 | blocks = min(blocks, 65535); 53 | 54 | elu_fp16<<>>(input, output, total, alpha); 55 | } -------------------------------------------------------------------------------- /notes/offsetcudatriton.md: -------------------------------------------------------------------------------- 1 | I want to talk about how offsets and stuff is calculated in CUDA and Triton 2 | 3 | I will start with CUDA because is easier to explain in my opinion : 4 | ```c 5 | __global__ void vectorAdd(const float* A , const float *B, float *C, int N){ 6 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 7 | if (idx a constant pointer of type float to the `A` array 15 | 16 | `const float* B` -> a constant pointer of type float to the `B` array 17 | 18 | `float *C` -> a pointer of type float to the `C` array. Note: `C` pointer is not a constant because we want to modify the element of it 19 | 20 | Now lets dive deeper: 21 | `int idx = blockIdx.x * blockDim.x + threadIdx.x` : so we have the position in the grid multiplied by the dimension of each block + position in the thread 22 | 23 | 24 | Now lets see the triton function: 25 | ```python 26 | def __kernelfunction__(input_pointer, output_pointer, N, 27 | BLOCKSIZE: tl.constexpr): 28 | pid = tl.program_id(0) 29 | 30 | offset = pid * BLOCKSIZE + tl.arange(0, BLOCKSIZE) 31 | mask = offset < N 32 | 33 | input_data = tl.load(input_pointer + offset, mask=mask) 34 | output_data = tl.sqrt(input_data) 35 | tl.store(output_pointer + offset, output_data, mask=mask) 36 | ``` 37 | so our `idx` is exactly the `offset` in the triton. 38 | offset will be calculated by the programd_id multiplied by the dimension of the block and we will add an array of [0,1,2,3,....,BLOCKSIZE] . 39 | The result can be thinked that will be an array with each position associated to each thread. 40 | -------------------------------------------------------------------------------- /nvidiadocs/addition.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void addition(float* A , float* B, float*C){ 5 | int idx = threadIdx.x; 6 | C[idx] = A[idx] + B[idx]; 7 | } 8 | 9 | int main(){ 10 | int N = 10; 11 | addition<<<1,N>>>(A,B,C); // simple addition kernle that will launch N threads 12 | } 13 | 14 | ////////////////////////////////////// 15 | // (x, y) is (x + y Dx); 16 | // (x, y, z) is (x + y Dx + z Dx Dy) 17 | // int i 18 | __global__ void MatAdd(float A[N][N], float B[N][N], 19 | float C[N][N]) 20 | { 21 | int i = threadIdx.x; 22 | int j = threadIdx.y; 23 | C[i][j] = A[i][j] + B[i][j]; 24 | } 25 | 26 | int main() 27 | { 28 | // Kernel invocation with one block of N * N * 1 threads 29 | int numBlocks = 1; // number of blocks 30 | dim3 threadsPerBlock(N, N); // Threads 31 | MatAdd<<>>(A, B, C); 32 | } 33 | 34 | 35 | ///////////////// 36 | // Kernel definition 37 | __global__ void MatAdd(float A[N][N], float B[N][N], 38 | float C[N][N]) 39 | { 40 | int i = blockIdx.x * blockDim.x + threadIdx.x; 41 | int j = blockIdx.y * blockDim.y + threadIdx.y; 42 | if (i < N && j < N) 43 | C[i][j] = A[i][j] + B[i][j]; 44 | } 45 | 46 | int main() 47 | { 48 | ... 49 | // Kernel invocation 50 | dim3 threadsPerBlock(16, 16); // threadsPerBlock -> how many threads per blocl 51 | dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y); // Nu 52 | MatAdd<<>>(A, B, C); 53 | ... 54 | } 55 | //// --------------------------------------------------------------------------------