├── .gitignore
├── Makefile
├── ReadMe.md
├── day01
    ├── addition.cu
    └── printAdd.cu
├── day02
    ├── function.cu
    └── function.py
├── day03
    ├── addMatrix.cu
    ├── addMatrix.py
    └── anotherMatrix.cu
├── day04
    └── layerNorm.cu
├── day05
    └── vectorSumTricks.cu
├── day06
    ├── AdditionKernel
    │   ├── additionKernel.cpython-312-x86_64-linux-gnu.so
    │   ├── additionKernel.cu
    │   ├── additionKernel.egg-info
    │   │   ├── PKG-INFO
    │   │   ├── SOURCES.txt
    │   │   ├── dependency_links.txt
    │   │   └── top_level.txt
    │   ├── additionKernelBinding.cpp
    │   ├── additionkernel.cpython-312-x86_64-linux-gnu.so
    │   ├── additionkernel.egg-info
    │   │   ├── PKG-INFO
    │   │   ├── SOURCES.txt
    │   │   ├── dependency_links.txt
    │   │   └── top_level.txt
    │   ├── build
    │   │   ├── lib.linux-x86_64-cpython-312
    │   │   │   ├── additionKernel.cpython-312-x86_64-linux-gnu.so
    │   │   │   └── additionkernel.cpython-312-x86_64-linux-gnu.so
    │   │   └── temp.linux-x86_64-cpython-312
    │   │   │   ├── additionKernel.o
    │   │   │   └── additionKernelBinding.o
    │   ├── pythontest.py
    │   └── setup.py
    ├── ImportingToPython
    │   ├── build
    │   │   ├── lib.linux-x86_64-cpython-312
    │   │   │   └── example_kernels.cpython-312-x86_64-linux-gnu.so
    │   │   └── temp.linux-x86_64-cpython-312
    │   │   │   ├── rollcall.o
    │   │   │   └── rollcallbinding.o
    │   ├── example_kernels.cpython-312-x86_64-linux-gnu.so
    │   ├── example_kernels.egg-info
    │   │   ├── PKG-INFO
    │   │   ├── SOURCES.txt
    │   │   ├── dependency_links.txt
    │   │   └── top_level.txt
    │   ├── pythontest.py
    │   ├── rollcall.cu
    │   ├── rollcallbinding.cpp
    │   └── setup.py
    ├── SMBlocks.cu
    ├── SoftMax.cu
    ├── TransposeMatrix.cu
    └── note
├── day07
    ├── conv1d.cu
    ├── globalMemoryCoalescing.cu
    ├── matmul.cu
    ├── naive.cu
    └── pythontest.py
├── day08
    ├── idk.cu
    ├── pmpbook
    │   ├── chapter3ex.cu
    │   ├── chapter3matvecmul.cu
    │   ├── color2gray.cu
    │   ├── deviceinfo.cu
    │   ├── imageblur.cu
    │   └── vecaddition.cu
    └── selfAttention
    │   └── selfAttention.cu
├── day09
    ├── bind.cpp
    ├── flashAttention.cu
    ├── flashAttentionFromTut.cu
    └── test.py
├── day10
    ├── FlashAttention.cpp
    ├── FlashAttention.cu
    ├── linking
    │   ├── simpleKernel.cpp
    │   ├── simpleKernel.cu
    │   └── test.py
    ├── ppmbook
    │   └── matrixmul.cu
    ├── setup.py
    └── test.py
├── day100
    └── delta.cu
├── day11
    ├── FlashTestPytorch
    │   ├── FlashAttention.cu
    │   ├── binding.cpp
    │   └── test.py
    ├── LeakyReLU.cu
    ├── ReLU.cu
    ├── SoftMax.cu
    ├── TanH.cu
    ├── binding.cpp
    ├── test.py
    └── testbackward.py
├── day12
    ├── NN
    │   └── kernels.cu
    ├── softMax.cu
    └── tileMatrix.cu
├── day13
    ├── RMS.cu
    ├── RMSBetter.cu
    ├── binding.cpp
    └── test.py
├── day14
    ├── FA2
    │   ├── flash.cu
    │   ├── helper.cu
    │   ├── helper.cuh
    │   ├── kernels.cu
    │   └── kernels.cuh
    ├── FlashAttention2
    │   └── kernel.cu
    ├── cat.jpg
    └── conv.cu
├── day15
    ├── Attention.cu
    ├── SMM.cu
    └── dotproduct.cu
├── day16
    ├── attentionbwkd.cu
    └── test.py
├── day17
    ├── cublas1.cu
    ├── cublas2.cu
    └── cublas3.cu
├── day18
    ├── atomic1.cu
    ├── atomic2.cu
    └── wrap.cu
├── day19
    └── cublasMM.cu
├── day20
    ├── rope.cu
    └── test_rope.py
├── day21
    └── conv.cu
├── day22
    ├── persistent2.cu
    └── persistentKernel.cu
├── day23
    ├── kernel.ptx
    └── main.cu
├── day24
    └── GeGLU.cu
├── day25
    └── nbody.cu
├── day26
    ├── gradientdescent.cu
    └── gradientdescent.out
├── day27
    ├── kmeans.cu
    └── kmeans.out
├── day28
    ├── sample.cu
    └── test_sample.py
├── day29
    └── pi.cu
├── day30
    └── kernelHisto.cu
├── day31
    └── kernel.cu
├── day32
    ├── Makefile
    └── matmul_kernels
    │   ├── kernel_1
    │       └── kernel_1.cpp
    │   ├── kernel_2
    │       └── kernel_2.cpp
    │   ├── kernel_3
    │       └── kernel_3.cpp
    │   └── kernel_rocblas
    │       └── kernel_rocblas.cpp
├── day33
    └── load_in_pytorch
    │   ├── kernel.cpp
    │   ├── kernel.so
    │   └── test.py
├── day34
    └── tensor_lib
    │   ├── test1.cpp
    │   └── test1.out
├── day35
    └── layernorm.cpp
├── day36
    └── random.cpp
├── day37
    └── MultiStreams
    │   ├── MHA.cpp
    │   ├── MHA.out
    │   ├── notes.md
    │   ├── results.copy_stats.csv
    │   ├── results.db
    │   ├── results.hip_stats.csv
    │   ├── results.hsa_stats.csv
    │   ├── results.json
    │   ├── results.stats.csv
    │   └── results.sysinfo.txt
├── day38
    └── myreduction.cpp
├── day39
    └── advancedcudamm.cu
├── day40
    └── flaship.cpp
├── day41
    └── MLA.cu
├── day42
    ├── mat_mul.py
    └── mat_mul_2.py
├── day43
    └── rope.py
├── day44
    ├── average_duration_per_block_size.png
    ├── benchmark_results.csv
    ├── duration_vs_total_elements.png
    └── tritonkernel.py
├── day45
    └── cross_entropy
    │   └── cross_entropy.py
├── day46
    └── flash_attention.py
├── day47
    ├── hip_cooperative_groups.h
    └── kernel.cpp
├── day48
    └── kernel.py
├── day49
    └── kernel.py
├── day50
    └── tritonnn.py
├── day51
    └── main.py
├── day52
    └── functionsused.py
├── day53
    └── layer_norm.py
├── day54
    └── softmax.py
├── day55
    └── ddpm.py
├── day56
    └── main.py
├── day57
    └── main.py
├── day58
    └── layer_norm.cpp
├── day59
    └── test.py
├── day60
    └── fused.py
├── day61
    └── backprop.py
├── day62
    └── main.py
├── day63
    └── lstm.py
├── day64
    └── main.py
├── day65
    └── quant.cpp
├── day66
    └── kernel.cpp
├── day67
    └── lora.py
├── day68
    └── adam.py
├── day69
    └── main.py
├── day70
    └── gla.py
├── day71
    └── main.py
├── day72
    └── main.py
├── day73
    └── code.py
├── day74
    └── kernel.py
├── day75
    └── kernel.py
├── day76
    └── kernel.py
├── day77
    └── main.py
├── day78
    └── rmsnorm.py
├── day79
    └── main.py
├── day80
    └── kernel.py
├── day81
    └── main.py
├── day82
    └── rope.py
├── day83
    └── lin.py
├── day84
    └── kernel.py
├── day85
    └── TensorMatMul.cu
├── day86
    └── hard_sigmoid.cu
├── day87
    └── SymMatMul.cu
├── day88
    └── MSE.cu
├── day89
    └── LTMM.cu
├── day90
    └── FrobeniusNorm.cu
├── day91
    └── Hinge_Loss.cu
├── day92
    └── 1D_Convolution.cu
├── day93
    └── RMS_Normalization.cu
├── day94
    └── ELU.cu
├── day95
    └── 2D_Max_Pooling.cu
├── day96
    └── Product_Over_Dimension.cu
├── day97
    └── elu_optim.cu
├── day98
    └── kernel.cpp
├── day99
    └── kernel.cpp
├── notes
    └── offsetcudatriton.md
└── nvidiadocs
    └── addition.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | /.vscode


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PROJECT_DIR := $(CURDIR)
 2 | 
 3 | COLOR_RESET := \033[0m
 4 | COLOR_GREEN := \033[32m
 5 | COLOR_YELLOW := \033[33m
 6 | COLOR_BLUE := \033[34m
 7 | COLOR_RED := \033[31m
 8 | 
 9 | CUDA_ARCH := sm_89  # Specify CUDA architecture (e.g., sm_89 for RTX 4070)
10 | 
11 | all: build
12 | 
13 | build: $(PROJECT_DIR)/$(dir)/$(program).out
14 | 
15 | $(PROJECT_DIR)/$(dir)/$(program).out: $(PROJECT_DIR)/$(dir)/$(program).cu
16 | 	@echo  "$(COLOR_YELLOW)Building program $(program) in directory $(dir)...$(COLOR_RESET)"
17 | 	@nvcc -arch=$(CUDA_ARCH) -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -o $@ $< -lcuda
18 | 	@echo  "$(COLOR_GREEN)Build completed for $(program).out in $(dir)$(COLOR_RESET)"
19 | 
20 | run: $(PROJECT_DIR)/$(dir)/$(program).out
21 | 	@echo  "$(COLOR_BLUE)Running $(program).out in directory $(dir)...$(COLOR_RESET)"
22 | 	@./$(dir)/$(program).out
23 | 
24 | clean:
25 | 	@echo  "$(COLOR_RED)Cleaning up .out files in directory $(dir)...$(COLOR_RESET)"
26 | 	@rm -f $(PROJECT_DIR)/$(dir)/*.out
27 | 	@echo  "$(COLOR_GREEN)Clean completed for directory $(dir)$(COLOR_RESET)"
28 | 
29 | cleanall:
30 | 	@echo  "$(COLOR_RED)Cleaning up all .out files in all directories...$(COLOR_RESET)"
31 | 	@find $(PROJECT_DIR) -type f -name "*.out" -exec rm -f {} \;
32 | 	@echo  "$(COLOR_GREEN)Cleanall completed for all directories$(COLOR_RESET)"
33 | 
34 | help:
35 | 	@echo  "$(COLOR_BLUE)Usage instructions for Makefile:$(COLOR_RESET)"
36 | 	@echo ""
37 | 	@echo "$(COLOR_YELLOW)make dir=<dir> program=<program>$(COLOR_RESET)      # Build the program <program>.cu in directory <dir>"
38 | 	@echo "$(COLOR_YELLOW)make run dir=<dir> program=<program>$(COLOR_RESET)  # Run the compiled <program>.out in directory <dir>"
39 | 	@echo "$(COLOR_YELLOW)make clean dir=<dir>$(COLOR_RESET)                  # Clean all .out files in directory <dir>"
40 | 	@echo "$(COLOR_YELLOW)make cleanall$(COLOR_RESET)                         # Clean all .out files in all directories"
41 | 	@echo ""
42 | 	@echo "$(COLOR_BLUE)Examples:$(COLOR_RESET)"
43 | 	@echo "$(COLOR_GREEN)make dir=day1 program=addition$(COLOR_RESET)        # Build addition.cu in day1"
44 | 	@echo "$(COLOR_GREEN)make run dir=day1 program=addition$(COLOR_RESET)    # Run addition.out in day1"
45 | 	@echo "$(COLOR_GREEN)make clean dir=day1$(COLOR_RESET)                   # Clean up .out files in day1"
46 | 	@echo "$(COLOR_GREEN)make cleanall$(COLOR_RESET)                         # Clean all .out files in all directories"
47 | 


--------------------------------------------------------------------------------
/day01/addition.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __global__ void vectorAdd(const float* A , const float *B, float *C, int N){
 5 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;    
 6 |     // so blockIdx.x -> is the ID of thread
 7 |     // block dim = the size of the window we work on it
 8 |     // threaidx =
 9 |     if (idx<N){
10 |         C[idx] = A[idx] + B[idx];
11 |     }
12 | }
13 | 
14 | int main(){
15 |     const int N = 1024; // elements in vec
16 |     const int size = N * sizeof(int); // total size of vectors in bytes
17 | 
18 |     float *h_A = new float[N];
19 |     float *h_B = new float[N];
20 |     float *h_C = new float[N];
21 | 
22 |     for(int i = 0 ;i <N;i++){
23 |         h_A[i] = 1;
24 |         h_B[i] = i;
25 |     }
26 | 
27 |     float *d_A, *d_B,*d_C;
28 | 
29 |     cudaMalloc((void**)&d_A,size);
30 |     cudaMalloc((void**)&d_B,size);
31 |     cudaMalloc((void**)&d_C,size);
32 | 
33 |     // copy input data from host to device:
34 |     cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);
35 |     cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);
36 | 
37 |     int threadsPerBlock = 256;
38 |     int blocksPerGrid = ( N + threadsPerBlock -1) / threadsPerBlock;
39 | 
40 |     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A,d_B,d_C,N);
41 | 
42 |     cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
43 |     for(int i =N-10;i<N;i++){
44 |         std::cout << "C[" << i << "] = " << h_C[i] << std::endl;
45 |     }
46 | 
47 |     cudaFree(d_A);
48 |     cudaFree(d_B);
49 |     cudaFree(d_C);
50 | 
51 |     delete[] h_A; 
52 |     delete[] h_B; 
53 |     delete[] h_C; 
54 | 
55 |     return 0; 
56 | }
57 | 


--------------------------------------------------------------------------------
/day01/printAdd.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | // Kernel to print threadIdx.x
 5 | __global__ void printThreadIdx(int N) {
 6 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 7 | 
 8 |     if (idx < N) { // Ensure the thread is within bounds
 9 |         printf("Block: %d, Thread: %d, Global Index: %d\n", blockIdx.x, threadIdx.x, idx);
10 |     }
11 | }
12 | 
13 | int main() {
14 |     const int N = 1024; // Number of elements
15 |     const int threadsPerBlock = 256;
16 |     const int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
17 | 
18 |     // Launch the kernel
19 |     printThreadIdx<<<blocksPerGrid, threadsPerBlock>>>(N);
20 | 
21 |     // Wait for the device to finish
22 |     cudaDeviceSynchronize();
23 | 
24 |     return 0;
25 | }
26 | 


--------------------------------------------------------------------------------
/day02/function.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __device__ float square(int x){
 5 |     return x*x;
 6 |     // __device__ marked function can only be called from anoter device function
 7 |     // or a kernel method
 8 | }
 9 | 
10 | __global__ void voidKernel(int *input,int *output,int N) {
11 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
12 |     if (i < N){
13 |         output[i] = square(input[i]);
14 |     }   
15 | }
16 | 
17 | 
18 | int main(){
19 |     int N = 10; // size of input and output arrays
20 |     int size = N*sizeof(int); // total memory to allocate for the ararys 
21 |     int *h_input = new int[N]; // alocate memory on the CPU
22 |     int *h_output = new int[N]; // alocate memory on the CPU
23 | 
24 |     for(int i = 0;i<N;i++){
25 |         h_input[i] = i+1;
26 |     }
27 | 
28 |     int *d_input,*d_output;
29 |     cudaMalloc((void**)&d_input,size);
30 |     cudaMalloc((void**)&d_output,size);
31 | 
32 |     cudaMemcpy(d_input,h_input,size,cudaMemcpyHostToDevice);
33 |     // destinatin, from,number of bytes to copy, specify that the data is transfered form the host to device
34 | 
35 |     int threadsPerBlock = 256;
36 |     int blockGrid = (N + threadsPerBlock-1 )/ threadsPerBlock;
37 |     voidKernel<<<blockGrid, threadsPerBlock>>>(d_input, d_output, N);    cudaMemcpy(h_output,d_output,size,cudaMemcpyDeviceToHost);
38 | 
39 |     std::cout << "Squared array: ";
40 |     for (int i = 0; i < N; i++) {
41 |         std::cout << h_output[i] << " ";
42 |         }
43 |     std::cout << std::endl;
44 | 
45 |     delete[] h_input;
46 |     delete[] h_output;
47 |     cudaFree(d_input);
48 |     cudaFree(d_output);
49 | 
50 |     return 0;
51 | }


--------------------------------------------------------------------------------
/day02/function.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def __kernelfunction__(input_pointer, output_pointer, N,
 7 |                        BLOCKSIZE: tl.constexpr):
 8 |     pid = tl.program_id(0)  # Get the program (block) ID
 9 | 
10 |     offset = pid * BLOCKSIZE + tl.arange(0, BLOCKSIZE)
11 |     mask = offset < N
12 | 
13 |     input_data = tl.load(input_pointer + offset, mask=mask)
14 |     output_data = tl.sqrt(input_data)
15 |     tl.store(output_pointer + offset, output_data, mask=mask)
16 | 
17 | def main():
18 |     N = 10
19 | 
20 |     input_data = torch.arange(0, N, dtype=torch.float32)
21 |     print("Input data:", input_data)
22 | 
23 |     output_data = torch.empty_like(input_data)
24 | 
25 |     input_ptr = input_data.to("cuda")
26 |     output_ptr = output_data.to("cuda")
27 | 
28 |     BLOCKSIZE = 256
29 | 
30 |     GRID = (triton.cdiv(N, BLOCKSIZE),)
31 | 
32 |     __kernelfunction__[GRID](input_ptr, output_ptr, N, BLOCKSIZE=BLOCKSIZE)
33 | 
34 |     output_data = output_ptr.cpu()
35 |     print("Output data:", output_data)
36 | 
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/day03/addMatrix.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cmath>
 3 | #include <cuda_runtime.h>
 4 | #include <chrono>
 5 | 
 6 | void printMatrix(const float *Matrix, const int size = 16) {
 7 |     int rootSize = sqrt(size);
 8 |     for (int i = 0; i < rootSize; i++) {
 9 |         for (int j = 0; j < rootSize; j++) {
10 |             std::cout << Matrix[i * rootSize + j] << " ";
11 |         }
12 |         std::cout << "\n";
13 |     }
14 | }
15 | 
16 | __global__ void matrixAddCUDA(const float *Matrix_A, const float *Matrix_B, float *Matrix_C,
17 |                               const int sizeX, const int sizeY) {
18 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
19 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
20 | 
21 |     if (row < sizeY && col < sizeX) {
22 |         Matrix_C[row * sizeX + col] = Matrix_A[row * sizeX + col] + Matrix_B[row * sizeX + col];
23 |     }
24 | }
25 | 
26 | void matrixAddCPU(const float *Matrix_A, const float *Matrix_B, float *Matrix_C, int sizeX, int sizeY) {
27 |     for (int row = 0; row < sizeY; row++) {
28 |         for (int col = 0; col < sizeX; col++) {
29 |             Matrix_C[row * sizeX + col] = Matrix_A[row * sizeX + col] + Matrix_B[row * sizeX + col];
30 |         }
31 |     }
32 | }
33 | 
34 | void compareExecutionTime(const float *Matrix_A, const float *Matrix_B, float *Matrix_C,
35 |                           const int sizeX, const int sizeY) {
36 |     const int matrixSize = sizeX * sizeY;
37 |     const int matrixBytes = sizeof(float) * matrixSize;
38 | 
39 |     float *gpu_A, *gpu_B, *gpu_C;
40 |     cudaMalloc((void **)&gpu_A, matrixBytes);
41 |     cudaMalloc((void **)&gpu_B, matrixBytes);
42 |     cudaMalloc((void **)&gpu_C, matrixBytes);
43 | 
44 |     cudaMemcpy(gpu_A, Matrix_A, matrixBytes, cudaMemcpyHostToDevice);
45 |     cudaMemcpy(gpu_B, Matrix_B, matrixBytes, cudaMemcpyHostToDevice);
46 | 
47 |     int BLOCK_SIZE = 32;
48 |     dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
49 |     dim3 gridDim((sizeX + BLOCK_SIZE - 1) / BLOCK_SIZE, (sizeY + BLOCK_SIZE - 1) / BLOCK_SIZE);
50 | 
51 |     auto startCPU = std::chrono::high_resolution_clock::now();
52 |     matrixAddCPU(Matrix_A, Matrix_B, Matrix_C, sizeX, sizeY);
53 |     auto endCPU = std::chrono::high_resolution_clock::now();
54 | 
55 |     auto startCUDA = std::chrono::high_resolution_clock::now();
56 |     matrixAddCUDA<<<gridDim, blockDim>>>(gpu_A, gpu_B, gpu_C, sizeX, sizeY);
57 |     cudaDeviceSynchronize();
58 |     auto endCUDA = std::chrono::high_resolution_clock::now();
59 | 
60 |     cudaMemcpy(Matrix_C, gpu_C, matrixBytes, cudaMemcpyDeviceToHost);
61 | 
62 |     std::chrono::duration<double> cpuDuration = endCPU - startCPU;
63 |     std::chrono::duration<double> cudaDuration = endCUDA - startCUDA;
64 |     std::cout << "CPU Execution Time: " << cpuDuration.count() << " seconds\n";
65 |     std::cout << "CUDA Execution Time: " << cudaDuration.count() << " seconds\n";
66 | 
67 |     cudaFree(gpu_A);
68 |     cudaFree(gpu_B);
69 |     cudaFree(gpu_C);
70 | }
71 | 
72 | int main() {
73 |     const int sizeX = 1024*16;
74 |     const int sizeY = 1024*16;
75 |     const int matrixSize = sizeX * sizeY;
76 | 
77 |     float *cpu_A = new float[matrixSize];
78 |     float *cpu_B = new float[matrixSize];
79 |     float *cpu_C = new float[matrixSize];
80 | 
81 |     for (int i = 0; i < matrixSize; i++) {  
82 |         cpu_A[i] = 10.0f;
83 |         cpu_B[i] = static_cast<float>(i);
84 |     }
85 | 
86 |     compareExecutionTime(cpu_A, cpu_B, cpu_C, sizeX, sizeY);
87 | 
88 |     delete[] cpu_A;
89 |     delete[] cpu_B;
90 |     delete[] cpu_C;
91 | 
92 |     return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/day03/addMatrix.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import torch
 3 | import triton.language as tl
 4 | 
 5 | 
 6 | @triton.jit
 7 | def addMatrix(Matrix_A,Matrix_B,Matrix_C,sizeX,sizeY,BLOCK_SIZE:tl.constexpr):
 8 | 
 9 |     pid_x = tl.program_id(0) # we have the rows
10 |     pid_y = tl.program_id(1) # we have the collumns
11 |     
12 |     row_start = pid_x*BLOCK_SIZE
13 |     col_start = pid_y*BLOCK_SIZE
14 |     
15 |     row_indices = row_start + tl.arange(0,BLOCK_SIZE)
16 |     col_indices = col_start + tl.arange(0,BLOCK_SIZE)
17 |     
18 |     row_indices = row_indices[:,None]
19 |     col_indices = col_indices[None,:]
20 |     
21 |     row_mask = row_indices < sizeY
22 |     col_mask = col_indices < sizeX
23 |     valid_mask = row_mask & col_mask 
24 |     
25 |     flat_indicies = row_indices * sizeX + col_indices
26 |     
27 |     A = tl.load(Matrix_A + flat_indicies,mask =valid_mask,other=0.0)
28 |     B = tl.load(Matrix_B + flat_indicies,mask = valid_mask,other = 0.0)
29 |     
30 |     C = A+B;
31 |     
32 |     tl.store(Matrix_C+flat_indicies,C,mask=valid_mask)
33 | 
34 | 
35 | def test_addMatrix():
36 |     sizeX = 8
37 |     sizeY = 8
38 |     BLOCK_SIZE = 2
39 | 
40 |     Matrix_A = torch.randn(sizeY, sizeX, device='cuda', dtype=torch.float32)
41 |     Matrix_B = torch.randn(sizeY, sizeX, device='cuda', dtype=torch.float32)
42 |     Matrix_C = torch.zeros_like(Matrix_A, device='cuda', dtype=torch.float32)
43 | 
44 |     Matrix_A_flat = Matrix_A.flatten()
45 |     Matrix_B_flat = Matrix_B.flatten()
46 |     Matrix_C_flat = Matrix_C.flatten()
47 | 
48 |     grid = (triton.cdiv(sizeX, BLOCK_SIZE), triton.cdiv(sizeY, BLOCK_SIZE))
49 |     addMatrix[grid](Matrix_A_flat, Matrix_B_flat, Matrix_C_flat, sizeX, sizeY, BLOCK_SIZE)
50 | 
51 |     Matrix_C = Matrix_C_flat.reshape(sizeY, sizeX)
52 | 
53 |     expected = Matrix_A + Matrix_B
54 |     print("Matrix A:\n", Matrix_A)
55 |     print("Matrix B:\n", Matrix_B)
56 |     print("Matrix C (Triton):\n", Matrix_C)
57 |     print("Expected (PyTorch):\n", expected)
58 |     assert torch.allclose(Matrix_C, expected), "Triton result does not match PyTorch result!"
59 | 
60 | test_addMatrix()
61 | 
62 |     
63 |     
64 |     


--------------------------------------------------------------------------------
/day03/anotherMatrix.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __device__ float randomFunction(float x, float y)
 5 | {
 6 |     return x + y * 2;
 7 | }
 8 | 
 9 | __global__ void matrixFunction(const float *A, const float *B, float *C, const int size)
10 | {
11 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
12 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
13 | 
14 |     if (i < size && j < size)
15 |     {
16 |         C[i + size * j] = randomFunction(A[i + size * j], B[i + size * j]);
17 |     }
18 | }
19 | 
20 | int main()
21 | {
22 |     int N = 8;
23 |     int BLOCK_SIZE = 2;
24 |     dim3 blockDim(BLOCK_SIZE * BLOCK_SIZE);
25 |     dim3 gridDim(N + BLOCK_SIZE - 1 / BLOCK_SIZE, N + BLOCK_SIZE - 1 / BLOCK_SIZE);
26 |     int size = sizeof(float) * N * N;
27 | 
28 |     float *A,*B,*C;
29 |     float *dA,*dB,*dC;
30 |     A = new float[N*N];
31 |     B = new float[N*N];
32 |     C = new float[N*N];
33 | 
34 |     cudaMalloc((void**)&dA,size);
35 |     cudaMalloc((void**)&dB,size);
36 |     cudaMalloc((void**)&dC,size);
37 | 
38 |     for (int i = 0; i < N; ++i) {
39 |         for (int j = 0; j < N; ++j) {
40 |             A[i + N * j] = 1.0f; 
41 |             B[i + N * j] = 2.0f;
42 |         }
43 |     }
44 |     
45 |     cudaMemcpy(dA,A,size,cudaMemcpyHostToDevice);
46 |     cudaMemcpy(dB,B,size,cudaMemcpyHostToDevice);
47 | 
48 |     // now we have everything set up
49 |     matrixFunction<<<gridDim,blockDim>>>(dA,dB,dC,N);
50 |     cudaDeviceSynchronize();
51 | 
52 |     cudaMemcpy(C,dC,size,cudaMemcpyDeviceToHost);
53 | 
54 |     for (int i = 0; i < N*N; i++) {
55 |         std::cout << C[i] << " ";
56 |         if ((i + 1) % N == 0) std::cout << std::endl;
57 |     }
58 | }


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/additionKernel.cpython-312-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <torch/extension.h>
 3 | 
 4 | template <typename T>
 5 | __global__ void addKernel(T* input, int arraySize) {
 6 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 7 |     if (idx < arraySize) {
 8 |         input[idx] += 10;
 9 |     }
10 | }
11 | 
12 | void addition(torch::Tensor& input, int arraySize) {
13 |     int threads_per_block = 256;
14 |     int blocks = (arraySize + threads_per_block - 1) / threads_per_block;
15 | 
16 |     AT_DISPATCH_FLOATING_TYPES(input.type(), "arrayAddition", [&]() {
17 |         addKernel<<<blocks, threads_per_block>>>(input.data_ptr<scalar_t>(), arraySize);
18 |     });
19 |     cudaDeviceSynchronize();
20 |     
21 |     auto err = cudaGetLastError();
22 |     if (err != cudaSuccess) {
23 |         TORCH_CHECK(false, "CUDA error: ", cudaGetErrorString(err));
24 |     }
25 | }   


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: additionKernel
3 | Version: 0.0.1
4 | 


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | additionKernel.cu
2 | additionKernelBinding.cpp
3 | setup.py
4 | additionKernel.egg-info/PKG-INFO
5 | additionKernel.egg-info/SOURCES.txt
6 | additionKernel.egg-info/dependency_links.txt
7 | additionKernel.egg-info/top_level.txt


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | additionKernel
2 | 


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernelBinding.cpp:
--------------------------------------------------------------------------------
1 | #include <torch/extension.h>
2 | 
3 | void addition(torch::Tensor& input, int arraySize);
4 | 
5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
6 |     m.def("addition", &addition, "Adds 10 to each element of the tensor");
7 | }


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/additionkernel.cpython-312-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: additionkernel
3 | Version: 0.0.0
4 | 


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | additionKernel.cu
2 | additionKernelBinding.cpp
3 | setup.py
4 | additionkernel.egg-info/PKG-INFO
5 | additionkernel.egg-info/SOURCES.txt
6 | additionkernel.egg-info/dependency_links.txt
7 | additionkernel.egg-info/top_level.txt


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | additionkernel
2 | 


--------------------------------------------------------------------------------
/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionKernel.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionKernel.cpython-312-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionkernel.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionkernel.cpython-312-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernel.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernel.o


--------------------------------------------------------------------------------
/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernelBinding.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernelBinding.o


--------------------------------------------------------------------------------
/day06/AdditionKernel/pythontest.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import additionkernel  
3 | 
4 | input_tensor = torch.randn(100).cuda()
5 | additionkernel.addition(input_tensor, input_tensor.size(0))
6 | print("Result after addition:", input_tensor)


--------------------------------------------------------------------------------
/day06/AdditionKernel/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | setup(
 5 |     name='additionkernel',
 6 |     ext_modules=[
 7 |         CUDAExtension(
 8 |             name='additionkernel',
 9 |             sources=[
10 |                 'additionKernelBinding.cpp',
11 |                 'additionKernel.cu',
12 |             ]
13 |         )
14 |     ],
15 |     cmdclass={
16 |         'build_ext': BuildExtension
17 |     }
18 | )


--------------------------------------------------------------------------------
/day06/ImportingToPython/build/lib.linux-x86_64-cpython-312/example_kernels.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/lib.linux-x86_64-cpython-312/example_kernels.cpython-312-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcall.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcall.o


--------------------------------------------------------------------------------
/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcallbinding.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcallbinding.o


--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/example_kernels.cpython-312-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: example_kernels
3 | Version: 0.0.1
4 | 


--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | rollcall.cu
2 | rollcallbinding.cpp
3 | setup.py
4 | example_kernels.egg-info/PKG-INFO
5 | example_kernels.egg-info/SOURCES.txt
6 | example_kernels.egg-info/dependency_links.txt
7 | example_kernels.egg-info/top_level.txt


--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | example_kernels
2 | 


--------------------------------------------------------------------------------
/day06/ImportingToPython/pythontest.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import example_kernels
3 | example_kernels.rollcall()


--------------------------------------------------------------------------------
/day06/ImportingToPython/rollcall.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __global__ void roll_call_kernel() {
 5 | 	const int threadIndex = threadIdx.x;
 6 | 	printf("Thread %d here!\n", threadIndex);
 7 |     printf("Te iubesc atat de mult: %d \n",threadIndex*1000);
 8 | }
 9 | 
10 | void roll_call_launcher() {
11 |     roll_call_kernel<<<1, 5>>>();
12 |     cudaDeviceSynchronize();
13 | }
14 | 
15 | int main() {
16 |     roll_call_launcher();
17 | 	return 0;
18 | }


--------------------------------------------------------------------------------
/day06/ImportingToPython/rollcallbinding.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <iostream>
 3 | 
 4 | void roll_call_launcher();
 5 | 
 6 | void roll_call_binding(){
 7 |     roll_call_launcher();
 8 | }
 9 | 
10 | PYBIND11_MODULE(example_kernels, m) {
11 |   m.def(
12 |     "rollcall", // Name of the Python function to create
13 |     &roll_call_binding, // Corresponding C++ function to call
14 |     "Launches the roll_call kernel" // Docstring
15 |   );
16 | }
17 | 


--------------------------------------------------------------------------------
/day06/ImportingToPython/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | __version__ = "0.0.1"
 5 | 
 6 | ext_modules = [
 7 |     CUDAExtension('example_kernels', 
 8 |     [
 9 |         'rollcallbinding.cpp',  
10 |         'rollcall.cu', 
11 |     ])
12 | ]   
13 | 
14 | setup(
15 |     name="example_kernels",
16 |     version=__version__,
17 |     ext_modules=ext_modules,
18 |     cmdclass={"build_ext": BuildExtension}
19 | )
20 | 


--------------------------------------------------------------------------------
/day06/SMBlocks.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __global__ void sm_roll_call() {
 5 | 	const int threadIndex = threadIdx.x;
 6 | 	
 7 | 	uint streamingMultiprocessorId;
 8 | 	asm("mov.u32 %0, %smid;" : "=r"(streamingMultiprocessorId) );
 9 | 	
10 | 	printf("Thread %d running on SM %d!\n", threadIndex, streamingMultiprocessorId);
11 | }
12 | 
13 | int main() {
14 | 	sm_roll_call<<<4, 2>>>();
15 | 	cudaDeviceSynchronize();
16 | 	return 0;
17 | }


--------------------------------------------------------------------------------
/day06/SoftMax.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __global__ void SoftMaxNaive(float *input,float *output,int size){
 5 |     int numThreads = blockDim.x;
 6 |     
 7 | 
 8 |     //each thread to compute softmax for this:
 9 |     int numElementsPerThread = size/numThreads;
10 | 
11 |     int threadIndex = threadIdx.x;
12 | 
13 |     int startIndex = threadIndex * numElementsPerThread;
14 |     int endIndex = min(size,startIndex* numElementsPerThread);
15 | 
16 | 
17 |     float MaxValue = 0.0;
18 |     for (int i = 0; i < size; i++) {
19 |         if (input[i] > MaxValue) {
20 |             MaxValue = input[i];
21 |         }
22 |     }
23 | 
24 |     float sumExp = 0.0;
25 |     for(int i =0;i<size;i++){
26 |         sumExp +=expf(input[i] - MaxValue);
27 |     }
28 | 
29 |     for(int i =startIndex;i<endIndex;i++){
30 |         output[i] = expf(input[i] - MaxValue) / sumExp;
31 |     }
32 | }
33 | 
34 | __global__ void SoftMaxShared(float *input,float *output,int size){
35 |     int numThreads = blockDim.x;
36 |     int numElementsPerThread = size/numThreads;
37 |     int threadIndex = threadIdx.x;
38 |     int startIndex = threadIndex * numElementsPerThread;
39 |     int endIndex = min(size,startIndex* numElementsPerThread);
40 | 
41 | 
42 |     /// Calculate the Maximum 
43 |     __shared__ float SharedMaxValue[numThreads];
44 |     float MaxValue = 0.0;
45 |     for (int i = startIndex; i < endIndex; i++) {
46 |         if (input[i] > MaxValue) {
47 |             MaxValue = input[i];
48 |         }
49 |     }
50 |     SharedMaxValue[threadIndex] = MaxValue;
51 |     __syncthreads();
52 |     for (int i = 0; i < numThreads; i++) {
53 |         if (SharedMaxValue[i] > MaxValue) {
54 |             MaxValue = SharedMaxValue[i];
55 |         }
56 |     }
57 |     
58 | 
59 |     /// Now we need to calcualte the SumExp
60 |     __shared__ float sharedSumExp[numThreads];
61 |     float sumExp = 0.0;
62 |     for(int i =startIndex;i<endIndex;i++){
63 |         sumExp +=expf(input[i] - MaxValue);
64 |     }
65 |     sharedSumExp[threadIndex] = sumExp;
66 |     __syncthreads();
67 | 
68 |     for(int i = 0;i<numThreads;i++){
69 |         sumExp+= sharedSumExp[i];
70 |     }
71 | 
72 |     for (int i = startIndex; i < endIndex; i++) {
73 |         output[i] = expf(input[i] - MaxValue) / sumExp;
74 |     }
75 | }


--------------------------------------------------------------------------------
/day06/TransposeMatrix.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __global__ void transposeKernel(int *A, int *B)
 5 | {
 6 |     const int idx = threadIdx.x + threadIdx.y * blockDim.x;
 7 |     // threadIDx.x -> id of the row
 8 |     // threadIdx.y -> id of the collumn
 9 |     // BlockDim.x -> the size of the Dimension of the row
10 |     // So we will get the idx to be on the element in the flattned matrix
11 | 
12 |     //  1  2  3    1  2  5
13 |     //  2  3  4 -> 2  3  2
14 |     //  5  2  1    3  4  1
15 |     const int outidx = threadIdx.y + threadIdx.x * blockDim.y;
16 |     B[outidx] = A[idx];
17 | }
18 | 
19 | int main()
20 | {
21 |     int rows = 3;
22 |     int cols = 3;
23 |     int sizeMatrix = rows * cols;
24 |     int *Matrix = (int *)malloc(sizeof(int) * cols * rows);
25 |     for (int i = 0; i < sizeMatrix; i++)
26 |     {
27 |         Matrix[i] = i;
28 |     }
29 |     for (int i = 0; i < sizeMatrix; i++)
30 |     {
31 |         std::cout << Matrix[i] << " ";
32 |         if (i % cols == cols - 1)
33 |             std::cout << std::endl;
34 |     }
35 | 
36 |     int *MatrixD, *MatrixOut;
37 |     cudaMalloc((void **)&MatrixD, sizeMatrix * sizeof(int));
38 |     cudaMalloc((void **)&MatrixOut, sizeMatrix * sizeof(int));
39 |     cudaMemcpy(MatrixD, Matrix, sizeMatrix * sizeof(int), cudaMemcpyHostToDevice);
40 | 
41 |     dim3 numThreadsPerBlock(rows, cols);
42 | 
43 |     cudaFuncSetAttribute(
44 |         transposeKernel,
45 |         cudaFuncAttributePreferredSharedMemoryCarveout,
46 |         20 // Use 20% of combined L1/Shared Memory for Shared Memory
47 |     ); 
48 |     transposeKernel<<<1, numThreadsPerBlock>>>(MatrixD, MatrixOut);
49 | 
50 |     cudaMemcpy(Matrix, MatrixOut, sizeMatrix * sizeof(float), cudaMemcpyDeviceToHost);
51 |     std::cout << "\nTransposed\n";
52 |     for (int i = 0; i < sizeMatrix; i++)
53 |     {
54 |         std::cout << Matrix[i] << " ";
55 |         if (i % rows == rows - 1)
56 |             std::cout << std::endl;
57 |     }
58 | 
59 |     cudaFree(MatrixD);
60 |     cudaFree(MatrixOut);
61 |     free(Matrix);
62 | 
63 |     return 0;
64 | }


--------------------------------------------------------------------------------
/day06/note:
--------------------------------------------------------------------------------
1 | I will work more on this day to surprise my biggest supporter in this Journey :D
2 | I will start wit this tutorial : https://tinkerd.net/blog/machine-learning/cuda-basics/
3 | And later this day will continue with working on the softmax forward + backward


--------------------------------------------------------------------------------
/day07/conv1d.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __global__ void conv1D(float *X, float *K, float *Y, int input_size, int kernel_size)
 5 | {
 6 | 
 7 |     extern __shared__ float shared[];
 8 | 
 9 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
10 | 
11 |     int radius = kernel_size / 2;
12 | 
13 |     int sharedIdx = threadIdx.x + radius; // the main element from the conv
14 |     // index will start from the radius so that we have left 2 more behind use 
15 |     /// SO we load in the share memory all the elements our filter will work on the block
16 |     if (threadIdx.x < blockDim.x - radius)
17 |     {
18 |         int left = i - radius;
19 |         int right = i + blockDim.x;
20 | 
21 |         shared[threadIdx.x] = (left >= 0) ? X[left] : 0.0f; 
22 |         shared[sharedIdx + blockDim.x] = (right < input_size) ? X[right] : 0.0f;
23 |     }
24 | 
25 |     __syncthreads();
26 | 
27 |     float sum = 0.0;
28 |     for (int j = -radius; j <= radius; j++)
29 |     {
30 |         sum += shared[sharedIdx + j] * K[radius + j];
31 |         // we iterate from -2 to 2 . so we have -2 -1 0 1 2. Which is normal
32 |         // So we have this:
33 |     }
34 | 
35 |     if (i < input_size)
36 |     {
37 |         Y[i] = sum;
38 |     }
39 | }
40 | 
41 | int main()
42 | {
43 |     int N = 1024;                                   // size of the vector
44 |     int BlockSize = 256;                            // size of the block we use
45 |     int GridSize = (N + BlockSize - 1) / BlockSize; // size of the grid we use. Also ceil function
46 | 
47 |     int KernelSize = 5;
48 |     float Kernel[KernelSize] = {1.0f, 2.0f, 1.0f, 1.0f, -2.0f};
49 |     int radius = KernelSize / 2;
50 |     int SharedMemory = (BlockSize + 2 * radius) * sizeof(float);
51 | 
52 |     float *Xcpu, *Ycpu;
53 |     float *Xgpu, *Ygpu, *Kgpu;
54 | 
55 |     Xcpu = (float *)malloc(N * sizeof(float));
56 |     Ycpu = (float *)malloc(N * sizeof(float));
57 |     // we already have declared our kernel;
58 | 
59 |     for (int i = 0; i < N; i++)
60 |     {
61 |         Xcpu[i] = 1;
62 |     }
63 | 
64 |     // now lets launch this data in the air baby
65 |     cudaMalloc((void **)&Xgpu, N * sizeof(float));
66 |     cudaMalloc((void **)&Ygpu, N * sizeof(float));
67 |     cudaMalloc((void **)&Kgpu, KernelSize * sizeof(float));
68 |     cudaMemcpy(Xgpu, Xcpu, N * sizeof(float), cudaMemcpyHostToDevice);
69 |     cudaMemcpy(Kgpu, Kernel, KernelSize * sizeof(float), cudaMemcpyHostToDevice);
70 | 
71 |     conv1D<<<GridSize, BlockSize, SharedMemory>>>(Xgpu, Kgpu, Ygpu, N, KernelSize);
72 | 
73 |     cudaMemcpy(Ycpu, Ygpu, N * sizeof(float), cudaMemcpyDeviceToHost);
74 | 
75 |     std::cout << "First 10 elements " << std::endl;
76 |     for (size_t i = 0; i < 10; i++)
77 |     {
78 |         std::cout << Xcpu[i] << " ";
79 |     }
80 | 
81 |     std::cout << "\nFirst 10 elements after the convolution op" << std::endl;
82 |     for (size_t i = 0; i < 10; i++)
83 |     {
84 |         std::cout << Ycpu[i] << " ";
85 |     }
86 | 
87 |     free(Xcpu);
88 |     free(Ycpu);
89 |     cudaFree(Xgpu);
90 |     cudaFree(Ygpu);
91 |     cudaFree(Kgpu);
92 | 
93 |     return 0;
94 | }
95 | 


--------------------------------------------------------------------------------
/day07/globalMemoryCoalescing.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | #define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
 5 | int M = 10;
 6 | int N = 10;
 7 | 
 8 | dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
 9 | dim3 blockDim(32, 32, 1); // 32 * 32 * 1
10 | 
11 | __global__ void sgemm_naive(int M, int N, int K, float alpha,
12 |                             const float *A, const float *B, float beta, float *C)
13 | {
14 |     const int x = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
15 |     const int y = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);
16 |     
17 |     if (x < M && y<N){
18 |         float temp = 0.0;
19 |         for(int i = 0;i<K;++i){
20 |             temp += A[x*K+i] * B[i*N+y];
21 |         }
22 | 
23 |         C[x*N+y] = alpha*temp + beta * C[x*N+y];
24 |     }
25 | }


--------------------------------------------------------------------------------
/day07/matmul.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | #define BLOCK_SIZE 32
 4 | 
 5 | __global__ void matmulKernel(float *A, float *B, float *C, int dim)
 6 | {
 7 |     int i, j;       // i and j indexes
 8 |     float temp = 0; // temp value
 9 | 
10 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
11 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
12 | 
13 |     __shared__ float ASharedT[BLOCK_SIZE][BLOCK_SIZE]; // we allocate memory for shared
14 |     __shared__ float BSharedT[BLOCK_SIZE][BLOCK_SIZE]; // we allocate memory fro shared
15 | 
16 |     for (int tileNUM = 0; tileNUM < gridDim.x; tileNUM++)
17 |     {
18 |         j = tileNUM * BLOCK_SIZE + threadIdx.x;
19 |         i = tileNUM * BLOCK_SIZE + threadIdx.y;
20 | 
21 |         ASharedT[threadIdx.y][threadIdx.x] = A[i * dim + j];
22 |         BSharedT[threadIdx.y][threadIdx.x] = B[i * dim + j];
23 | 
24 |         __syncthreads();
25 | 
26 |         for (int k = 0; k < BLOCK_SIZE; k++)
27 |         {
28 |             temp += ASharedT[threadIdx.y][k] * BSharedT[k][threadIdx.x];
29 |         }
30 | 
31 |         __syncthreads();
32 |     }
33 |     C[row * dim + col] = temp;
34 | }
35 | 
36 | int main()
37 | {
38 |     int N = 1024;
39 |     float *Acpu, *Bcpu, *Ccpu;
40 |     float *Agpu, *Bgpu, *Cgpu;
41 | 
42 |     Acpu = (float *)malloc(N * N * sizeof(float));
43 |     Bcpu = (float *)malloc(N * N * sizeof(float));
44 |     Ccpu = (float *)malloc(N * N * sizeof(float));
45 | 
46 |     for (int i = 0; i < N * N; i++)
47 |     {
48 |         Acpu[i] = sin(i);
49 |         Bcpu[i] = cos(i);
50 |     }
51 | 
52 |     size_t vectorSize = N * N * sizeof(float);
53 | 
54 |     cudaMalloc((void **)&Agpu, vectorSize);
55 |     cudaMalloc((void **)&Bgpu, vectorSize);
56 |     cudaMalloc((void **)&Cgpu, vectorSize);
57 |     cudaMemcpy(Agpu, Acpu, vectorSize, cudaMemcpyHostToDevice);
58 |     cudaMemcpy(Bgpu, Bcpu, vectorSize, cudaMemcpyHostToDevice);
59 | 
60 |     dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
61 |     dim3 gridDim(N / BLOCK_SIZE, N / BLOCK_SIZE);
62 | 
63 |     cudaEvent_t start, stop;
64 |     cudaEventCreate(&start);
65 |     cudaEventCreate(&stop);
66 |     cudaEventRecord(start, 0);
67 | 
68 |     matmulKernel<<<gridDim, blockDim>>>(Agpu, Bgpu, Cgpu, N);
69 | 
70 |     cudaEventRecord(stop, 0);
71 |     cudaEventSynchronize(stop);
72 |     float et;
73 |     cudaEventElapsedTime(&et, start, stop);
74 |     cudaEventDestroy(start);
75 |     cudaEventDestroy(stop);
76 | 
77 |     cudaMemcpy(Ccpu, Cgpu, vectorSize, cudaMemcpyDeviceToHost);
78 | 
79 |     printf("GPU time= %f ms\n", et);
80 | 
81 |     free(Acpu);
82 |     free(Bcpu);
83 |     free(Ccpu);
84 |     cudaFree(Agpu);
85 |     cudaFree(Bgpu);
86 |     cudaFree(Cgpu);
87 | 
88 |     return 0;
89 | }


--------------------------------------------------------------------------------
/day07/naive.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | #define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
 5 | int M = 10;
 6 | int N = 10;
 7 | 
 8 | dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
 9 | dim3 blockDim(32, 32, 1); // 32 * 32 * 1
10 | 
11 | __global__ void sgemm_naive(int M, int N, int K, float alpha,
12 |                             const float *A, const float *B, float beta, float *C)
13 | {
14 |     const uint x = blockIdx.x * blockDim.x + threadIdx.x;
15 |     const uint y = blockIdx.y * blockDim.y + threadIdx.y;
16 | 
17 |     if (x < M && y<N){
18 |         float temp = 0.0;
19 |         for(int i = 0;i<K;++i){
20 |             temp += A[x*K+i] * B[i*N+y];
21 |         }
22 | 
23 |         C[x*N+y] = alpha*temp + beta * C[x*N+y];
24 |     }
25 | }


--------------------------------------------------------------------------------
/day07/pythontest.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def test_conv1d():
 5 |     input_size = 1024
 6 |     kernel_size = 5
 7 |     kernel_weights = torch.tensor([1.0, 2.0, 1.0, 1.0, -2.0], dtype=torch.float32)
 8 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 9 | 
10 |     x_cpu = torch.ones(1, 1, input_size, dtype=torch.float32)  #
11 |     x_gpu = x_cpu.to(device)
12 | 
13 |     torch_conv = torch.nn.Conv1d(
14 |         in_channels=1,
15 |         out_channels=1,
16 |         kernel_size=kernel_size,
17 |         padding=kernel_size // 2, 
18 |         bias=False,
19 |         padding_mode="zeros",
20 |     ).to(device)
21 | 
22 |     with torch.no_grad():
23 |         torch_conv.weight.data = kernel_weights.view(1, 1, kernel_size).to(device)
24 | 
25 |     y_torch = torch_conv(x_gpu)
26 |     y_torch_np = y_torch.detach().cpu().squeeze().numpy()
27 | 
28 |     print("PyTorch Output (first 10):", y_torch_np[:10])
29 | 
30 | if __name__ == "__main__":
31 |     test_conv1d()


--------------------------------------------------------------------------------
/day08/idk.cu:
--------------------------------------------------------------------------------
1 | #include <iostream>
2 | #include <cuda_runtime.h>
3 | 


--------------------------------------------------------------------------------
/day08/pmpbook/chapter3matvecmul.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | #define CUDA_CHECK(err)                         \
 5 |     {                                           \
 6 |         cuda_assert((err), __FILE__, __LINE__); \
 7 |     }
 8 | inline void cuda_assert(cudaError_t code, const char *file, int line)
 9 | {
10 |     if (code != cudaSuccess)
11 |     {
12 |         std::cerr << "CUDA Error: " << cudaGetErrorString(code)
13 |                   << " in " << file << ":" << line << std::endl;
14 |         exit(1);
15 |     }
16 | }
17 | 
18 | __global__ void matrixveckernel(const float *A,const float*b,float*C,const int N){
19 |     // N the size of the NxN A matrix
20 |     // N aslo the size of the vector
21 |     // we need so that each thread will iterate the row 
22 | 
23 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
24 |     // we got 
25 | 
26 |     if(i<N){
27 |         float result = 0;
28 |         for(int j = 0; j<N;j++){
29 |             result += A[i*N+j] * b[j];
30 |         }
31 |         C[i] = result;
32 |     }
33 | }
34 | 
35 | void matvecmul(const float*A,const float *b,float*c,const int N){
36 |     float *dA,*db,*dc;
37 |     size_t sizeA = N*N*sizeof(float);
38 |     size_t sizeb = N*sizeof(float);
39 |     int Threads = 256;
40 |     dim3 blockDim(Threads ,1,1);
41 |     dim3 GridDim(ceil(N/(Threads)));
42 | 
43 | 
44 |     CUDA_CHECK(cudaMalloc((void**)&dA,sizeA));
45 |     CUDA_CHECK(cudaMemcpy(dA,A,sizeA,cudaMemcpyHostToDevice));
46 | 
47 |     CUDA_CHECK(cudaMalloc((void**)&db,sizeb));
48 |     CUDA_CHECK(cudaMemcpy(db,b,sizeb,cudaMemcpyHostToDevice));
49 | 
50 |     CUDA_CHECK(cudaMalloc((void**)&dc,sizeb));
51 | 
52 |     matrixveckernel<<<GridDim,blockDim>>>(dA,db,dc,N);
53 |     CUDA_CHECK(cudaGetLastError());
54 | 
55 |     CUDA_CHECK(cudaMemcpy(c,dc,sizeb,cudaMemcpyDeviceToHost));
56 | 
57 | 
58 |     CUDA_CHECK(cudaFree(dA));
59 |     CUDA_CHECK(cudaFree(db));
60 |     CUDA_CHECK(cudaFree(dc));
61 | 
62 | }
63 | 
64 | int main(){
65 |     int N = 1024;
66 |     float *A = new float[N * N];
67 |     float *b = new float[N];
68 | 
69 |     for(int i = 0 ;i <N;i++){
70 |         b[i] = 1;
71 |         for(int j = 0 ;j<N;j++){
72 |             A[i*N+j] = 1;
73 |         }
74 |     }
75 | 
76 |     float *c = new float[N];
77 |     matvecmul(A,b,c,N);
78 |     std::cout <<"C[0:10]=[ ";
79 |     for(int i = 0 ;i<10; i++){
80 |         std::cout<<c[i]<<" ";
81 | 
82 |     }
83 |     std::cout<<"]"<<std::endl;
84 | 
85 |     free(A);
86 |     free(b);
87 |     free(c);
88 | 
89 |     return 0;
90 | }
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/day08/pmpbook/color2gray.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | __global__ void color2graykernel(const float* R, const float*G,const float*B,float *O,const int n){
 5 |     // assume the matrix is nxn;
 6 | 
 7 |     int i = blockIdx.x * blockDim.x + threadIdx.x; // so this will be for collumns
 8 |     int j = blockIdx.y * blockDim.y + threadIdx.y; // this will be for rows
 9 | 
10 | 
11 |     if( i<n && j<n){
12 |         int idx = j* n + i;
13 |         O[idx] = 0.299f * R[idx] + 0.587f * G[idx] + 0.114f * B[idx];
14 |     }
15 | }
16 | 
17 | float* color2gray(const float *R,const float*G,const float*B,const int n){
18 |     float * d_r, *d_g, *d_b, *d_o;
19 |     int size = n * n * sizeof(float);
20 |     int Threads = 32;
21 |     dim3 gridDim(ceil(n/Threads),ceil(n/Threads),1);
22 |     dim3 BlockDim(Threads,Threads,1);
23 | 
24 |     cudaMalloc((void**)&d_r, size);
25 |     cudaMalloc((void**)&d_g, size);
26 |     cudaMalloc((void**)&d_b, size);
27 |     cudaMalloc((void**)&d_o, size);
28 | 
29 |     cudaMemcpy(d_r,R,size,cudaMemcpyHostToDevice);
30 |     cudaMemcpy(d_g,G,size,cudaMemcpyHostToDevice);
31 |     cudaMemcpy(d_b,B,size,cudaMemcpyHostToDevice);
32 | 
33 |     color2graykernel<<<gridDim,BlockDim>>>(d_r,d_g,d_b,d_o,n);
34 | 
35 |     float *O = (float*)malloc(size);
36 |     cudaMemcpy(O,d_o,size,cudaMemcpyDeviceToHost);
37 | 
38 |     cudaFree(d_r);
39 |     cudaFree(d_g);
40 |     cudaFree(d_b);
41 |     cudaFree(d_o);
42 | 
43 |     return O;
44 | }


--------------------------------------------------------------------------------
/day08/pmpbook/deviceinfo.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | int main()
 5 | {
 6 |     int dev_count;
 7 |     cudaGetDeviceCount(&dev_count);
 8 |     std::cout << "Devices are : " << dev_count << std::endl;
 9 | 
10 |     cudaDeviceProp dev_prop;
11 |     for (int i = 0; i < dev_count; ++i)
12 |     {
13 |         cudaGetDeviceProperties(&dev_prop, i);
14 |     }
15 |     std::cout << "Max Threads per Block : " << dev_prop.maxThreadsPerBlock << std::endl;
16 |     std::cout << "Max Threads per MultiProcessor :" << dev_prop.maxThreadsPerMultiProcessor << std::endl;
17 |     std::cout << "Max Blocks per MultiProcessor : " << dev_prop.maxBlocksPerMultiProcessor << std::endl;
18 |     std::cout << "Clock rate : " << dev_prop.clockRate << std::endl;
19 |     std::cout << "Max Grid Size (X,Y,Z) : (" << dev_prop.maxGridSize[0] << "," << dev_prop.maxGridSize[1] << "," << dev_prop.maxGridSize[2] << ")" << std::endl;
20 |     std::cout << "Max Threads Dim (X,Y,Z) : (" << dev_prop.maxThreadsDim[0] << "," << dev_prop.maxThreadsDim[1] << "," << dev_prop.maxThreadsDim[2] << ")" << std::endl;
21 |     std::cout << "Max Shared Memory per Block : " << dev_prop.sharedMemPerBlock << std::endl;
22 |     std::cout << "Max Shared Memory per MultiProcessor : " << dev_prop.sharedMemPerMultiprocessor << std::endl;
23 |     std::cout << "Max Registers per Block : " << dev_prop.regsPerBlock << std::endl;
24 |     std::cout << "Max Registers per MultiProcessor : " << dev_prop.regsPerMultiprocessor << std::endl;
25 |     std::cout << "Warp Size : " << dev_prop.warpSize << std::endl;
26 |     std::cout << "Max Threads per Warp : " << dev_prop.maxThreadsPerMultiProcessor / dev_prop.warpSize << std::endl;
27 |     std::cout << "Max Warps per MultiProcessor : " << dev_prop.maxThreadsPerMultiProcessor / dev_prop.warpSize << std::endl;
28 |     std::cout << "Max Warps per Block : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize << std::endl;
29 |     std::cout << "Max Warps per Grid : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] << std::endl;
30 |     std::cout << "Max Warps per Device : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] * dev_prop.multiProcessorCount << std::endl;
31 |     std::cout << "Max Blocks per Device : " << dev_prop.maxBlocksPerMultiProcessor * dev_prop.multiProcessorCount << std::endl;
32 |     std::cout << "Max Threads per Device : " << dev_prop.maxThreadsPerBlock * dev_prop.multiProcessorCount << std::endl;
33 |     std::cout << "Max Warps per Device : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] * dev_prop.multiProcessorCount << std::endl;
34 | 
35 | }


--------------------------------------------------------------------------------
/day08/pmpbook/imageblur.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | __global__ void imageblurkernel(const float *A, float *C, const int sizeArray, const int sizeKernel)
 5 | {
 6 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 7 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
 8 | 
 9 |     int radius = sizeKernel / 2;
10 | 
11 |     //   1  2  3  2
12 |     //   4  5  6  2
13 |     //   1  2  3  2
14 |     //   5  6  7  2
15 |     //
16 |     //   Sow we lets say we are at index = 1 first element
17 |     //   we need now to do this :
18 |     //   we only use the blur when if it dosnt overflow
19 |     if (i < sizeArray && j < sizeArray)
20 |     {
21 |         float PixelValue = 0.0;
22 |         int pixels = 0;
23 |         for (int blurRow = -radius; i <= radius; i++)
24 |         {
25 |             for (int blurCol = -radius; j <= radius; j++)
26 |             {
27 |                 // so now we are in the kernel
28 |                 int curRow = i + blurRow;
29 |                 int curCol = j + blurCol;
30 | 
31 |                 if (curRow < 0 || curRow >= sizeArray || curCol < 0 || curCol >= sizeArray)
32 |                 {
33 |                     PixelValue += A[curRow * sizeArray + curCol];
34 |                     pixels++;
35 |                 }
36 |             }
37 |         }
38 |         C[sizeArray * j + i] = PixelValue / pixels;
39 |     }
40 | }


--------------------------------------------------------------------------------
/day08/pmpbook/vecaddition.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | //// CHAPTER 2 DONE
 5 | __global__ void addkernel(float *a, float *b, float *c, int N)
 6 | {
 7 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     if (i < N)
 9 |     {
10 |         c[i] = a[i] + b[i];
11 |     }
12 | }
13 | 
14 | 
15 | void vecAdd(float *A, float *B, float*C,int n){
16 |     int size = n*sizeof(float);
17 |     float *d_A, *d_B, *d_C;
18 | 
19 |     cudaMalloc((void**)&d_A, size);
20 |     cudaMalloc((void**)&d_B, size);
21 |     cudaMalloc((void**)&d_C, size);
22 | 
23 |     cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
24 |     cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
25 | 
26 |     dim3 dimGrid(ceil(n/256.0),1,1);
27 |     dim3 dimBlock(256,1,1);
28 |     addkernel<<<dimGrid,dimBlock>>>(d_A, d_B, d_C, n);
29 |     // launches a gri of 4 blocks with 256 threads per block
30 | 
31 |     cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
32 |     cudaFree(d_A);
33 |     cudaFree(d_B);
34 |     cudaFree(d_C);    
35 | }


--------------------------------------------------------------------------------
/day09/bind.cpp:
--------------------------------------------------------------------------------
1 | #include <torch/extension.h>
2 | 
3 | torch::Tensor forward(torch::Tensor Q, torch::Tensor K, torch::Tensor V);
4 | 
5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
6 |     m.def("forward", torch::wrap_pybind_function(forward), "forward");
7 | }


--------------------------------------------------------------------------------
/day09/test.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch.nn import functional as F
 5 | from torch.utils.cpp_extension import load
 6 | 
 7 | print("LOADING FLASH ATTENTION")
 8 | minimal_attn = load(name='minimal_attn', sources=['bind.cpp', 'flashAttentionFromTut.cu'], extra_cuda_cflags=['-O2'])
 9 | print("LOADED FLASH ATTENTION")
10 | 
11 | batch_size = 16
12 | n_head = 12
13 | seq_len = 64
14 | head_embd = 64
15 | 
16 | q = torch.randn(batch_size, n_head, seq_len, head_embd).cuda()
17 | k = torch.randn(batch_size, n_head, seq_len, head_embd).cuda()
18 | v = torch.randn(batch_size, n_head, seq_len, head_embd).cuda()
19 | 
20 | print('=== profiling manual attention ===')
21 | 
22 | # Our minimal flash attention aims to be faster than this by avoiding HBM read/writes of N^2 matrices.
23 | def manual_attn(q, k, v):
24 |     att = (q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.size(-1))))
25 |     att = F.softmax(att, dim=-1)
26 |     y = att @ v
27 |     return y
28 | 
29 | with torch.autograd.profiler.profile(use_cuda=True) as prof:
30 |     manual_result = manual_attn(q, k, v)
31 | print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
32 | 
33 | print('=== profiling minimal flash attention === ')
34 | 
35 | with torch.autograd.profiler.profile(use_cuda=True) as prof:
36 |     minimal_result = minimal_attn.forward(q, k, v)
37 | print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
38 | 
39 | print('attn values sanity check:', torch.allclose(minimal_result, manual_result, rtol=0, atol=1e-02))


--------------------------------------------------------------------------------
/day10/FlashAttention.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | 
 4 | void FlashAttention(torch::Tensor &Q,
 5 |                     torch::Tensor &K,
 6 |                     torch::Tensor &V,
 7 |                     torch::Tensor &O,
 8 |                     torch::Tensor &m,
 9 |                     torch::Tensor &l,
10 |                     const int seq_len,
11 |                     const int head_dim,
12 |                     int Tc, int Tr, int Bc, int Br);
13 | 
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
15 |   m.def("FlashAttention", &FlashAttention, "FlashAttention forward");
16 | }


--------------------------------------------------------------------------------
/day10/linking/simpleKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include "ATen/ATen.h"
 3 | 
 4 | void cuda_simpleKernel(float *A);
 5 | 
 6 | void simpleKernel(at::Tensor A) {
 7 |     cuda_simpleKernel(A.data_ptr<float>());
 8 | }
 9 | 
10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11 |     m.def("simplekernel", &simpleKernel, "A simple kernel (CUDA)");
12 | }
13 | 


--------------------------------------------------------------------------------
/day10/linking/simpleKernel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cuda_runtime.h>
 3 | #include "ATen/ATen.h"
 4 | 
 5 | template <typename T>
 6 | __global__ void simpleKernel(T* A) {
 7 |     A[threadIdx.x] += 100;
 8 | }
 9 | 
10 | void cuda_simpleKernel(float *A ) {
11 |     dim3 blocks(1);
12 |     simpleKernel<<<blocks, 32>>>(A);
13 | }


--------------------------------------------------------------------------------
/day10/linking/test.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.cpp_extension import load
 2 | 
 3 | simplekernel = load(
 4 |     name='simplekernel',
 5 |     sources=['simpleKernel.cpp', 'simpleKernel.cu'],
 6 |     verbose=True
 7 | )
 8 | 
 9 | # Test kernel
10 | import torch
11 | A = torch.zeros(32, device='cuda', dtype=torch.float32)
12 | simplekernel.simplekernel(A)
13 | print(A)
14 | 


--------------------------------------------------------------------------------
/day10/ppmbook/matrixmul.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | __global__ void matrixmulkernel(float *M,float *N,float *P,int width)
 4 | {
 5 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
 6 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
 7 | 
 8 |     if(row < width && col <width){
 9 |         float Pvalue = 0;
10 |         for(int k = 0; k<width;++k){
11 |             Pvalue += M[row * width + k] * N[k*width + col];
12 |         }
13 |         P[row * width + col] = Pvalue;
14 |     }
15 | }


--------------------------------------------------------------------------------
/day10/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | setup(
 5 |     name='FlashAttention',
 6 |     ext_modules=[
 7 |         CUDAExtension(
 8 |             name='FlashAttention',
 9 |             sources=['FlashAttention.cpp', 'FlashAttention.cu'],
10 |         )
11 |     ],
12 |     cmdclass={
13 |         'build_ext': BuildExtension
14 |     }
15 | )
16 | 


--------------------------------------------------------------------------------
/day10/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import math
 4 | 
 5 | batch_size = 1
 6 | n_head = 1
 7 | seq_len = 2
 8 | head_embd = 2
 9 | 
10 | q = torch.ones(batch_size, n_head, seq_len, head_embd).cuda()
11 | k = torch.ones(batch_size, n_head, seq_len, head_embd).cuda()
12 | v = torch.ones(batch_size, n_head, seq_len, head_embd).cuda()
13 | 
14 | print('=== profiling manual attention ===')
15 | 
16 | # Our minimal flash attention aims to be faster than this by avoiding HBM read/writes of N^2 matrices.
17 | def manual_attn(q, k, v):
18 |     att = (q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.size(-1))))
19 |     att = F.softmax(att, dim=-1)
20 |     y = att @ v
21 |     return y
22 | 
23 | with torch.autograd.profiler.profile(use_cuda=True) as prof:
24 |     manual_result = manual_attn(q, k, v)
25 | print(manual_result)


--------------------------------------------------------------------------------
/day100/delta.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda.h>
 2 | #include <cuda_fp16.h>
 3 | using namespace nvcuda;
 4 | 
 5 | // Block layout: one block per (batch, head)
 6 | template<int D>
 7 | __global__ void delta_net_attention(
 8 |     const half* __restrict__ K,   // [B, S, D]
 9 |     const half* __restrict__ V,   // [B, S, D]
10 |     const half* __restrict__ Q,   // [B, S, D]
11 |     half* __restrict__ O,         // [B, S, D]
12 |     int batch, int seq_len)
13 | {
14 |     extern __shared__ half shared_mem[];       // size = D*D
15 |     half* S = shared_mem;                      // state matrix S
16 |     int b = blockIdx.x;                        // batch index
17 | 
18 |     // Initialize S to zero
19 |     for (int idx = threadIdx.x; idx < D*D; idx += blockDim.x) {
20 |         S[idx] = __float2half(0.0f);
21 |     }
22 |     __syncthreads();
23 | 
24 |     // Loop over sequence length
25 |     for (int t = 0; t < seq_len; ++t) {
26 |         // Load k_t and v_t into registers
27 |         half k_vec[D], v_vec[D];
28 |         #pragma unroll
29 |         for (int i = threadIdx.x; i < D; i += blockDim.x) {
30 |             int base = (b*seq_len + t)*D;
31 |             k_vec[i] = K[base + i];
32 |             v_vec[i] = V[base + i];
33 |         }
34 |         __syncthreads();
35 | 
36 |         // S += v_vec * k_vec^T  — outer-product update
37 |         for (int i = threadIdx.y; i < D; i += blockDim.y) {
38 |             #pragma unroll
39 |             for (int j = threadIdx.x; j < D; j += blockDim.x) {
40 |                 int idx = i*D + j;
41 |                 float s = __half2float(S[idx]);
42 |                 s += __half2float(v_vec[i]) * __half2float(k_vec[j]);
43 |                 S[idx] = __float2half(s);
44 |             }
45 |         }
46 |         __syncthreads();
47 | 
48 |         // Load q_t and compute o_t = S * q_vec
49 |         half q_vec[D];
50 |         #pragma unroll
51 |         for (int i = threadIdx.x; i < D; i += blockDim.x) {
52 |             int base = (b*seq_len + t)*D;
53 |             q_vec[i] = Q[base + i];
54 |         }
55 |         __syncthreads();
56 | 
57 |         #pragma unroll
58 |         for (int i = threadIdx.x; i < D; i += blockDim.x) {
59 |             float o = 0.0f;
60 |             #pragma unroll
61 |             for (int j = 0; j < D; ++j) {
62 |                 o += __half2float(S[i*D + j]) * __half2float(q_vec[j]);
63 |             }
64 |             int out_idx = (b*seq_len + t)*D + i;
65 |             O[out_idx] = __float2half(o);
66 |         }
67 |         __syncthreads();
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/day11/FlashTestPytorch/binding.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include "ATen/ATen.h"
 3 | 
 4 | void CudaFlashAttention(const float *Q,
 5 |                         const float *K,
 6 |                         const float *V,
 7 |                         float *O,
 8 |                         float *m,
 9 |                         float *l,
10 |                         const int seq_len,
11 |                         const int head_dim,
12 |                         const int batch_size,
13 |                         const int nr_heads);
14 | 
15 | torch::Tensor FlashAttention(torch::Tensor Q,
16 |                              torch::Tensor K,
17 |                              torch::Tensor V)
18 | {
19 |     int batch_size = Q.size(0);
20 |     int nr_heads = Q.size(1);
21 |     int seq_len = Q.size(2);
22 |     int head_dim = Q.size(3);
23 | 
24 |     torch::Tensor m = torch::full({batch_size, nr_heads, seq_len},
25 |                                   -std::numeric_limits<float>::infinity(),Q.options());
26 |     torch::Tensor l = torch::zeros({batch_size, nr_heads, seq_len},Q.options());
27 | 
28 |     torch::Tensor O = torch::zeros_like(Q);
29 |     CudaFlashAttention(Q.data_ptr<float>(), K.data_ptr<float>(), V.data_ptr<float>(), O.data_ptr<float>(), m.data_ptr<float>(), l.data_ptr<float>(), seq_len, head_dim, batch_size, nr_heads);
30 |     return O;
31 | }
32 | 
33 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
34 | {
35 |     m.def("FlashAttention", &FlashAttention, "FlashAttention (CUDA)");
36 | }


--------------------------------------------------------------------------------
/day11/FlashTestPytorch/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.cpp_extension import load
 3 | import time
 4 | 
 5 | # Load the custom CUDA extension
 6 | sources = ["binding.cpp", "FlashAttention.cu"]
 7 | flash_attention = load("flash_attention", sources=sources, verbose=True)
 8 | print("Custom CUDA extension loaded.")
 9 | 
10 | def manual_attention(Q, K, V):
11 |     batch_size, num_heads, seq_len, head_dim = Q.shape
12 |     
13 |     attn_scores = torch.matmul(Q, K.transpose(-2, -1))  # [batch, heads, seq_len, seq_len]
14 |     scale = 1.0 / (head_dim ** 0.5)
15 |     attn_scores = attn_scores * scale
16 |     attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1)
17 |     output = torch.matmul(attn_weights, V)  # [batch, heads, seq_len, head_dim]
18 |     return output
19 | def test_flash_attention():
20 |     batch_size = 2
21 |     num_heads = 4
22 |     seq_len = 128
23 |     head_dim = 64
24 | 
25 |     # Create random input tensors
26 |     Q = torch.randn(batch_size, num_heads, seq_len, head_dim, device='cuda')
27 |     K = torch.randn_like(Q)
28 |     V = torch.randn_like(Q)
29 | 
30 |     # Warmup runs
31 |     for _ in range(3):
32 |         _ = flash_attention.FlashAttention(Q, K, V)
33 |         _ = manual_attention(Q, K, V)
34 | 
35 |     # Benchmark custom FlashAttention
36 |     custom_times = []
37 |     for _ in range(100):
38 |         torch.cuda.synchronize()
39 |         start = time.time()
40 |         _ = flash_attention.FlashAttention(Q, K, V)
41 |         torch.cuda.synchronize()
42 |         custom_times.append(time.time() - start)
43 | 
44 |     # Benchmark manual attention
45 |     manual_times = []
46 |     for _ in range(100):
47 |         torch.cuda.synchronize()
48 |         start = time.time()
49 |         _ = manual_attention(Q, K, V)
50 |         torch.cuda.synchronize()
51 |         manual_times.append(time.time() - start)
52 | 
53 |     # Get fastest iterations
54 |     fastest_custom = min(custom_times) * 1000  # Convert to milliseconds
55 |     fastest_manual = min(manual_times) * 1000
56 | 
57 |     # Print performance results
58 |     print("\nPerformance results (fastest iteration):")
59 |     print(f"Custom FlashAttention: {fastest_custom:.2f} ms")
60 |     print(f"Manual PyTorch attention: {fastest_manual:.2f} ms")
61 |     print(f"Speedup factor: {fastest_manual / fastest_custom:.2f}x")
62 | 
63 | if __name__ == "__main__":
64 |     test_flash_attention()


--------------------------------------------------------------------------------
/day11/LeakyReLU.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | 
 4 | __global__ void leakyreluKernel(float*input,float*output,float slope,int N){
 5 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
 6 |     if(index < N)
 7 |         output[index] = input[index] < 0 ? input[index]*slope : input[index];
 8 | }
 9 | 
10 | void CudaLeakyReLU(float *A,float*B,float slope ,int N){
11 |     int ThreadsPerBlock = 256;
12 |     int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock;
13 |     leakyreluKernel<<<BlocksPerGrid, ThreadsPerBlock>>>(A, B,slope,N);
14 | }


--------------------------------------------------------------------------------
/day11/ReLU.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | 
 4 | __global__ void reluKernel(float*input,float*output,int N){
 5 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
 6 |     if(index < N)
 7 |         output[index] = input[index] < 0 ? 0 : input[index];
 8 | }
 9 | 
10 | void CudaReLU(float *A,float*B, int N){
11 |     int ThreadsPerBlock = 256;
12 |     int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock;
13 |     reluKernel<<<BlocksPerGrid, ThreadsPerBlock>>>(A, B,N);
14 | }
15 | 
16 | //=========================
17 | 
18 | __global__ void reluKernelBackward(float *input, float *grad_input, float *grad_output, int N){
19 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
20 |     if(index < N)
21 |         grad_input[index] = input[index] < 0 ? 0 : grad_output[index];
22 | }
23 | 
24 | void CudaReLUBackward(float *A, float *Gi, float *Go, int N){
25 |     int ThreadsPerBlock = 256;
26 |     int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock;
27 |     reluKernelBackward<<<BlocksPerGrid, ThreadsPerBlock>>>(A, Gi, Go, N);
28 | }   


--------------------------------------------------------------------------------
/day11/SoftMax.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | inline int prevPow2(int n) {
 4 |     if (n == 0) return 0;
 5 |     int prev = 1;
 6 |     while (prev <= n/2) {
 7 |         prev *= 2;
 8 |     }
 9 |     return prev;
10 | }
11 | 
12 | __global__ void softmaxKernel(float *input, float *output, int Dim) {
13 |     int batch_idx = blockIdx.x; // Current batch index
14 |     int tid = threadIdx.x;      // Thread index within the block
15 | 
16 |     extern __shared__ float shared_data[];
17 |     float max_val = -INFINITY;
18 |     for (int i = tid; i < Dim; i += blockDim.x) {
19 |         max_val = fmaxf(max_val, input[batch_idx * Dim + i]);
20 |     }
21 | 
22 |     shared_data[tid] = max_val;
23 |     __syncthreads();
24 | 
25 |     // Reduction for max_val
26 |     for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
27 |         if (tid < stride) {
28 |             shared_data[tid] = fmaxf(shared_data[tid], shared_data[tid + stride]);
29 |         }
30 |         __syncthreads();
31 |     }
32 |     max_val = shared_data[0];
33 | 
34 |     float sum_exp = 0.0f;
35 |     for (int i = tid; i < Dim; i += blockDim.x) {
36 |         output[batch_idx * Dim + i] = expf(input[batch_idx * Dim + i] - max_val);
37 |         sum_exp += output[batch_idx * Dim + i];
38 |     }
39 | 
40 |     shared_data[tid] = sum_exp;
41 |     __syncthreads();
42 | 
43 |     // Reduction for sum_exp
44 |     for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
45 |         if (tid < stride) {
46 |             shared_data[tid] += shared_data[tid + stride];
47 |         }
48 |         __syncthreads();
49 |     }
50 |     sum_exp = shared_data[0];
51 | 
52 |     for (int i = tid; i < Dim; i += blockDim.x) {
53 |         output[batch_idx * Dim + i] /= sum_exp;
54 |     }
55 | }
56 | 
57 | 
58 | void CudaSoftmax(float *input, float *output, int BatchSize, int Dim) {
59 |     int max_threads = min(512, Dim);
60 |     int threads = prevPow2(max_threads);
61 |     if (threads == 0) threads = 1; // Ensure at least 1 thread
62 |     size_t shared_mem_size = threads * sizeof(float);
63 |     softmaxKernel<<<BatchSize, threads, shared_mem_size>>>(input, output, Dim);
64 |     cudaDeviceSynchronize(); // Ensure kernel completion
65 | }


--------------------------------------------------------------------------------
/day11/TanH.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | 
 4 | __global__ void tanhKernel(float*input,float*output,int N){
 5 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
 6 |     if(index < N)
 7 |         output[index] = tanhf(input[index]);
 8 | }
 9 | 
10 | void CudaTanH(float *A,float*B, int N){
11 |     int ThreadsPerBlock = 256;
12 |     int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock;
13 |     tanhKernel<<<BlocksPerGrid, ThreadsPerBlock>>>(A, B,N);
14 | }
15 | 


--------------------------------------------------------------------------------
/day11/binding.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include "ATen/ATen.h"
 3 | 
 4 | 
 5 | void CudaLeakyReLU(float *A,float*B,float slope ,int N);
 6 | torch::Tensor LeakyReLU(torch::Tensor A, float slope){
 7 |     torch::Tensor B = torch::empty_like(A);
 8 |     int N = A.numel();
 9 |     CudaLeakyReLU(A.data_ptr<float>(),B.data_ptr<float>(),slope,N);
10 |     return B;
11 | }
12 | 
13 | void CudaReLU(float *A,float*B, int N);
14 | torch::Tensor ReLU(torch::Tensor A){
15 |     torch::Tensor B = torch::empty_like(A);
16 |     int N = A.numel();
17 |     CudaReLU(A.data_ptr<float>(),B.data_ptr<float>(),N);
18 |     return B;
19 | }
20 | 
21 | void CudaReLUBackward(float *A, float *Gi, float *Go, int N);
22 | torch::Tensor ReLUBackward(torch::Tensor A, torch::Tensor Go){
23 |     torch::Tensor Gi = torch::empty_like(A);
24 |     int N = A.numel();
25 |     CudaReLUBackward(A.data_ptr<float>(),Gi.data_ptr<float>(),Go.data_ptr<float>(),N);
26 |     return Go;
27 | }
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | void CudaSoftmax(float *input, float *output, int BatchSize, int Dim) ;
39 | torch::Tensor Softmax(torch::Tensor input) {
40 |     int BatchSize = input.size(0);
41 |     int Dim = input.size(1);
42 |     torch::Tensor output = torch::empty_like(input);
43 |     CudaSoftmax(input.data_ptr<float>(), output.data_ptr<float>(), BatchSize, Dim);
44 |     return output;
45 | }
46 | 
47 | void CudaTanH(float *A,float*B, int N);
48 | torch::Tensor TanH(torch::Tensor A){
49 |     torch::Tensor B = torch::empty_like(A);
50 |     int N = A.numel();
51 |     CudaTanH(A.data_ptr<float>(),B.data_ptr<float>(),N);
52 |     return B;
53 | }
54 | 
55 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
56 |     m.def("LeakyReLU", &LeakyReLU, "LeakyReLU (CUDA)");
57 |     m.def("ReLU", &ReLU, "ReLU (CUDA)");
58 |     m.def("ReLUBackward", &ReLUBackward, "ReLU (CUDA)");
59 |     m.def("Softmax", &Softmax, "Softmax (CUDA)");
60 |     m.def("TanH", &TanH, "TanH (CUDA)");
61 | }


--------------------------------------------------------------------------------
/day11/testbackward.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.cpp_extension import load
 3 | 
 4 | sources = ["binding.cpp", "ReLU.cu", "SoftMax.cu", "LeakyReLU.cu", "TanH.cu"]
 5 | functions = load("functions", sources=sources, verbose=True)
 6 | 
 7 | class CustomReLU(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, input):
10 |         ctx.save_for_backward(input)
11 |         return functions.ReLU(input)
12 | 
13 |     @staticmethod
14 |     def backward(ctx, grad_output):
15 |         input, = ctx.saved_tensors
16 |         return functions.ReLUBackward(input, grad_output)
17 | 
18 | x = torch.tensor([-1.0, -1.0, -1.0, -2.0], device='cuda', requires_grad=True)
19 | 
20 | relu = CustomReLU.apply
21 | 
22 | y_custom = relu(x)
23 | y_custom.sum().backward()  
24 | grad_custom = x.grad.clone()  
25 | 
26 | x.grad.zero_()
27 | y_pytorch = torch.nn.functional.relu(x)
28 | y_pytorch.sum().backward()  
29 | grad_pytorch = x.grad.clone()  
30 | 
31 | # Compare the gradients
32 | print("Custom ReLU Gradient:", grad_custom)
33 | print("PyTorch ReLU Gradient:", grad_pytorch)
34 | 
35 | if torch.allclose(grad_custom, grad_pytorch, atol=1e-6):
36 |     print("Gradients match!")
37 | else:
38 |     print("Gradients do not match!")
39 | 


--------------------------------------------------------------------------------
/day12/tileMatrix.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | #define TILE_WIDTH 32
 4 | 
 5 | __global__ void tileKernel(const float *dM,const float *dN,float *dP,const int Width){
 6 |     __shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
 7 |     __shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
 8 | 
 9 |     int bx = blockIdx.x;
10 |     int by = blockIdx.y;
11 |     int tx = threadIdx.x;
12 |     int ty = threadIdx.y;
13 | 
14 |     int row = by * TILE_WIDTH + ty;
15 |     int col = bx * TILE_WIDTH + tx;
16 | 
17 |     float Pvalue = 0;
18 |     for(int i = 0 ; i < TILE_WIDTH/Width ; ++i){
19 |         Mds[ty][tx] = dM[row*Width + i*TILE_WIDTH + tx];
20 |         Nds[ty][tx] = dN[(i*TILE_WIDTH + ty)*Width + col];
21 |         __syncthreads();
22 | 
23 |         for(int k = 0 ;k<TILE_WIDTH;++k){
24 |             Pvalue = Mds[ty][k] * Nds[k][tx];
25 |         }
26 |         __syncthreads();    
27 | 
28 |         }
29 |     dP[row*Width + col] = Pvalue;
30 | }


--------------------------------------------------------------------------------
/day13/RMS.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | // Define the CEILING macro
 4 | #define CEILING(x, y) (((x) + (y) - 1) / (y))
 5 | 
 6 | #define blockdimy 128
 7 | 
 8 | __global__ void RMSKernel1_V1(float *input, float *output, const int w, const int h)
 9 | {
10 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
11 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
12 | 
13 |     if (row < h && col < w)
14 |     {
15 |         float sum = 0;
16 |         for (int i = 0; i < w; ++i)
17 |         {
18 |             sum += input[row * w + i] * input[row * w + i];
19 |         }
20 |         sum = sqrt((float)1 / w * sum);
21 | 
22 |         output[row + w * col] = input[row * w + col] / sum;
23 |     }
24 | }
25 | 
26 | 
27 | void RMSV1(float *input, float *output, int w, int h)
28 | {
29 | 
30 |     dim3 block_size = dim3(32, 32);
31 |     dim3 grid_size = dim3(CEILING(w, 32), CEILING(32, h));
32 |     RMSKernel1_V1<<<grid_size, block_size>>>(input, output, w, h);
33 | }
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/day13/RMSBetter.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | #define CEILING(x, y) (((x) + (y) - 1) / (y))
 4 | 
 5 | #define blockdimy 128
 6 | 
 7 | __device__ float warpReduceSum(float val)
 8 | {
 9 |     for (int offset = 16; offset > 0; offset /= 2)
10 |     {
11 |         val += __shfl_down_sync(0xffffffff, val, offset, 32);
12 |     }
13 |     return val;
14 | }
15 | 
16 | __global__ void RMSKernel_V2(float *input, float *output, const int w, const int h)
17 | {
18 |     int row = blockIdx.x * blockDim.x + threadIdx.x;
19 |     int col = blockIdx.y * blockDim.y + threadIdx.y;
20 | 
21 |     __shared__ float shared_data[32];
22 | 
23 |     float sum = 0.0f;
24 | 
25 |     if (row < h && col < w)
26 |     {
27 |         float4 val = reinterpret_cast<float4 *>(&input[row * w + col * 4])[0];
28 |         sum += val.x * val.x + val.y * val.y + val.z * val.z + val.w * val.w;
29 |     }
30 |     __syncthreads();
31 | 
32 |     sum = warpReduceSum(sum);
33 | 
34 |     __syncthreads();
35 | 
36 |     if (threadIdx.x % 32 == 0)
37 |     {
38 |         shared_data[threadIdx.x / 32] = sum;
39 |     }
40 | 
41 |     __syncthreads();
42 | 
43 |     if (threadIdx.x == 0)
44 |     {
45 |         float final_sum = 0.0f;
46 |         for (int i = 0; i < blockDim.x / 32; ++i)
47 |         {
48 |             final_sum += shared_data[i];
49 |         }
50 |         output[row] = input[row] / sqrt(final_sum / float(w));
51 |     }
52 | }
53 | 
54 | void RMSV2(float *input, float *output, int w, int h)
55 | {
56 |     dim3 block_size = dim3(1, 32, 1);
57 |     dim3 grid_size = dim3(h, 1, 1);
58 |     RMSKernel_V2<<<grid_size, block_size>>>(input, output, w, h);
59 | }
60 | 


--------------------------------------------------------------------------------
/day13/binding.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include "ATen/ATen.h"
 3 | 
 4 | void RMSV1(float *input, float *output, int w, int h);
 5 | 
 6 | torch::Tensor RMS_V1(torch::Tensor input)
 7 | {
 8 |     auto out = torch::empty_like(input);
 9 |     int h = input.size(0);
10 |     int w = input.size(1);
11 |     RMSV1(input.data_ptr<float>(), out.data_ptr<float>(), w, h);
12 |     return out;
13 | }
14 | 
15 | void RMSV2(float *input, float *output, int w, int h);
16 | torch::Tensor RMS_V2(torch::Tensor input)
17 | {
18 |     auto out = torch::empty_like(input);
19 |     int h = input.size(0);
20 |     int w = input.size(1);
21 |     RMSV1(input.data_ptr<float>(), out.data_ptr<float>(), w, h);
22 |     return out;
23 | }
24 | 
25 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
26 | {
27 |     m.def("RMSV1", &RMS_V1, "RMSV1 (CUDA)");
28 |     m.def("RMSV2", &RMS_V2, "RMSV2 (CUDA)");
29 | }


--------------------------------------------------------------------------------
/day13/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.cpp_extension import load
 3 | import time
 4 | from liger_kernel.ops import rms_norm
 5 | 
 6 | def rms_norm(tensor):
 7 |     return tensor / torch.sqrt(torch.mean(tensor ** 2))
 8 | 
 9 | sources = ["binding.cpp", "RMS.cu", "RMSBetter.cu"]
10 | RMS = load("RMS", sources=sources, verbose=True)
11 | print("Custom CUDA extension loaded.")
12 | 
13 | tensor_sizes = [(1024, 1024), (2048, 2048), (4096, 4096), (8192, 8192)]
14 | 
15 | for tensor_size in tensor_sizes:
16 |     print("=" * 50)
17 |     print("Input Size: ", tensor_size)
18 |     print("=" * 50)
19 |     input_tensor = torch.randn(tensor_size, device='cuda')
20 | 
21 |     # PyTorch RMS time and result
22 |     pytorch_time = 0
23 |     result_pytorch = None
24 |     for _ in range(5):
25 |         start_time = time.time()
26 |         result_pytorch = rms_norm(input_tensor)
27 |         pytorch_time += time.time() - start_time
28 |     print(f"PyTorch RMS time: {pytorch_time / 6:.6f} seconds")
29 | 
30 |     # Custom kernel time and result
31 |     custom_time = 0
32 |     result_custom = None
33 |     for _ in range(5):
34 |         start_time = time.time()
35 |         result_custom = RMS.RMSV2(input_tensor)
36 |         custom_time += time.time() - start_time
37 |     print(f"Custom kernel time: {custom_time / 6:.6f} seconds")
38 | 
39 |     # Liger kernel time and result
40 |     liger_time = 0
41 |     result_liger = None
42 |     for _ in range(5):
43 |         start_time = time.time()
44 |         result_liger = rms_norm(input_tensor)
45 |         liger_time += time.time() - start_time
46 |     print(f"Liger kernel time: {liger_time / 6:.6f} seconds")
47 | 
48 |     # Checking if the results are the same
49 |     pytorch_custom_diff = torch.max(torch.abs(result_pytorch - result_custom))
50 |     pytorch_liger_diff = torch.max(torch.abs(result_pytorch - result_liger))
51 | 
52 |     print(f"Max difference between PyTorch and Custom kernel: {pytorch_custom_diff.item():.6f}")
53 |     print(f"Max difference between PyTorch and Liger kernel: {pytorch_liger_diff.item():.6f}")
54 | 
55 |     # Check if they are numerically close (within tolerance)
56 |     are_pytorch_custom_close = torch.allclose(result_pytorch, result_custom, atol=1)  # You can adjust the tolerance
57 |     are_pytorch_liger_close = torch.allclose(result_pytorch, result_liger, atol=1)  # You can adjust the tolerance
58 | 
59 |     if are_pytorch_custom_close:
60 |         print("PyTorch and Custom kernel results are the same!")
61 |     else:
62 |         print("PyTorch and Custom kernel results are different.")
63 | 
64 |     if are_pytorch_liger_close:
65 |         print("PyTorch and Liger kernel results are the same!")
66 |     else:
67 |         print("PyTorch and Liger kernel results are different.")
68 | 
69 |     print("=" * 50 + "\n")
70 | 


--------------------------------------------------------------------------------
/day14/FA2/helper.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <math.h>
 3 | #include "helper.cuh"
 4 | 
 5 | __device__ float warpReduceMax(float val) {
 6 |     for (int offset = 16; offset > 0; offset /= 2) {
 7 |         val = fmaxf(val, __shfl_down_sync(0xffffffff, val, offset));
 8 |     }
 9 |     return val;
10 | }


--------------------------------------------------------------------------------
/day14/FA2/helper.cuh:
--------------------------------------------------------------------------------
1 | #ifndef HELPER_CUH
2 | #define HELPER_CUH
3 | 
4 | __device__ float warpReduceMax(float val);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/day14/FA2/kernels.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef KERNELS_CUH
 2 | #define KERNELS_CUH
 3 | 
 4 | __global__ void computeDKernel(const float* dO, const float* O, float* D, int N, int d);
 5 | 
 6 | __global__ void computeSiKernel(const float* Qi, const float* Kj, float* Si, int Br, int Bc, int d, float scale);
 7 | 
 8 | __global__ void findRowMaxSiKernel(float* Si, float* maxSi, int Br, int Bc);
 9 | 
10 | __global__ void computeSoftmaxKernel(float* Si, float* softmaxSi, int Br, int Bc);
11 | 
12 | __global__ void computeAttentionKernel(const float* Q, const float* K, const float* V, float* attention, int N, int d);
13 | 
14 | __global__ void computeQKernel(const float* Q, const float* dO, float* dQ, int N, int d);
15 | 
16 | __global__ void computeKKernel(const float* K, const float* dO, float* dK, int N, int d);
17 | 
18 | __global__ void computeVKernel(const float* V, const float* dO, float* dV, int N, int d);
19 | 
20 | __global__ void computeGradientsKernel(const float* dO, float* dQ, float* dK, float* dV, int N, int d);
21 | 
22 | #endif


--------------------------------------------------------------------------------
/day14/FlashAttention2/kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | __global__ void baackwardKernel(float *Q, float *K, float *V, float *O,
 5 |                                 float *dQ, float *dK, float *dV, float *dO,
 6 |                                 float *L, int Bc, int Br,
 7 |                                 int batch_size, int N, int nr_heads, int d)
 8 | {
 9 |     int Tr = ceil(N / Br);
10 |     int Tc = ceil(N / Bc);
11 | 
12 |     // Q1 - > size of Br* d size in shared memory
13 |     // O1 - > size of Br* d size in shared memory
14 | 
15 |     // K1 - > size of Bc *d size in shared memory
16 |     // V1 - > size of Bc *d size in shared memory
17 | 
18 |     // L - > size of Br each
19 | 
20 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
21 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
22 |     
23 | 
24 | }


--------------------------------------------------------------------------------
/day14/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day14/cat.jpg


--------------------------------------------------------------------------------
/day15/SMM.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | __global__ void spmv_csr_kernel(int num_rows, const float *values, const int *column_indices, const int *row_offsets, const float *x, float *y) {
 6 |     int row = blockIdx.x * blockDim.x + threadIdx.x;
 7 |     if (row < num_rows) {
 8 |         float dot = 0;
 9 |         for (int i = row_offsets[row]; i < row_offsets[row + 1]; i++) {
10 |             dot += values[i] * x[column_indices[i]];
11 |         }
12 |         y[row] = dot;
13 |     }
14 | }
15 | 
16 | void spmv_csr(int num_rows, int nnz, float *h_values, int *h_column_indices, int *h_row_offsets, float *h_x, float *h_y) {
17 |     float *d_values;
18 |     float*d_x;
19 |     float *d_y;
20 |     int *d_column_indices; 
21 |     int *d_row_offsets;
22 | 
23 |     cudaMalloc(&d_values, nnz * sizeof(float));
24 |     cudaMalloc(&d_column_indices, nnz * sizeof(int));
25 |     cudaMalloc(&d_row_offsets, (num_rows + 1) * sizeof(int));
26 |     cudaMalloc(&d_x, num_rows * sizeof(float));
27 |     cudaMalloc(&d_y, num_rows * sizeof(float));
28 | 
29 |     cudaMemcpy(d_values, h_values, nnz * sizeof(float), cudaMemcpyHostToDevice);
30 |     cudaMemcpy(d_column_indices, h_column_indices, nnz * sizeof(int), cudaMemcpyHostToDevice);
31 |     cudaMemcpy(d_row_offsets, h_row_offsets, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice);
32 |     cudaMemcpy(d_x, h_x, num_rows * sizeof(float), cudaMemcpyHostToDevice);
33 | 
34 |     int blockSize = 256;
35 |     int gridSize = (num_rows + blockSize - 1) / blockSize;
36 |     spmv_csr_kernel<<<gridSize, blockSize>>>(num_rows, d_values, d_column_indices, d_row_offsets, d_x, d_y);
37 |     
38 |     cudaMemcpy(h_y, d_y, num_rows * sizeof(float), cudaMemcpyDeviceToHost);
39 | 
40 |     cudaFree(d_values);
41 |     cudaFree(d_column_indices);
42 |     cudaFree(d_row_offsets);
43 |     cudaFree(d_x);
44 |     cudaFree(d_y);
45 | }
46 | 
47 | int main() {
48 |     int num_rows = 3;
49 |     int nnz = 4;
50 |     float values[] = {1, 2, 3, 4};
51 |     int column_indices[] = {0, 2, 1, 2};
52 |     int row_offsets[] = {0, 1, 3, 4};
53 |     float x[] = {1, 2, 3};
54 |     float y[3] = {0};
55 | 
56 |     spmv_csr(num_rows, nnz, values, column_indices, row_offsets, x, y);
57 | 
58 |     std::cout << "Rezultat SpMV: ";
59 |     for (int i = 0; i < num_rows; i++) {
60 |         std::cout << y[i] << " ";
61 |     }
62 |     std::cout << std::endl;
63 |     return 0;
64 | }


--------------------------------------------------------------------------------
/day16/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | # Set the seed for reproducibility
 5 | torch.manual_seed(42)
 6 | 
 7 | # Define the dimensions
 8 | seq_len = 4
 9 | dim = 4
10 | 
11 | # Initialize the tensors
12 | Q = torch.full((seq_len, dim), 2.0, requires_grad=True)
13 | K = torch.full((seq_len, dim), 2.0, requires_grad=True)
14 | V = torch.full((seq_len, dim), 2.0, requires_grad=True)
15 | # Forward pass
16 | scores = torch.matmul(Q, K.transpose(-2, -1)) / (dim ** 0.5)
17 | P = F.softmax(scores, dim=-1)
18 | O = torch.matmul(P, V)
19 | 
20 | # Create a dummy gradient for the output
21 | dO = torch.ones_like(O)
22 | 
23 | # Backward pass
24 | O.backward(dO)
25 | 
26 | 
27 | print("PyTorch O:")
28 | print(O)
29 | # Print the gradients
30 | print("PyTorch dQ:")
31 | print(Q.grad)
32 | print("PyTorch dK:")
33 | print(K.grad)
34 | print("PyTorch dV:")
35 | print(V.grad)


--------------------------------------------------------------------------------
/day17/cublas1.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cublas_v2.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | 
 6 | #define n 6
 7 | 
 8 | int main(){
 9 |     cudaError_t cudaStat;
10 |     cublasStatus_t stat;
11 |     cublasHandle_t handle;
12 | 
13 |     int j;
14 |     float *x;
15 |     x = (float*)malloc(sizeof(float)*n);
16 |     for(j = 0 ; j<n ;++j){
17 |         x[j] = (float)j;
18 |     }
19 | 
20 |     printf("x:\n");
21 |     for(j = 0 ; j<n ;++j){
22 |         printf("%f\n",x[j]);
23 |     }
24 | 
25 |     float *d_x;
26 |     cudaStat = cudaMalloc((void**)&d_x,n*sizeof(float));
27 | 
28 |     stat = cublasCreate(&handle);
29 |     stat = cublasSetVector(n,sizeof(float),x,1,d_x,1);
30 |     
31 |     int result;
32 | 
33 |     stat = cublasIsamax(handle,n,d_x,1,&result);
34 |     printf("max: %f\n",fabs(x[result-1]));
35 | 
36 |     stat = cublasIsamin(handle,n,d_x,1,&result);
37 |     printf("min: %f\n",fabs(x[result-1]));
38 | 
39 |     cudaFree(d_x);
40 |     cublasDestroy(handle);
41 |     free(x);
42 |     return 0;
43 | }


--------------------------------------------------------------------------------
/day17/cublas2.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cublas_v2.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | #define n 10
 6 | 
 7 | int main()
 8 | {
 9 |     cudaError_t cudaStat;
10 |     cublasStatus_t stat;
11 |     cublasHandle_t handle;
12 | 
13 |     int j;
14 |     float *x, *y;
15 |     x = (float *)malloc(sizeof(float) * n);
16 |     y = (float *)malloc(sizeof(float) * n);
17 | 
18 |     for (j = 0; j < n; ++j)
19 |     {
20 |         x[j] = (float)j;
21 |         y[j] = (float)j + 1;
22 |     }
23 | 
24 |     printf("\nx:\n");
25 |     for (j = 0; j < n; ++j)
26 |     {
27 |         printf("%f ", x[j]);
28 |     }
29 | 
30 |     printf("\ny:\n");
31 |     for (j = 0; j < n; ++j)
32 |     {
33 |         printf("%f ", y[j]);
34 |     }
35 | 
36 |     float *d_x, *d_y;
37 |     cudaStat = cudaMalloc((void **)&d_x, n * sizeof(float));
38 |     cudaStat = cudaMalloc((void **)&d_y, n * sizeof(float));
39 | 
40 |     stat = cublasCreate(&handle);
41 |     stat = cublasSetVector(n, sizeof(float), x, 1, d_x, 1);
42 |     stat = cublasSetVector(n, sizeof(float), y, 1, d_y, 1);
43 |     float a = 3.0;
44 | 
45 |     stat = cublasSaxpy(handle, n, &a, d_x, 1, d_y, 1);
46 |     stat = cublasGetVector(n, sizeof(float), d_y, 1, y, 1);
47 | 
48 |     printf("\nNew y:\n");
49 |     for (j = 0; j < n; ++j)
50 |     {
51 |         printf("%f ", y[j]);
52 |     }
53 |     cudaFree(d_y);
54 |     cudaFree(d_x);
55 |     cublasDestroy(handle);
56 |     free(x);
57 |     free(y);
58 |     return 0;
59 | }


--------------------------------------------------------------------------------
/day17/cublas3.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cublas_v2.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | #define n 10
 6 | 
 7 | int main()
 8 | {
 9 |     cudaError_t cudaStat;
10 |     cublasStatus_t stat;
11 |     cublasHandle_t handle;
12 | 
13 |     int j;
14 |     float *x, *y;
15 |     x = (float *)malloc(sizeof(float) * n);
16 |     y = (float *)malloc(sizeof(float) * n);
17 |     for (j = 0; j < n; ++j)
18 |     {
19 |         x[j] = (float)1;
20 |         y[j] = (float)1;
21 |     }
22 | 
23 |     printf("\nx:\n");
24 |     for (j = 0; j < n; ++j)
25 |     {
26 |         printf("%f ", x[j]);
27 |     }
28 | 
29 |     printf("\ny:\n");
30 |     for (j = 0; j < n; ++j)
31 |     {
32 |         printf("%f ", y[j]);
33 |     }
34 | 
35 |     float *d_x, *d_y;
36 |     cudaStat = cudaMalloc((void **)&d_x, n * sizeof(float));
37 |     cudaStat = cudaMalloc((void **)&d_y, n * sizeof(float));
38 | 
39 |     stat = cublasCreate(&handle);
40 |     stat = cublasSetVector(n, sizeof(float), x, 1, d_x, 1);
41 |     stat = cublasSetVector(n, sizeof(float), y, 1, d_y, 1);
42 |     float a = 3.0;
43 | 
44 |     float result;
45 |     stat = cublasSdot(handle, n, d_x, 1, d_y, 1, &result);
46 |     printf("\ndot product x . y : \n ");
47 |     printf (" %7.0f \n " , result );
48 | 
49 | 
50 |     cudaFree(d_y);
51 |     cudaFree(d_x);
52 |     cublasDestroy(handle);
53 |     free(x);
54 |     free(y);
55 |     return 0;
56 | }


--------------------------------------------------------------------------------
/day18/atomic1.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #define N 32  
 3 | #include <stdio.h>
 4 | __device__ int lane_id() {
 5 |     return threadIdx.x & 31;
 6 | }
 7 | 
 8 | __device__ int atomicAggInc(int *ptr) {
 9 |     int mask = __match_any_sync(__activemask(), (unsigned long long)ptr);
10 |     int leader = __ffs(mask) - 1;  
11 |     int res;
12 |     if (lane_id() == leader)       
13 |         res = atomicAdd(ptr, __popc(mask));
14 |     res = __shfl_sync(mask, res, leader); 
15 |     return res + __popc(mask & ((1 << lane_id()) - 1)); 
16 | }
17 | 
18 | __global__ void test_atomicAggInc(int *d_ptr, int *d_results) {
19 |     int old_val = atomicAggInc(d_ptr);
20 |     d_results[threadIdx.x] = old_val;  
21 | }
22 | 
23 | int main() {
24 |     int *d_ptr, *d_results;
25 |     int h_ptr = 0;   
26 |     int h_results[N];
27 | 
28 |     cudaMalloc(&d_ptr, sizeof(int));
29 |     cudaMalloc(&d_results, N * sizeof(int));
30 | 
31 |     cudaMemcpy(d_ptr, &h_ptr, sizeof(int), cudaMemcpyHostToDevice);
32 | 
33 |     test_atomicAggInc<<<1, N>>>(d_ptr, d_results);
34 | 
35 |     cudaMemcpy(&h_ptr, d_ptr, sizeof(int), cudaMemcpyDeviceToHost);
36 |     cudaMemcpy(h_results, d_results, N * sizeof(int), cudaMemcpyDeviceToHost);
37 | 
38 |     printf("Final value of ptr: %d\n", h_ptr);
39 |     printf("Old values returned by each thread:\n");
40 |     for (int i = 0; i < N; i++) {
41 |         printf("Thread %2d -> %d\n", i, h_results[i]);
42 |     }
43 | 
44 |     cudaFree(d_ptr);
45 |     cudaFree(d_results);
46 | 
47 |     return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/day18/atomic2.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | #define N 320
 5 | 
 6 | __device__ int lane_id(){
 7 |     return threadIdx.x & 31;
 8 | }
 9 | 
10 | // incremenets threads by pointers
11 | __device__ int atomicIncrement(int * ptr){
12 |     int mask = __match_any_sync(__activemask(), (unsigned long long)ptr);
13 |     int leader = __ffs(mask) -1;
14 |     int res;
15 |     if(lane_id() == leader){
16 |         res = atomicAdd(ptr,__popc(mask)); // add on ptr number of active threads
17 |     }
18 |     __shfl_sync(mask,res,leader);
19 |     return *ptr;
20 | }
21 | 
22 | __global__ void testatomicIncrement(int *d_ptr, int *d_results){
23 |     int val = atomicIncrement(d_ptr);
24 |     d_results[threadIdx.x] = val;
25 | }
26 |     
27 | 
28 | 
29 | int main() {
30 |     int *d_ptr, *d_results;
31 |     int h_ptr = 100;   
32 |     int h_results[N];
33 | 
34 |     cudaMalloc(&d_ptr, sizeof(int));
35 |     cudaMalloc(&d_results, N * sizeof(int));
36 | 
37 |     cudaMemcpy(d_ptr, &h_ptr, sizeof(int), cudaMemcpyHostToDevice);
38 | 
39 |     testatomicIncrement<<<1, N>>>(d_ptr, d_results);
40 | 
41 |     cudaMemcpy(&h_ptr, d_ptr, sizeof(int), cudaMemcpyDeviceToHost);
42 |     cudaMemcpy(h_results, d_results, N * sizeof(int), cudaMemcpyDeviceToHost);
43 | 
44 |     printf("Final value of ptr: %d\n", h_ptr);
45 |     printf("Old values returned by each thread:\n");
46 |     for (int i = 0; i < N; i++) {
47 |         printf("Thread %2d -> %d\n", i, h_results[i]);
48 |     }
49 | 
50 |     cudaFree(d_ptr);
51 |     cudaFree(d_results);
52 | 
53 |     return 0;
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/day18/wrap.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <cmath>
 3 | #include <algorithm>
 4 | #include <iostream>
 5 | 
 6 | __device__ int lane_id() {
 7 |     return threadIdx.x & 31;
 8 | }
 9 | 
10 | __device__ float reduceMax(float val) {
11 |     // threads of 32 we perform reduction on them
12 |     for (int offset = 16; offset > 0; offset /= 2) {
13 |         float temp = __shfl_xor_sync(0xFFFFFFFF, val, offset);
14 |         val = fmaxf(val, temp);
15 |     }
16 |     return val;
17 | }
18 | 
19 | __device__ float atomicMaxFloat(float *addr, float value) {
20 |     // give the adres of input
21 |     // save old adress
22 |     int *addr_as_int = (int*)addr;
23 |     int old = *addr_as_int;
24 |     int assumed;
25 |     do {
26 |         assumed = old;
27 |         float old_val = __int_as_float(assumed);
28 |         if (old_val >= value) {
29 |             return old_val;
30 |         }
31 |         old = atomicCAS(addr_as_int, assumed, __float_as_int(fmaxf(old_val, value)));
32 |     } while (assumed != old);
33 |     return __int_as_float(old);
34 | }
35 | 
36 | __global__ void MaxValue(float *data, float *max_value, int N) {
37 |     int tx = threadIdx.x;
38 |     int bx = blockIdx.x;
39 | 
40 |     extern __shared__ float reduction[];
41 | 
42 |     float block_max = -INFINITY;
43 | 
44 |     for (int i = bx * blockDim.x + tx; i < N; i += gridDim.x * blockDim.x) {
45 |         block_max = fmaxf(block_max, data[i]);
46 |     }
47 | 
48 |     block_max = reduceMax(block_max);
49 | 
50 |     reduction[tx] = block_max;
51 |     __syncthreads();
52 | 
53 |     if (tx == 0) {
54 |         float final_max = -INFINITY;
55 |         for (int i = 0; i < blockDim.x; ++i) {
56 |             final_max = fmaxf(final_max, reduction[i]);
57 |         }
58 |         atomicMaxFloat(max_value, final_max);
59 |     }
60 | }
61 | 
62 | int main() {
63 |     int N = 1024;
64 |     float *host_data = (float*)malloc(N * sizeof(float));
65 |     float host_result = -INFINITY;
66 | 
67 |     for (int i = 0; i < N; ++i) {
68 |         host_data[i] = rand()%10000;
69 |         if (host_data[i] > host_result) {
70 |             host_result = host_data[i];
71 |         }
72 |     }
73 | 
74 |     float *device_data, *device_result;
75 |     cudaMalloc(&device_data, N * sizeof(float));
76 |     cudaMalloc(&device_result, sizeof(float));
77 | 
78 |     cudaMemcpy(device_data, host_data, N * sizeof(float), cudaMemcpyHostToDevice);
79 |     cudaMemcpy(device_result, &host_result, sizeof(float), cudaMemcpyHostToDevice);
80 | 
81 |     int threadsPerBlock = 256;
82 |     int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
83 | 
84 |     MaxValue<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(float)>>>(device_data, device_result, N);
85 | 
86 |     cudaMemcpy(&host_result, device_result, sizeof(float), cudaMemcpyDeviceToHost);
87 | 
88 |     std::cout << "Max value: " << host_result << std::endl;
89 | 
90 |     free(host_data);
91 |     cudaFree(device_data);
92 |     cudaFree(device_result);
93 | 
94 |     return 0;
95 | }


--------------------------------------------------------------------------------
/day20/rope.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | #include <torch/extension.h>
 4 | #include <torch/types.h>
 5 | 
 6 | #define BLOCK_SIZE 256
 7 | #define theta 10000.0f
 8 | #define STRINGFY(str) #str
 9 | #define TORCH_BINDING_COMMON_EXTENSION(func) \
10 |   m.def(STRINGFY(func), &func, STRINGFY(func));
11 | 
12 | __global__ void rope_kernel(float* x, float* out, int N){ 
13 |   int idx = blockIdx.x * blockDim.x + threadIdx.x;
14 |   float4 x_v = reinterpret_cast<float4*>(&(x[idx * 4]))[0];
15 | 
16 |   int token_pos = idx / N; 
17 |   int token_idx = idx % N;
18 |   
19 |   float exp_f_v = 1.0f / powf(theta, token_idx * 2 / (N * 4));
20 |   float exp_s_v = 1.0f / powf(theta, ((token_idx * 2) + 1) / (N * 4));
21 |   
22 |   float sin_f_v = sinf(token_pos / exp_f_v);
23 |   float cos_f_v = cosf(token_pos / exp_f_v);
24 |   
25 |   float sin_s_v = sinf(token_pos / exp_s_v);
26 |   float cos_s_v = cosf(token_pos / exp_s_v);
27 |   float4 out_v;
28 | 
29 |   out_v.x = x_v.x * cos_f_v - x_v.y * sin_f_v;
30 |   out_v.y = x_v.x * sin_f_v + x_v.y * cos_f_v;
31 |   out_v.z = x_v.z * cos_s_v - x_v.w * sin_s_v;
32 |   out_v.w = x_v.z * sin_s_v + x_v.w * cos_s_v; 
33 |   
34 |   reinterpret_cast<float4*>(&(out[idx * 4]))[0] = out_v;
35 | }
36 | 
37 | void rope(torch::Tensor x, torch::Tensor out) {
38 |   int seq_len = x.size(0);
39 |   int hidden_size = x.size(1);
40 |   
41 |   int N = (int)(hidden_size/4);
42 | 
43 |   dim3 grid((seq_len * N + BLOCK_SIZE - 1) / BLOCK_SIZE);
44 |   dim3 block(BLOCK_SIZE);
45 | 
46 |   rope_kernel<<<grid, block>>>(x.data_ptr<float>(), out.data_ptr<float>(), N);
47 | }
48 | 
49 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
50 |   TORCH_BINDING_COMMON_EXTENSION(rope)
51 | }


--------------------------------------------------------------------------------
/day20/test_rope.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | from torch.utils.cpp_extension import load
 4 | print(torch.__version__)  # Verifică versiunea PyTorch
 5 | print(torch.cuda.is_available())  # Dacă e False, PyTorch nu vede CUDA
 6 | print(torch.version.cuda)  # Verifică versiunea CUDA detectată de
 7 | lib = load(
 8 |     name="rope",
 9 |     sources=["rope.cu"],
10 |     extra_cuda_cflags=[        "-O3",
11 |         "--use_fast_math",
12 |     ],
13 |     extra_cflags=["-std=c++17"],
14 | )
15 | 
16 | def benchmark(func, x, out=None, iters=20):
17 |     torch.cuda.synchronize()
18 |     start = time.time()
19 |     for _ in range(iters):
20 |         if out is not None:
21 |             func(x, out)
22 |         else:
23 |             _ = func(x)
24 |     torch.cuda.synchronize()
25 |     return (time.time() - start) * 1000 / iters  
26 | 
27 | def naive_rope(x, theta=10000.0):
28 |     dim = x.shape[-1]
29 |     seq_len = x.shape[-2]
30 |     x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
31 |     freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim)).cuda()
32 |     freqs = torch.outer(torch.arange(seq_len, device='cuda'), freqs)
33 |     freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
34 |     return torch.view_as_real(x_ * freqs_cis).flatten(1).type_as(x)
35 | 
36 | sizes = [(4096, 512), (4096, 1024), (8192, 512), (8192, 1024)]
37 | for M, N in sizes:
38 |     print(f"Testing M={M}, N={N}")
39 |     x = torch.randn((M, N), device='cuda', dtype=torch.float32).contiguous()
40 |     out = torch.zeros_like(x)
41 | 
42 |     t_naive = benchmark(naive_rope, x)
43 |     naive_out = naive_rope(x)
44 |     
45 |     t_cuda = benchmark(lib.rope, x, out)
46 |     
47 |     # Compute the maximum absolute difference
48 |     max_diff = torch.max(torch.abs(naive_out - out)).item()
49 |     
50 |     print(f"Naive: {t_naive:.4f}ms, CUDA f32: {t_cuda:.4f}ms")
51 |     print(f"Max difference: {max_diff:.6f}")
52 |     print("-" * 60)
53 | 
54 |     
55 |     
56 | 


--------------------------------------------------------------------------------
/day23/kernel.ptx:
--------------------------------------------------------------------------------
 1 | .version 6.0
 2 | .target sm_50
 3 | .address_size 64
 4 | 
 5 | .visible .entry vectorAdd(
 6 |     .param .u64 param_A,
 7 |     .param .u64 param_B,
 8 |     .param .u64 param_C,
 9 |     .param .u32 param_N
10 | )
11 | {
12 |     .reg .pred   %p<2>;
13 |     .reg .s32    %r<6>;
14 |     .reg .f32    %f<4>;
15 |     .reg .u64    %rd<10>;
16 | 
17 |     ld.param.u64   %rd1, [param_A];
18 |     ld.param.u64   %rd2, [param_B];
19 |     ld.param.u64   %rd3, [param_C];
20 |     ld.param.u32   %r1,  [param_N];
21 | 
22 |     mov.u32        %r2, %tid.x;
23 |     mov.u32        %r3, %ctaid.x;
24 |     mov.u32        %r4, %ntid.x;
25 |     mad.lo.s32   %r5, %r3, %r4, %r2;
26 | 
27 |     setp.ge.s32  %p1, %r5, %r1;
28 |     @%p1 bra     EXIT;
29 | 
30 |     cvt.u64.s32  %rd4, %r5;
31 |     mul.wide.s32 %rd5, %r5, 4;
32 |     add.u64      %rd6, %rd1, %rd5;
33 |     add.u64      %rd7, %rd2, %rd5;
34 |     add.u64      %rd8, %rd3, %rd5;
35 | 
36 |     ld.global.f32 %f1, [%rd6];
37 |     ld.global.f32 %f2, [%rd7];
38 | 
39 |     add.f32      %f3, %f1, %f2;
40 | 
41 |     st.global.f32 [%rd8], %f3;
42 | 
43 | EXIT:
44 |     ret;
45 | }
46 | 


--------------------------------------------------------------------------------
/day24/GeGLU.cu:
--------------------------------------------------------------------------------
 1 | #include "cuda_runtime.h"
 2 | 
 3 | __global__ void GLUKernel(float* x, float* W, float* V, float* b, float* c, float* out, int M, int N, int K) {
 4 |     int row = blockIdx.x * blockDim.x + threadIdx.x;
 5 |     int col = blockIdx.y * blockDim.y + threadIdx.y;
 6 |     
 7 |     if (row < M && col < K) {
 8 |         float sum1 = b[col];
 9 |         float sum2 = c[col];
10 |         
11 |         for (int i = 0; i < N; i++) {
12 |             sum1 += x[row * N + i] * W[i * K + col];
13 |             sum2 += x[row * N + i] * V[i * K + col];
14 |         }
15 |         
16 |         float gate = 1.0f / (1.0f + expf(-sum1)); 
17 |         out[row * K + col] = gate * sum2;
18 |     }
19 | }
20 | 
21 | extern "C" void launchGLU(float* x, float* W, float* V, float* b, float* c, float* out, int M, int N, int K) {
22 |     dim3 blockSize(16, 16);
23 |     dim3 gridSize((M + 15) / 16, (K + 15) / 16);
24 |     
25 |     GLUKernel<<<gridSize, blockSize>>>(x, W, V, b, c, out, M, N, K);
26 |     cudaDeviceSynchronize();
27 | }
28 | 


--------------------------------------------------------------------------------
/day26/gradientdescent.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day26/gradientdescent.out


--------------------------------------------------------------------------------
/day27/kmeans.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day27/kmeans.out


--------------------------------------------------------------------------------
/day28/sample.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <torch/extension.h>
 3 | #include <float.h>
 4 | 
 5 | __global__ void update_x_kernel(
 6 |     float *x, const float *noise, const float *predicted_noise,
 7 |     float sqrt_alpha, float sqrt_alpha_hat, float beta, float alpha,
 8 |     int numel)
 9 | {
10 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
11 |     if (idx < numel)
12 |     {
13 |         x[idx] = (1.0f / sqrt_alpha) *(x[idx] - ((1 - alpha) / sqrt_alpha_hat) * predicted_noise[idx]) +sqrt(beta) * noise[idx];
14 |     }
15 | }
16 | 
17 | torch::Tensor update_x(torch::Tensor x, torch::Tensor noise, torch::Tensor predicted_noise,
18 |                        torch::Tensor sqrt_alpha, torch::Tensor sqrt_alpha_hat,
19 |                        torch::Tensor beta, torch::Tensor alpha)
20 | {
21 |     int numel = x.numel();
22 |     float sqrt_alpha_val = sqrt_alpha.item<float>();
23 |     float sqrt_alpha_hat_val = sqrt_alpha_hat.item<float>();
24 |     float beta_val = beta.item<float>();
25 |     float alpha_val = alpha.item<float>();
26 |     
27 |     const int threads = 1024;
28 |     const int blocks = (numel + threads - 1) / threads;
29 | 
30 |     update_x_kernel<<<blocks, threads>>>(
31 |         x.data_ptr<float>(), noise.data_ptr<float>(), predicted_noise.data_ptr<float>(),
32 |         sqrt_alpha_val, sqrt_alpha_hat_val, beta_val, alpha_val, numel);
33 |     
34 |     cudaError_t err = cudaGetLastError();
35 |     if (err != cudaSuccess) {
36 |         printf("CUDA error: %s\n", cudaGetErrorString(err));
37 |         throw std::runtime_error(cudaGetErrorString(err));
38 |     }
39 |     
40 |     cudaDeviceSynchronize();
41 | 
42 |     return x;
43 | }
44 | 
45 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
46 | {
47 |     m.def("update_x", &update_x, "CUDA kernel for updating x");
48 | }
49 | 


--------------------------------------------------------------------------------
/day28/test_sample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.cpp_extension import load
 3 | import time
 4 | 
 5 | lib = load(
 6 |     name="update_x",
 7 |     sources=["sample.cu"],
 8 |     extra_cuda_cflags=[        "-O3",
 9 |         "--use_fast_math",
10 |     ],
11 |     extra_cflags=["-std=c++17"],
12 | )
13 | 
14 | 
15 | print("Loaded ")
16 | 
17 | size = 10**6
18 | device = "cuda"
19 | 
20 | x = torch.randn(size, device=device)
21 | noise = torch.randn(size, device=device)
22 | predicted_noise = torch.randn(size, device=device)
23 | alpha = torch.tensor(0.9, device=device)
24 | beta = torch.tensor(0.1, device=device)
25 | alpha_hat = torch.tensor(0.81, device=device)
26 | 
27 | sqrt_alpha = torch.sqrt(alpha)
28 | sqrt_alpha_hat = torch.sqrt(1 - alpha_hat)
29 | 
30 | torch.cuda.synchronize()
31 | start = time.time()
32 | x_cuda = lib.update_x(x.clone(), noise, predicted_noise, sqrt_alpha, sqrt_alpha_hat, beta, alpha)
33 | torch.cuda.synchronize()
34 | time_cuda = time.time() - start
35 | 
36 | torch.cuda.synchronize()
37 | start = time.time()
38 | x_torch = 1 / sqrt_alpha * (x - ((1 - alpha) / sqrt_alpha_hat) * predicted_noise) + torch.sqrt(beta) * noise
39 | torch.cuda.synchronize()
40 | time_torch = time.time() - start
41 | 
42 | print(f"CUDA Kernel Time: {time_cuda:.6f}s")
43 | print(f"PyTorch Time: {time_torch:.6f}s")
44 | print(f"Speedup: {time_torch / time_cuda:.2f}x")
45 | 


--------------------------------------------------------------------------------
/day29/pi.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | __device__ float randomFloat(unsigned int *seed) {
 6 |     *seed = (*seed * 1664525u + 1013904223u);
 7 |     return (float)(*seed & 0x00FFFFFF) / (float)0x01000000;
 8 | }
 9 | 
10 | __global__ void monteCarloPi(int iterations, unsigned long long *d_count) {
11 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
12 |     unsigned int seed = tid;
13 |     unsigned int local_count = 0;
14 | 
15 |     for (int i = 0; i < iterations; i++) {
16 |         float x = randomFloat(&seed);
17 |         float y = randomFloat(&seed);
18 |         if (x * x + y * y <= 1.0f)
19 |             local_count++;
20 |     }
21 | 
22 |     atomicAdd(d_count, (unsigned long long)local_count);
23 | }
24 | 
25 | int main() {
26 |     int iterations = 10000;
27 |     int threadsPerBlock = 256;
28 |     int blocks = 256;
29 | 
30 |     unsigned long long totalPoints = (unsigned long long)iterations * threadsPerBlock * blocks;
31 | 
32 |     unsigned long long host_count = 0;
33 |     unsigned long long *d_count;
34 |     cudaMalloc((void**)&d_count, sizeof(unsigned long long));
35 |     cudaMemset(d_count, 0, sizeof(unsigned long long));
36 | 
37 |     monteCarloPi<<<blocks, threadsPerBlock>>>(iterations, d_count);
38 |     cudaDeviceSynchronize();
39 | 
40 |     cudaMemcpy(&host_count, d_count, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
41 | 
42 |     float pi = 4.0f * (float)host_count / (float)totalPoints;
43 |     printf("Estimated Pi = %f\n", pi);
44 | 
45 |     cudaFree(d_count);
46 |     return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/day30/kernelHisto.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | 
 4 | #define BLOCK_SIZE 16  // 16x16 thread block
 5 | #define HIST_SIZE 256  // Grayscale histogram bins
 6 | 
 7 | __global__ void histogram_equalization(unsigned char *d_img, unsigned char *d_out, int width, int height) {
 8 |     __shared__ unsigned int hist_shared[HIST_SIZE];  // Shared memory for histogram
 9 |     __shared__ float cdf_shared[HIST_SIZE];          // Shared memory for CDF
10 | 
11 |     int tx = threadIdx.x, ty = threadIdx.y;
12 |     int x = blockIdx.x * blockDim.x + tx;
13 |     int y = blockIdx.y * blockDim.y + ty;
14 | 
15 |     int index = y * width + x;
16 |     
17 |     // Initialize shared histogram
18 |     if (tx < HIST_SIZE / BLOCK_SIZE && ty == 0) {
19 |         hist_shared[tx * BLOCK_SIZE] = 0;
20 |     }
21 |     __syncthreads();
22 | 
23 |     // First pass: compute local histogram using atomic operations
24 |     if (x < width && y < height) {
25 |         atomicAdd(&hist_shared[d_img[index]], 1);
26 |     }
27 |     __syncthreads();
28 | 
29 |     // Merge local histograms into global memory
30 |     __shared__ unsigned int hist_global[HIST_SIZE]; 
31 |     if (tx == 0 && ty == 0) {
32 |         for (int i = 0; i < HIST_SIZE; i++) {
33 |             atomicAdd(&hist_global[i], hist_shared[i]);
34 |         }
35 |     }
36 |     __syncthreads();
37 | 
38 |     // Compute CDF (Cumulative Distribution Function)
39 |     if (tx == 0 && ty == 0) {
40 |         float sum = 0;
41 |         for (int i = 0; i < HIST_SIZE; i++) {
42 |             sum += hist_global[i];
43 |             cdf_shared[i] = sum;
44 |         }
45 | 
46 |         // Normalize the CDF
47 |         float min_cdf = cdf_shared[0];
48 |         for (int i = 0; i < HIST_SIZE; i++) {
49 |             cdf_shared[i] = ((cdf_shared[i] - min_cdf) / (width * height - min_cdf)) * 255.0f;
50 |         }
51 |     }
52 |     __syncthreads();
53 | 
54 |     // Apply equalization
55 |     if (x < width && y < height) {
56 |         d_out[index] = (unsigned char)cdf_shared[d_img[index]];
57 |     }
58 | }


--------------------------------------------------------------------------------
/day32/Makefile:
--------------------------------------------------------------------------------
 1 | PROJECT_DIR := $(CURDIR)
 2 | 
 3 | COLOR_RESET := \033[0m
 4 | COLOR_GREEN := \033[32m
 5 | COLOR_YELLOW := \033[33m
 6 | COLOR_BLUE := \033[34m
 7 | COLOR_RED := \033[31m
 8 | 
 9 | HIP_GPU_TARGET := gfx90a
10 | 
11 | all: build
12 | 
13 | build: $(PROJECT_DIR)/$(dir)/$(program).out
14 | 
15 | $(PROJECT_DIR)/$(dir)/$(program).out: $(PROJECT_DIR)/$(dir)/$(program).cpp
16 | 	@echo "$(COLOR_YELLOW)Building HIP program $(program) in directory $(dir)...$(COLOR_RESET)"
17 | 	@hipcc --offload-arch=$(HIP_GPU_TARGET) -O3 -o $@ $< -lrocblas
18 | 	@echo "$(COLOR_GREEN)Build completed for $(program).out in $(dir)$(COLOR_RESET)"
19 | 
20 | run: $(PROJECT_DIR)/$(dir)/$(program).out
21 | 	@echo "$(COLOR_BLUE)Running $(program).out in directory $(dir)...$(COLOR_RESET)"
22 | 	@./$(dir)/$(program).out
23 | 
24 | # Target: Separate rocprof command for kernel profiling
25 | rocprof: $(PROJECT_DIR)/$(dir)/$(program).out
26 | 	@echo "$(COLOR_BLUE)Running rocprof for kernel trace on $(program).out in directory $(dir)...$(COLOR_RESET)"
27 | 	@mkdir -p $(PROJECT_DIR)/$(dir)/output
28 | 	@echo "$(COLOR_GREEN)Kernel profiling completed for $(program).out in $(dir)$(COLOR_RESET)"
29 | 
30 | # Target: Generate ISA assembly files and kernel resource usage analysis.
31 | isa:
32 | 	@echo "$(COLOR_BLUE)Generating ISA and kernel resource usage for $(program) in directory $(dir)...$(COLOR_RESET)"
33 | 	@mkdir -p $(PROJECT_DIR)/$(dir)/isa_output
34 | 	@hipcc -c --save-temps=obj -O3 -Rpass-analysis=kernel-resource-usage --offload-arch=$(HIP_GPU_TARGET) -o $(PROJECT_DIR)/$(dir)/isa_output/$(program).o $(PROJECT_DIR)/$(dir)/$(program).cpp
35 | 	@echo "$(COLOR_GREEN)ISA and resource analysis files saved in $(dir)/isa_output$(COLOR_RESET)"
36 | 
37 | clean:
38 | 	@echo "$(COLOR_RED)Cleaning up .out and ISA files in directory $(dir)...$(COLOR_RESET)"
39 | 	@rm -f $(PROJECT_DIR)/$(dir)/*.out
40 | 	@rm -rf $(PROJECT_DIR)/$(dir)/isa_output
41 | 	@rm -rf $(PROJECT_DIR)/$(dir)/output
42 | 	@echo "$(COLOR_GREEN)Clean completed for directory $(dir)$(COLOR_RESET)"
43 | 
44 | cleanall:
45 | 	@echo "$(COLOR_RED)Cleaning up all .out and ISA files in all directories...$(COLOR_RESET)"
46 | 	@find $(PROJECT_DIR) -type f -name "*.out" -exec rm -f {} \;
47 | 	@find $(PROJECT_DIR) -type d -name "isa_output" -exec rm -rf {} \;
48 | 	@find $(PROJECT_DIR) -type d -name "output" -exec rm -rf {} \;
49 | 	@echo "$(COLOR_GREEN)Cleanall completed for all directories$(COLOR_RESET)"
50 | 
51 | help:
52 | 	@echo "$(COLOR_BLUE)Usage instructions for HIP Makefile:$(COLOR_RESET)"
53 | 	@echo ""
54 | 	@echo "$(COLOR_YELLOW)make dir=<dir> program=<program>$(COLOR_RESET)      		# Build the HIP program <program>.cpp in directory <dir>"
55 | 	@echo "$(COLOR_YELLOW)make run dir=<dir> program=<program>$(COLOR_RESET)  		# Run the compiled <program>.out in directory <dir>"
56 | 	@echo "$(COLOR_YELLOW)make clean dir=<dir>$(COLOR_RESET)                  		# Clean all .out files in directory <dir>"
57 | 	@echo "$(COLOR_YELLOW)make cleanall$(COLOR_RESET)                         		# Clean all .out files in all directories"
58 | 	@echo "$(COLOR_YELLOW)make isa dir=<dir> program=<program>$(COLOR_RESET)  		# Generate ISA assembly files and kernel resource usage analysis"
59 | 	@echo ""
60 | 	@echo "$(COLOR_BLUE)Examples:$(COLOR_RESET)"
61 | 	@echo "$(COLOR_GREEN)make dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)"
62 | 	@echo "$(COLOR_GREEN)make run dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)"
63 | 	@echo "$(COLOR_GREEN)make isa dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)"
64 | 


--------------------------------------------------------------------------------
/day32/matmul_kernels/kernel_1/kernel_1.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <iostream>
 3 | #include <cstdlib>
 4 | 
 5 | #define HIP_CHECK(status)                                    \
 6 |     {                                                        \
 7 |         hipError_t err = status;                             \
 8 |         if (err != hipSuccess) {                             \
 9 |             std::cerr << "HIP error: " << hipGetErrorString(err) \
10 |                       << " at line " << __LINE__ << std::endl; \
11 |             exit(err);                                       \
12 |         }                                                    \
13 |     }
14 | 
15 | __global__ void kernel(float *A, float *B, float *C, int N, int M, int K, float alpha, float beta) {
16 |     int row = blockDim.y * blockIdx.y + threadIdx.y;
17 |     int col = blockDim.x * blockIdx.x + threadIdx.x;
18 | 
19 |     if (row < M && col < N) {
20 |         float sum = 0.0f;
21 |         for (int k = 0; k < K; ++k) {
22 |             sum += A[row * K + k] * B[k * N + col];
23 |         }
24 |         C[row * N + col] = alpha * sum + beta * C[row * N + col];
25 |     }
26 | }
27 | 
28 | // int main() {
29 | //     float *A, *B, *C;
30 | //     float *d_A, *d_B, *d_C;
31 | 
32 | //     float alpha, beta;
33 |     
34 | //     // For simplicity, we use a square matrix.
35 | //     int SIZE = 100;
36 | //     size_t mem_size = SIZE * SIZE * sizeof(float);
37 | 
38 | //     alpha = 1.0f;
39 | //     beta = 0.0f;
40 | 
41 | //     A = (float*)malloc(mem_size);
42 | //     B = (float*)malloc(mem_size);
43 | //     C = (float*)malloc(mem_size);
44 | 
45 | //     for (int i = 0; i < SIZE * SIZE; ++i) {
46 | //         A[i] = i%3;
47 | //         B[i] = i%3;
48 | //         C[i] = 0.0f;
49 | //     }
50 | 
51 | //     HIP_CHECK(hipMalloc(&d_A, mem_size));
52 | //     HIP_CHECK(hipMalloc(&d_B, mem_size));
53 | //     HIP_CHECK(hipMalloc(&d_C, mem_size));
54 | 
55 | //     HIP_CHECK(hipMemcpy(d_A, A, mem_size, hipMemcpyHostToDevice));
56 | //     HIP_CHECK(hipMemcpy(d_B, B, mem_size, hipMemcpyHostToDevice));
57 | //     HIP_CHECK(hipMemcpy(d_C, C, mem_size, hipMemcpyHostToDevice));
58 | 
59 | //     dim3 threadsPerBlock(16, 16);
60 | //     dim3 blocksPerGrid((SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
61 | //                        (SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);
62 | 
63 | //     hipLaunchKernelGGL(kernel, blocksPerGrid, threadsPerBlock, 0, 0, 
64 | //                        d_A, d_B, d_C, SIZE, SIZE, SIZE, alpha, beta);
65 | 
66 | //     HIP_CHECK(hipDeviceSynchronize());
67 | //     HIP_CHECK(hipMemcpy(C, d_C, mem_size, hipMemcpyDeviceToHost));
68 | 
69 | //     std::cout << "Result matrix C (first 10 elements):" << std::endl;
70 | //     for (int i = 0; i < 10; ++i) {
71 | //         std::cout << C[i] << " ";
72 | //     }
73 | //     std::cout << std::endl;
74 | 
75 | //     HIP_CHECK(hipFree(d_A));
76 | //     HIP_CHECK(hipFree(d_B));
77 | //     HIP_CHECK(hipFree(d_C));
78 | //     free(A);
79 | //     free(B);
80 | //     free(C);
81 | 
82 | //     return 0;
83 | // }
84 | 


--------------------------------------------------------------------------------
/day32/matmul_kernels/kernel_2/kernel_2.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | // Macro to check HIP errors.
 5 | #define CHECK_HIP_ERROR(error)                                     \
 6 |     {                                                              \
 7 |         if ((error) != hipSuccess)                                 \
 8 |         {                                                          \
 9 |             std::cerr << "HIP error: " << hipGetErrorString(error) \
10 |                       << " at line " << __LINE__ << std::endl;     \
11 |             exit(EXIT_FAILURE);                                    \
12 |         }                                                          \
13 |     }
14 | 
15 | #define TILESIZE 32
16 | 
17 | __global__ void kernel(const float *A, const float *B, float *C, int N)
18 | {
19 |     __shared__ float As[TILESIZE][TILESIZE];
20 |     __shared__ float Bs[TILESIZE][TILESIZE];
21 | 
22 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
23 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
24 | 
25 |     float sum = 0.0f;
26 | 
27 |     for (int t = 0; t < N; t += TILESIZE)
28 |     {
29 |         Bs[threadIdx.y][threadIdx.x] = B[(t + threadIdx.y) * N + col];
30 |         As[threadIdx.y][threadIdx.x] = A[row * N + t + threadIdx.x];
31 | 
32 |         __syncthreads();
33 | 
34 |         for (int k = 0; k < TILESIZE; k++)
35 |         {
36 |             sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
37 |         }
38 | 
39 |         __syncthreads();
40 |     }
41 |     if (row < N && col < N)
42 |     {
43 |         C[row * N + col] = sum;
44 |     }
45 | }
46 | 
47 | int main(){
48 |     float *A, *B, *C;
49 |     float *d_A, *d_B, *d_C;
50 | 
51 |     int N = 1024; // Size of the matrix
52 | 
53 |     size_t size = N*N* sizeof(float);
54 | 
55 |     // Allocate host memory
56 |     A = (float *)malloc(size);
57 |     B = (float *)malloc(size);
58 |     C = (float *)malloc(size);
59 | 
60 |     for(int i = 0 ; i < N * N ; i++){
61 |         A[i] = i;
62 |         B[i] = i;
63 |     }
64 | 
65 |     CHECK_HIP_ERROR(hipMalloc((void**)&d_A,size));
66 |     CHECK_HIP_ERROR(hipMalloc((void**)&d_B,size));
67 |     CHECK_HIP_ERROR(hipMalloc((void**)&d_C,size));
68 | 
69 |     CHECK_HIP_ERROR(hipMemcpy(d_A, A, size, hipMemcpyHostToDevice));
70 |     CHECK_HIP_ERROR(hipMemcpy(d_B, B, size, hipMemcpyHostToDevice));
71 | 
72 |     dim3 Threads(TILESIZE, TILESIZE);
73 |     dim3 Blocks((N+Threads.x-1)/Threads.x, (N+Threads.y-1)/Threads.y);
74 |     hipLaunchKernelGGL(kernel, Blocks, Threads, 0, 0, d_A, d_B, d_C, N);
75 | 
76 |     CHECK_HIP_ERROR(hipMemcpy(C, d_C, size, hipMemcpyDeviceToHost));
77 | 
78 |     // Check the result
79 |     for(int i = 0 ; i < 10 ; i++){
80 |         for(int j = 0 ; j < 10 ; j++){
81 |             std::cout << C[i*N+j] << " ";
82 |         }
83 |         std::cout << std::endl;
84 |     }
85 | 
86 |     CHECK_HIP_ERROR(hipFree(d_A));
87 |     CHECK_HIP_ERROR(hipFree(d_B));
88 |     CHECK_HIP_ERROR(hipFree(d_C));
89 | 
90 |     free(A);
91 |     free(B);
92 |     free(C);
93 | 
94 |     return 0;
95 | }
96 | 


--------------------------------------------------------------------------------
/day33/load_in_pytorch/kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | // Your HIP kernel remains the same.
 5 | extern "C" __global__ void kernel_addition(const float *A, const float *B, float *C, size_t N) {
 6 |     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 7 |     if (idx < N) {
 8 |         C[idx] = A[idx] + B[idx];
 9 |     }
10 | }
11 | 
12 | // Host wrapper function that launches the kernel.
13 | // This function will be callable from Python.
14 | extern "C" void launch_kernel_addition(const float *A, const float *B, float *C, size_t N,
15 |                                          int grid_x, int grid_y, int grid_z,
16 |                                          int block_x, int block_y, int block_z) {
17 |     // Create dim3 objects for grid and block dimensions.
18 |     dim3 grid(grid_x, grid_y, grid_z);
19 |     dim3 block(block_x, block_y, block_z);
20 | 
21 |     // Launch the kernel with the provided configuration.
22 |     hipLaunchKernelGGL(kernel_addition, grid, block, 0, 0, A, B, C, N);
23 | 
24 |     // Wait for the kernel to finish.
25 |     hipDeviceSynchronize();
26 | }
27 | 


--------------------------------------------------------------------------------
/day33/load_in_pytorch/kernel.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day33/load_in_pytorch/kernel.so


--------------------------------------------------------------------------------
/day33/load_in_pytorch/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import ctypes
 3 | import time
 4 | 
 5 | # Load the shared object
 6 | lib = ctypes.CDLL('./kernel.so')
 7 | 
 8 | # Specify the argument types for the host wrapper function.
 9 | lib.launch_kernel_addition.argtypes = [
10 |     ctypes.c_void_p,  # pointer to A
11 |     ctypes.c_void_p,  # pointer to B
12 |     ctypes.c_void_p,  # pointer to C
13 |     ctypes.c_size_t,  # N
14 |     ctypes.c_int,     # grid_x
15 |     ctypes.c_int,     # grid_y
16 |     ctypes.c_int,     # grid_z
17 |     ctypes.c_int,     # block_x
18 |     ctypes.c_int,     # block_y
19 |     ctypes.c_int      # block_z
20 | ]
21 | lib.launch_kernel_addition.restype = None
22 | 
23 | N = 1000
24 | 
25 | # Create input tensors on the ROCm device.
26 | A = torch.randn(N, device='cuda', dtype=torch.float32)
27 | B = torch.randn(N, device='cuda', dtype=torch.float32)
28 | C = torch.empty(N, device='cuda', dtype=torch.float32)
29 | 
30 | # Get pointers to the tensor data.
31 | a_ptr = A.data_ptr()
32 | b_ptr = B.data_ptr()
33 | c_ptr = C.data_ptr()
34 | 
35 | # Define block and grid sizes.
36 | block_size = 256
37 | grid_size = (N + block_size - 1) // block_size
38 | 
39 | def measure_amd_kernel_time():
40 |     start_amd = time.time()
41 |     lib.launch_kernel_addition(
42 |         ctypes.c_void_p(a_ptr),
43 |         ctypes.c_void_p(b_ptr),
44 |         ctypes.c_void_p(c_ptr),
45 |         ctypes.c_size_t(N),
46 |         ctypes.c_int(grid_size),  # grid_x
47 |         ctypes.c_int(1),          # grid_y
48 |         ctypes.c_int(1),          # grid_z
49 |         ctypes.c_int(block_size), # block_x
50 |         ctypes.c_int(1),          # block_y
51 |         ctypes.c_int(1)           # block_z
52 |     )
53 |     torch.cuda.synchronize()  # Ensure the kernel has finished executing
54 |     end_amd = time.time()
55 |     return end_amd - start_amd
56 | 
57 | def measure_pytorch_time():
58 |     start_pytorch = time.time()
59 |     c_pytorch = A + B
60 |     end_pytorch = time.time()
61 |     return end_pytorch - start_pytorch
62 | 
63 | # Run the measurements 5 times and get the lowest time
64 | amd_times = [measure_amd_kernel_time() for _ in range(5)]
65 | pytorch_times = [measure_pytorch_time() for _ in range(5)]
66 | 
67 | min_amd_time = min(amd_times)
68 | min_pytorch_time = min(pytorch_times)
69 | 
70 | # Verify the result.
71 | if torch.allclose(C, A + B):
72 |     print("Success!")
73 | else:
74 |     print("Error in computation.")
75 | 
76 | print(f"Lowest AMD kernel execution time: {min_amd_time} seconds")
77 | print(f"Lowest Pytorch computation time: {min_pytorch_time} seconds")
78 | 


--------------------------------------------------------------------------------
/day34/tensor_lib/test1.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <hip/hip_runtime.h>
 3 | #include <hiprand_kernel.h>
 4 | 
 5 | __global__ void kernel_noise_image(float *X, const float *e, const float *alpha_hat, int N)
 6 | {
 7 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     __shared__ float sqrt_alphas[2];
 9 | 
10 |     if (threadIdx.x == 0) {
11 |         sqrt_alphas[0] = sqrtf(*alpha_hat);
12 |         sqrt_alphas[1] = sqrtf(1.0f - *alpha_hat);
13 |     }
14 |     
15 |     __syncthreads();
16 |     
17 |     if (idx < N) {
18 |         e[idx] = hiprand_normal(&state[idx]);
19 | 
20 |         X[idx] = sqrt_alphas[0] * X[idx] + sqrt_alphas[1] * e[idx];
21 |     }
22 | }
23 | 
24 | torch::Tensor noiseImage(torch::Tensor X, int t, torch::Tensor alpha_hat)
25 | {
26 |     torch::Tensor alpha_at_t = alpha_hat.index({t});
27 |     
28 |     float *d_X, *d_e, *d_alpha_hat;
29 |     int N = X.numel();
30 | 
31 |     hipMalloc(&d_X, N * sizeof(float));
32 |     hipMalloc(&d_e, N * sizeof(float));
33 |     hipMalloc(&d_alpha_hat, sizeof(float));
34 | 
35 |     hipMemcpy(d_X, X.data_ptr<float>(), N * sizeof(float), hipMemcpyHostToDevice);
36 |     hipMemcpy(d_alpha_hat, alpha_at_t.data_ptr<float>(), sizeof(float), hipMemcpyHostToDevice);
37 | 
38 |     int blockSize = 256;
39 |     int numBlocks = (N + blockSize - 1) / blockSize;
40 | 
41 |     kernel_noise_image<<<numBlocks, blockSize>>>(d_X, d_e, d_alpha_hat, N);
42 | 
43 |     hipDeviceSynchronize();
44 | 
45 |     hipMemcpy(X.data_ptr<float>(), d_X, N * sizeof(float), hipMemcpyDeviceToHost);
46 | 
47 |     hipFree(d_X);
48 |     hipFree(d_e);
49 |     hipFree(d_alpha_hat);
50 | 
51 |     return X;
52 | }
53 | 
54 | int main()
55 | {
56 |     torch::Tensor X = torch::rand({1, 3, 64, 64}, torch::kFloat32);
57 |     torch::Tensor alpha_hat = torch::rand({1000}, torch::kFloat32);
58 | 
59 |     int t = 500;
60 | 
61 |     X = noiseImage(X, t, alpha_hat);
62 | 
63 |     std::cout << "Noisy image tensor shape: " << X.sizes() << std::endl;
64 | 
65 |     return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/day34/tensor_lib/test1.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day34/tensor_lib/test1.out


--------------------------------------------------------------------------------
/day36/random.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <time.h>
 5 | 
 6 | #define BLOCK_SIZE 256
 7 | 
 8 | __global__ void reductionKernelOptimized(const float *g_in, float *g_out, int n) {
 9 |     extern __shared__ float sdata[];
10 | 
11 |     unsigned int tid = threadIdx.x;
12 |     unsigned int idx = blockIdx.x * (BLOCK_SIZE * 2) + tid;
13 |     
14 |     float mySum = 0.0f;
15 |     if (idx < n)
16 |         mySum = g_in[idx];
17 |     if (idx + BLOCK_SIZE < n)
18 |         mySum += g_in[idx + BLOCK_SIZE];
19 |     
20 |     sdata[tid] = mySum;
21 |     __syncthreads();
22 | 
23 |     for (unsigned int s = BLOCK_SIZE / 2; s > 32; s >>= 1) {
24 |         if (tid < s)
25 |             sdata[tid] += sdata[tid + s];
26 |         __syncthreads();
27 |     }
28 | 
29 |     if (tid < 32) {
30 |         volatile float *vsmem = sdata;
31 |         vsmem[tid] += vsmem[tid + 32];
32 |         vsmem[tid] += vsmem[tid + 16];
33 |         vsmem[tid] += vsmem[tid + 8];
34 |         vsmem[tid] += vsmem[tid + 4];
35 |         vsmem[tid] += vsmem[tid + 2];
36 |         vsmem[tid] += vsmem[tid + 1];
37 |     }
38 | 
39 |     if (tid == 0)
40 |         g_out[blockIdx.x] = sdata[0];
41 | }
42 | 
43 | int main() {
44 |     int n = 1 << 20;
45 |     size_t size = n * sizeof(float);
46 | 
47 |     float *h_array = (float*)malloc(size);
48 |     for (int i = 0; i < n; i++) {
49 |         h_array[i] = 1.0f;
50 |     }
51 | 
52 |     float *d_in, *d_out;
53 |     hipMalloc(&d_in, size);
54 |     int numBlocks = (n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2);
55 |     hipMalloc(&d_out, numBlocks * sizeof(float));
56 | 
57 |     hipMemcpy(d_in, h_array, size, hipMemcpyHostToDevice);
58 | 
59 |     size_t sharedMemSize = BLOCK_SIZE * sizeof(float);
60 |     hipLaunchKernelGGL(reductionKernelOptimized, dim3(numBlocks), dim3(BLOCK_SIZE), sharedMemSize, 0, d_in, d_out, n);
61 |     hipDeviceSynchronize();
62 | 
63 |     float *h_partialSums = (float*)malloc(numBlocks * sizeof(float));
64 |     hipMemcpy(h_partialSums, d_out, numBlocks * sizeof(float), hipMemcpyDeviceToHost);
65 | 
66 |     float sum = 0.0f;
67 |     for (int i = 0; i < numBlocks; i++) {
68 |         sum += h_partialSums[i];
69 |     }
70 |     printf("Reduction result: %f (expected %f)\n", sum, (float)n);
71 | 
72 |     free(h_array);
73 |     free(h_partialSums);
74 |     hipFree(d_in);
75 |     hipFree(d_out);
76 |     
77 |     return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/day37/MultiStreams/MHA.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <iostream>
 3 | #include <cstdlib>
 4 | 
 5 | #define HEADS    8
 6 | #define SEQ_LEN  128
 7 | #define DIM      768  // head dimension
 8 | 
 9 | __global__ void addition(const float* query, const float* key, const float* value,
10 |                           float* output, int seq_len, int dim, int head_id) 
11 |         {
12 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
13 |     int total = seq_len * dim;  // work items per head
14 |     if (idx < total) {
15 |         int seq = idx / dim;  // which seq  | on row
16 |         int d   = idx % dim;  // pos in seq | on col
17 |      
18 |         int offset = head_id * (seq_len * dim) + seq * dim;
19 |         output[offset + d] = query[offset + d] + key[offset + d] + value[offset + d];
20 |     }
21 | }
22 | 
23 | int main(){
24 |     size_t total_elements = HEADS * SEQ_LEN * DIM;
25 |     size_t size = total_elements * sizeof(float);
26 |     
27 |     // Create one HIP stream per head.
28 |     hipStream_t streams[HEADS];
29 |     for (int i = 0; i < HEADS; i++){
30 |         hipStreamCreate(&streams[i]);
31 |     }
32 |     
33 |     float *key   = (float*)malloc(size);
34 |     float *value = (float*)malloc(size);
35 |     float *query = (float*)malloc(size);
36 |     float *output= (float*)malloc(size);
37 | 
38 |     for (size_t i = 0; i < total_elements; i++){
39 |         key[i]   = 3.0f;
40 |         value[i] = 5.0f;
41 |         query[i] = 6.0f;
42 |     }
43 | 
44 |     float *d_key, *d_value, *d_query, *d_output;
45 |     hipMalloc(&d_key, size);
46 |     hipMalloc(&d_value, size);
47 |     hipMalloc(&d_query, size);
48 |     hipMalloc(&d_output, size);
49 | 
50 |     size_t headSize = SEQ_LEN * DIM * sizeof(float);
51 |     
52 |     // [HEADS][SEQ_LEN][DIM]
53 |     for (int head = 0; head < HEADS; head++){
54 |          int offset = head * SEQ_LEN * DIM;
55 |          hipMemcpyAsync(d_key + offset,   key + offset,   headSize, hipMemcpyHostToDevice, streams[head]);
56 |          hipMemcpyAsync(d_value + offset, value + offset, headSize, hipMemcpyHostToDevice, streams[head]);
57 |          hipMemcpyAsync(d_query + offset, query + offset, headSize, hipMemcpyHostToDevice, streams[head]);
58 |     }
59 | 
60 |     int threadsPerBlock = 256;      // threads per block 16x16 layout 
61 |     int totalWork = SEQ_LEN * DIM;  // elements in a head   
62 |     int blocks = (totalWork + threadsPerBlock - 1) / threadsPerBlock;
63 | 
64 |     for (int head = 0; head < HEADS; head++){
65 |          hipLaunchKernelGGL(addition, dim3(blocks), dim3(threadsPerBlock), 0, streams[head],
66 |                             d_query, d_key, d_value, d_output, SEQ_LEN, DIM, head);
67 |     }
68 | 
69 |     for (int head = 0; head < HEADS; head++){
70 |          int offset = head * SEQ_LEN * DIM;
71 |          hipMemcpyAsync(output + offset, d_output + offset, headSize,
72 |                         hipMemcpyDeviceToHost, streams[head]);
73 |     }
74 | 
75 |     hipDeviceSynchronize();
76 | 
77 | 
78 |     for (int i = 0; i < HEADS; i++){
79 |         hipStreamDestroy(streams[i]);
80 |     }
81 | 
82 |     hipFree(d_key);
83 |     hipFree(d_value);
84 |     hipFree(d_query);
85 |     hipFree(d_output);
86 |     free(key);
87 |     free(value);
88 |     free(query);
89 |     free(output);
90 |     return 0;
91 | }
92 | 


--------------------------------------------------------------------------------
/day37/MultiStreams/MHA.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day37/MultiStreams/MHA.out


--------------------------------------------------------------------------------
/day37/MultiStreams/results.copy_stats.csv:
--------------------------------------------------------------------------------
1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage"
2 | "CopyDeviceToHost",8,894880,111860,52.82395164336985
3 | "CopyHostToDevice",24,799200,33300,47.17604835663015
4 | 


--------------------------------------------------------------------------------
/day37/MultiStreams/results.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day37/MultiStreams/results.db


--------------------------------------------------------------------------------
/day37/MultiStreams/results.hip_stats.csv:
--------------------------------------------------------------------------------
 1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage"
 2 | "hipStreamCreate",8,287069631,35883703,96.91385734531264
 3 | "hipMemcpyAsync",32,6984316,218259,2.357884399407559
 4 | "hipStreamDestroy",8,1219926,152490,0.41184340511392464
 5 | "hipLaunchKernel",8,587473,73434,0.19832914515510996
 6 | "hipMalloc",4,186051,46512,0.06281026665949475
 7 | "hipFree",4,147901,36975,0.049930939630563304
 8 | "hipDeviceSynchronize",1,12841,12841,0.004335083574797083
 9 | "__hipPushCallConfiguration",8,1710,213,0.0005772909362902432
10 | "__hipPopCallConfiguration",8,1280,160,0.0004321242096207668
11 | 


--------------------------------------------------------------------------------
/day37/MultiStreams/results.hsa_stats.csv:
--------------------------------------------------------------------------------
 1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage"
 2 | "hsa_queue_create",4,47570153,11892538,77.44727724772702
 3 | "hsa_amd_memory_pool_allocate",24,6784269,282677,11.045227501499935
 4 | "hsa_signal_wait_scacquire",74,1694809,22902,2.7592583632207988
 5 | "hsa_amd_memory_async_copy",32,1068866,33402,1.7401827873597333
 6 | "hsa_amd_memory_pool_free",20,837405,41870,1.363349350666012
 7 | "hsa_agent_get_info",65,764114,11755,1.2440268755677468
 8 | "hsa_executable_load_agent_code_object",2,604473,302236,0.9841210310962273
 9 | "hsa_amd_agents_allow_access",20,537142,26857,0.8745018204040375
10 | "hsa_signal_create",547,500153,914,0.8142813426999574
11 | "hsa_executable_freeze",2,332962,166481,0.5420836112710775
12 | "hsa_signal_load_relaxed",1496,123321,82,0.20077454191637648
13 | "hsa_signal_destroy",546,102760,188,0.1672999077799146
14 | "hsa_code_object_reader_create_from_memory",2,97010,48505,0.15793853691834872
15 | "hsa_amd_signal_async_handler",32,82190,2568,0.13381062106297373
16 | "hsa_isa_get_info_alt",2,74801,37400,0.12178085248973718
17 | "hsa_executable_iterate_symbols",16,55820,3488,0.09087856025958381
18 | "hsa_executable_create_alt",2,24260,12130,0.039496844713319657
19 | "hsa_iterate_agents",1,24220,24220,0.039431722133413116
20 | "hsa_amd_pointer_info",128,21370,166,0.03479173831507177
21 | "hsa_executable_symbol_get_info",260,15970,61,0.026000190027688167
22 | "hsa_signal_store_screlease",48,15030,313,0.024469809399884357
23 | "hsa_amd_agent_iterate_memory_pools",4,13870,3467,0.022581254582594544
24 | "hsa_amd_profiling_get_async_copy_time",32,13490,421,0.021962590073482367
25 | "hsa_queue_load_read_index_relaxed",48,12480,260,0.0203182449308421
26 | "hsa_amd_profiling_set_profiler_enabled",4,9170,2292,0.014929351443575484
27 | "hsa_amd_profiling_get_dispatch_time",16,7110,444,0.011575538578388408
28 | "hsa_executable_get_symbol_by_name",16,6530,408,0.010631261169743502
29 | "hsa_queue_add_write_index_screlease",48,5490,114,0.008938074092173327
30 | "hsa_signal_silent_store_relaxed",80,5350,66,0.008710145062500419
31 | "hsa_amd_profiling_async_copy_enable",8,4790,598,0.007798428943808787
32 | "hsa_amd_memory_pool_get_info",27,4010,148,0.006528538635631156
33 | "hsa_queue_load_read_index_scacquire",48,3000,62,0.00488419349299089
34 | "hsa_amd_memory_copy_engine_status",2,2180,1090,0.0035491806049067127
35 | "hsa_agent_iterate_isas",1,1860,1860,0.0030281999656543513
36 | "hsa_amd_agent_memory_pool_get_info",9,1470,163,0.0023932548115655357
37 | "hsa_system_get_info",4,370,92,0.000602383864135543
38 | "hsa_system_get_major_extension_table",1,360,360,0.0005861032191589067
39 | 


--------------------------------------------------------------------------------
/day37/MultiStreams/results.stats.csv:
--------------------------------------------------------------------------------
1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage"
2 | "addition(float const*, float const*, float const*, float*, int, int, int)",8,42080,5260,100.0
3 | 


--------------------------------------------------------------------------------
/day38/myreduction.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <iostream>
 3 | #include <float.h> // for FLT_MAX
 4 | 
 5 | #ifndef WARP_SIZE
 6 | #define WARP_SIZE 64
 7 | #endif
 8 | 
 9 | #define HIPCHECK(error)                                            \
10 |     {                                                              \
11 |         if ((error) != hipSuccess)                                 \
12 |         {                                                          \
13 |             std::cerr << "HIP error: " << hipGetErrorString(error) \
14 |                       << " at line " << __LINE__ << std::endl;     \
15 |             exit(EXIT_FAILURE);                                    \
16 |         }                                                          \
17 |     }
18 | 
19 | 
20 | template <typename scalar_t>
21 | __global__ void reduce_max_1d(const scalar_t *__restrict__ input,
22 |                               scalar_t *__restrict__ output,
23 |                               int n)
24 | {
25 |     extern __shared__ float sdata[];
26 |     const uint32_t tid = threadIdx.x;
27 |     const uint32_t i = blockIdx.x * (blockDim.x * 2) + tid;
28 |     const uint32_t lane    = tid % WARP_SIZE;
29 |     const uint32_t warp_id = tid / WARP_SIZE;
30 |     float max_val = -FLT_MAX;
31 |     if (i < n)
32 |         max_val = input[i];
33 |     if (i + blockDim.x < n)
34 |         max_val = fmaxf(max_val, input[i + blockDim.x]);
35 | 
36 |     for (uint32_t offset = WARP_SIZE / 2; offset > 0; offset /= 2)
37 |     {
38 |         max_val = fmaxf(max_val, __shfl_down(max_val, offset, WARP_SIZE));
39 |     }
40 | 
41 | 
42 |     if (lane == 0)
43 |     {
44 |         sdata[warp_id] = max_val;
45 |     }
46 |     __syncthreads();
47 | 
48 | 
49 |     const uint32_t numWarps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
50 |     if (tid < numWarps)
51 |     {
52 |         max_val = sdata[lane];
53 |         for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
54 |         {
55 |             max_val = fmaxf(max_val, __shfl_down(max_val, offset, WARP_SIZE));
56 |         }
57 |         if (lane == 0)
58 |             sdata[tid] = max_val;
59 |     }
60 |     __syncthreads();
61 | 
62 |     if (tid == 0)
63 |         output[blockIdx.x] = sdata[0];
64 | }
65 | 
66 | 
67 | int main()
68 | {
69 |     const int n = 102400;                  
70 |     std::vector<float> h_input(n, 1.0f); 
71 |     h_input[500] = 133.0f;               
72 | 
73 |     float *d_input;
74 |     float *d_output;
75 |     HIPCHECK(hipMalloc(&d_input, n * sizeof(float)));
76 |     HIPCHECK(hipMalloc(&d_output, sizeof(float))); 
77 | 
78 |     HIPCHECK(hipMemcpy(d_input, h_input.data(), n * sizeof(float), hipMemcpyHostToDevice));
79 | 
80 |     const int threadsPerBlock = 256;
81 |     const int blocks = (n + threadsPerBlock * 2 - 1) / (threadsPerBlock * 2);
82 |     const size_t sharedMemSize = ((threadsPerBlock + WARP_SIZE - 1) / WARP_SIZE) * sizeof(float);
83 | 
84 |     hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_max_1d<float>), dim3(blocks), dim3(threadsPerBlock), sharedMemSize, 0, d_input, d_output, n);
85 | 
86 |     float h_output;
87 |     HIPCHECK(hipMemcpy(&h_output, d_output, sizeof(float), hipMemcpyDeviceToHost));
88 | 
89 | 
90 |     HIPCHECK(hipFree(d_input));
91 |     HIPCHECK(hipFree(d_output));
92 | 
93 |     std::cout << "Maximum value: " << h_output << std::endl;
94 | 
95 |     return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/day42/mat_mul.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | 
 7 | DEVICE = torch.device("cuda:0")
 8 | 
 9 | 
10 | @triton.jit
11 | def matmul_kernel(
12 |         a_ptr, b_ptr, c_ptr,
13 |         M, N, K,
14 |         stride_am, stride_ak,  
15 |         stride_bk, stride_bn,  
16 |         stride_cm, stride_cn,
17 |         BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,  #
18 |         GROUP_SIZE_M: tl.constexpr,  #
19 |         ACTIVATION: tl.constexpr  #
20 | ):
21 |     pid = tl.program_id(axis=0)
22 |     num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
23 |     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
24 |     num_pid_in_group = GROUP_SIZE_M * num_pid_n
25 | 
26 |     group_id = pid // num_pid_in_group
27 |     first_pid_m = group_id * GROUP_SIZE_M
28 | 
29 |     group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
30 | 
31 |     pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
32 |     pid_n = (pid % num_pid_in_group) // group_size_m
33 | 
34 |     offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
35 |     offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
36 | 
37 |     offs_k = tl.arange(0, BLOCK_SIZE_K)
38 | 
39 |     a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
40 |     b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
41 | 
42 |     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
43 |     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
44 | 
45 |         a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
46 |         b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
47 | 
48 |         accumulator = tl.dot(a, b, accumulator)
49 | 
50 |         a_ptrs += BLOCK_SIZE_K * stride_ak
51 |         b_ptrs += BLOCK_SIZE_K * stride_bk
52 |     if ACTIVATION == "leaky_relu":
53 |         accumulator = leaky_relu(accumulator)
54 |     c = accumulator.to(tl.float16)
55 | 
56 |     offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
57 |     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
58 | 
59 |     c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
60 |     c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
61 | 
62 |     tl.store(c_ptrs, c, mask=c_mask)
63 | 
64 | 
65 | @triton.jit
66 | def leaky_relu(x):
67 |     return tl.where(x >= 0, x, 0.01 * x)
68 | 
69 | 
70 | def matmul(a, b, activation=""):
71 |     assert a.shape[1] == b.shape[0], "Incompatible dimensions"
72 |     assert a.is_contiguous(), "Matrix A must be contiguous"
73 |     M, K = a.shape
74 |     K, N = b.shape
75 | 
76 |     c = torch.empty((M, N), device=a.device, dtype=torch.float16)
77 | 
78 |     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
79 | 
80 |     matmul_kernel[grid](
81 |         a, b, c,
82 |         M, N, K,
83 |         a.stride(0), a.stride(1),
84 |         b.stride(0), b.stride(1),
85 |         c.stride(0), c.stride(1),
86 |         ACTIVATION=activation
87 |     )
88 |     return c
89 | 
90 | 


--------------------------------------------------------------------------------
/day42/mat_mul_2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | @triton.jit
 7 | def get_1d_offest(size,n_prev_chunks):
 8 |     return n_prev_chunks * size + tl.arange(0,size)
 9 | 
10 | @triton.jit
11 | def get_2d_offest(offs_0,offs_1,stride_0,stride_1):
12 |     return tl.expand_dims(offs_0,1)*stride_0 + tl.expand_dims(offs_1,0)*stride_1
13 | 
14 | @triton.jit
15 | def get_1d_mask(offs,max):
16 |     return offs<max
17 | 
18 | @triton.jit
19 | def get_2d_mask(offs_0,offs_1,max_0,max_1):
20 |     return (tl.expand_dims(offs_0,1) < max_0) & (tl.expand_dims(offs_1,0) < max_1)
21 | 
22 | 
23 | @triton.jit
24 | def naive_matmul_k(
25 |     a_ptr,b_ptr,c_ptr,
26 |     m,n,k,
27 |     stride_am,stride_ak,
28 |     stride_bk,stride_bn,
29 |     stride_cm,stride_cn,
30 |     bm:tl.constexpr,
31 |     bn:tl.constexpr,
32 |     bk:tl.constexpr,
33 | ):
34 |     pid_m,pid_n = tl.program_id(0), tl.program_id(1)
35 |     
36 |     rm = get_1d_offest(size=bm,n_prev_chunks=pid_m)
37 |     rn = get_1d_offest(size=bn,n_prev_chunks=pid_n)
38 |     rk = get_1d_offest(size=bk,n_prev_chunks=0)
39 |     
40 |     offs_a = a_ptr + get_2d_offest(rm,rk,stride_am,stride_ak)
41 |     offs_b = b_ptr + get_2d_offest(rk,rn,stride_bk,stride_bn)
42 |     
43 |     acc = tl.zeros((bm,bn),dtype=tl.float32)
44 |     for _ in range(0,k,bk):
45 |         
46 |         a = tl.load(offs_a)
47 |         b = tl.load(offs_b)
48 |         
49 |         acc += tl.dot(a,b)
50 |         
51 |         offs_a += bk * stride_ak
52 |         offs_b += bk * stride_bk
53 |     
54 |     c = c_ptr + get_2d_offest(rn,rn,stride_cm,stride_cn)
55 |     mask = get_2d_mask(rm,rn,m,n)
56 |     tl.store(c,acc,mask)
57 |     
58 | def matmul(a,b,matmul_k_fn,bs=16,group_sz=None):
59 |     (m,k) , (_,n) = a.shape, b.shape
60 |     c = torch.empty((m,n),device=a.device,dtype=torch.float16)
61 |     
62 |     grid = lambda meta : (triton.cdiv(n,meta['bm']),triton.cdiv(n,meta['bn']))
63 |     
64 |     matmul_k_fn[grid](
65 |         a , b, c,
66 |         m, n ,k,
67 |         a.stride(0), a.stride(1),
68 |         b.stride(0), b.stride(1),
69 |         c.stride(0),c.stride(1),
70 |         bn=bs,bm=bs,bk=bs
71 |     )
72 |     return c
73 | 
74 |      
75 | a = torch.ones((3,4),dtype=torch.float32,device="cuda")
76 | b = torch.ones((4,5),dtype=torch.float32,device="cuda")
77 | 
78 | print(matmul(a,b,naive_matmul_k))


--------------------------------------------------------------------------------
/day43/rope.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import triton.language as tl
 3 | 
 4 | def rope_embeddings(
 5 |     Q, Q_row_stride,
 6 |     cos,cos_row_stride,
 7 |     sin,sin_row_stride,
 8 |     seq_len,
 9 |     head_dim  : tl.constexpr,
10 |     n_heads   : tl.constexpr,
11 |     BLOCKSIZE : tl.constexpr
12 | ):
13 |     ROPE_GROUP_SIZE = 4
14 |     row_position = tl.program_id(0)
15 |     group_head_position = tl.program_id(1)
16 |     
17 |     offset = tl.arange(0,BLOCKSIZE)
18 |     half_head_dim = head_dim//2
19 |     mask = offset< half_head_dim
20 |     
21 |     sin1 = tl.load(sin +(row_position % seq_len)*sin_row_stride + half_head_dim*0 + offset,mask=mask)
22 |     cos1 = tl.load(cos +(row_position % seq_len)*cos_row_stride + half_head_dim*0 + offset,mask=mask)
23 |     
24 |     head_start = group_head_position * ROPE_GROUP_SIZE
25 |     head_end = min((head_start + ROPE_GROUP_SIZE), n_heads)
26 |     
27 |     for k in range(head_start,head_end):
28 |         offs_q1 = row_position * Q_row_stride + k * head_dim + offset
29 |         offs_q2 = row_position * Q_row_stride + k * head_dim + offset + half_head_dim
30 |         
31 |         Q1 = tl.load(Q + offs_q1, mask = mask, other = 0).to(sin1.dtype)
32 |         Q2 = tl.load(Q + offs_q2, mask = mask, other = 0).to(sin1.dtype)
33 | 
34 |         tl.store(Q + offs_q1, Q1*cos1 - Q2*sin1, mask = mask)
35 |         tl.store(Q + offs_q2, Q2*cos1 + Q1*sin1, mask = mask)


--------------------------------------------------------------------------------
/day44/average_duration_per_block_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day44/average_duration_per_block_size.png


--------------------------------------------------------------------------------
/day44/duration_vs_total_elements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day44/duration_vs_total_elements.png


--------------------------------------------------------------------------------
/day47/hip_cooperative_groups.h:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_cooperative_groups.h>
 2 | 
 3 | namespace cg = cooperative_groups;
 4 | 
 5 | template <typename T>
 6 | __device__ T reduce_sum(cg::thread_block_tile<32>& group, T val) {
 7 |     for (int offset = group.size()/2; offset > 0; offset /= 2) {
 8 |         T temp = group.shfl_down(val, offset);
 9 |         val += temp;
10 |     }
11 |     return val;
12 | }
13 | 
14 | template <typename T>
15 | __device__ T reduce_max(cg::thread_block_tile<32>& group, T val) {
16 |     for (int offset = group.size()/2; offset > 0; offset /= 2) {
17 |         T temp = group.shfl_down(val, offset);
18 |         val = max(val, temp);
19 |     }
20 |     return val;
21 | }
22 | 
23 | template <typename T>
24 | __device__ T reduce_sum(cg::thread_block_tile<64>& group, T val) {
25 |     for (int offset = group.size()/2; offset > 0; offset /= 2) {
26 |         T temp = group.shfl_down(val, offset);
27 |         val += temp;
28 |     }
29 |     return val;
30 | }
31 | 
32 | template <typename T>
33 | __device__ T reduce_max(cg::thread_block_tile<64>& group, T val) {
34 |     for (int offset = group.size()/2; offset > 0; offset /= 2) {
35 |         T temp = group.shfl_down(val, offset);
36 |         val = max(val, temp);
37 |     }
38 |     return val;
39 | }
40 | 
41 | __device__ cg::thread_block this_thread_block() {
42 |     return cg::this_thread_block();
43 | }
44 | 
45 | template <unsigned Size>
46 | __device__ cg::thread_block_tile<Size> tiled_partition(cg::thread_block& block) {
47 |     return cg::tiled_partition<Size>(block);
48 | }


--------------------------------------------------------------------------------
/day48/kernel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def gelu_kernel(
 7 |     x_ptr,
 8 |     output_ptr,
 9 |     n_elements,
10 |     BLOCK_SIZE: tl.constexpr,
11 | ):
12 |     pid = tl.program_id(axis=0)
13 |     block_start = pid * BLOCK_SIZE
14 |     offsets = block_start + tl.arange(0, BLOCK_SIZE)
15 |     
16 |     mask = offsets < n_elements
17 |     
18 |     x = tl.load(x_ptr + offsets, mask=mask)
19 |     
20 |     coeff1 = 0.7978845608028654
21 |     coeff2 = 0.044715
22 |     x_cubed = x * x * x
23 |     inner = coeff1 * (x + coeff2 * x_cubed)
24 |     tanh = tl.math.tanh(inner)
25 |     output = 0.5 * x * (1.0 + tanh)
26 |     
27 |     tl.store(output_ptr + offsets, output, mask=mask)
28 | 
29 | def fused_gelu(x: torch.Tensor):
30 |     output = torch.empty_like(x)
31 |     
32 |     n_elements = x.numel()
33 |     
34 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
35 |     
36 |     gelu_kernel[grid](x, output, n_elements, BLOCK_SIZE=1024)
37 |     
38 |     return output
39 | 
40 | if __name__ == "__main__":
41 |     torch.manual_seed(0)
42 |     
43 |     x = torch.randn(1000000, device='cuda')
44 |     
45 |     triton_output = fused_gelu(x)
46 |     
47 |     torch_output = torch.nn.functional.gelu(x)
48 |     
49 |     print(f"Maximum absolute error: {torch.max(torch.abs(triton_output - torch_output)):.2e}")
50 |     print(f"Results match: {torch.allclose(triton_output, torch_output, atol=1e-5)}")


--------------------------------------------------------------------------------
/day49/kernel.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import triton.language as tl
 3 | import torch
 4 | import time
 5 | 
 6 | @triton.jit
 7 | def fused_bias_skip_gelu_scale_kernel(
 8 |     x_ptr,
 9 |     bias_ptr,
10 |     skip_ptr,
11 |     gamma_ptr,
12 |     y_ptr,
13 |     n_elements: tl.constexpr
14 | ):
15 |     BLOCK_SIZE = 1024
16 |     pid = tl.program_id(0)
17 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
18 |     mask = offsets < n_elements
19 |     x = tl.load(x_ptr + offsets, mask=mask)
20 |     bias = tl.load(bias_ptr + offsets, mask=mask)
21 |     skip = tl.load(skip_ptr + offsets, mask=mask)
22 |     gamma = tl.load(gamma_ptr + offsets, mask=mask)
23 |     temp = x + bias + skip
24 |     gelu = 0.5 * temp * (1.0 + tl.tanh(0.7978845608028654 * (temp + 0.044715 * temp * temp * temp)))
25 |     out = gelu * gamma
26 |     tl.store(y_ptr + offsets, out, mask=mask)
27 | 
28 | def test_fused_kernel():
29 |     n_elements = 2048
30 |     BLOCK_SIZE = 1024
31 |     x = torch.randn(n_elements, device='cuda', dtype=torch.float32)
32 |     bias = torch.randn(n_elements, device='cuda', dtype=torch.float32)
33 |     skip = torch.randn(n_elements, device='cuda', dtype=torch.float32)
34 |     gamma = torch.randn(n_elements, device='cuda', dtype=torch.float32)
35 |     y = torch.empty_like(x)
36 |     grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE,)
37 |     fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements)
38 |     torch.cuda.synchronize()
39 |     temp = x + bias + skip
40 |     gelu = 0.5 * temp * (1.0 + torch.tanh(0.7978845608028654 * (temp + 0.044715 * temp ** 3)))
41 |     ref = gelu * gamma
42 |     if torch.allclose(y, ref, atol=1e-6):
43 |         print("Test passed! Kernel output matches reference.")
44 |     else:
45 |         print("Test failed! Maximum absolute error:", (y - ref).abs().max().item())
46 | 
47 | def benchmark_kernel():
48 |     n_elements = 2048
49 |     BLOCK_SIZE = 1024
50 |     x = torch.randn(n_elements, device='cuda', dtype=torch.float32)
51 |     bias = torch.randn(n_elements, device='cuda', dtype=torch.float32)
52 |     skip = torch.randn(n_elements, device='cuda', dtype=torch.float32)
53 |     gamma = torch.randn(n_elements, device='cuda', dtype=torch.float32)
54 |     y = torch.empty_like(x)
55 |     grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE,)
56 |     for _ in range(10):
57 |         fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements)
58 |     torch.cuda.synchronize()
59 |     n_iter = 100
60 |     start = time.time()
61 |     for _ in range(n_iter):
62 |         fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements)
63 |     torch.cuda.synchronize()
64 |     end = time.time()
65 |     avg_time_ms = (end - start) / n_iter * 1000
66 |     print(f"Average kernel time over {n_iter} iterations: {avg_time_ms:.3f} ms")
67 | 
68 | if __name__ == "__main__":
69 |     test_fused_kernel()
70 |     benchmark_kernel()
71 | 


--------------------------------------------------------------------------------
/day50/tritonnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def gelu_kernel(
 7 |     x_ptr,
 8 |     output_ptr,
 9 |     n_elements,
10 |     BLOCK_SIZE: tl.constexpr,
11 | ):
12 |     pid = tl.program_id(axis=0)
13 |     block_start = pid * BLOCK_SIZE
14 |     offsets = block_start + tl.arange(0, BLOCK_SIZE)
15 |     mask = offsets < n_elements
16 |     x = tl.load(x_ptr + offsets, mask=mask)
17 |     sqrt_2_over_pi = tl.sqrt(2.0 / tl.math.pi)
18 |     cdf = 0.5 * (1.0 + tl.tanh(sqrt_2_over_pi * (x + 0.044715 * (x ** 3))))
19 |     output = x * cdf
20 |     tl.store(output_ptr + offsets, output, mask=mask)
21 | 
22 | @triton.jit
23 | def fused_add_multiply_kernel(
24 |     a_ptr, b_ptr, c_ptr, output_ptr,
25 |     n_elements,
26 |     BLOCK_SIZE: tl.constexpr,
27 | ):
28 |     pid = tl.program_id(axis=0)
29 |     block_start = pid * BLOCK_SIZE
30 |     offsets = block_start + tl.arange(0, BLOCK_SIZE)
31 |     mask = offsets < n_elements
32 |     a = tl.load(a_ptr + offsets, mask=mask)
33 |     b = tl.load(b_ptr + offsets, mask=mask)
34 |     c = tl.load(c_ptr + offsets, mask=mask)
35 |     output = (a + b) * c
36 |     tl.store(output_ptr + offsets, output, mask=mask)
37 | 
38 | class GELUTriton(torch.autograd.Function):
39 |     @staticmethod
40 |     def forward(ctx, x):
41 |         x = x.contiguous()
42 |         output = torch.empty_like(x)
43 |         n_elements = output.numel()
44 |         grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
45 |         gelu_kernel[grid](x, output, n_elements, BLOCK_SIZE=1024)
46 |         ctx.save_for_backward(x)
47 |         return output
48 | 
49 |     @staticmethod
50 |     def backward(ctx, grad_output):
51 |         x, = ctx.saved_tensors
52 |         with torch.enable_grad():
53 |             x = x.detach().requires_grad_()
54 |             with torch.cuda.amp.autocast():
55 |                 output = GELUTriton.apply(x)
56 |             grad_input = torch.autograd.grad(
57 |                 output, x, grad_output, create_graph=True
58 |             )[0]
59 |         return grad_input
60 | 
61 | def fused_add_multiply(a, b, c):
62 |     assert a.shape == b.shape == c.shape
63 |     a, b, c = a.contiguous(), b.contiguous(), c.contiguous()
64 |     output = torch.empty_like(a)
65 |     n_elements = output.numel()
66 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
67 |     fused_add_multiply_kernel[grid](a, b, c, output, n_elements, BLOCK_SIZE=1024)
68 |     return output
69 | 
70 | class TritonNN(torch.nn.Module):
71 |     def __init__(self, in_features, hidden_size, out_features):
72 |         super().__init__()
73 |         self.fc1 = torch.nn.Linear(in_features, hidden_size)
74 |         self.fc2 = torch.nn.Linear(hidden_size, out_features)
75 |         
76 |     def forward(self, x):
77 |         x = self.fc1(x)
78 |         x = GELUTriton.apply(x)
79 |         
80 |         residual = x
81 |         a = x
82 |         b = torch.ones_like(x) * 0.5  
83 |         c = torch.ones_like(x) * 1.5
84 |         x = fused_add_multiply(a, b, c)
85 |         x += residual  
86 |         
87 |         return self.fc2(x)
88 | 
89 | if __name__ == "__main__":
90 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
91 |     model = TritonNN(784, 256, 10).to(device)
92 |     x = torch.randn(32, 784).to(device)
93 |     output = model(x)
94 |     print("Output shape:", output.shape)
95 |     print("Output values:", output[0, :5])


--------------------------------------------------------------------------------
/day52/functionsused.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import numpy as np
 3 | 
 4 | import triton.language as tl
 5 | 
 6 | @triton.jit
 7 | def init_matrix(matrix, seed: tl.constexpr):
 8 |     idx = tl.arange(0, matrix.shape[0])
 9 |     matrix[idx] = tl.random(seed + idx)
10 | 
11 | @triton.jit
12 | def add_matrices(a, b, result):
13 |     idx = tl.arange(0, a.shape[0])
14 |     result[idx] = a[idx] + b[idx]
15 | 
16 | @triton.jit
17 | def multiply_matrices(a, b, result):
18 |     idx = tl.arange(0, a.shape[0])
19 |     result[idx] = a[idx] * b[idx]
20 | 
21 | @triton.jit
22 | def transpose_matrix(matrix, result):
23 |     idx = tl.arange(0, matrix.shape[0])
24 |     idy = tl.arange(0, matrix.shape[1])
25 |     result[idy, idx] = matrix[idx, idy]
26 | 
27 | @triton.jit
28 | def matmul_kernel(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
29 |     pid = tl.program_id(0)
30 |     row = pid // N
31 |     col = pid % N
32 | 
33 |     acc = 0.0
34 |     for k in range(K):
35 |         acc += a[row, k] * b[k, col]
36 | 
37 |     c[row, col] = acc
38 | 
39 | if __name__ == "__main__":
40 | 
41 |     M, N, K = 128, 128, 128
42 |     a = np.random.rand(M, K).astype(np.float32)
43 |     b = np.random.rand(K, N).astype(np.float32)
44 |     c = np.zeros((M, N), dtype=np.float32)
45 | 
46 |     a_dev = triton.testing.to_device(a)
47 |     b_dev = triton.testing.to_device(b)
48 |     c_dev = triton.testing.to_device(c)
49 | 
50 |     grid = (M * N,)
51 |     matmul_kernel[grid](a_dev, b_dev, c_dev, M, N, K)
52 | 
53 |     c = c_dev.cpu()
54 |     print(c)


--------------------------------------------------------------------------------
/day54/softmax.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def softmax_kernel(
 7 |     output_ptr, input_ptr,
 8 |     input_row_stride, output_row_stride,
 9 |     n_cols,
10 |     BLOCK_SIZE: tl.constexpr
11 | ):
12 |     row_idx = tl.program_id(0)
13 |     row_start = row_idx * input_row_stride
14 |     
15 |     col_offsets = tl.arange(0, BLOCK_SIZE)
16 |     input_ptrs = input_ptr + row_start + col_offsets
17 |     row_mask = col_offsets < n_cols
18 | 
19 |     row = tl.load(input_ptrs, mask=row_mask, other=-float('inf'))
20 |     row_minus_max = row - tl.max(row, axis=0)
21 |     numerator = tl.exp(row_minus_max)
22 |     denominator = tl.sum(numerator, axis=0)
23 |     softmax_output = numerator / denominator
24 | 
25 |     output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets
26 |     tl.store(output_ptrs, softmax_output, mask=row_mask)
27 | 
28 | def triton_softmax(x):
29 |     n_rows, n_cols = x.shape
30 |     BLOCK_SIZE = triton.next_power_of_2(n_cols)
31 |     
32 |     y = torch.empty_like(x)
33 |     assert x.is_cuda and y.is_cuda
34 |     
35 |     num_warps = 4
36 |     if BLOCK_SIZE >= 2048:
37 |         num_warps = 8
38 |     if BLOCK_SIZE >= 4096:
39 |         num_warps = 16
40 | 
41 |     softmax_kernel[(n_rows,)](
42 |         y, x,
43 |         x.stride(0), y.stride(0),
44 |         n_cols,
45 |         BLOCK_SIZE=BLOCK_SIZE,
46 |         num_warps=num_warps
47 |     )
48 |     return y
49 | 
50 | x = torch.randn(10000, 1000, device='cuda')
51 | triton_result = triton_softmax(x)
52 | torch_result = torch.softmax(x, axis=1)
53 | 
54 | print(f"Max error: {torch.max(torch.abs(triton_result - torch_result)):.2e}")


--------------------------------------------------------------------------------
/day57/main.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import triton.language as tl
 3 | import torch
 4 | @triton.jit
 5 | def fused_linear_xentropy_forward(
 6 |     input_ptr, weight_ptr, bias_ptr, target_ptr, loss_ptr,
 7 |     batch_size, in_features, out_features,
 8 |     stride_input_batch, stride_input_feature,
 9 |     stride_weight_out, stride_weight_in,
10 |     stride_bias_out,
11 |     BLOCK_SIZE_IN: tl.constexpr,
12 |     BLOCK_SIZE_OUT: tl.constexpr,
13 | ):
14 |     pid = tl.program_id(0)
15 |     if pid >= batch_size:
16 |         return
17 | 
18 |     input_row = input_ptr + pid * stride_input_batch
19 |     target = tl.load(target_ptr + pid)
20 | 
21 |     logits = tl.zeros((BLOCK_SIZE_OUT,), dtype=tl.float32)
22 |     
23 |     for i in range(0, in_features, BLOCK_SIZE_IN):
24 |         input_offsets = i + tl.arange(0, BLOCK_SIZE_IN)
25 |         input_mask = input_offsets < in_features
26 |         current_input = tl.load(input_row + input_offsets, mask=input_mask, other=0.0)
27 | 
28 |         weight_offsets = (i + tl.arange(0, BLOCK_SIZE_IN))[None, :] * stride_weight_in + \
29 |                         tl.arange(0, BLOCK_SIZE_OUT)[:, None] * stride_weight_out
30 |         weight_mask = (input_mask[None, :]) & (tl.arange(0, BLOCK_SIZE_OUT)[:, None] < out_features)
31 |         current_weight = tl.load(weight_ptr + weight_offsets, mask=weight_mask, other=0.0)
32 | 
33 |         logits += tl.sum(current_input[None, :] * current_weight, axis=1)
34 | 
35 |     bias_offsets = tl.arange(0, BLOCK_SIZE_OUT) * stride_bias_out
36 |     bias_mask = tl.arange(0, BLOCK_SIZE_OUT) < out_features
37 |     bias = tl.load(bias_ptr + bias_offsets, mask=bias_mask, other=0.0)
38 |     logits += bias
39 | 
40 |     max_logit = tl.max(logits, axis=0)
41 |     exp_logits = tl.exp(logits - max_logit)
42 |     sum_exp = tl.sum(exp_logits, axis=0)
43 |     log_sum_exp = tl.log(sum_exp)
44 |     log_probs = logits - max_logit - log_sum_exp
45 | 
46 |     target_mask = tl.arange(0, BLOCK_SIZE_OUT) == target
47 |     contribution = -tl.sum(log_probs * target_mask, axis=0)
48 |     tl.atomic_add(loss_ptr, contribution / batch_size)
49 | 
50 | def fused_linear_cross_entropy(
51 |     input: torch.Tensor,
52 |     weight: torch.Tensor,
53 |     bias: torch.Tensor,
54 |     target: torch.Tensor
55 | ) -> torch.Tensor:
56 |     assert input.is_cuda and weight.is_cuda and bias.is_cuda and target.is_cuda
57 |     batch_size, in_features = input.shape
58 |     out_features, _ = weight.shape
59 | 
60 |     loss = torch.zeros(1, device=input.device, dtype=torch.float32)
61 | 
62 |     BLOCK_SIZE_IN = 128
63 |     BLOCK_SIZE_OUT = triton.next_power_of_2(out_features)
64 |     if BLOCK_SIZE_OUT > 4096:
65 |         raise ValueError("Too many output features for this kernel implementation")
66 | 
67 |     grid = (batch_size,)
68 |     fused_linear_xentropy_forward[grid](
69 |         input_ptr=input,
70 |         weight_ptr=weight,
71 |         bias_ptr=bias,
72 |         target_ptr=target,
73 |         loss_ptr=loss,
74 |         batch_size=batch_size,
75 |         in_features=in_features,
76 |         out_features=out_features,
77 |         stride_input_batch=input.stride(0),
78 |         stride_input_feature=input.stride(1),
79 |         stride_weight_out=weight.stride(0),
80 |         stride_weight_in=weight.stride(1),
81 |         stride_bias_out=bias.stride(0),
82 |         BLOCK_SIZE_IN=BLOCK_SIZE_IN,
83 |         BLOCK_SIZE_OUT=BLOCK_SIZE_OUT,
84 |     )
85 |     return loss


--------------------------------------------------------------------------------
/day58/layer_norm.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | 
 3 | __global__ void layer_norm_kernel(
 4 |     float* output,
 5 |     const float* input,
 6 |     const float* gamma,
 7 |     const float* beta,
 8 |     int batch_size,
 9 |     int hidden_size,
10 |     float epsilon)
11 | {
12 |     extern __shared__ float shared[];
13 |     int batch_idx = blockIdx.x;
14 |     int tid = threadIdx.x;
15 | 
16 |     if (batch_idx >= batch_size) return;
17 | 
18 |     float* sum = shared;
19 |     float* sum_sq = &shared[blockDim.x];
20 | 
21 |     float thread_sum = 0.0f;
22 |     for (int i = tid; i < hidden_size; i += blockDim.x) {
23 |         float val = input[batch_idx * hidden_size + i];
24 |         thread_sum += val;
25 |     }
26 |     sum[tid] = thread_sum;
27 |     __syncthreads();
28 | 
29 |     for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
30 |         if (tid < stride) {
31 |             sum[tid] += sum[tid + stride];
32 |         }
33 |         __syncthreads();
34 |     }
35 |     float mean = sum[0] / hidden_size;
36 | 
37 |     float thread_sum_sq = 0.0f;
38 |     for (int i = tid; i < hidden_size; i += blockDim.x) {
39 |         float val = input[batch_idx * hidden_size + i];
40 |         float diff = val - mean;
41 |         thread_sum_sq += diff * diff;
42 |     }
43 |     sum_sq[tid] = thread_sum_sq;
44 |     __syncthreads();
45 | 
46 |     for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
47 |         if (tid < stride) {
48 |             sum_sq[tid] += sum_sq[tid + stride];
49 |         }
50 |         __syncthreads();
51 |     }
52 |     float variance = sum_sq[0] / hidden_size + epsilon;
53 |     float inv_std = rsqrtf(variance);
54 | 
55 |     for (int i = tid; i < hidden_size; i += blockDim.x) {
56 |         float val = input[batch_idx * hidden_size + i];
57 |         float normalized = (val - mean) * inv_std;
58 |         output[batch_idx * hidden_size + i] = normalized * gamma[i] + beta[i];
59 |     }
60 | }
61 | 
62 | void layer_norm_hip(
63 |     float* output,
64 |     const float* input,
65 |     const float* gamma,
66 |     const float* beta,
67 |     int batch_size,
68 |     int hidden_size,
69 |     float epsilon,
70 |     hipStream_t stream)
71 | {
72 |     dim3 blocks(batch_size);
73 |     dim3 threads(256);
74 |     size_t shared_mem = 2 * threads.x * sizeof(float);
75 | 
76 |     hipLaunchKernelGGL(
77 |         layer_norm_kernel,
78 |         blocks, threads, shared_mem, stream,
79 |         output, input, gamma, beta,
80 |         batch_size, hidden_size, epsilon
81 |     );
82 | }


--------------------------------------------------------------------------------
/day60/fused.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def _fused_skip_act_norm_dropout_kernel(
 7 |     input_ptr, skip_ptr, output_ptr,
 8 |     weight_ptr, bias_ptr,
 9 |     M, N,
10 |     stride_input, stride_skip, stride_output,
11 |     dropout_p, seed,
12 |     eps,
13 |     is_training,
14 |     BLOCK_SIZE: tl.constexpr,
15 | ):
16 |     pid = tl.program_id(0)
17 |     
18 |     offsets = pid * stride_input + tl.arange(0, BLOCK_SIZE)
19 |     mask = tl.arange(0, BLOCK_SIZE) < N
20 | 
21 |     input = tl.load(input_ptr + offsets, mask=mask, other=0.0)
22 |     skip = tl.load(skip_ptr + offsets, mask=mask, other=0.0)
23 |     
24 |     summed = input + skip
25 | 
26 |     mean = tl.sum(summed, axis=0) / N
27 |     centered = summed - mean
28 |     var = tl.sum(centered * centered, axis=0) / N
29 |     inv_std = 1.0 / tl.sqrt(var + eps)
30 | 
31 |     normalized = centered * inv_std
32 | 
33 |     if weight_ptr is not None:
34 |         weight = tl.load(weight_ptr + tl.arange(0, BLOCK_SIZE), mask=mask, other=1.0)
35 |         normalized *= weight
36 |     if bias_ptr is not None:
37 |         bias = tl.load(bias_ptr + tl.arange(0, BLOCK_SIZE), mask=mask, other=0.0)
38 |         normalized += bias
39 | 
40 |     gelu = normalized * 0.5 * (1.0 + tl.erf(normalized / tl.sqrt(2.0)))
41 | 
42 |     if is_training:
43 |         dropout_mask = tl.rand(seed, offsets) > dropout_p
44 |         gelu = tl.where(dropout_mask, gelu / (1 - dropout_p), 0.0)
45 |     
46 |     tl.store(output_ptr + offsets, gelu, mask=mask)
47 | 
48 | class FusedSkipNormActDropout(torch.autograd.Function):
49 |     @staticmethod
50 |     def forward(ctx, input, skip, weight, bias, p, training, eps):
51 |         assert input.shape == skip.shape
52 |         M, N = input.shape
53 |         output = torch.empty_like(input)
54 |         
55 |         BLOCK_SIZE = triton.next_power_of_2(N)
56 |         
57 |         seed = torch.randint(0, 2**31, (1,)).item()
58 |         
59 |         grid = (M,)
60 |         _fused_skip_act_norm_dropout_kernel[grid](
61 |             input, skip, output,
62 |             weight if weight is not None else None,
63 |             bias if bias is not None else None,
64 |             M, N,
65 |             input.stride(0), skip.stride(0), output.stride(0),
66 |             dropout_p=p,
67 |             seed=seed,
68 |             eps=eps,
69 |             is_training=training,
70 |             BLOCK_SIZE=BLOCK_SIZE,
71 |         )
72 |         
73 |         ctx.training = training
74 |         ctx.p = p
75 |         ctx.eps = eps
76 |         ctx.save_for_backward(input, skip, weight, bias, output)
77 |         
78 |         return output
79 | 
80 |     @staticmethod
81 |     def backward(ctx, grad_output):
82 |         raise NotImplementedError("Backward not implemented for this fused operation")
83 | 
84 | def fused_skip_norm_act_dropout(
85 |     input: torch.Tensor,
86 |     skip: torch.Tensor,
87 |     weight: torch.Tensor = None,
88 |     bias: torch.Tensor = None,
89 |     p: float = 0.5,
90 |     training: bool = False,
91 |     eps: float = 1e-5
92 | ) -> torch.Tensor:
93 |     return FusedSkipNormActDropout.apply(input, skip, weight, bias, p, training, eps)


--------------------------------------------------------------------------------
/day64/main.py:
--------------------------------------------------------------------------------
 1 | import torch, time
 2 | import torch.nn.functional as F
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | @triton.jit
 7 | def geglu_kernel(input_ptr, output_ptr, numel: tl.constexpr, D: tl.constexpr, BLOCK_SIZE: tl.constexpr):
 8 |     pid = tl.program_id(0)
 9 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
10 |     mask = offsets < numel
11 | 
12 |     row = offsets // D
13 |     col = offsets % D
14 |     base_offset = row * (2 * D)
15 |     x = tl.load(input_ptr + base_offset + col, mask=mask)
16 |     gate = tl.load(input_ptr + base_offset + D + col, mask=mask)
17 | 
18 |     t = 0.7978845608 * (gate + 0.044715 * gate * gate * gate)
19 |     exp_2t = tl.exp(2 * t)
20 |     tanh_t = (exp_2t - 1.0) / (exp_2t + 1.0)
21 |     gelu_gate = 0.5 * gate * (1.0 + tanh_t)
22 |     out = x * gelu_gate
23 | 
24 |     tl.store(output_ptr + offsets, out, mask=mask)
25 | 
26 | def fused_geglu(input_tensor):
27 |     N, twoD = input_tensor.shape
28 |     D = twoD // 2
29 |     output = torch.empty((N, D), device=input_tensor.device, dtype=input_tensor.dtype)
30 |     numel = N * D
31 |     BLOCK_SIZE = 256
32 |     grid = lambda meta: ((numel + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
33 |     geglu_kernel[grid](input_tensor, output, numel, D, BLOCK_SIZE)
34 |     return output
35 | 
36 | def torch_geglu(input_tensor):
37 |     x, gate = input_tensor.chunk(2, dim=-1)
38 |     return x * F.gelu(gate)
39 | 
40 | input_tensor = torch.randn(8192, 8192, device='cuda')
41 | 
42 | _ = fused_geglu(input_tensor)
43 | _ = torch_geglu(input_tensor)
44 | 
45 | torch.cuda.synchronize()
46 | start = time.time()
47 | for _ in range(100):
48 |     _ = fused_geglu(input_tensor)
49 | torch.cuda.synchronize()
50 | fused_time = time.time() - start
51 | 
52 | torch.cuda.synchronize()
53 | start = time.time()
54 | for _ in range(100):
55 |     _ = torch_geglu(input_tensor)
56 | torch.cuda.synchronize()
57 | torch_time = time.time() - start
58 | 
59 | print("Fused Triton kernel time: {:.6f} sec".format(fused_time))
60 | print("Torch baseline time: {:.6f} sec".format(torch_time))
61 | 


--------------------------------------------------------------------------------
/day67/lora.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def lora_kernel(
 7 |     y_ptr, x_ptr, w_ptr, a_ptr, b_ptr,
 8 |     M, N, K, R,
 9 |     stride_ym, stride_yn,
10 |     stride_xm, stride_xk,
11 |     stride_wk, stride_wn,
12 |     stride_ak, stride_ar,
13 |     stride_br, stride_bn,
14 |     BLOCK_SIZE_M: tl.constexpr,
15 |     BLOCK_SIZE_N: tl.constexpr,
16 |     BLOCK_SIZE_K: tl.constexpr,
17 |     BLOCK_SIZE_R: tl.constexpr,
18 | ):
19 |     pid = tl.program_id(0)
20 |     num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
21 |     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
22 |     pid_m = pid // num_pid_n
23 |     pid_n = pid % num_pid_n
24 | 
25 |     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
26 |     offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
27 |     offs_k = tl.arange(0, BLOCK_SIZE_K)
28 |     offs_r = tl.arange(0, BLOCK_SIZE_R)
29 | 
30 |     y_ptrs = y_ptr + offs_m[:, None] * stride_ym + offs_n[None, :] * stride_yn
31 |     mask_m = (offs_m < M)[:, None]
32 |     mask_n = (offs_n < N)[None, :]
33 |     mask_y = mask_m & mask_n
34 | 
35 |     acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
36 | 
37 |     for k in range(0, K, BLOCK_SIZE_K):
38 |         x_ptrs = x_ptr + offs_m[:, None] * stride_xm + (k + offs_k)[None, :] * stride_xk
39 |         mask_x = (offs_m < M)[:, None] & ((k + offs_k) < K)[None, :]
40 |         x = tl.load(x_ptrs, mask=mask_x, other=0.0)
41 | 
42 |         w_ptrs = w_ptr + (k + offs_k)[:, None] * stride_wk + offs_n[None, :] * stride_wn
43 |         mask_w = ((k + offs_k) < K)[:, None] & (offs_n < N)[None, :]
44 |         w = tl.load(w_ptrs, mask=mask_w, other=0.0)
45 | 
46 |         ab = tl.zeros((BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=tl.float32)
47 |         for r in range(0, R, BLOCK_SIZE_R):
48 |             a_ptrs = a_ptr + (k + offs_k)[:, None] * stride_ak + (r + offs_r)[None, :] * stride_ar
49 |             mask_a = ((k + offs_k) < K)[:, None] & ((r + offs_r) < R)[None, :]
50 |             a = tl.load(a_ptrs, mask=mask_a, other=0.0)
51 | 
52 |             b_ptrs = b_ptr + (r + offs_r)[:, None] * stride_br + offs_n[None, :] * stride_bn
53 |             mask_b = ((r + offs_r) < R)[:, None] & (offs_n < N)[None, :]
54 |             b = tl.load(b_ptrs, mask=mask_b, other=0.0)
55 | 
56 |             ab += tl.dot(a.to(tl.float32), b.to(tl.float32))
57 | 
58 |         w_eff = w.to(tl.float32) + ab
59 |         acc += tl.dot(x.to(tl.float32), w_eff)
60 | 
61 |     tl.store(y_ptrs, acc.to(tl.float16), mask=mask_y)
62 | 
63 | def lora_matmul(x, W, A, B):
64 |     M, K = x.shape
65 |     _, N = W.shape
66 |     R = A.shape[1]
67 |     y = torch.empty((M, N), device=x.device, dtype=x.dtype)
68 |     
69 |     BLOCK_SIZE_M = 64
70 |     BLOCK_SIZE_N = 64
71 |     BLOCK_SIZE_K = 32
72 |     BLOCK_SIZE_R = 32
73 |     
74 |     grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N, meta['BLOCK_SIZE_N']),)
75 |     
76 |     lora_kernel[grid](
77 |         y, x, W, A, B,
78 |         M, N, K, R,
79 |         y.stride(0), y.stride(1),
80 |         x.stride(0), x.stride(1),
81 |         W.stride(0), W.stride(1),
82 |         A.stride(0), A.stride(1),
83 |         B.stride(0), B.stride(1),
84 |         BLOCK_SIZE_M=BLOCK_SIZE_M,
85 |         BLOCK_SIZE_N=BLOCK_SIZE_N,
86 |         BLOCK_SIZE_K=BLOCK_SIZE_K,
87 |         BLOCK_SIZE_R=BLOCK_SIZE_R,
88 |     )
89 |     return y
90 | 


--------------------------------------------------------------------------------
/day68/adam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def adam_fp8_kernel(
 7 |     param_ptr, grad_ptr, m_ptr, v_ptr, lr_ptr, 
 8 |     beta1, beta2, eps, step,
 9 |     BLOCK_SIZE: tl.constexpr
10 | ):
11 |     pid = tl.program_id(axis=0)
12 |     offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
13 |     mask = offset < tl.numel(param_ptr)
14 |     
15 |     param = tl.load(param_ptr + offset, mask=mask, other=0.0).to(tl.float16)
16 |     grad = tl.load(grad_ptr + offset, mask=mask, other=0.0).to(tl.float16)
17 |     m = tl.load(m_ptr + offset, mask=mask, other=0.0).to(tl.float16)
18 |     v = tl.load(v_ptr + offset, mask=mask, other=0.0).to(tl.float16)
19 |     lr = tl.load(lr_ptr + offset, mask=mask, other=0.0).to(tl.float16)
20 |     
21 |     m_new = beta1 * m + (1 - beta1) * grad
22 |     v_new = beta2 * v + (1 - beta2) * grad * grad
23 |     m_hat = m_new / (1 - beta1 ** step)
24 |     v_hat = v_new / (1 - beta2 ** step)
25 |     update = m_hat / (tl.sqrt(v_hat) + eps)
26 |     param_new = param - lr * update
27 |     
28 |     param_new_fp8 = param_new.to(tl.float8_e4m3)
29 |     m_new_fp8 = m_new.to(tl.float8_e4m3)
30 |     v_new_fp8 = v_new.to(tl.float8_e4m3)
31 |     
32 |     tl.store(param_ptr + offset, param_new_fp8, mask=mask)
33 |     tl.store(m_ptr + offset, m_new_fp8, mask=mask)
34 |     tl.store(v_ptr + offset, v_new_fp8, mask=mask)
35 | 
36 | def adam_fp8(param, grad, m, v, lr, beta1=0.9, beta2=0.999, eps=1e-8, step=1):
37 |     BLOCK_SIZE = 1024
38 |     n = param.numel()
39 |     grid = lambda meta: (triton.cdiv(n, meta['BLOCK_SIZE']),)
40 |     adam_fp8_kernel[grid](
41 |         param, grad, m, v, lr,
42 |         beta1, beta2, eps, step,
43 |         BLOCK_SIZE=BLOCK_SIZE
44 |     )
45 | 


--------------------------------------------------------------------------------
/day69/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def reduce_kernel(K, V, A_ptr, b_ptr, N: tl.constexpr, D: tl.constexpr):
 7 |     acc_A = tl.zeros([D, D], dtype=tl.float32)
 8 |     acc_b = tl.zeros([D], dtype=tl.float32)
 9 |     for j in range(N):
10 |         k = tl.load(K + j * D)
11 |         v = tl.load(V + j * D)
12 |         k_phi = tl.relu(k) + 1.0
13 |         for i in range(D):
14 |             for jj in range(D):
15 |                 acc_A[i, jj] += k_phi[i] * v[jj]
16 |         acc_b += k_phi
17 |     tl.store(A_ptr, acc_A)
18 |     tl.store(b_ptr, acc_b)
19 | 
20 | @triton.jit
21 | def attention_kernel(Q, A_ptr, b_ptr, Out, N: tl.constexpr, D: tl.constexpr):
22 |     pid = tl.program_id(0)
23 |     q = tl.load(Q + pid * D)
24 |     q_phi = tl.relu(q) + 1.0
25 |     out_vec = tl.zeros([D], dtype=tl.float32)
26 |     for i in range(D):
27 |         a_row = tl.load(A_ptr + i * D)
28 |         out_vec[i] = tl.dot(q_phi, a_row)
29 |     denom = tl.dot(q_phi, tl.load(b_ptr))
30 |     tl.store(Out + pid * D, out_vec / denom)
31 | 
32 | def linear_attention(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor) -> torch.Tensor:
33 |     assert Q.is_cuda and K.is_cuda and V.is_cuda, "Input tensors must be on CUDA"
34 |     N, D = Q.shape
35 |     A = torch.empty((D, D), device='cuda', dtype=torch.float32)
36 |     b = torch.empty((D,), device='cuda', dtype=torch.float32)
37 |     reduce_kernel[(1,)](K, V, A, b, N, D)
38 |     Out = torch.empty_like(Q)
39 |     attention_kernel[(N,)](Q, A, b, Out, N, D)
40 |     return Out
41 | 
42 | if __name__ == "__main__":
43 |     N = 1024
44 |     D = 64
45 |     Q = torch.randn((N, D), device='cuda', dtype=torch.float32)
46 |     K = torch.randn((N, D), device='cuda', dtype=torch.float32)
47 |     V = torch.randn((N, D), device='cuda', dtype=torch.float32)
48 |     Out = linear_attention(Q, K, V)
49 |     print("Output shape:", Out.shape)
50 | 


--------------------------------------------------------------------------------
/day72/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def sgd_kernel(
 7 |     param_ptr,
 8 |     grad_ptr,
 9 |     momentum_ptr,
10 |     lr,
11 |     weight_decay,
12 |     momentum_factor,
13 |     dampening,
14 |     nesterov,
15 |     n_elements,
16 |     BLOCK_SIZE: tl.constexpr,
17 | ):
18 |     pid = tl.program_id(axis=0)
19 |     block_start = pid * BLOCK_SIZE
20 |     offsets = block_start + tl.arange(0, BLOCK_SIZE)
21 |     mask = offsets < n_elements
22 |     params = tl.load(param_ptr + offsets, mask=mask)
23 |     grads = tl.load(grad_ptr + offsets, mask=mask)
24 |     if weight_decay != 0.0:
25 |         grads = grads + weight_decay * params
26 |     if momentum_factor != 0.0:
27 |         momentum_buf = tl.load(momentum_ptr + offsets, mask=mask)
28 |         momentum_buf = momentum_factor * momentum_buf + (1.0 - dampening) * grads
29 |         tl.store(momentum_ptr + offsets, momentum_buf, mask=mask)
30 |         if nesterov:
31 |             grads = grads + momentum_factor * momentum_buf
32 |         else:
33 |             grads = momentum_buf
34 |     params = params - lr * grads
35 |     tl.store(param_ptr + offsets, params, mask=mask)
36 | 
37 | def sgd_update(
38 |     params,
39 |     grads,
40 |     momentum_buffer=None,
41 |     lr=0.01,
42 |     weight_decay=0.0,
43 |     momentum=0.0,
44 |     dampening=0.0,
45 |     nesterov=False,
46 | ):
47 |     n_elements = params.numel()
48 |     if momentum != 0.0 and momentum_buffer is None:
49 |         momentum_buffer = torch.zeros_like(params)
50 |     BLOCK_SIZE = 1024
51 |     grid = (n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE
52 |     sgd_kernel[grid, 1](
53 |         params.data_ptr(),
54 |         grads.data_ptr(),
55 |         momentum_buffer.data_ptr() if momentum != 0.0 else 0,
56 |         lr,
57 |         weight_decay,
58 |         momentum,
59 |         dampening,
60 |         1 if nesterov else 0,
61 |         n_elements,
62 |         BLOCK_SIZE,
63 |     )
64 |     return params, momentum_buffer
65 | 
66 | def example():
67 |     params = torch.randn(10000, device='cuda')
68 |     grads = torch.randn(10000, device='cuda')
69 |     momentum_buffer = torch.zeros_like(params)
70 |     updated_params, updated_momentum = sgd_update(
71 |         params, 
72 |         grads,
73 |         momentum_buffer,
74 |         lr=0.01,
75 |         weight_decay=0.0001,
76 |         momentum=0.9,
77 |         nesterov=True
78 |     )
79 |     print(f"Updated {params.shape} parameters using Triton SGD kernel")
80 |     
81 | if __name__ == "__main__":
82 |     example()
83 | 


--------------------------------------------------------------------------------
/day73/code.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import triton.language as tl
 3 | import numpy as np
 4 | 
 5 | @triton.jit
 6 | def ddim_step_kernel(
 7 |     x_ptr,
 8 |     eps_ptr,
 9 |     out_ptr,
10 |     alpha_t: tl.constexpr,
11 |     alpha_t_prev: tl.constexpr,
12 |     n_elements: tl.constexpr,
13 |     BLOCK_SIZE: tl.constexpr = 1024
14 | ):
15 |     pid = tl.program_id(0)
16 |     offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
17 |     mask = offsets < n_elements
18 | 
19 |     x = tl.load(x_ptr + offsets, mask=mask)
20 |     eps = tl.load(eps_ptr + offsets, mask=mask)
21 |     
22 |     sqrt_alpha_t = tl.sqrt(alpha_t)
23 |     sqrt_one_minus_alpha_t = tl.sqrt(1 - alpha_t)
24 |     sqrt_alpha_t_prev = tl.sqrt(alpha_t_prev)
25 |     sqrt_one_minus_alpha_t_prev = tl.sqrt(1 - alpha_t_prev)
26 | 
27 |     x0 = (x - sqrt_one_minus_alpha_t * eps) / sqrt_alpha_t
28 |     new_x = sqrt_alpha_t_prev * x0 + sqrt_one_minus_alpha_t_prev * eps
29 | 
30 |     tl.store(out_ptr + offsets, new_x, mask=mask)
31 | 
32 | def ddim_sampling_step(x: np.ndarray, eps: np.ndarray, alpha_t: float, alpha_t_prev: float):
33 |     x = np.ascontiguousarray(x.astype(np.float32))
34 |     eps = np.ascontiguousarray(eps.astype(np.float32))
35 |     out = np.empty_like(x)
36 | 
37 |     n_elements = x.size
38 |     grid = (triton.cdiv(n_elements, 1024),)
39 | 
40 |     ddim_step_kernel[grid](
41 |         x_ptr=x,
42 |         eps_ptr=eps,
43 |         out_ptr=out,
44 |         alpha_t=alpha_t,
45 |         alpha_t_prev=alpha_t_prev,
46 |         n_elements=n_elements,
47 |         BLOCK_SIZE=1024
48 |     )
49 |     return out
50 | 
51 | if __name__ == '__main__':
52 |     N = 4096
53 |     x = np.random.randn(N).astype(np.float32)
54 |     eps = np.random.randn(N).astype(np.float32)
55 |     alpha_t = 0.9
56 |     alpha_t_prev = 0.85
57 | 
58 |     x_prev = ddim_sampling_step(x, eps, alpha_t, alpha_t_prev)
59 |     print("Updated sample:", x_prev)
60 | 


--------------------------------------------------------------------------------
/day74/kernel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def relu_device_fn(x):
 7 |     return tl.maximum(0.0, x)
 8 | 
 9 | @triton.jit
10 | def swish_device_fn(x):
11 |     return x * tl.sigmoid(x)
12 | 
13 | @triton.jit
14 | def gelu_device_fn(x):
15 |     return 0.5 * x * (1.0 + tl.tanh(0.7978845608 * (x + 0.044715 * x * x * x)))
16 | 
17 | def create_activation_kernel(device_fn):
18 |     @triton.jit
19 |     def kernel(x_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
20 |         pid = tl.program_id(axis=0)
21 |         block_start = pid * BLOCK_SIZE
22 |         offsets = block_start + tl.arange(0, BLOCK_SIZE)
23 |         mask = offsets < n_elements
24 |         x = tl.load(x_ptr + offsets, mask=mask)
25 |         output = device_fn(x)
26 |         tl.store(output_ptr + offsets, output, mask=mask)
27 |     return kernel
28 | 
29 | def create_activation_function(kernel, name):
30 |     @triton.autotune(
31 |         configs=[
32 |             triton.Config({'BLOCK_SIZE': 128}),
33 |             triton.Config({'BLOCK_SIZE': 256}),
34 |             triton.Config({'BLOCK_SIZE': 512}),
35 |             triton.Config({'BLOCK_SIZE': 1024}),
36 |         ],
37 |         key=['n_elements'],
38 |     )
39 |     def activation_fn(x):
40 |         n_elements = x.numel()
41 |         output = torch.empty_like(x)
42 |         grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
43 |         kernel[grid](
44 |             x.data_ptr(),
45 |             output.data_ptr(),
46 |             n_elements,
47 |         )
48 |         return output
49 |     activation_fn.__name__ = name
50 |     return activation_fn
51 | 
52 | relu_kernel = create_activation_kernel(relu_device_fn)
53 | swish_kernel = create_activation_kernel(swish_device_fn)
54 | gelu_kernel = create_activation_kernel(gelu_device_fn)
55 | 
56 | relu = create_activation_function(relu_kernel, "relu")
57 | swish = create_activation_function(swish_kernel, "swish")
58 | gelu = create_activation_function(gelu_kernel, "gelu")
59 | 
60 | def example():
61 |     x = torch.randn(1024, 1024, device='cuda')
62 |     y_relu = relu(x)
63 |     y_swish = swish(x)
64 |     y_gelu = gelu(x)
65 |     print(f"Input shape: {x.shape}")
66 |     print(f"ReLU output shape: {y_relu.shape}")
67 |     print(f"Swish output shape: {y_swish.shape}")
68 |     print(f"GELU output shape: {y_gelu.shape}")
69 |     torch_relu = torch.nn.functional.relu(x)
70 |     torch_gelu = torch.nn.functional.gelu(x)
71 |     torch_swish = torch.nn.functional.silu(x)
72 |     print(f"ReLU max error: {(y_relu - torch_relu).abs().max().item()}")
73 |     print(f"Swish max error: {(y_swish - torch_swish).abs().max().item()}")
74 |     print(f"GELU max error: {(y_gelu - torch_gelu).abs().max().item()}")
75 | 
76 | if __name__ == "__main__":
77 |     example()
78 | 


--------------------------------------------------------------------------------
/day76/kernel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def moe_kernel(
 7 |     input_ptr,
 8 |     gate_weight_ptr,
 9 |     experts_ptr,
10 |     output_ptr,
11 |     num_tokens,
12 |     hidden_size,
13 |     num_experts,
14 |     top_k,
15 |     input_token_stride,
16 |     input_hidden_stride,
17 |     expert_stride,
18 |     expert_hidden_stride,
19 |     BLOCK_SIZE: tl.constexpr,
20 | ):
21 |     token_idx = tl.program_id(0)
22 |     if token_idx >= num_tokens:
23 |         return
24 |     
25 |     input_offset = token_idx * input_token_stride
26 |     input = tl.load(input_ptr + input_offset + tl.arange(0, BLOCK_SIZE) * input_hidden_stride,
27 |                    mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0)
28 |     
29 |     gate_logits = tl.zeros((num_experts,), dtype=tl.float32)
30 |     for expert in range(num_experts):
31 |         gate_w = tl.load(gate_weight_ptr + expert * hidden_size + tl.arange(0, BLOCK_SIZE),
32 |                          mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0)
33 |         logit = tl.sum(input * gate_w)
34 |         gate_logits = tl.store(gate_logits + expert, logit)
35 |     
36 |     max_logit = tl.max(gate_logits)
37 |     exp_logits = tl.exp(gate_logits - max_logit)
38 |     sum_exp = tl.sum(exp_logits)
39 |     probs = exp_logits / sum_exp
40 |     
41 |     topk_values, topk_indices = tl.topk(probs, top_k)
42 |     
43 |     output = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
44 |     for i in range(top_k):
45 |         expert_idx = topk_indices[i]
46 |         weight = topk_values[i]
47 |         
48 |         expert_offset = expert_idx * expert_stride
49 |         expert_output = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
50 |         for j in range(hidden_size):
51 |             w = tl.load(experts_ptr + expert_offset + j * expert_hidden_stride + tl.arange(0, BLOCK_SIZE),
52 |                         mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0)
53 |             expert_output += input[j] * w
54 |         
55 |         output += weight * expert_output
56 |     
57 |     tl.store(output_ptr + token_idx * input_token_stride + tl.arange(0, BLOCK_SIZE) * input_hidden_stride,
58 |             output, mask=tl.arange(0, BLOCK_SIZE) < hidden_size)
59 | 
60 | def moe_layer(input: torch.Tensor, gate: torch.Tensor, experts: torch.Tensor, top_k: int):
61 |     assert experts.shape[0] >= top_k, "Number of experts must be >= top_k"
62 |     output = torch.empty_like(input)
63 |     hidden_size = input.size(1)
64 |     num_tokens = input.size(0)
65 |     num_experts = gate.size(1)
66 |     
67 |     # Ensure block size is a power of two for optimal performance
68 |     BLOCK_SIZE = triton.next_power_of_2(hidden_size)
69 |     if BLOCK_SIZE > 4096:
70 |         BLOCK_SIZE = 4096
71 |     
72 |     moe_kernel[(num_tokens,)](
73 |         input_ptr=input,
74 |         gate_weight_ptr=gate,
75 |         experts_ptr=experts,
76 |         output_ptr=output,
77 |         num_tokens=num_tokens,
78 |         hidden_size=hidden_size,
79 |         num_experts=num_experts,
80 |         top_k=top_k,
81 |         input_token_stride=input.stride(0),
82 |         input_hidden_stride=input.stride(1),
83 |         expert_stride=experts.stride(0),
84 |         expert_hidden_stride=experts.stride(2),
85 |         BLOCK_SIZE=BLOCK_SIZE,
86 |     )
87 |     return output


--------------------------------------------------------------------------------
/day77/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import time
 5 | 
 6 | # A simplified RetNet kernel: a decaying cumulative sum.
 7 | # Given an input sequence x and a decay factor alpha,
 8 | # it computes: y[0] = x[0] and for i>0, y[i] = x[i] + alpha * y[i-1]
 9 | # Note: This kernel assumes the sequence length (N) is known at compile-time.
10 | @triton.jit
11 | def retnet_kernel(x_ptr, y_ptr, N: tl.constexpr, alpha: tl.constexpr):
12 |     # we use a single program (grid = (1,)) to process the full sequence sequentially.
13 |     acc = tl.zeros([1], dtype=tl.float32)
14 |     # Process each element in sequence.
15 |     for i in range(N):
16 |         # Load the i-th element from input.
17 |         x_val = tl.load(x_ptr + i)
18 |         # Compute the recurrent relation.
19 |         acc = x_val + alpha * acc
20 |         # Store the result.
21 |         tl.store(y_ptr + i, acc)
22 | 
23 | # A CPU reference implementation for testing correctness and timing.
24 | def retnet_cpu(x, alpha):
25 |     y = torch.empty_like(x)
26 |     acc = 0.0
27 |     for i in range(x.shape[0]):
28 |         acc = x[i].item() + alpha * acc
29 |         y[i] = acc
30 |     return y
31 | 
32 | def main():
33 |     # Parameters
34 |     N = 1024  # Sequence length (must match the kernel compile-time constant)
35 |     alpha = 0.9
36 |     # Create a random input tensor on the GPU.
37 |     x = torch.randn(N, device='cuda', dtype=torch.float32)
38 |     y = torch.empty_like(x)
39 | 
40 |     # Define a grid that launches one program instance (since the kernel is sequential).
41 |     grid = lambda meta: (1,)
42 | 
43 |     # Warm-up: launch the kernel once to compile and warm up.
44 |     retnet_kernel[grid](x, y, N, alpha)
45 |     torch.cuda.synchronize()
46 | 
47 |     # Time the Triton kernel using CUDA events.
48 |     start_event = torch.cuda.Event(enable_timing=True)
49 |     end_event = torch.cuda.Event(enable_timing=True)
50 |     start_event.record()
51 |     retnet_kernel[grid](x, y, N, alpha)
52 |     end_event.record()
53 |     torch.cuda.synchronize()
54 |     triton_time = start_event.elapsed_time(end_event)  # milliseconds
55 | 
56 |     # Run the CPU version for comparison.
57 |     x_cpu = x.cpu()
58 |     start = time.time()
59 |     y_cpu = retnet_cpu(x_cpu, alpha)
60 |     cpu_time = (time.time() - start) * 1000  # convert to ms
61 | 
62 |     # Verify correctness.
63 |     y_ref = y_cpu.to(device='cuda')
64 |     if torch.allclose(y, y_ref, atol=1e-5):
65 |         print("Results match.")
66 |     else:
67 |         print("Results differ!")
68 | 
69 |     print("Triton kernel time (ms):", triton_time)
70 |     print("CPU cumulative sum time (ms):", cpu_time)
71 | 
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/day79/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def quantize_kernel(input_ptr, output_ptr, n_elements, scale, BLOCK_SIZE: tl.constexpr):
 7 |     
 8 |     pid = tl.program_id(0)
 9 |     block_start = pid * BLOCK_SIZE
10 |     offsets = block_start + tl.arange(0, BLOCK_SIZE)
11 |     
12 |     mask = offsets < n_elements
13 |     
14 |     x = tl.load(input_ptr + offsets, mask=mask)
15 |     
16 |     x_scaled = x * scale
17 |     
18 |     x_rounded = tl.round(x_scaled)
19 |     
20 |     x_clamped = tl.max(tl.min(x_rounded, 127), -128)
21 |     
22 |     tl.store(output_ptr + offsets, tl.cast(x_clamped, tl.int8), mask=mask)
23 | 
24 | def quantize(input_tensor, scale):
25 |     
26 |     assert input_tensor.is_cuda, "Input tensor must be on a CUDA device"
27 |     n_elements = input_tensor.numel()
28 |     output_tensor = torch.empty_like(input_tensor, dtype=torch.int8)
29 |     
30 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
31 |     quantize_kernel[grid](input_tensor, output_tensor, n_elements, scale, BLOCK_SIZE=1024)
32 |     
33 |     return output_tensor
34 | 
35 | if __name__ == '__main__':
36 |     
37 |     input_tensor = torch.randn(1024 * 1024, device='cuda', dtype=torch.float32)
38 |     scale = 127.0
39 |     output_tensor = quantize(input_tensor, scale)
40 |     print("Quantization complete. Output tensor:")
41 |     print(output_tensor)
42 | 


--------------------------------------------------------------------------------
/day80/kernel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | @triton.jit
 6 | def rwkv_kernel(
 7 |     output_ptr,    
 8 |     k_ptr,        
 9 |     v_ptr,         
10 |     w_ptr,         
11 |     n_time: tl.constexpr,      
12 |     n_channels: tl.constexpr,  
13 |     stride_time: tl.constexpr, 
14 |     stride_batch: tl.constexpr 
15 | ):
16 |     pid = tl.program_id(0)
17 |     batch = pid // n_channels   
18 |     channel = pid % n_channels 
19 | 
20 |     w = tl.load(w_ptr + channel)
21 | 
22 |     max_val = -1e30
23 |     numerator = 0.0
24 |     denominator = 0.0
25 | 
26 |     for t in range(n_time):
27 |         offset = batch * stride_batch + t * stride_time + channel
28 | 
29 |         cur_k = tl.load(k_ptr + offset)
30 |         cur_v = tl.load(v_ptr + offset)
31 | 
32 |         m = tl.maximum(max_val, cur_k)
33 | 
34 |         exp_max_diff = tl.exp(max_val - m)
35 |         exp_k_diff = tl.exp(cur_k - m)
36 | 
37 |         numerator = numerator * exp_max_diff + cur_v * exp_k_diff
38 |         denominator = denominator * exp_max_diff + exp_k_diff
39 | 
40 |         result = numerator / denominator
41 |         tl.store(output_ptr + offset, result)
42 | 
43 |         max_val = m + w
44 | 
45 | def rwkv_forward(k: torch.Tensor, v: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
46 |     assert k.is_cuda and v.is_cuda and w.is_cuda, "All tensors must be on CUDA."
47 |     B, T, C = k.shape
48 | 
49 |     output = torch.empty_like(v)
50 | 
51 |     stride_time = k.stride(1)
52 |     stride_batch = k.stride(0)
53 | 
54 |     grid = (B * C,)
55 | 
56 |     rwkv_kernel[grid](
57 |         output_ptr=output,
58 |         k_ptr=k,
59 |         v_ptr=v,
60 |         w_ptr=w,
61 |         n_time=T,
62 |         n_channels=C,
63 |         stride_time=stride_time,
64 |         stride_batch=stride_batch,
65 |     )
66 |     return output
67 | 
68 | if __name__ == '__main__':
69 |     B = 2           # batch size
70 |     T = 128         # sequence length
71 |     C = 256         # number of channels
72 | 
73 |     k_tensor = torch.randn(B, T, C, device='cuda', dtype=torch.float32)
74 |     v_tensor = torch.randn(B, T, C, device='cuda', dtype=torch.float32)
75 |     w_tensor = torch.randn(C, device='cuda', dtype=torch.float32) * 0.1
76 | 
77 |     output_tensor = rwkv_forward(k_tensor, v_tensor, w_tensor)
78 |     print("Output shape:", output_tensor.shape)
79 |     print("Output sample:", output_tensor[0, :5, :5])
80 | 


--------------------------------------------------------------------------------
/day81/main.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import triton.language as tl
 3 | 
 4 | @triton.jit
 5 | def fused_layernorm_ff_dropout_kernel(
 6 |     x_ptr, out_ptr,
 7 |     gamma_ptr, beta_ptr,
 8 |     weight1_ptr, bias1_ptr,
 9 |     weight2_ptr, bias2_ptr,
10 |     seed,
11 |     dropout_p: tl.constexpr,
12 |     N: tl.constexpr,
13 |     M: tl.constexpr,
14 |     BLOCK: tl.constexpr
15 | ):
16 |     row_idx = tl.program_id(0)
17 |     row_offset = row_idx * N
18 | 
19 |     x = tl.load(x_ptr + row_offset + tl.arange(0, N))
20 |     mean = tl.sum(x, axis=0) / N
21 |     diff = x - mean
22 |     var = tl.sum(diff * diff, axis=0) / N
23 |     norm = diff * tl.rsqrt(var + 1e-5)
24 | 
25 |     gamma = tl.load(gamma_ptr + tl.arange(0, N))
26 |     beta = tl.load(beta_ptr + tl.arange(0, N))
27 |     norm = norm * gamma + beta
28 | 
29 |     hidden = tl.zeros([M], dtype=x.dtype)
30 |     for i in range(0, N, BLOCK):
31 |         block_range = i + tl.arange(0, BLOCK)
32 |         norm_block = norm[block_range]
33 |         weight1_block = tl.load(
34 |             weight1_ptr + i * M + tl.arange(0, BLOCK)[:, None] * M + tl.arange(0, M),
35 |             mask=(i + tl.arange(0, BLOCK))[:, None] < N, other=0.0
36 |         )
37 |         hidden += tl.dot(norm_block, weight1_block)
38 | 
39 |     bias1 = tl.load(bias1_ptr + tl.arange(0, M))
40 |     hidden += bias1
41 | 
42 |     SQRT_2_OVER_PI = 0.7978845608028654
43 |     gelu_hidden = 0.5 * hidden * (1.0 + tl.tanh(SQRT_2_OVER_PI * (hidden + 0.044715 * hidden * hidden * hidden)))
44 | 
45 |     prng = tl.arange(0, M) + row_idx * M + seed
46 |     rand_vals = ((1103515245 * prng + 12345) & 0x7fffffff) / 2147483647.0
47 |     dropout_mask = rand_vals > dropout_p
48 |     dropout_scale = 1.0 / (1.0 - dropout_p)
49 |     dropped = gelu_hidden * dropout_mask * dropout_scale
50 | 
51 |     out = tl.zeros([N], dtype=x.dtype)
52 |     for j in range(0, M, BLOCK):
53 |         block_range = j + tl.arange(0, BLOCK)
54 |         dropped_block = dropped[block_range]
55 |         weight2_block = tl.load(
56 |             weight2_ptr + j * N + tl.arange(0, BLOCK)[:, None] * N + tl.arange(0, N),
57 |             mask=(j + tl.arange(0, BLOCK))[:, None] < M, other=0.0
58 |         )
59 |         out += tl.dot(dropped_block, weight2_block)
60 | 
61 |     bias2 = tl.load(bias2_ptr + tl.arange(0, N))
62 |     out += bias2
63 | 
64 |     tl.store(out_ptr + row_offset + tl.arange(0, N), out)
65 | 


--------------------------------------------------------------------------------
/day82/rope.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import math
 5 | 
 6 | @triton.jit
 7 | def rope_kernel(q_ptr, cos_ptr, sin_ptr, stride_q0, stride_q1, stride_cos0, stride_cos1, seq_len: tl.constexpr, head_half: tl.constexpr, BLOCK_SEQ: tl.constexpr, BLOCK_HD: tl.constexpr):
 8 |     
 9 |     pid_seq = tl.program_id(0)
10 |     pid_hd = tl.program_id(1)
11 | 
12 |     seq_offset = pid_seq * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ)
13 |     hd_offset = pid_hd * BLOCK_HD + tl.arange(0, BLOCK_HD)
14 | 
15 |     mask_seq = seq_offset < seq_len
16 |     mask_hd = hd_offset < head_half
17 | 
18 |     q_ptrs = q_ptr + seq_offset[:, None] * stride_q0 + hd_offset[None, :] * (2 * stride_q1)
19 | 
20 |     q0 = tl.load(q_ptrs, mask=mask_seq[:, None] & mask_hd[None, :])
21 |     q1 = tl.load(q_ptrs + stride_q1, mask=mask_seq[:, None] & mask_hd[None, :])
22 | 
23 |     cos_ptrs = cos_ptr + seq_offset[:, None] * stride_cos0 + hd_offset[None, :] * stride_cos1
24 |     sin_ptrs = sin_ptr + seq_offset[:, None] * stride_cos0 + hd_offset[None, :] * stride_cos1
25 | 
26 |     cos_val = tl.load(cos_ptrs, mask=mask_seq[:, None] & mask_hd[None, :])
27 |     sin_val = tl.load(sin_ptrs, mask=mask_seq[:, None] & mask_hd[None, :])
28 | 
29 |     out0 = q0 * cos_val - q1 * sin_val
30 |     out1 = q0 * sin_val + q1 * cos_val
31 | 
32 |     tl.store(q_ptrs, out0, mask=mask_seq[:, None] & mask_hd[None, :])
33 |     tl.store(q_ptrs + stride_q1, out1, mask=mask_seq[:, None] & mask_hd[None, :])
34 | 
35 | def apply_rope(q, cos, sin, BLOCK_SEQ=64, BLOCK_HD=32):
36 |     
37 |     seq_len, head_dim = q.shape
38 |     assert head_dim % 2 == 0
39 |     head_half = head_dim // 2
40 | 
41 |     grid = ((seq_len + BLOCK_SEQ - 1) // BLOCK_SEQ, (head_half + BLOCK_HD - 1) // BLOCK_HD)
42 | 
43 |     q_contig = q.contiguous()
44 | 
45 |     rope_kernel[grid](q_contig, cos, sin, q_contig.stride(0), q_contig.stride(1), cos.stride(0), cos.stride(1), seq_len, head_half, BLOCK_SEQ, BLOCK_HD)
46 |     return q_contig
47 | 
48 | if __name__ == "__main__":
49 |     torch.manual_seed(0)
50 |     device = 'cuda'
51 | 
52 |     seq_len = 128
53 |     head_dim = 64
54 | 
55 |     q = torch.randn(seq_len, head_dim, device=device, dtype=torch.float32)
56 | 
57 |     positions = torch.arange(seq_len, device=device, dtype=torch.float32).unsqueeze(1)
58 |     dim_idx = torch.arange(head_dim // 2, device=device, dtype=torch.float32).unsqueeze(0)
59 |     inv_freq = 1.0 / (10000 ** (dim_idx / (head_dim // 2)))
60 |     theta = positions * inv_freq
61 | 
62 |     cos = torch.cos(theta)
63 |     sin = torch.sin(theta)
64 | 
65 |     q_transformed = apply_rope(q, cos, sin)
66 |     print("Transformed q:")
67 |     print(q_transformed)
68 | 


--------------------------------------------------------------------------------
/day84/kernel.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import triton.language as tl
 3 | import torch
 4 | 
 5 | @triton.jit
 6 | def fp8_gemm_kernel(
 7 |     a_ptr, b_ptr, c_ptr,
 8 |     M, N, K,
 9 |     stride_am, stride_ak,
10 |     stride_bk, stride_bn,
11 |     stride_cm, stride_cn,
12 |     scale_a, scale_b, scale_c,
13 |     BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr
14 | ):
15 |     pid_m = tl.program_id(0)
16 |     pid_n = tl.program_id(1)
17 |     
18 |     rm = tl.arange(0, BLOCK_M)
19 |     rn = tl.arange(0, BLOCK_N)
20 |     offm = pid_m * BLOCK_M + rm
21 |     offn = pid_n * BLOCK_N + rn
22 | 
23 |     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
24 | 
25 |     for k in range(0, K, BLOCK_K):
26 |         offk = k + tl.arange(0, BLOCK_K)
27 | 
28 |         a = tl.load(
29 |             a_ptr + offm[:, None] * stride_am + offk[None, :] * stride_ak,
30 |             mask=(offm[:, None] < M) & (offk[None, :] < K),
31 |             other=0,
32 |         )
33 |         b = tl.load(
34 |             b_ptr + offk[:, None] * stride_bk + offn[None, :] * stride_bn,
35 |             mask=(offk[:, None] < K) & (offn[None, :] < N),
36 |             other=0,
37 |         )
38 | 
39 |         a_fp32 = tl.cast(a, tl.float32) * scale_a
40 |         b_fp32 = tl.cast(b, tl.float32) * scale_b
41 | 
42 |         acc += tl.dot(a_fp32, b_fp32)
43 | 
44 |     c_fp8 = tl.round(acc / scale_c)
45 |     c_fp8 = tl.max(tl.min(c_fp8, 127), -128)
46 |     
47 |     tl.store(
48 |         c_ptr + offm[:, None] * stride_cm + offn[None, :] * stride_cn,
49 |         c_fp8.to(tl.int8),
50 |         mask=(offm[:, None] < M) & (offn[None, :] < N)
51 |     )
52 | 
53 | def fp8_gemm(a: torch.Tensor, b: torch.Tensor,
54 |              scale_a: float, scale_b: float, scale_c: float,
55 |              BLOCK_M: int = 64, BLOCK_N: int = 64, BLOCK_K: int = 32) -> torch.Tensor:
56 |     assert a.dtype == torch.int8 and b.dtype == torch.int8
57 |     M, K = a.shape
58 |     K2, N = b.shape
59 |     assert K == K2
60 |     
61 |     c = torch.empty((M, N), device=a.device, dtype=torch.int8)
62 |     
63 |     grid = (triton.cdiv(M, BLOCK_M), triton.cdiv(N, BLOCK_N))
64 |     
65 |     fp8_gemm_kernel[grid](
66 |         a, b, c,
67 |         M, N, K,
68 |         a.stride(0), a.stride(1),
69 |         b.stride(0), b.stride(1),
70 |         c.stride(0), c.stride(1),
71 |         scale_a, scale_b, scale_c,
72 |         BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K
73 |     )
74 |     return c
75 | 
76 | if __name__ == "__main__":
77 |     torch.manual_seed(0)
78 |     M, K, N = 128, 256, 64
79 | 
80 |     a_fp8 = torch.randint(-128, 127, (M, K), device='cuda', dtype=torch.int8)
81 |     b_fp8 = torch.randint(-128, 127, (K, N), device='cuda', dtype=torch.int8)
82 |     
83 |     scale_a, scale_b, scale_c = 0.1, 0.1, 0.05
84 |     
85 |     c_fp8 = fp8_gemm(a_fp8, b_fp8, scale_a, scale_b, scale_c)
86 |     print("GEMM result (FP8 stored as int8):", c_fp8)
87 | 


--------------------------------------------------------------------------------
/day85/TensorMatMul.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | 
 4 | __global__ void tensorMatrixMultKernel(
 5 |     const float* A,
 6 |     const float* B,
 7 |     float* C,
 8 |     size_t B_dim,
 9 |     size_t I_dim,
10 |     size_t J_dim,
11 |     size_t L_dim,
12 |     size_t K_dim
13 | ) {
14 | 
15 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
16 |     
17 | 
18 |     int total_elements = B_dim * I_dim * J_dim * K_dim;
19 |     if (idx < total_elements) {
20 |         int k = idx % K_dim;
21 |         int j = (idx / K_dim) % J_dim;
22 |         int i = (idx / (K_dim * J_dim)) % I_dim;
23 |         int b = idx / (K_dim * J_dim * I_dim);
24 |         
25 |     
26 |         size_t c_idx = ((b * I_dim + i) * J_dim + j) * K_dim + k;
27 |         
28 |    
29 |         float sum = 0.0f;
30 |         
31 | 
32 |         size_t a_base = ((b * I_dim + i) * J_dim + j) * L_dim;
33 |        
34 |         for (int l = 0; l < L_dim; l++) {
35 |             sum += A[a_base + l] * B[l * K_dim + k];
36 |         }
37 | 
38 |         C[c_idx] = sum;
39 |     }
40 | }
41 | 
42 | extern "C" void solution(const float* A, const float* B, float* C, size_t b, size_t i, size_t j, size_t l, size_t k) {
43 | 
44 |     size_t total_elements = b * i * j * k;
45 |     
46 | 
47 |     int threadsPerBlock = 256;
48 |     int blocksPerGrid = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
49 |     
50 | 
51 |     tensorMatrixMultKernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, b, i, j, l, k);
52 | }


--------------------------------------------------------------------------------
/day86/hard_sigmoid.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <cstdio>
 3 | 
 4 | 
 5 | __global__ void hard_sigmoid_kernel(const float* input, float* output, size_t total_elements) {
 6 |     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 7 |     if (idx >= total_elements) return; 
 8 | 
 9 |     float x = input[idx];
10 |     if (x <= -3.0f)
11 |         output[idx] = 0.0f;
12 |     else if (x >= 3.0f)
13 |         output[idx] = 1.0f;
14 |     else
15 |         output[idx] = (x + 3.0f) / 6.0f;
16 | }
17 | 
18 | 
19 | extern "C" void solution(const float* input, float* output, size_t n, size_t m) {
20 | 
21 |     size_t total_elements = n * m;
22 |     
23 |     const int threadsPerBlock = 256;
24 |     int blocksPerGrid = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
25 |     
26 |     hard_sigmoid_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, total_elements);
27 |     
28 |   
29 | }
30 | 


--------------------------------------------------------------------------------
/day87/SymMatMul.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | #define BLOCK_SIZE 16
 4 | 
 5 | __global__ void matrixMulKernel(const float* A, const float* B, float* C, size_t n) {
 6 |     size_t row = blockIdx.y * blockDim.y + threadIdx.y;
 7 |     size_t col = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     
 9 |     if (row < n && col < n) {
10 |         float sum = 0.0f;
11 |         for (size_t k = 0; k < n; k++) {
12 |             sum += A[row * n + k] * B[k * n + col];
13 |         }
14 |         C[row * n + col] = sum;
15 |     }
16 | }
17 | 
18 | extern "C" void solution(const float* input_a, const float* input_b, float* output_c, size_t n) {    
19 |     dim3 block(BLOCK_SIZE, BLOCK_SIZE);
20 |     dim3 grid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, (n + BLOCK_SIZE - 1) / BLOCK_SIZE);
21 |     
22 |     matrixMulKernel<<<grid, block>>>(input_a, input_b, output_c, n);
23 |     
24 | 
25 |     cudaDeviceSynchronize();
26 | }
27 | 


--------------------------------------------------------------------------------
/day88/MSE.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <cstdio>
 3 | 
 4 | __global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) {
 5 |     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 6 |     if (idx < numElements) {
 7 |         float diff = predictions[idx] - targets[idx];
 8 |         float sq_diff = diff * diff;
 9 |         
10 |         atomicAdd(sum, sq_diff);
11 |     }
12 | }
13 | 
14 | extern "C" void solution(const float* predictions, const float* targets, float* output, size_t* shape, size_t ndim) {
15 | 
16 |     size_t* hostShape = new size_t[ndim];
17 |     cudaMemcpy(hostShape, shape, ndim * sizeof(size_t), cudaMemcpyDeviceToHost);
18 | 
19 |     size_t numElements = 1;
20 |     for (size_t i = 0; i < ndim; i++) {
21 |         numElements *= hostShape[i];
22 |     }
23 |     delete[] hostShape;
24 | 
25 | 
26 |     float init = 0.0f;
27 |     cudaMemcpy(output, &init, sizeof(float), cudaMemcpyHostToDevice);
28 | 
29 | 
30 |     int threadsPerBlock = 256;
31 |     int blocks = (numElements + threadsPerBlock - 1) / threadsPerBlock;
32 |     mseKernel<<<blocks, threadsPerBlock>>>(predictions, targets, numElements, output);
33 |     cudaDeviceSynchronize(); 
34 | 
35 |     float hostSum = 0.0f;
36 |     cudaMemcpy(&hostSum, output, sizeof(float), cudaMemcpyDeviceToHost);
37 | 
38 |     float mse = hostSum / numElements;
39 | 
40 |     cudaMemcpy(output, &mse, sizeof(float), cudaMemcpyHostToDevice);
41 | }
42 | 


--------------------------------------------------------------------------------
/day89/LTMM.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <cstddef>
 3 | 
 4 | #define BLOCK_SIZE 16
 5 | 
 6 | 
 7 | __global__ 
 8 | void lowerTriangularMultiplyKernel(const float* A, const float* B, float* C, size_t n) {
 9 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
10 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
11 | 
12 |     if (row < n && col < n) {
13 |         if (col > row) {
14 |             C[row * n + col] = 0.0f;
15 |         } else {
16 |             float sum = 0.0f;
17 |             for (int k = col; k <= row; k++) {
18 |                 sum += A[row * n + k] * B[k * n + col];
19 |             }
20 |             C[row * n + col] = sum;
21 |         }
22 |     }
23 | }
24 | 
25 | extern "C" void solution(const float* input_a, const float* input_b, float* output_c, size_t n) {
26 |     dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
27 |     dim3 gridDim((n + blockDim.x - 1) / blockDim.x,
28 |                  (n + blockDim.y - 1) / blockDim.y);
29 | 
30 |     lowerTriangularMultiplyKernel<<<gridDim, blockDim>>>(input_a, input_b, output_c, n);
31 | 
32 |     cudaDeviceSynchronize();
33 | }
34 | 


--------------------------------------------------------------------------------
/day90/FrobeniusNorm.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | 
 4 | __global__ void calculateSumOfSquares(const float* X, float* partialSums, size_t size) {
 5 |     extern __shared__ float sharedData[];
 6 |     
 7 |    
 8 |     unsigned int tid = threadIdx.x;
 9 |     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
10 |     
11 | 
12 |     sharedData[tid] = 0.0f;
13 |     
14 |     
15 |     while (i < size) {
16 |         sharedData[tid] += X[i] * X[i];
17 |         i += blockDim.x * gridDim.x;
18 |     }
19 |     
20 |     
21 |     __syncthreads();
22 |     
23 |    
24 |     for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
25 |         if (tid < s) {
26 |             sharedData[tid] += sharedData[tid + s];
27 |         }
28 |         __syncthreads();
29 |     }
30 |     
31 |     
32 |     if (tid == 0) {
33 |         partialSums[blockIdx.x] = sharedData[0];
34 |     }
35 | }
36 | 
37 | 
38 | __global__ void normalizeByFrobeniusNorm(const float* X, float* Y, size_t size, float frobeniusNorm) {
39 |     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
40 |     
41 |     if (i < size) {
42 |         Y[i] = X[i] / frobeniusNorm;
43 |     }
44 | }
45 | 
46 | extern "C" void solution(const float* X, float* Y, size_t size) {
47 |     
48 |     int blockSize = 256;
49 |     int gridSize = (size + blockSize - 1) / blockSize;
50 |     int maxBlocks = 1024; 
51 |     
52 |     if (gridSize > maxBlocks) {
53 |         gridSize = maxBlocks;
54 |     }
55 |     
56 |     float* d_partialSums;
57 |     cudaMalloc(&d_partialSums, gridSize * sizeof(float));
58 |     
59 |     calculateSumOfSquares<<<gridSize, blockSize, blockSize * sizeof(float)>>>(X, d_partialSums, size);
60 |     
61 |     float* h_partialSums = new float[gridSize];
62 |     cudaMemcpy(h_partialSums, d_partialSums, gridSize * sizeof(float), cudaMemcpyDeviceToHost);
63 |     
64 |     float sumOfSquares = 0.0f;
65 |     for (int i = 0; i < gridSize; i++) {
66 |         sumOfSquares += h_partialSums[i];
67 |     }
68 |     
69 |     float frobeniusNorm = sqrt(sumOfSquares);
70 |     
71 |     if (frobeniusNorm < 1e-10) {
72 |         frobeniusNorm = 1.0f; 
73 |     }
74 |     
75 |     normalizeByFrobeniusNorm<<<(size + blockSize - 1) / blockSize, blockSize>>>(X, Y, size, frobeniusNorm);
76 |     
77 |     delete[] h_partialSums;
78 |     cudaFree(d_partialSums);
79 | }


--------------------------------------------------------------------------------
/day91/Hinge_Loss.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | 
 4 | __global__ void hingeKernel(const float* predictions, const float* targets, float* output, size_t n) {
 5 | 
 6 |     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 7 |     
 8 |     if (idx < n) {
 9 |         float prod = predictions[idx] * targets[idx];
10 |         output[idx] = fmaxf(0.0f, 1.0f - prod);
11 |     }
12 | }
13 | 
14 | 
15 | extern "C" void solution(const float* predictions, const float* targets, float* output, size_t n) {
16 |     // I found this to be the best configuration for the kernel (h100)
17 |     const int blockSize = 256;
18 |     const int gridSize = (n + blockSize - 1) / blockSize;
19 |     
20 |     hingeKernel<<<gridSize, blockSize>>>(predictions, targets, output, n);
21 |     
22 |   
23 | }


--------------------------------------------------------------------------------
/day92/1D_Convolution.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | __global__
 4 | void conv1d(const float* A,
 5 |             const float* B,
 6 |             float*       C,
 7 |             size_t       N,
 8 |             size_t       K)
 9 | {
10 |     size_t i      = blockIdx.x * blockDim.x + threadIdx.x;
11 |     int    radius = int(K/2);
12 | 
13 |     if (i < N) {
14 |         float sum = 0.0f;
15 |         for (int j = 0; j < int(K); ++j) {
16 |             int idx = int(i) + j - radius;
17 |             if (idx >= 0 && idx < int(N)) {
18 |                 sum += A[idx] * B[j];
19 |             }
20 |         }
21 |         C[i] = sum;
22 |     }
23 | }
24 | 
25 | extern "C"
26 | void solution(const float* A,
27 |               const float* B,
28 |               float*       C,
29 |               size_t       N,
30 |               size_t       K)
31 | {
32 |     int threads = 1024;
33 |     int blocks  = int((N + threads - 1) / threads);
34 | 
35 |     conv1d<<<blocks, threads>>>(A, B, C, N, K);
36 | }
37 | 


--------------------------------------------------------------------------------
/day93/RMS_Normalization.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <math.h>
 3 | 
 4 | #define EPSILON 1e-5f
 5 | 
 6 | __global__ void compute_rms(const float* X, float* rms, size_t B, size_t N) {
 7 |     extern __shared__ float sdata[];
 8 |     size_t row = blockIdx.x;
 9 |     size_t tid = threadIdx.x;
10 |     const float* row_ptr = X + row * N;
11 | 
12 |     float sum = 0.0f;
13 |     for (size_t i = tid; i < N; i += blockDim.x) {
14 |         float v = row_ptr[i];
15 |         sum += v * v;
16 |     }
17 |     sdata[tid] = sum;
18 |     __syncthreads();
19 | 
20 |     for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
21 |         if (tid < s) {
22 |             sdata[tid] += sdata[tid + s];
23 |         }
24 |         __syncthreads();
25 |     }
26 | 
27 |     if (tid == 0) {
28 |         float mean_sq = sdata[0] / static_cast<float>(N);
29 |         rms[row] = sqrtf(mean_sq + EPSILON);
30 |     }
31 | }
32 | 
33 | __global__ void normalize_rms(const float* X, float* Y, const float* rms, size_t B, size_t N) {
34 |     size_t row = blockIdx.x;
35 |     size_t tid = threadIdx.x;
36 |     float r = rms[row];
37 |     const float* row_in = X + row * N;
38 |     float* row_out = Y + row * N;
39 | 
40 |     for (size_t i = tid; i < N; i += blockDim.x) {
41 |         row_out[i] = row_in[i] / r;
42 |     }
43 | }
44 | 
45 | extern "C" void solution(const float* X, float* Y, size_t B, size_t N) {
46 |     int threads = (N < 256) ? int(N) : 256;
47 |     size_t shared_mem_size = threads * sizeof(float);
48 | 
49 |     float* d_rms = nullptr;
50 |     cudaMalloc(&d_rms, B * sizeof(float));
51 | 
52 |     compute_rms<<<B, threads, shared_mem_size>>>(X, d_rms, B, N);
53 | 
54 |     normalize_rms<<<B, threads>>>(X, Y, d_rms, B, N);
55 | 
56 |     cudaFree(d_rms);
57 | }
58 | 


--------------------------------------------------------------------------------
/day94/ELU.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day94/ELU.cu


--------------------------------------------------------------------------------
/day95/2D_Max_Pooling.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <float.h>    // for FLT_MAX
 3 | #include <stddef.h>   // for size_t
 4 | 
 5 | 
 6 | __global__
 7 | void maxpool2d_kernel(const float* __restrict__ input,
 8 |                       int H, int W,
 9 |                       int kernel_size, int stride, int padding, int dilation,
10 |                       int H_out, int W_out,
11 |                       float* __restrict__ output)
12 | {
13 | 
14 |     int out_y = blockIdx.y * blockDim.y + threadIdx.y;
15 |     int out_x = blockIdx.x * blockDim.x + threadIdx.x;
16 |     if (out_y >= H_out || out_x >= W_out) return;
17 | 
18 | 
19 |     float max_val = -FLT_MAX;
20 |     for (int m = 0; m < kernel_size; ++m) {
21 |         int in_y = out_y * stride + m * dilation - padding;
22 |         for (int n = 0; n < kernel_size; ++n) {
23 |             int in_x = out_x * stride + n * dilation - padding;
24 |             
25 |             if (in_y >= 0 && in_y < H && in_x >= 0 && in_x < W) {
26 |                 float v = input[in_y * W + in_x];
27 |                 if (v > max_val) max_val = v;
28 |             }
29 |         }
30 |     }
31 |     output[out_y * W_out + out_x] = max_val;
32 | }
33 | 
34 | 
35 | extern "C"
36 | void solution(const float* input,
37 |               int kernel_size,
38 |               int stride,
39 |               int padding,
40 |               int dilation,
41 |               float* output,
42 |               size_t H,
43 |               size_t W)
44 | {
45 | 
46 |     int H_out = (int)(( (int)H + 2*padding
47 |                        - dilation*(kernel_size-1)
48 |                        - 1 ) / stride) + 1;
49 |     int W_out = (int)(( (int)W + 2*padding
50 |                        - dilation*(kernel_size-1)
51 |                        - 1 ) / stride) + 1;
52 | 
53 | 
54 |     const int Bx = 16, By = 16;
55 |     dim3 block(Bx, By);
56 |     dim3 grid( (W_out + Bx - 1) / Bx,
57 |                (H_out + By - 1) / By );
58 | 
59 |     maxpool2d_kernel<<<grid, block>>>(
60 |         input,
61 |         (int)H, (int)W,
62 |         kernel_size, stride, padding, dilation,
63 |         H_out, W_out,
64 |         output
65 |     );
66 | 
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/day96/Product_Over_Dimension.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <algorithm>
 3 | 
 4 | 
 5 | __global__
 6 | void prod_reduce_kernel(const float* __restrict__ input,
 7 |                         float* __restrict__ output,
 8 |                         size_t M,
 9 |                         size_t S_d,
10 |                         size_t N)
11 | {
12 | 
13 |     size_t out_idx = blockIdx.x;
14 | 
15 |     size_t m = out_idx / N;
16 |     size_t n = out_idx - m * N;
17 | 
18 | 
19 |     const float* base = input + (m * S_d) * N + n;
20 | 
21 | 
22 |     double prod = 1.0;
23 |     for (size_t k = threadIdx.x; k < S_d; k += blockDim.x) {
24 |         prod *= static_cast<double>( base[k * N] );
25 |     }
26 | 
27 | 
28 |     constexpr unsigned FULL_MASK = 0xffffffffu;
29 |     for (int offset = warpSize/2; offset > 0; offset >>= 1) {
30 |         prod *= __shfl_down_sync(FULL_MASK, prod, offset);
31 |     }
32 | 
33 |   
34 |     __shared__ double warp_prod[1024/32];
35 |     int lane = threadIdx.x & (warpSize - 1);
36 |     int wid  = threadIdx.x >> 5;                 
37 |     if (lane == 0) warp_prod[wid] = prod;
38 |     __syncthreads();
39 | 
40 |   
41 |     if (wid == 0) {
42 |         double block_prod = (lane < ((blockDim.x+31)/32)) 
43 |                             ? warp_prod[lane] 
44 |                             : 1.0;
45 |         for (int offset = ((blockDim.x+31)/32)/2; offset > 0; offset >>= 1) {
46 |             block_prod *= __shfl_down_sync(FULL_MASK, block_prod, offset);
47 |         }
48 |         if (lane == 0) {
49 |            
50 |             output[out_idx] = static_cast<float>(block_prod);
51 |         }
52 |     }
53 | }
54 | 
55 | 
56 | extern "C"
57 | void solution(const float*  input,
58 |               int           dim,
59 |               float*        output,
60 |               size_t*       shape,    
61 |               size_t        ndim)
62 | {
63 | 
64 |     std::vector<size_t> hshape(ndim);
65 |     cudaMemcpy(hshape.data(), shape, ndim*sizeof(size_t),
66 |                cudaMemcpyDeviceToHost);
67 | 
68 | 
69 |     size_t M = 1, N = 1;
70 |     for (int i = 0; i < dim; ++i)         M *= hshape[i];
71 |     for (int i = dim+1; i < (int)ndim; ++i) N *= hshape[i];
72 |     size_t S_d = hshape[dim];
73 | 
74 |     size_t total_outputs = M * N;
75 |     if (total_outputs == 0 || S_d == 0) return;
76 | 
77 | 
78 |     int blk = 1;
79 |     while (blk < (int)S_d && blk < 1024) blk <<= 1;
80 |     blk = std::max(blk, 32);
81 |     blk = std::min(blk, 1024);
82 | 
83 | 
84 |     dim3 grid( total_outputs );
85 |     dim3 block( blk );
86 | 
87 |     prod_reduce_kernel<<<grid,block>>>(input, output, M, S_d, N);
88 | 
89 |    
90 | }
91 | 


--------------------------------------------------------------------------------
/day97/elu_optim.cu:
--------------------------------------------------------------------------------
 1 | // To optimize the code I used: float4 loads for FP32 tail, __half2 vectorized ELU for even-index FP16, branchless, FMA, __exp2f for faster exp
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <cuda_runtime.h>
 5 | 
 6 | #define EXPM1f(x) expm1f(x)
 7 | #define EXP2f(x) __exp2f(x)  
 8 | 
 9 | 
10 | __global__ __launch_bounds__(1024, 4)
11 | void elu_fp16(const float* __restrict__ input,
12 |                    float* __restrict__ output,
13 |                    size_t total,
14 |                    float alpha) {
15 |     size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
16 |     size_t stride = blockDim.x * gridDim.x;
17 | 
18 |    
19 |     size_t vec8 = (total / 8) * 8;
20 |     for (size_t base = tid * 8; base < vec8; base += stride * 8) {
21 |         
22 |         float4 f0 = __ldg((const float4*)(input + base));
23 |         float4 f1 = __ldg((const float4*)(input + base + 4));
24 |         
25 |     
26 |         f0.x = f0.x > 0.f ? f0.x : alpha * EXPM1f(f0.x);
27 |         f0.y = f0.y > 0.f ? f0.y : alpha * EXPM1f(f0.y);
28 |         f0.z = f0.z > 0.f ? f0.z : alpha * EXPM1f(f0.z);
29 |         f0.w = f0.w > 0.f ? f0.w : alpha * EXPM1f(f0.w);
30 |         
31 |         f1.x = f1.x > 0.f ? f1.x : alpha * EXPM1f(f1.x);
32 |         f1.y = f1.y > 0.f ? f1.y : alpha * EXPM1f(f1.y);
33 |         f1.z = f1.z > 0.f ? f1.z : alpha * EXPM1f(f1.z);
34 |         f1.w = f1.w > 0.f ? f1.w : alpha * EXPM1f(f1.w);
35 |         
36 |         
37 |         reinterpret_cast<float4*>(output + base)[0] = f0;
38 |         reinterpret_cast<float4*>(output + base)[1] = f1;
39 |     }
40 |     
41 |     for (size_t i = vec8 + tid; i < total; i += stride) {
42 |         float x = __ldg(&input[i]);
43 |         output[i] = x > 0.f ? x : alpha * EXPM1f(x);
44 |     }
45 | }
46 | 
47 | extern "C" void solution(const float* input, float* output, size_t n, size_t m, float alpha) {
48 |     size_t total = n * m;
49 |     const int threads = 1024;
50 |     int blocks = (total / 8 + threads - 1) / threads;
51 |     blocks = max(blocks, 320);
52 |     blocks = min(blocks, 65535);
53 | 
54 |     elu_fp16<<<blocks, threads>>>(input, output, total, alpha);
55 | }


--------------------------------------------------------------------------------
/notes/offsetcudatriton.md:
--------------------------------------------------------------------------------
 1 |  I want to talk about how offsets and stuff is calculated in CUDA and Triton
 2 | 
 3 | I will start with CUDA because is easier to explain in my opinion :
 4 | ```c
 5 | __global__ void vectorAdd(const float* A , const float *B, float *C, int N){
 6 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;    
 7 |     if (idx<N){
 8 |         C[idx] = A[idx] + B[idx];
 9 |     }
10 | }
11 | ```
12 | So this is a simple `vectorADD` function who takes as input the following:
13 | 
14 | `const float* A` -> a constant pointer of type float to the `A` array
15 | 
16 | `const float* B` -> a constant pointer of type float to the `B` array
17 | 
18 | `float *C` -> a pointer of type float to the `C` array. Note: `C` pointer is not a constant because we want to modify the element of it
19 | 
20 | Now lets dive deeper:
21 | `int idx = blockIdx.x * blockDim.x + threadIdx.x` : so we have the position in the grid multiplied by the dimension of each block + position in the thread
22 | 
23 | 
24 | Now lets see the triton function:
25 | ```python
26 | def __kernelfunction__(input_pointer, output_pointer, N,
27 |                        BLOCKSIZE: tl.constexpr):
28 |     pid = tl.program_id(0) 
29 | 
30 |     offset = pid * BLOCKSIZE + tl.arange(0, BLOCKSIZE)
31 |     mask = offset < N
32 | 
33 |     input_data = tl.load(input_pointer + offset, mask=mask)
34 |     output_data = tl.sqrt(input_data)
35 |     tl.store(output_pointer + offset, output_data, mask=mask)
36 | ```
37 | so our `idx` is exactly the `offset` in the triton.
38 | offset will be calculated by the programd_id multiplied by the dimension of the block and we will add an array of [0,1,2,3,....,BLOCKSIZE] .
39 | The result can be thinked that will be an array with each position associated to each thread.
40 | 


--------------------------------------------------------------------------------
/nvidiadocs/addition.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | __global__ void addition(float* A , float* B, float*C){
 5 |     int idx = threadIdx.x;
 6 |     C[idx]  = A[idx] + B[idx];
 7 | }
 8 | 
 9 | int main(){
10 |     int N = 10;
11 |     addition<<<1,N>>>(A,B,C); // simple addition kernle that will launch N threads
12 | }
13 | 
14 | //////////////////////////////////////
15 | // (x, y) is (x + y Dx);
16 | // (x, y, z) is (x + y Dx + z Dx Dy)
17 | // int i 
18 | __global__ void MatAdd(float A[N][N], float B[N][N],
19 |                        float C[N][N])
20 | {
21 |     int i = threadIdx.x;
22 |     int j = threadIdx.y;
23 |     C[i][j] = A[i][j] + B[i][j];
24 | }
25 | 
26 | int main()
27 | {
28 |     // Kernel invocation with one block of N * N * 1 threads
29 |     int numBlocks = 1; // number of blocks
30 |     dim3 threadsPerBlock(N, N); // Threads 
31 |     MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
32 | }
33 | 
34 | 
35 | /////////////////
36 | // Kernel definition
37 | __global__ void MatAdd(float A[N][N], float B[N][N],
38 | float C[N][N])
39 | {
40 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
41 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
42 |     if (i < N && j < N)
43 |         C[i][j] = A[i][j] + B[i][j];
44 | }
45 | 
46 | int main()
47 | {
48 |     ...
49 |     // Kernel invocation
50 |     dim3 threadsPerBlock(16, 16); // threadsPerBlock -> how many threads per blocl
51 |     dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y); // Nu
52 |     MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
53 |     ...
54 | }
55 | ////


--------------------------------------------------------------------------------