├── .gitignore
├── Makefile
├── ReadMe.md
├── day01
├── addition.cu
└── printAdd.cu
├── day02
├── function.cu
└── function.py
├── day03
├── addMatrix.cu
├── addMatrix.py
└── anotherMatrix.cu
├── day04
└── layerNorm.cu
├── day05
└── vectorSumTricks.cu
├── day06
├── AdditionKernel
│ ├── additionKernel.cpython-312-x86_64-linux-gnu.so
│ ├── additionKernel.cu
│ ├── additionKernel.egg-info
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ └── top_level.txt
│ ├── additionKernelBinding.cpp
│ ├── additionkernel.cpython-312-x86_64-linux-gnu.so
│ ├── additionkernel.egg-info
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ └── top_level.txt
│ ├── build
│ │ ├── lib.linux-x86_64-cpython-312
│ │ │ ├── additionKernel.cpython-312-x86_64-linux-gnu.so
│ │ │ └── additionkernel.cpython-312-x86_64-linux-gnu.so
│ │ └── temp.linux-x86_64-cpython-312
│ │ │ ├── additionKernel.o
│ │ │ └── additionKernelBinding.o
│ ├── pythontest.py
│ └── setup.py
├── ImportingToPython
│ ├── build
│ │ ├── lib.linux-x86_64-cpython-312
│ │ │ └── example_kernels.cpython-312-x86_64-linux-gnu.so
│ │ └── temp.linux-x86_64-cpython-312
│ │ │ ├── rollcall.o
│ │ │ └── rollcallbinding.o
│ ├── example_kernels.cpython-312-x86_64-linux-gnu.so
│ ├── example_kernels.egg-info
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ └── top_level.txt
│ ├── pythontest.py
│ ├── rollcall.cu
│ ├── rollcallbinding.cpp
│ └── setup.py
├── SMBlocks.cu
├── SoftMax.cu
├── TransposeMatrix.cu
└── note
├── day07
├── conv1d.cu
├── globalMemoryCoalescing.cu
├── matmul.cu
├── naive.cu
└── pythontest.py
├── day08
├── idk.cu
├── pmpbook
│ ├── chapter3ex.cu
│ ├── chapter3matvecmul.cu
│ ├── color2gray.cu
│ ├── deviceinfo.cu
│ ├── imageblur.cu
│ └── vecaddition.cu
└── selfAttention
│ └── selfAttention.cu
├── day09
├── bind.cpp
├── flashAttention.cu
├── flashAttentionFromTut.cu
└── test.py
├── day10
├── FlashAttention.cpp
├── FlashAttention.cu
├── linking
│ ├── simpleKernel.cpp
│ ├── simpleKernel.cu
│ └── test.py
├── ppmbook
│ └── matrixmul.cu
├── setup.py
└── test.py
├── day100
└── delta.cu
├── day11
├── FlashTestPytorch
│ ├── FlashAttention.cu
│ ├── binding.cpp
│ └── test.py
├── LeakyReLU.cu
├── ReLU.cu
├── SoftMax.cu
├── TanH.cu
├── binding.cpp
├── test.py
└── testbackward.py
├── day12
├── NN
│ └── kernels.cu
├── softMax.cu
└── tileMatrix.cu
├── day13
├── RMS.cu
├── RMSBetter.cu
├── binding.cpp
└── test.py
├── day14
├── FA2
│ ├── flash.cu
│ ├── helper.cu
│ ├── helper.cuh
│ ├── kernels.cu
│ └── kernels.cuh
├── FlashAttention2
│ └── kernel.cu
├── cat.jpg
└── conv.cu
├── day15
├── Attention.cu
├── SMM.cu
└── dotproduct.cu
├── day16
├── attentionbwkd.cu
└── test.py
├── day17
├── cublas1.cu
├── cublas2.cu
└── cublas3.cu
├── day18
├── atomic1.cu
├── atomic2.cu
└── wrap.cu
├── day19
└── cublasMM.cu
├── day20
├── rope.cu
└── test_rope.py
├── day21
└── conv.cu
├── day22
├── persistent2.cu
└── persistentKernel.cu
├── day23
├── kernel.ptx
└── main.cu
├── day24
└── GeGLU.cu
├── day25
└── nbody.cu
├── day26
├── gradientdescent.cu
└── gradientdescent.out
├── day27
├── kmeans.cu
└── kmeans.out
├── day28
├── sample.cu
└── test_sample.py
├── day29
└── pi.cu
├── day30
└── kernelHisto.cu
├── day31
└── kernel.cu
├── day32
├── Makefile
└── matmul_kernels
│ ├── kernel_1
│ └── kernel_1.cpp
│ ├── kernel_2
│ └── kernel_2.cpp
│ ├── kernel_3
│ └── kernel_3.cpp
│ └── kernel_rocblas
│ └── kernel_rocblas.cpp
├── day33
└── load_in_pytorch
│ ├── kernel.cpp
│ ├── kernel.so
│ └── test.py
├── day34
└── tensor_lib
│ ├── test1.cpp
│ └── test1.out
├── day35
└── layernorm.cpp
├── day36
└── random.cpp
├── day37
└── MultiStreams
│ ├── MHA.cpp
│ ├── MHA.out
│ ├── notes.md
│ ├── results.copy_stats.csv
│ ├── results.db
│ ├── results.hip_stats.csv
│ ├── results.hsa_stats.csv
│ ├── results.json
│ ├── results.stats.csv
│ └── results.sysinfo.txt
├── day38
└── myreduction.cpp
├── day39
└── advancedcudamm.cu
├── day40
└── flaship.cpp
├── day41
└── MLA.cu
├── day42
├── mat_mul.py
└── mat_mul_2.py
├── day43
└── rope.py
├── day44
├── average_duration_per_block_size.png
├── benchmark_results.csv
├── duration_vs_total_elements.png
└── tritonkernel.py
├── day45
└── cross_entropy
│ └── cross_entropy.py
├── day46
└── flash_attention.py
├── day47
├── hip_cooperative_groups.h
└── kernel.cpp
├── day48
└── kernel.py
├── day49
└── kernel.py
├── day50
└── tritonnn.py
├── day51
└── main.py
├── day52
└── functionsused.py
├── day53
└── layer_norm.py
├── day54
└── softmax.py
├── day55
└── ddpm.py
├── day56
└── main.py
├── day57
└── main.py
├── day58
└── layer_norm.cpp
├── day59
└── test.py
├── day60
└── fused.py
├── day61
└── backprop.py
├── day62
└── main.py
├── day63
└── lstm.py
├── day64
└── main.py
├── day65
└── quant.cpp
├── day66
└── kernel.cpp
├── day67
└── lora.py
├── day68
└── adam.py
├── day69
└── main.py
├── day70
└── gla.py
├── day71
└── main.py
├── day72
└── main.py
├── day73
└── code.py
├── day74
└── kernel.py
├── day75
└── kernel.py
├── day76
└── kernel.py
├── day77
└── main.py
├── day78
└── rmsnorm.py
├── day79
└── main.py
├── day80
└── kernel.py
├── day81
└── main.py
├── day82
└── rope.py
├── day83
└── lin.py
├── day84
└── kernel.py
├── day85
└── TensorMatMul.cu
├── day86
└── hard_sigmoid.cu
├── day87
└── SymMatMul.cu
├── day88
└── MSE.cu
├── day89
└── LTMM.cu
├── day90
└── FrobeniusNorm.cu
├── day91
└── Hinge_Loss.cu
├── day92
└── 1D_Convolution.cu
├── day93
└── RMS_Normalization.cu
├── day94
└── ELU.cu
├── day95
└── 2D_Max_Pooling.cu
├── day96
└── Product_Over_Dimension.cu
├── day97
└── elu_optim.cu
├── day98
└── kernel.cpp
├── day99
└── kernel.cpp
├── notes
└── offsetcudatriton.md
└── nvidiadocs
└── addition.cu
/.gitignore:
--------------------------------------------------------------------------------
1 | /.vscode
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PROJECT_DIR := $(CURDIR)
2 |
3 | COLOR_RESET := \033[0m
4 | COLOR_GREEN := \033[32m
5 | COLOR_YELLOW := \033[33m
6 | COLOR_BLUE := \033[34m
7 | COLOR_RED := \033[31m
8 |
9 | CUDA_ARCH := sm_89 # Specify CUDA architecture (e.g., sm_89 for RTX 4070)
10 |
11 | all: build
12 |
13 | build: $(PROJECT_DIR)/$(dir)/$(program).out
14 |
15 | $(PROJECT_DIR)/$(dir)/$(program).out: $(PROJECT_DIR)/$(dir)/$(program).cu
16 | @echo "$(COLOR_YELLOW)Building program $(program) in directory $(dir)...$(COLOR_RESET)"
17 | @nvcc -arch=$(CUDA_ARCH) -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -o $@ $< -lcuda
18 | @echo "$(COLOR_GREEN)Build completed for $(program).out in $(dir)$(COLOR_RESET)"
19 |
20 | run: $(PROJECT_DIR)/$(dir)/$(program).out
21 | @echo "$(COLOR_BLUE)Running $(program).out in directory $(dir)...$(COLOR_RESET)"
22 | @./$(dir)/$(program).out
23 |
24 | clean:
25 | @echo "$(COLOR_RED)Cleaning up .out files in directory $(dir)...$(COLOR_RESET)"
26 | @rm -f $(PROJECT_DIR)/$(dir)/*.out
27 | @echo "$(COLOR_GREEN)Clean completed for directory $(dir)$(COLOR_RESET)"
28 |
29 | cleanall:
30 | @echo "$(COLOR_RED)Cleaning up all .out files in all directories...$(COLOR_RESET)"
31 | @find $(PROJECT_DIR) -type f -name "*.out" -exec rm -f {} \;
32 | @echo "$(COLOR_GREEN)Cleanall completed for all directories$(COLOR_RESET)"
33 |
34 | help:
35 | @echo "$(COLOR_BLUE)Usage instructions for Makefile:$(COLOR_RESET)"
36 | @echo ""
37 | @echo "$(COLOR_YELLOW)make dir=
program=$(COLOR_RESET) # Build the program .cu in directory "
38 | @echo "$(COLOR_YELLOW)make run dir= program=$(COLOR_RESET) # Run the compiled .out in directory "
39 | @echo "$(COLOR_YELLOW)make clean dir=$(COLOR_RESET) # Clean all .out files in directory "
40 | @echo "$(COLOR_YELLOW)make cleanall$(COLOR_RESET) # Clean all .out files in all directories"
41 | @echo ""
42 | @echo "$(COLOR_BLUE)Examples:$(COLOR_RESET)"
43 | @echo "$(COLOR_GREEN)make dir=day1 program=addition$(COLOR_RESET) # Build addition.cu in day1"
44 | @echo "$(COLOR_GREEN)make run dir=day1 program=addition$(COLOR_RESET) # Run addition.out in day1"
45 | @echo "$(COLOR_GREEN)make clean dir=day1$(COLOR_RESET) # Clean up .out files in day1"
46 | @echo "$(COLOR_GREEN)make cleanall$(COLOR_RESET) # Clean all .out files in all directories"
47 |
--------------------------------------------------------------------------------
/day01/addition.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void vectorAdd(const float* A , const float *B, float *C, int N){
5 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
6 | // so blockIdx.x -> is the ID of thread
7 | // block dim = the size of the window we work on it
8 | // threaidx =
9 | if (idx>>(d_A,d_B,d_C,N);
41 |
42 | cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
43 | for(int i =N-10;i
2 | #include
3 |
4 | // Kernel to print threadIdx.x
5 | __global__ void printThreadIdx(int N) {
6 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
7 |
8 | if (idx < N) { // Ensure the thread is within bounds
9 | printf("Block: %d, Thread: %d, Global Index: %d\n", blockIdx.x, threadIdx.x, idx);
10 | }
11 | }
12 |
13 | int main() {
14 | const int N = 1024; // Number of elements
15 | const int threadsPerBlock = 256;
16 | const int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
17 |
18 | // Launch the kernel
19 | printThreadIdx<<>>(N);
20 |
21 | // Wait for the device to finish
22 | cudaDeviceSynchronize();
23 |
24 | return 0;
25 | }
26 |
--------------------------------------------------------------------------------
/day02/function.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __device__ float square(int x){
5 | return x*x;
6 | // __device__ marked function can only be called from anoter device function
7 | // or a kernel method
8 | }
9 |
10 | __global__ void voidKernel(int *input,int *output,int N) {
11 | int i = blockIdx.x * blockDim.x + threadIdx.x;
12 | if (i < N){
13 | output[i] = square(input[i]);
14 | }
15 | }
16 |
17 |
18 | int main(){
19 | int N = 10; // size of input and output arrays
20 | int size = N*sizeof(int); // total memory to allocate for the ararys
21 | int *h_input = new int[N]; // alocate memory on the CPU
22 | int *h_output = new int[N]; // alocate memory on the CPU
23 |
24 | for(int i = 0;i>>(d_input, d_output, N); cudaMemcpy(h_output,d_output,size,cudaMemcpyDeviceToHost);
38 |
39 | std::cout << "Squared array: ";
40 | for (int i = 0; i < N; i++) {
41 | std::cout << h_output[i] << " ";
42 | }
43 | std::cout << std::endl;
44 |
45 | delete[] h_input;
46 | delete[] h_output;
47 | cudaFree(d_input);
48 | cudaFree(d_output);
49 |
50 | return 0;
51 | }
--------------------------------------------------------------------------------
/day02/function.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def __kernelfunction__(input_pointer, output_pointer, N,
7 | BLOCKSIZE: tl.constexpr):
8 | pid = tl.program_id(0) # Get the program (block) ID
9 |
10 | offset = pid * BLOCKSIZE + tl.arange(0, BLOCKSIZE)
11 | mask = offset < N
12 |
13 | input_data = tl.load(input_pointer + offset, mask=mask)
14 | output_data = tl.sqrt(input_data)
15 | tl.store(output_pointer + offset, output_data, mask=mask)
16 |
17 | def main():
18 | N = 10
19 |
20 | input_data = torch.arange(0, N, dtype=torch.float32)
21 | print("Input data:", input_data)
22 |
23 | output_data = torch.empty_like(input_data)
24 |
25 | input_ptr = input_data.to("cuda")
26 | output_ptr = output_data.to("cuda")
27 |
28 | BLOCKSIZE = 256
29 |
30 | GRID = (triton.cdiv(N, BLOCKSIZE),)
31 |
32 | __kernelfunction__[GRID](input_ptr, output_ptr, N, BLOCKSIZE=BLOCKSIZE)
33 |
34 | output_data = output_ptr.cpu()
35 | print("Output data:", output_data)
36 |
37 | if __name__ == "__main__":
38 | main()
39 |
--------------------------------------------------------------------------------
/day03/addMatrix.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | void printMatrix(const float *Matrix, const int size = 16) {
7 | int rootSize = sqrt(size);
8 | for (int i = 0; i < rootSize; i++) {
9 | for (int j = 0; j < rootSize; j++) {
10 | std::cout << Matrix[i * rootSize + j] << " ";
11 | }
12 | std::cout << "\n";
13 | }
14 | }
15 |
16 | __global__ void matrixAddCUDA(const float *Matrix_A, const float *Matrix_B, float *Matrix_C,
17 | const int sizeX, const int sizeY) {
18 | int col = blockIdx.x * blockDim.x + threadIdx.x;
19 | int row = blockIdx.y * blockDim.y + threadIdx.y;
20 |
21 | if (row < sizeY && col < sizeX) {
22 | Matrix_C[row * sizeX + col] = Matrix_A[row * sizeX + col] + Matrix_B[row * sizeX + col];
23 | }
24 | }
25 |
26 | void matrixAddCPU(const float *Matrix_A, const float *Matrix_B, float *Matrix_C, int sizeX, int sizeY) {
27 | for (int row = 0; row < sizeY; row++) {
28 | for (int col = 0; col < sizeX; col++) {
29 | Matrix_C[row * sizeX + col] = Matrix_A[row * sizeX + col] + Matrix_B[row * sizeX + col];
30 | }
31 | }
32 | }
33 |
34 | void compareExecutionTime(const float *Matrix_A, const float *Matrix_B, float *Matrix_C,
35 | const int sizeX, const int sizeY) {
36 | const int matrixSize = sizeX * sizeY;
37 | const int matrixBytes = sizeof(float) * matrixSize;
38 |
39 | float *gpu_A, *gpu_B, *gpu_C;
40 | cudaMalloc((void **)&gpu_A, matrixBytes);
41 | cudaMalloc((void **)&gpu_B, matrixBytes);
42 | cudaMalloc((void **)&gpu_C, matrixBytes);
43 |
44 | cudaMemcpy(gpu_A, Matrix_A, matrixBytes, cudaMemcpyHostToDevice);
45 | cudaMemcpy(gpu_B, Matrix_B, matrixBytes, cudaMemcpyHostToDevice);
46 |
47 | int BLOCK_SIZE = 32;
48 | dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
49 | dim3 gridDim((sizeX + BLOCK_SIZE - 1) / BLOCK_SIZE, (sizeY + BLOCK_SIZE - 1) / BLOCK_SIZE);
50 |
51 | auto startCPU = std::chrono::high_resolution_clock::now();
52 | matrixAddCPU(Matrix_A, Matrix_B, Matrix_C, sizeX, sizeY);
53 | auto endCPU = std::chrono::high_resolution_clock::now();
54 |
55 | auto startCUDA = std::chrono::high_resolution_clock::now();
56 | matrixAddCUDA<<>>(gpu_A, gpu_B, gpu_C, sizeX, sizeY);
57 | cudaDeviceSynchronize();
58 | auto endCUDA = std::chrono::high_resolution_clock::now();
59 |
60 | cudaMemcpy(Matrix_C, gpu_C, matrixBytes, cudaMemcpyDeviceToHost);
61 |
62 | std::chrono::duration cpuDuration = endCPU - startCPU;
63 | std::chrono::duration cudaDuration = endCUDA - startCUDA;
64 | std::cout << "CPU Execution Time: " << cpuDuration.count() << " seconds\n";
65 | std::cout << "CUDA Execution Time: " << cudaDuration.count() << " seconds\n";
66 |
67 | cudaFree(gpu_A);
68 | cudaFree(gpu_B);
69 | cudaFree(gpu_C);
70 | }
71 |
72 | int main() {
73 | const int sizeX = 1024*16;
74 | const int sizeY = 1024*16;
75 | const int matrixSize = sizeX * sizeY;
76 |
77 | float *cpu_A = new float[matrixSize];
78 | float *cpu_B = new float[matrixSize];
79 | float *cpu_C = new float[matrixSize];
80 |
81 | for (int i = 0; i < matrixSize; i++) {
82 | cpu_A[i] = 10.0f;
83 | cpu_B[i] = static_cast(i);
84 | }
85 |
86 | compareExecutionTime(cpu_A, cpu_B, cpu_C, sizeX, sizeY);
87 |
88 | delete[] cpu_A;
89 | delete[] cpu_B;
90 | delete[] cpu_C;
91 |
92 | return 0;
93 | }
94 |
--------------------------------------------------------------------------------
/day03/addMatrix.py:
--------------------------------------------------------------------------------
1 | import triton
2 | import torch
3 | import triton.language as tl
4 |
5 |
6 | @triton.jit
7 | def addMatrix(Matrix_A,Matrix_B,Matrix_C,sizeX,sizeY,BLOCK_SIZE:tl.constexpr):
8 |
9 | pid_x = tl.program_id(0) # we have the rows
10 | pid_y = tl.program_id(1) # we have the collumns
11 |
12 | row_start = pid_x*BLOCK_SIZE
13 | col_start = pid_y*BLOCK_SIZE
14 |
15 | row_indices = row_start + tl.arange(0,BLOCK_SIZE)
16 | col_indices = col_start + tl.arange(0,BLOCK_SIZE)
17 |
18 | row_indices = row_indices[:,None]
19 | col_indices = col_indices[None,:]
20 |
21 | row_mask = row_indices < sizeY
22 | col_mask = col_indices < sizeX
23 | valid_mask = row_mask & col_mask
24 |
25 | flat_indicies = row_indices * sizeX + col_indices
26 |
27 | A = tl.load(Matrix_A + flat_indicies,mask =valid_mask,other=0.0)
28 | B = tl.load(Matrix_B + flat_indicies,mask = valid_mask,other = 0.0)
29 |
30 | C = A+B;
31 |
32 | tl.store(Matrix_C+flat_indicies,C,mask=valid_mask)
33 |
34 |
35 | def test_addMatrix():
36 | sizeX = 8
37 | sizeY = 8
38 | BLOCK_SIZE = 2
39 |
40 | Matrix_A = torch.randn(sizeY, sizeX, device='cuda', dtype=torch.float32)
41 | Matrix_B = torch.randn(sizeY, sizeX, device='cuda', dtype=torch.float32)
42 | Matrix_C = torch.zeros_like(Matrix_A, device='cuda', dtype=torch.float32)
43 |
44 | Matrix_A_flat = Matrix_A.flatten()
45 | Matrix_B_flat = Matrix_B.flatten()
46 | Matrix_C_flat = Matrix_C.flatten()
47 |
48 | grid = (triton.cdiv(sizeX, BLOCK_SIZE), triton.cdiv(sizeY, BLOCK_SIZE))
49 | addMatrix[grid](Matrix_A_flat, Matrix_B_flat, Matrix_C_flat, sizeX, sizeY, BLOCK_SIZE)
50 |
51 | Matrix_C = Matrix_C_flat.reshape(sizeY, sizeX)
52 |
53 | expected = Matrix_A + Matrix_B
54 | print("Matrix A:\n", Matrix_A)
55 | print("Matrix B:\n", Matrix_B)
56 | print("Matrix C (Triton):\n", Matrix_C)
57 | print("Expected (PyTorch):\n", expected)
58 | assert torch.allclose(Matrix_C, expected), "Triton result does not match PyTorch result!"
59 |
60 | test_addMatrix()
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/day03/anotherMatrix.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __device__ float randomFunction(float x, float y)
5 | {
6 | return x + y * 2;
7 | }
8 |
9 | __global__ void matrixFunction(const float *A, const float *B, float *C, const int size)
10 | {
11 | int i = blockIdx.x * blockDim.x + threadIdx.x;
12 | int j = blockIdx.y * blockDim.y + threadIdx.y;
13 |
14 | if (i < size && j < size)
15 | {
16 | C[i + size * j] = randomFunction(A[i + size * j], B[i + size * j]);
17 | }
18 | }
19 |
20 | int main()
21 | {
22 | int N = 8;
23 | int BLOCK_SIZE = 2;
24 | dim3 blockDim(BLOCK_SIZE * BLOCK_SIZE);
25 | dim3 gridDim(N + BLOCK_SIZE - 1 / BLOCK_SIZE, N + BLOCK_SIZE - 1 / BLOCK_SIZE);
26 | int size = sizeof(float) * N * N;
27 |
28 | float *A,*B,*C;
29 | float *dA,*dB,*dC;
30 | A = new float[N*N];
31 | B = new float[N*N];
32 | C = new float[N*N];
33 |
34 | cudaMalloc((void**)&dA,size);
35 | cudaMalloc((void**)&dB,size);
36 | cudaMalloc((void**)&dC,size);
37 |
38 | for (int i = 0; i < N; ++i) {
39 | for (int j = 0; j < N; ++j) {
40 | A[i + N * j] = 1.0f;
41 | B[i + N * j] = 2.0f;
42 | }
43 | }
44 |
45 | cudaMemcpy(dA,A,size,cudaMemcpyHostToDevice);
46 | cudaMemcpy(dB,B,size,cudaMemcpyHostToDevice);
47 |
48 | // now we have everything set up
49 | matrixFunction<<>>(dA,dB,dC,N);
50 | cudaDeviceSynchronize();
51 |
52 | cudaMemcpy(C,dC,size,cudaMemcpyDeviceToHost);
53 |
54 | for (int i = 0; i < N*N; i++) {
55 | std::cout << C[i] << " ";
56 | if ((i + 1) % N == 0) std::cout << std::endl;
57 | }
58 | }
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/additionKernel.cpython-312-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | template
5 | __global__ void addKernel(T* input, int arraySize) {
6 | int idx = threadIdx.x + blockIdx.x * blockDim.x;
7 | if (idx < arraySize) {
8 | input[idx] += 10;
9 | }
10 | }
11 |
12 | void addition(torch::Tensor& input, int arraySize) {
13 | int threads_per_block = 256;
14 | int blocks = (arraySize + threads_per_block - 1) / threads_per_block;
15 |
16 | AT_DISPATCH_FLOATING_TYPES(input.type(), "arrayAddition", [&]() {
17 | addKernel<<>>(input.data_ptr(), arraySize);
18 | });
19 | cudaDeviceSynchronize();
20 |
21 | auto err = cudaGetLastError();
22 | if (err != cudaSuccess) {
23 | TORCH_CHECK(false, "CUDA error: ", cudaGetErrorString(err));
24 | }
25 | }
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: additionKernel
3 | Version: 0.0.1
4 |
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | additionKernel.cu
2 | additionKernelBinding.cpp
3 | setup.py
4 | additionKernel.egg-info/PKG-INFO
5 | additionKernel.egg-info/SOURCES.txt
6 | additionKernel.egg-info/dependency_links.txt
7 | additionKernel.egg-info/top_level.txt
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernel.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | additionKernel
2 |
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionKernelBinding.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | void addition(torch::Tensor& input, int arraySize);
4 |
5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
6 | m.def("addition", &addition, "Adds 10 to each element of the tensor");
7 | }
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/additionkernel.cpython-312-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: additionkernel
3 | Version: 0.0.0
4 |
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | additionKernel.cu
2 | additionKernelBinding.cpp
3 | setup.py
4 | additionkernel.egg-info/PKG-INFO
5 | additionkernel.egg-info/SOURCES.txt
6 | additionkernel.egg-info/dependency_links.txt
7 | additionkernel.egg-info/top_level.txt
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/day06/AdditionKernel/additionkernel.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | additionkernel
2 |
--------------------------------------------------------------------------------
/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionKernel.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionKernel.cpython-312-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionkernel.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/lib.linux-x86_64-cpython-312/additionkernel.cpython-312-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernel.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernel.o
--------------------------------------------------------------------------------
/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernelBinding.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/AdditionKernel/build/temp.linux-x86_64-cpython-312/additionKernelBinding.o
--------------------------------------------------------------------------------
/day06/AdditionKernel/pythontest.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import additionkernel
3 |
4 | input_tensor = torch.randn(100).cuda()
5 | additionkernel.addition(input_tensor, input_tensor.size(0))
6 | print("Result after addition:", input_tensor)
--------------------------------------------------------------------------------
/day06/AdditionKernel/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3 |
4 | setup(
5 | name='additionkernel',
6 | ext_modules=[
7 | CUDAExtension(
8 | name='additionkernel',
9 | sources=[
10 | 'additionKernelBinding.cpp',
11 | 'additionKernel.cu',
12 | ]
13 | )
14 | ],
15 | cmdclass={
16 | 'build_ext': BuildExtension
17 | }
18 | )
--------------------------------------------------------------------------------
/day06/ImportingToPython/build/lib.linux-x86_64-cpython-312/example_kernels.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/lib.linux-x86_64-cpython-312/example_kernels.cpython-312-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcall.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcall.o
--------------------------------------------------------------------------------
/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcallbinding.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/build/temp.linux-x86_64-cpython-312/rollcallbinding.o
--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day06/ImportingToPython/example_kernels.cpython-312-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: example_kernels
3 | Version: 0.0.1
4 |
--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | rollcall.cu
2 | rollcallbinding.cpp
3 | setup.py
4 | example_kernels.egg-info/PKG-INFO
5 | example_kernels.egg-info/SOURCES.txt
6 | example_kernels.egg-info/dependency_links.txt
7 | example_kernels.egg-info/top_level.txt
--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/day06/ImportingToPython/example_kernels.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | example_kernels
2 |
--------------------------------------------------------------------------------
/day06/ImportingToPython/pythontest.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import example_kernels
3 | example_kernels.rollcall()
--------------------------------------------------------------------------------
/day06/ImportingToPython/rollcall.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void roll_call_kernel() {
5 | const int threadIndex = threadIdx.x;
6 | printf("Thread %d here!\n", threadIndex);
7 | printf("Te iubesc atat de mult: %d \n",threadIndex*1000);
8 | }
9 |
10 | void roll_call_launcher() {
11 | roll_call_kernel<<<1, 5>>>();
12 | cudaDeviceSynchronize();
13 | }
14 |
15 | int main() {
16 | roll_call_launcher();
17 | return 0;
18 | }
--------------------------------------------------------------------------------
/day06/ImportingToPython/rollcallbinding.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | void roll_call_launcher();
5 |
6 | void roll_call_binding(){
7 | roll_call_launcher();
8 | }
9 |
10 | PYBIND11_MODULE(example_kernels, m) {
11 | m.def(
12 | "rollcall", // Name of the Python function to create
13 | &roll_call_binding, // Corresponding C++ function to call
14 | "Launches the roll_call kernel" // Docstring
15 | );
16 | }
17 |
--------------------------------------------------------------------------------
/day06/ImportingToPython/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3 |
4 | __version__ = "0.0.1"
5 |
6 | ext_modules = [
7 | CUDAExtension('example_kernels',
8 | [
9 | 'rollcallbinding.cpp',
10 | 'rollcall.cu',
11 | ])
12 | ]
13 |
14 | setup(
15 | name="example_kernels",
16 | version=__version__,
17 | ext_modules=ext_modules,
18 | cmdclass={"build_ext": BuildExtension}
19 | )
20 |
--------------------------------------------------------------------------------
/day06/SMBlocks.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void sm_roll_call() {
5 | const int threadIndex = threadIdx.x;
6 |
7 | uint streamingMultiprocessorId;
8 | asm("mov.u32 %0, %smid;" : "=r"(streamingMultiprocessorId) );
9 |
10 | printf("Thread %d running on SM %d!\n", threadIndex, streamingMultiprocessorId);
11 | }
12 |
13 | int main() {
14 | sm_roll_call<<<4, 2>>>();
15 | cudaDeviceSynchronize();
16 | return 0;
17 | }
--------------------------------------------------------------------------------
/day06/SoftMax.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void SoftMaxNaive(float *input,float *output,int size){
5 | int numThreads = blockDim.x;
6 |
7 |
8 | //each thread to compute softmax for this:
9 | int numElementsPerThread = size/numThreads;
10 |
11 | int threadIndex = threadIdx.x;
12 |
13 | int startIndex = threadIndex * numElementsPerThread;
14 | int endIndex = min(size,startIndex* numElementsPerThread);
15 |
16 |
17 | float MaxValue = 0.0;
18 | for (int i = 0; i < size; i++) {
19 | if (input[i] > MaxValue) {
20 | MaxValue = input[i];
21 | }
22 | }
23 |
24 | float sumExp = 0.0;
25 | for(int i =0;i MaxValue) {
47 | MaxValue = input[i];
48 | }
49 | }
50 | SharedMaxValue[threadIndex] = MaxValue;
51 | __syncthreads();
52 | for (int i = 0; i < numThreads; i++) {
53 | if (SharedMaxValue[i] > MaxValue) {
54 | MaxValue = SharedMaxValue[i];
55 | }
56 | }
57 |
58 |
59 | /// Now we need to calcualte the SumExp
60 | __shared__ float sharedSumExp[numThreads];
61 | float sumExp = 0.0;
62 | for(int i =startIndex;i
2 | #include
3 |
4 | __global__ void transposeKernel(int *A, int *B)
5 | {
6 | const int idx = threadIdx.x + threadIdx.y * blockDim.x;
7 | // threadIDx.x -> id of the row
8 | // threadIdx.y -> id of the collumn
9 | // BlockDim.x -> the size of the Dimension of the row
10 | // So we will get the idx to be on the element in the flattned matrix
11 |
12 | // 1 2 3 1 2 5
13 | // 2 3 4 -> 2 3 2
14 | // 5 2 1 3 4 1
15 | const int outidx = threadIdx.y + threadIdx.x * blockDim.y;
16 | B[outidx] = A[idx];
17 | }
18 |
19 | int main()
20 | {
21 | int rows = 3;
22 | int cols = 3;
23 | int sizeMatrix = rows * cols;
24 | int *Matrix = (int *)malloc(sizeof(int) * cols * rows);
25 | for (int i = 0; i < sizeMatrix; i++)
26 | {
27 | Matrix[i] = i;
28 | }
29 | for (int i = 0; i < sizeMatrix; i++)
30 | {
31 | std::cout << Matrix[i] << " ";
32 | if (i % cols == cols - 1)
33 | std::cout << std::endl;
34 | }
35 |
36 | int *MatrixD, *MatrixOut;
37 | cudaMalloc((void **)&MatrixD, sizeMatrix * sizeof(int));
38 | cudaMalloc((void **)&MatrixOut, sizeMatrix * sizeof(int));
39 | cudaMemcpy(MatrixD, Matrix, sizeMatrix * sizeof(int), cudaMemcpyHostToDevice);
40 |
41 | dim3 numThreadsPerBlock(rows, cols);
42 |
43 | cudaFuncSetAttribute(
44 | transposeKernel,
45 | cudaFuncAttributePreferredSharedMemoryCarveout,
46 | 20 // Use 20% of combined L1/Shared Memory for Shared Memory
47 | );
48 | transposeKernel<<<1, numThreadsPerBlock>>>(MatrixD, MatrixOut);
49 |
50 | cudaMemcpy(Matrix, MatrixOut, sizeMatrix * sizeof(float), cudaMemcpyDeviceToHost);
51 | std::cout << "\nTransposed\n";
52 | for (int i = 0; i < sizeMatrix; i++)
53 | {
54 | std::cout << Matrix[i] << " ";
55 | if (i % rows == rows - 1)
56 | std::cout << std::endl;
57 | }
58 |
59 | cudaFree(MatrixD);
60 | cudaFree(MatrixOut);
61 | free(Matrix);
62 |
63 | return 0;
64 | }
--------------------------------------------------------------------------------
/day06/note:
--------------------------------------------------------------------------------
1 | I will work more on this day to surprise my biggest supporter in this Journey :D
2 | I will start wit this tutorial : https://tinkerd.net/blog/machine-learning/cuda-basics/
3 | And later this day will continue with working on the softmax forward + backward
--------------------------------------------------------------------------------
/day07/conv1d.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void conv1D(float *X, float *K, float *Y, int input_size, int kernel_size)
5 | {
6 |
7 | extern __shared__ float shared[];
8 |
9 | int i = blockIdx.x * blockDim.x + threadIdx.x;
10 |
11 | int radius = kernel_size / 2;
12 |
13 | int sharedIdx = threadIdx.x + radius; // the main element from the conv
14 | // index will start from the radius so that we have left 2 more behind use
15 | /// SO we load in the share memory all the elements our filter will work on the block
16 | if (threadIdx.x < blockDim.x - radius)
17 | {
18 | int left = i - radius;
19 | int right = i + blockDim.x;
20 |
21 | shared[threadIdx.x] = (left >= 0) ? X[left] : 0.0f;
22 | shared[sharedIdx + blockDim.x] = (right < input_size) ? X[right] : 0.0f;
23 | }
24 |
25 | __syncthreads();
26 |
27 | float sum = 0.0;
28 | for (int j = -radius; j <= radius; j++)
29 | {
30 | sum += shared[sharedIdx + j] * K[radius + j];
31 | // we iterate from -2 to 2 . so we have -2 -1 0 1 2. Which is normal
32 | // So we have this:
33 | }
34 |
35 | if (i < input_size)
36 | {
37 | Y[i] = sum;
38 | }
39 | }
40 |
41 | int main()
42 | {
43 | int N = 1024; // size of the vector
44 | int BlockSize = 256; // size of the block we use
45 | int GridSize = (N + BlockSize - 1) / BlockSize; // size of the grid we use. Also ceil function
46 |
47 | int KernelSize = 5;
48 | float Kernel[KernelSize] = {1.0f, 2.0f, 1.0f, 1.0f, -2.0f};
49 | int radius = KernelSize / 2;
50 | int SharedMemory = (BlockSize + 2 * radius) * sizeof(float);
51 |
52 | float *Xcpu, *Ycpu;
53 | float *Xgpu, *Ygpu, *Kgpu;
54 |
55 | Xcpu = (float *)malloc(N * sizeof(float));
56 | Ycpu = (float *)malloc(N * sizeof(float));
57 | // we already have declared our kernel;
58 |
59 | for (int i = 0; i < N; i++)
60 | {
61 | Xcpu[i] = 1;
62 | }
63 |
64 | // now lets launch this data in the air baby
65 | cudaMalloc((void **)&Xgpu, N * sizeof(float));
66 | cudaMalloc((void **)&Ygpu, N * sizeof(float));
67 | cudaMalloc((void **)&Kgpu, KernelSize * sizeof(float));
68 | cudaMemcpy(Xgpu, Xcpu, N * sizeof(float), cudaMemcpyHostToDevice);
69 | cudaMemcpy(Kgpu, Kernel, KernelSize * sizeof(float), cudaMemcpyHostToDevice);
70 |
71 | conv1D<<>>(Xgpu, Kgpu, Ygpu, N, KernelSize);
72 |
73 | cudaMemcpy(Ycpu, Ygpu, N * sizeof(float), cudaMemcpyDeviceToHost);
74 |
75 | std::cout << "First 10 elements " << std::endl;
76 | for (size_t i = 0; i < 10; i++)
77 | {
78 | std::cout << Xcpu[i] << " ";
79 | }
80 |
81 | std::cout << "\nFirst 10 elements after the convolution op" << std::endl;
82 | for (size_t i = 0; i < 10; i++)
83 | {
84 | std::cout << Ycpu[i] << " ";
85 | }
86 |
87 | free(Xcpu);
88 | free(Ycpu);
89 | cudaFree(Xgpu);
90 | cudaFree(Ygpu);
91 | cudaFree(Kgpu);
92 |
93 | return 0;
94 | }
95 |
--------------------------------------------------------------------------------
/day07/globalMemoryCoalescing.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
5 | int M = 10;
6 | int N = 10;
7 |
8 | dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
9 | dim3 blockDim(32, 32, 1); // 32 * 32 * 1
10 |
11 | __global__ void sgemm_naive(int M, int N, int K, float alpha,
12 | const float *A, const float *B, float beta, float *C)
13 | {
14 | const int x = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
15 | const int y = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);
16 |
17 | if (x < M && y
2 | #include
3 | #define BLOCK_SIZE 32
4 |
5 | __global__ void matmulKernel(float *A, float *B, float *C, int dim)
6 | {
7 | int i, j; // i and j indexes
8 | float temp = 0; // temp value
9 |
10 | int row = blockIdx.y * blockDim.y + threadIdx.y;
11 | int col = blockIdx.x * blockDim.x + threadIdx.x;
12 |
13 | __shared__ float ASharedT[BLOCK_SIZE][BLOCK_SIZE]; // we allocate memory for shared
14 | __shared__ float BSharedT[BLOCK_SIZE][BLOCK_SIZE]; // we allocate memory fro shared
15 |
16 | for (int tileNUM = 0; tileNUM < gridDim.x; tileNUM++)
17 | {
18 | j = tileNUM * BLOCK_SIZE + threadIdx.x;
19 | i = tileNUM * BLOCK_SIZE + threadIdx.y;
20 |
21 | ASharedT[threadIdx.y][threadIdx.x] = A[i * dim + j];
22 | BSharedT[threadIdx.y][threadIdx.x] = B[i * dim + j];
23 |
24 | __syncthreads();
25 |
26 | for (int k = 0; k < BLOCK_SIZE; k++)
27 | {
28 | temp += ASharedT[threadIdx.y][k] * BSharedT[k][threadIdx.x];
29 | }
30 |
31 | __syncthreads();
32 | }
33 | C[row * dim + col] = temp;
34 | }
35 |
36 | int main()
37 | {
38 | int N = 1024;
39 | float *Acpu, *Bcpu, *Ccpu;
40 | float *Agpu, *Bgpu, *Cgpu;
41 |
42 | Acpu = (float *)malloc(N * N * sizeof(float));
43 | Bcpu = (float *)malloc(N * N * sizeof(float));
44 | Ccpu = (float *)malloc(N * N * sizeof(float));
45 |
46 | for (int i = 0; i < N * N; i++)
47 | {
48 | Acpu[i] = sin(i);
49 | Bcpu[i] = cos(i);
50 | }
51 |
52 | size_t vectorSize = N * N * sizeof(float);
53 |
54 | cudaMalloc((void **)&Agpu, vectorSize);
55 | cudaMalloc((void **)&Bgpu, vectorSize);
56 | cudaMalloc((void **)&Cgpu, vectorSize);
57 | cudaMemcpy(Agpu, Acpu, vectorSize, cudaMemcpyHostToDevice);
58 | cudaMemcpy(Bgpu, Bcpu, vectorSize, cudaMemcpyHostToDevice);
59 |
60 | dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
61 | dim3 gridDim(N / BLOCK_SIZE, N / BLOCK_SIZE);
62 |
63 | cudaEvent_t start, stop;
64 | cudaEventCreate(&start);
65 | cudaEventCreate(&stop);
66 | cudaEventRecord(start, 0);
67 |
68 | matmulKernel<<>>(Agpu, Bgpu, Cgpu, N);
69 |
70 | cudaEventRecord(stop, 0);
71 | cudaEventSynchronize(stop);
72 | float et;
73 | cudaEventElapsedTime(&et, start, stop);
74 | cudaEventDestroy(start);
75 | cudaEventDestroy(stop);
76 |
77 | cudaMemcpy(Ccpu, Cgpu, vectorSize, cudaMemcpyDeviceToHost);
78 |
79 | printf("GPU time= %f ms\n", et);
80 |
81 | free(Acpu);
82 | free(Bcpu);
83 | free(Ccpu);
84 | cudaFree(Agpu);
85 | cudaFree(Bgpu);
86 | cudaFree(Cgpu);
87 |
88 | return 0;
89 | }
--------------------------------------------------------------------------------
/day07/naive.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
5 | int M = 10;
6 | int N = 10;
7 |
8 | dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32), 1);
9 | dim3 blockDim(32, 32, 1); // 32 * 32 * 1
10 |
11 | __global__ void sgemm_naive(int M, int N, int K, float alpha,
12 | const float *A, const float *B, float beta, float *C)
13 | {
14 | const uint x = blockIdx.x * blockDim.x + threadIdx.x;
15 | const uint y = blockIdx.y * blockDim.y + threadIdx.y;
16 |
17 | if (x < M && y
2 | #include
3 |
--------------------------------------------------------------------------------
/day08/pmpbook/chapter3matvecmul.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define CUDA_CHECK(err) \
5 | { \
6 | cuda_assert((err), __FILE__, __LINE__); \
7 | }
8 | inline void cuda_assert(cudaError_t code, const char *file, int line)
9 | {
10 | if (code != cudaSuccess)
11 | {
12 | std::cerr << "CUDA Error: " << cudaGetErrorString(code)
13 | << " in " << file << ":" << line << std::endl;
14 | exit(1);
15 | }
16 | }
17 |
18 | __global__ void matrixveckernel(const float *A,const float*b,float*C,const int N){
19 | // N the size of the NxN A matrix
20 | // N aslo the size of the vector
21 | // we need so that each thread will iterate the row
22 |
23 | int i = blockIdx.x * blockDim.x + threadIdx.x;
24 | // we got
25 |
26 | if(i>>(dA,db,dc,N);
53 | CUDA_CHECK(cudaGetLastError());
54 |
55 | CUDA_CHECK(cudaMemcpy(c,dc,sizeb,cudaMemcpyDeviceToHost));
56 |
57 |
58 | CUDA_CHECK(cudaFree(dA));
59 | CUDA_CHECK(cudaFree(db));
60 | CUDA_CHECK(cudaFree(dc));
61 |
62 | }
63 |
64 | int main(){
65 | int N = 1024;
66 | float *A = new float[N * N];
67 | float *b = new float[N];
68 |
69 | for(int i = 0 ;i
2 | #include
3 |
4 | __global__ void color2graykernel(const float* R, const float*G,const float*B,float *O,const int n){
5 | // assume the matrix is nxn;
6 |
7 | int i = blockIdx.x * blockDim.x + threadIdx.x; // so this will be for collumns
8 | int j = blockIdx.y * blockDim.y + threadIdx.y; // this will be for rows
9 |
10 |
11 | if( i>>(d_r,d_g,d_b,d_o,n);
34 |
35 | float *O = (float*)malloc(size);
36 | cudaMemcpy(O,d_o,size,cudaMemcpyDeviceToHost);
37 |
38 | cudaFree(d_r);
39 | cudaFree(d_g);
40 | cudaFree(d_b);
41 | cudaFree(d_o);
42 |
43 | return O;
44 | }
--------------------------------------------------------------------------------
/day08/pmpbook/deviceinfo.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int main()
5 | {
6 | int dev_count;
7 | cudaGetDeviceCount(&dev_count);
8 | std::cout << "Devices are : " << dev_count << std::endl;
9 |
10 | cudaDeviceProp dev_prop;
11 | for (int i = 0; i < dev_count; ++i)
12 | {
13 | cudaGetDeviceProperties(&dev_prop, i);
14 | }
15 | std::cout << "Max Threads per Block : " << dev_prop.maxThreadsPerBlock << std::endl;
16 | std::cout << "Max Threads per MultiProcessor :" << dev_prop.maxThreadsPerMultiProcessor << std::endl;
17 | std::cout << "Max Blocks per MultiProcessor : " << dev_prop.maxBlocksPerMultiProcessor << std::endl;
18 | std::cout << "Clock rate : " << dev_prop.clockRate << std::endl;
19 | std::cout << "Max Grid Size (X,Y,Z) : (" << dev_prop.maxGridSize[0] << "," << dev_prop.maxGridSize[1] << "," << dev_prop.maxGridSize[2] << ")" << std::endl;
20 | std::cout << "Max Threads Dim (X,Y,Z) : (" << dev_prop.maxThreadsDim[0] << "," << dev_prop.maxThreadsDim[1] << "," << dev_prop.maxThreadsDim[2] << ")" << std::endl;
21 | std::cout << "Max Shared Memory per Block : " << dev_prop.sharedMemPerBlock << std::endl;
22 | std::cout << "Max Shared Memory per MultiProcessor : " << dev_prop.sharedMemPerMultiprocessor << std::endl;
23 | std::cout << "Max Registers per Block : " << dev_prop.regsPerBlock << std::endl;
24 | std::cout << "Max Registers per MultiProcessor : " << dev_prop.regsPerMultiprocessor << std::endl;
25 | std::cout << "Warp Size : " << dev_prop.warpSize << std::endl;
26 | std::cout << "Max Threads per Warp : " << dev_prop.maxThreadsPerMultiProcessor / dev_prop.warpSize << std::endl;
27 | std::cout << "Max Warps per MultiProcessor : " << dev_prop.maxThreadsPerMultiProcessor / dev_prop.warpSize << std::endl;
28 | std::cout << "Max Warps per Block : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize << std::endl;
29 | std::cout << "Max Warps per Grid : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] << std::endl;
30 | std::cout << "Max Warps per Device : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] * dev_prop.multiProcessorCount << std::endl;
31 | std::cout << "Max Blocks per Device : " << dev_prop.maxBlocksPerMultiProcessor * dev_prop.multiProcessorCount << std::endl;
32 | std::cout << "Max Threads per Device : " << dev_prop.maxThreadsPerBlock * dev_prop.multiProcessorCount << std::endl;
33 | std::cout << "Max Warps per Device : " << dev_prop.maxThreadsPerBlock / dev_prop.warpSize * dev_prop.maxGridSize[0] * dev_prop.maxGridSize[1] * dev_prop.maxGridSize[2] * dev_prop.multiProcessorCount << std::endl;
34 |
35 | }
--------------------------------------------------------------------------------
/day08/pmpbook/imageblur.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void imageblurkernel(const float *A, float *C, const int sizeArray, const int sizeKernel)
5 | {
6 | int i = blockIdx.x * blockDim.x + threadIdx.x;
7 | int j = blockIdx.y * blockDim.y + threadIdx.y;
8 |
9 | int radius = sizeKernel / 2;
10 |
11 | // 1 2 3 2
12 | // 4 5 6 2
13 | // 1 2 3 2
14 | // 5 6 7 2
15 | //
16 | // Sow we lets say we are at index = 1 first element
17 | // we need now to do this :
18 | // we only use the blur when if it dosnt overflow
19 | if (i < sizeArray && j < sizeArray)
20 | {
21 | float PixelValue = 0.0;
22 | int pixels = 0;
23 | for (int blurRow = -radius; i <= radius; i++)
24 | {
25 | for (int blurCol = -radius; j <= radius; j++)
26 | {
27 | // so now we are in the kernel
28 | int curRow = i + blurRow;
29 | int curCol = j + blurCol;
30 |
31 | if (curRow < 0 || curRow >= sizeArray || curCol < 0 || curCol >= sizeArray)
32 | {
33 | PixelValue += A[curRow * sizeArray + curCol];
34 | pixels++;
35 | }
36 | }
37 | }
38 | C[sizeArray * j + i] = PixelValue / pixels;
39 | }
40 | }
--------------------------------------------------------------------------------
/day08/pmpbook/vecaddition.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | //// CHAPTER 2 DONE
5 | __global__ void addkernel(float *a, float *b, float *c, int N)
6 | {
7 | int i = blockIdx.x * blockDim.x + threadIdx.x;
8 | if (i < N)
9 | {
10 | c[i] = a[i] + b[i];
11 | }
12 | }
13 |
14 |
15 | void vecAdd(float *A, float *B, float*C,int n){
16 | int size = n*sizeof(float);
17 | float *d_A, *d_B, *d_C;
18 |
19 | cudaMalloc((void**)&d_A, size);
20 | cudaMalloc((void**)&d_B, size);
21 | cudaMalloc((void**)&d_C, size);
22 |
23 | cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
24 | cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
25 |
26 | dim3 dimGrid(ceil(n/256.0),1,1);
27 | dim3 dimBlock(256,1,1);
28 | addkernel<<>>(d_A, d_B, d_C, n);
29 | // launches a gri of 4 blocks with 256 threads per block
30 |
31 | cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
32 | cudaFree(d_A);
33 | cudaFree(d_B);
34 | cudaFree(d_C);
35 | }
--------------------------------------------------------------------------------
/day09/bind.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | torch::Tensor forward(torch::Tensor Q, torch::Tensor K, torch::Tensor V);
4 |
5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
6 | m.def("forward", torch::wrap_pybind_function(forward), "forward");
7 | }
--------------------------------------------------------------------------------
/day09/test.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch.nn import functional as F
5 | from torch.utils.cpp_extension import load
6 |
7 | print("LOADING FLASH ATTENTION")
8 | minimal_attn = load(name='minimal_attn', sources=['bind.cpp', 'flashAttentionFromTut.cu'], extra_cuda_cflags=['-O2'])
9 | print("LOADED FLASH ATTENTION")
10 |
11 | batch_size = 16
12 | n_head = 12
13 | seq_len = 64
14 | head_embd = 64
15 |
16 | q = torch.randn(batch_size, n_head, seq_len, head_embd).cuda()
17 | k = torch.randn(batch_size, n_head, seq_len, head_embd).cuda()
18 | v = torch.randn(batch_size, n_head, seq_len, head_embd).cuda()
19 |
20 | print('=== profiling manual attention ===')
21 |
22 | # Our minimal flash attention aims to be faster than this by avoiding HBM read/writes of N^2 matrices.
23 | def manual_attn(q, k, v):
24 | att = (q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.size(-1))))
25 | att = F.softmax(att, dim=-1)
26 | y = att @ v
27 | return y
28 |
29 | with torch.autograd.profiler.profile(use_cuda=True) as prof:
30 | manual_result = manual_attn(q, k, v)
31 | print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
32 |
33 | print('=== profiling minimal flash attention === ')
34 |
35 | with torch.autograd.profiler.profile(use_cuda=True) as prof:
36 | minimal_result = minimal_attn.forward(q, k, v)
37 | print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
38 |
39 | print('attn values sanity check:', torch.allclose(minimal_result, manual_result, rtol=0, atol=1e-02))
--------------------------------------------------------------------------------
/day10/FlashAttention.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 |
4 | void FlashAttention(torch::Tensor &Q,
5 | torch::Tensor &K,
6 | torch::Tensor &V,
7 | torch::Tensor &O,
8 | torch::Tensor &m,
9 | torch::Tensor &l,
10 | const int seq_len,
11 | const int head_dim,
12 | int Tc, int Tr, int Bc, int Br);
13 |
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
15 | m.def("FlashAttention", &FlashAttention, "FlashAttention forward");
16 | }
--------------------------------------------------------------------------------
/day10/linking/simpleKernel.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "ATen/ATen.h"
3 |
4 | void cuda_simpleKernel(float *A);
5 |
6 | void simpleKernel(at::Tensor A) {
7 | cuda_simpleKernel(A.data_ptr());
8 | }
9 |
10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11 | m.def("simplekernel", &simpleKernel, "A simple kernel (CUDA)");
12 | }
13 |
--------------------------------------------------------------------------------
/day10/linking/simpleKernel.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "ATen/ATen.h"
4 |
5 | template
6 | __global__ void simpleKernel(T* A) {
7 | A[threadIdx.x] += 100;
8 | }
9 |
10 | void cuda_simpleKernel(float *A ) {
11 | dim3 blocks(1);
12 | simpleKernel<<>>(A);
13 | }
--------------------------------------------------------------------------------
/day10/linking/test.py:
--------------------------------------------------------------------------------
1 | from torch.utils.cpp_extension import load
2 |
3 | simplekernel = load(
4 | name='simplekernel',
5 | sources=['simpleKernel.cpp', 'simpleKernel.cu'],
6 | verbose=True
7 | )
8 |
9 | # Test kernel
10 | import torch
11 | A = torch.zeros(32, device='cuda', dtype=torch.float32)
12 | simplekernel.simplekernel(A)
13 | print(A)
14 |
--------------------------------------------------------------------------------
/day10/ppmbook/matrixmul.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | __global__ void matrixmulkernel(float *M,float *N,float *P,int width)
4 | {
5 | int row = blockIdx.y * blockDim.y + threadIdx.y;
6 | int col = blockIdx.x * blockDim.x + threadIdx.x;
7 |
8 | if(row < width && col
2 | #include
3 | using namespace nvcuda;
4 |
5 | // Block layout: one block per (batch, head)
6 | template
7 | __global__ void delta_net_attention(
8 | const half* __restrict__ K, // [B, S, D]
9 | const half* __restrict__ V, // [B, S, D]
10 | const half* __restrict__ Q, // [B, S, D]
11 | half* __restrict__ O, // [B, S, D]
12 | int batch, int seq_len)
13 | {
14 | extern __shared__ half shared_mem[]; // size = D*D
15 | half* S = shared_mem; // state matrix S
16 | int b = blockIdx.x; // batch index
17 |
18 | // Initialize S to zero
19 | for (int idx = threadIdx.x; idx < D*D; idx += blockDim.x) {
20 | S[idx] = __float2half(0.0f);
21 | }
22 | __syncthreads();
23 |
24 | // Loop over sequence length
25 | for (int t = 0; t < seq_len; ++t) {
26 | // Load k_t and v_t into registers
27 | half k_vec[D], v_vec[D];
28 | #pragma unroll
29 | for (int i = threadIdx.x; i < D; i += blockDim.x) {
30 | int base = (b*seq_len + t)*D;
31 | k_vec[i] = K[base + i];
32 | v_vec[i] = V[base + i];
33 | }
34 | __syncthreads();
35 |
36 | // S += v_vec * k_vec^T — outer-product update
37 | for (int i = threadIdx.y; i < D; i += blockDim.y) {
38 | #pragma unroll
39 | for (int j = threadIdx.x; j < D; j += blockDim.x) {
40 | int idx = i*D + j;
41 | float s = __half2float(S[idx]);
42 | s += __half2float(v_vec[i]) * __half2float(k_vec[j]);
43 | S[idx] = __float2half(s);
44 | }
45 | }
46 | __syncthreads();
47 |
48 | // Load q_t and compute o_t = S * q_vec
49 | half q_vec[D];
50 | #pragma unroll
51 | for (int i = threadIdx.x; i < D; i += blockDim.x) {
52 | int base = (b*seq_len + t)*D;
53 | q_vec[i] = Q[base + i];
54 | }
55 | __syncthreads();
56 |
57 | #pragma unroll
58 | for (int i = threadIdx.x; i < D; i += blockDim.x) {
59 | float o = 0.0f;
60 | #pragma unroll
61 | for (int j = 0; j < D; ++j) {
62 | o += __half2float(S[i*D + j]) * __half2float(q_vec[j]);
63 | }
64 | int out_idx = (b*seq_len + t)*D + i;
65 | O[out_idx] = __float2half(o);
66 | }
67 | __syncthreads();
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/day11/FlashTestPytorch/binding.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "ATen/ATen.h"
3 |
4 | void CudaFlashAttention(const float *Q,
5 | const float *K,
6 | const float *V,
7 | float *O,
8 | float *m,
9 | float *l,
10 | const int seq_len,
11 | const int head_dim,
12 | const int batch_size,
13 | const int nr_heads);
14 |
15 | torch::Tensor FlashAttention(torch::Tensor Q,
16 | torch::Tensor K,
17 | torch::Tensor V)
18 | {
19 | int batch_size = Q.size(0);
20 | int nr_heads = Q.size(1);
21 | int seq_len = Q.size(2);
22 | int head_dim = Q.size(3);
23 |
24 | torch::Tensor m = torch::full({batch_size, nr_heads, seq_len},
25 | -std::numeric_limits::infinity(),Q.options());
26 | torch::Tensor l = torch::zeros({batch_size, nr_heads, seq_len},Q.options());
27 |
28 | torch::Tensor O = torch::zeros_like(Q);
29 | CudaFlashAttention(Q.data_ptr(), K.data_ptr(), V.data_ptr(), O.data_ptr(), m.data_ptr(), l.data_ptr(), seq_len, head_dim, batch_size, nr_heads);
30 | return O;
31 | }
32 |
33 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
34 | {
35 | m.def("FlashAttention", &FlashAttention, "FlashAttention (CUDA)");
36 | }
--------------------------------------------------------------------------------
/day11/FlashTestPytorch/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.cpp_extension import load
3 | import time
4 |
5 | # Load the custom CUDA extension
6 | sources = ["binding.cpp", "FlashAttention.cu"]
7 | flash_attention = load("flash_attention", sources=sources, verbose=True)
8 | print("Custom CUDA extension loaded.")
9 |
10 | def manual_attention(Q, K, V):
11 | batch_size, num_heads, seq_len, head_dim = Q.shape
12 |
13 | attn_scores = torch.matmul(Q, K.transpose(-2, -1)) # [batch, heads, seq_len, seq_len]
14 | scale = 1.0 / (head_dim ** 0.5)
15 | attn_scores = attn_scores * scale
16 | attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1)
17 | output = torch.matmul(attn_weights, V) # [batch, heads, seq_len, head_dim]
18 | return output
19 | def test_flash_attention():
20 | batch_size = 2
21 | num_heads = 4
22 | seq_len = 128
23 | head_dim = 64
24 |
25 | # Create random input tensors
26 | Q = torch.randn(batch_size, num_heads, seq_len, head_dim, device='cuda')
27 | K = torch.randn_like(Q)
28 | V = torch.randn_like(Q)
29 |
30 | # Warmup runs
31 | for _ in range(3):
32 | _ = flash_attention.FlashAttention(Q, K, V)
33 | _ = manual_attention(Q, K, V)
34 |
35 | # Benchmark custom FlashAttention
36 | custom_times = []
37 | for _ in range(100):
38 | torch.cuda.synchronize()
39 | start = time.time()
40 | _ = flash_attention.FlashAttention(Q, K, V)
41 | torch.cuda.synchronize()
42 | custom_times.append(time.time() - start)
43 |
44 | # Benchmark manual attention
45 | manual_times = []
46 | for _ in range(100):
47 | torch.cuda.synchronize()
48 | start = time.time()
49 | _ = manual_attention(Q, K, V)
50 | torch.cuda.synchronize()
51 | manual_times.append(time.time() - start)
52 |
53 | # Get fastest iterations
54 | fastest_custom = min(custom_times) * 1000 # Convert to milliseconds
55 | fastest_manual = min(manual_times) * 1000
56 |
57 | # Print performance results
58 | print("\nPerformance results (fastest iteration):")
59 | print(f"Custom FlashAttention: {fastest_custom:.2f} ms")
60 | print(f"Manual PyTorch attention: {fastest_manual:.2f} ms")
61 | print(f"Speedup factor: {fastest_manual / fastest_custom:.2f}x")
62 |
63 | if __name__ == "__main__":
64 | test_flash_attention()
--------------------------------------------------------------------------------
/day11/LeakyReLU.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 |
4 | __global__ void leakyreluKernel(float*input,float*output,float slope,int N){
5 | int index = blockDim.x * blockIdx.x + threadIdx.x;
6 | if(index < N)
7 | output[index] = input[index] < 0 ? input[index]*slope : input[index];
8 | }
9 |
10 | void CudaLeakyReLU(float *A,float*B,float slope ,int N){
11 | int ThreadsPerBlock = 256;
12 | int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock;
13 | leakyreluKernel<<>>(A, B,slope,N);
14 | }
--------------------------------------------------------------------------------
/day11/ReLU.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 |
4 | __global__ void reluKernel(float*input,float*output,int N){
5 | int index = blockDim.x * blockIdx.x + threadIdx.x;
6 | if(index < N)
7 | output[index] = input[index] < 0 ? 0 : input[index];
8 | }
9 |
10 | void CudaReLU(float *A,float*B, int N){
11 | int ThreadsPerBlock = 256;
12 | int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock;
13 | reluKernel<<>>(A, B,N);
14 | }
15 |
16 | //=========================
17 |
18 | __global__ void reluKernelBackward(float *input, float *grad_input, float *grad_output, int N){
19 | int index = blockDim.x * blockIdx.x + threadIdx.x;
20 | if(index < N)
21 | grad_input[index] = input[index] < 0 ? 0 : grad_output[index];
22 | }
23 |
24 | void CudaReLUBackward(float *A, float *Gi, float *Go, int N){
25 | int ThreadsPerBlock = 256;
26 | int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock;
27 | reluKernelBackward<<>>(A, Gi, Go, N);
28 | }
--------------------------------------------------------------------------------
/day11/SoftMax.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | inline int prevPow2(int n) {
4 | if (n == 0) return 0;
5 | int prev = 1;
6 | while (prev <= n/2) {
7 | prev *= 2;
8 | }
9 | return prev;
10 | }
11 |
12 | __global__ void softmaxKernel(float *input, float *output, int Dim) {
13 | int batch_idx = blockIdx.x; // Current batch index
14 | int tid = threadIdx.x; // Thread index within the block
15 |
16 | extern __shared__ float shared_data[];
17 | float max_val = -INFINITY;
18 | for (int i = tid; i < Dim; i += blockDim.x) {
19 | max_val = fmaxf(max_val, input[batch_idx * Dim + i]);
20 | }
21 |
22 | shared_data[tid] = max_val;
23 | __syncthreads();
24 |
25 | // Reduction for max_val
26 | for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
27 | if (tid < stride) {
28 | shared_data[tid] = fmaxf(shared_data[tid], shared_data[tid + stride]);
29 | }
30 | __syncthreads();
31 | }
32 | max_val = shared_data[0];
33 |
34 | float sum_exp = 0.0f;
35 | for (int i = tid; i < Dim; i += blockDim.x) {
36 | output[batch_idx * Dim + i] = expf(input[batch_idx * Dim + i] - max_val);
37 | sum_exp += output[batch_idx * Dim + i];
38 | }
39 |
40 | shared_data[tid] = sum_exp;
41 | __syncthreads();
42 |
43 | // Reduction for sum_exp
44 | for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
45 | if (tid < stride) {
46 | shared_data[tid] += shared_data[tid + stride];
47 | }
48 | __syncthreads();
49 | }
50 | sum_exp = shared_data[0];
51 |
52 | for (int i = tid; i < Dim; i += blockDim.x) {
53 | output[batch_idx * Dim + i] /= sum_exp;
54 | }
55 | }
56 |
57 |
58 | void CudaSoftmax(float *input, float *output, int BatchSize, int Dim) {
59 | int max_threads = min(512, Dim);
60 | int threads = prevPow2(max_threads);
61 | if (threads == 0) threads = 1; // Ensure at least 1 thread
62 | size_t shared_mem_size = threads * sizeof(float);
63 | softmaxKernel<<>>(input, output, Dim);
64 | cudaDeviceSynchronize(); // Ensure kernel completion
65 | }
--------------------------------------------------------------------------------
/day11/TanH.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 |
4 | __global__ void tanhKernel(float*input,float*output,int N){
5 | int index = blockDim.x * blockIdx.x + threadIdx.x;
6 | if(index < N)
7 | output[index] = tanhf(input[index]);
8 | }
9 |
10 | void CudaTanH(float *A,float*B, int N){
11 | int ThreadsPerBlock = 256;
12 | int BlocksPerGrid = (N + ThreadsPerBlock - 1) / ThreadsPerBlock;
13 | tanhKernel<<>>(A, B,N);
14 | }
15 |
--------------------------------------------------------------------------------
/day11/binding.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "ATen/ATen.h"
3 |
4 |
5 | void CudaLeakyReLU(float *A,float*B,float slope ,int N);
6 | torch::Tensor LeakyReLU(torch::Tensor A, float slope){
7 | torch::Tensor B = torch::empty_like(A);
8 | int N = A.numel();
9 | CudaLeakyReLU(A.data_ptr(),B.data_ptr(),slope,N);
10 | return B;
11 | }
12 |
13 | void CudaReLU(float *A,float*B, int N);
14 | torch::Tensor ReLU(torch::Tensor A){
15 | torch::Tensor B = torch::empty_like(A);
16 | int N = A.numel();
17 | CudaReLU(A.data_ptr(),B.data_ptr(),N);
18 | return B;
19 | }
20 |
21 | void CudaReLUBackward(float *A, float *Gi, float *Go, int N);
22 | torch::Tensor ReLUBackward(torch::Tensor A, torch::Tensor Go){
23 | torch::Tensor Gi = torch::empty_like(A);
24 | int N = A.numel();
25 | CudaReLUBackward(A.data_ptr(),Gi.data_ptr(),Go.data_ptr(),N);
26 | return Go;
27 | }
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 | void CudaSoftmax(float *input, float *output, int BatchSize, int Dim) ;
39 | torch::Tensor Softmax(torch::Tensor input) {
40 | int BatchSize = input.size(0);
41 | int Dim = input.size(1);
42 | torch::Tensor output = torch::empty_like(input);
43 | CudaSoftmax(input.data_ptr(), output.data_ptr(), BatchSize, Dim);
44 | return output;
45 | }
46 |
47 | void CudaTanH(float *A,float*B, int N);
48 | torch::Tensor TanH(torch::Tensor A){
49 | torch::Tensor B = torch::empty_like(A);
50 | int N = A.numel();
51 | CudaTanH(A.data_ptr(),B.data_ptr(),N);
52 | return B;
53 | }
54 |
55 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
56 | m.def("LeakyReLU", &LeakyReLU, "LeakyReLU (CUDA)");
57 | m.def("ReLU", &ReLU, "ReLU (CUDA)");
58 | m.def("ReLUBackward", &ReLUBackward, "ReLU (CUDA)");
59 | m.def("Softmax", &Softmax, "Softmax (CUDA)");
60 | m.def("TanH", &TanH, "TanH (CUDA)");
61 | }
--------------------------------------------------------------------------------
/day11/testbackward.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.cpp_extension import load
3 |
4 | sources = ["binding.cpp", "ReLU.cu", "SoftMax.cu", "LeakyReLU.cu", "TanH.cu"]
5 | functions = load("functions", sources=sources, verbose=True)
6 |
7 | class CustomReLU(torch.autograd.Function):
8 | @staticmethod
9 | def forward(ctx, input):
10 | ctx.save_for_backward(input)
11 | return functions.ReLU(input)
12 |
13 | @staticmethod
14 | def backward(ctx, grad_output):
15 | input, = ctx.saved_tensors
16 | return functions.ReLUBackward(input, grad_output)
17 |
18 | x = torch.tensor([-1.0, -1.0, -1.0, -2.0], device='cuda', requires_grad=True)
19 |
20 | relu = CustomReLU.apply
21 |
22 | y_custom = relu(x)
23 | y_custom.sum().backward()
24 | grad_custom = x.grad.clone()
25 |
26 | x.grad.zero_()
27 | y_pytorch = torch.nn.functional.relu(x)
28 | y_pytorch.sum().backward()
29 | grad_pytorch = x.grad.clone()
30 |
31 | # Compare the gradients
32 | print("Custom ReLU Gradient:", grad_custom)
33 | print("PyTorch ReLU Gradient:", grad_pytorch)
34 |
35 | if torch.allclose(grad_custom, grad_pytorch, atol=1e-6):
36 | print("Gradients match!")
37 | else:
38 | print("Gradients do not match!")
39 |
--------------------------------------------------------------------------------
/day12/tileMatrix.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define TILE_WIDTH 32
4 |
5 | __global__ void tileKernel(const float *dM,const float *dN,float *dP,const int Width){
6 | __shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
7 | __shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
8 |
9 | int bx = blockIdx.x;
10 | int by = blockIdx.y;
11 | int tx = threadIdx.x;
12 | int ty = threadIdx.y;
13 |
14 | int row = by * TILE_WIDTH + ty;
15 | int col = bx * TILE_WIDTH + tx;
16 |
17 | float Pvalue = 0;
18 | for(int i = 0 ; i < TILE_WIDTH/Width ; ++i){
19 | Mds[ty][tx] = dM[row*Width + i*TILE_WIDTH + tx];
20 | Nds[ty][tx] = dN[(i*TILE_WIDTH + ty)*Width + col];
21 | __syncthreads();
22 |
23 | for(int k = 0 ;k
2 |
3 | // Define the CEILING macro
4 | #define CEILING(x, y) (((x) + (y) - 1) / (y))
5 |
6 | #define blockdimy 128
7 |
8 | __global__ void RMSKernel1_V1(float *input, float *output, const int w, const int h)
9 | {
10 | int col = blockIdx.x * blockDim.x + threadIdx.x;
11 | int row = blockIdx.y * blockDim.y + threadIdx.y;
12 |
13 | if (row < h && col < w)
14 | {
15 | float sum = 0;
16 | for (int i = 0; i < w; ++i)
17 | {
18 | sum += input[row * w + i] * input[row * w + i];
19 | }
20 | sum = sqrt((float)1 / w * sum);
21 |
22 | output[row + w * col] = input[row * w + col] / sum;
23 | }
24 | }
25 |
26 |
27 | void RMSV1(float *input, float *output, int w, int h)
28 | {
29 |
30 | dim3 block_size = dim3(32, 32);
31 | dim3 grid_size = dim3(CEILING(w, 32), CEILING(32, h));
32 | RMSKernel1_V1<<>>(input, output, w, h);
33 | }
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/day13/RMSBetter.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define CEILING(x, y) (((x) + (y) - 1) / (y))
4 |
5 | #define blockdimy 128
6 |
7 | __device__ float warpReduceSum(float val)
8 | {
9 | for (int offset = 16; offset > 0; offset /= 2)
10 | {
11 | val += __shfl_down_sync(0xffffffff, val, offset, 32);
12 | }
13 | return val;
14 | }
15 |
16 | __global__ void RMSKernel_V2(float *input, float *output, const int w, const int h)
17 | {
18 | int row = blockIdx.x * blockDim.x + threadIdx.x;
19 | int col = blockIdx.y * blockDim.y + threadIdx.y;
20 |
21 | __shared__ float shared_data[32];
22 |
23 | float sum = 0.0f;
24 |
25 | if (row < h && col < w)
26 | {
27 | float4 val = reinterpret_cast(&input[row * w + col * 4])[0];
28 | sum += val.x * val.x + val.y * val.y + val.z * val.z + val.w * val.w;
29 | }
30 | __syncthreads();
31 |
32 | sum = warpReduceSum(sum);
33 |
34 | __syncthreads();
35 |
36 | if (threadIdx.x % 32 == 0)
37 | {
38 | shared_data[threadIdx.x / 32] = sum;
39 | }
40 |
41 | __syncthreads();
42 |
43 | if (threadIdx.x == 0)
44 | {
45 | float final_sum = 0.0f;
46 | for (int i = 0; i < blockDim.x / 32; ++i)
47 | {
48 | final_sum += shared_data[i];
49 | }
50 | output[row] = input[row] / sqrt(final_sum / float(w));
51 | }
52 | }
53 |
54 | void RMSV2(float *input, float *output, int w, int h)
55 | {
56 | dim3 block_size = dim3(1, 32, 1);
57 | dim3 grid_size = dim3(h, 1, 1);
58 | RMSKernel_V2<<>>(input, output, w, h);
59 | }
60 |
--------------------------------------------------------------------------------
/day13/binding.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "ATen/ATen.h"
3 |
4 | void RMSV1(float *input, float *output, int w, int h);
5 |
6 | torch::Tensor RMS_V1(torch::Tensor input)
7 | {
8 | auto out = torch::empty_like(input);
9 | int h = input.size(0);
10 | int w = input.size(1);
11 | RMSV1(input.data_ptr(), out.data_ptr(), w, h);
12 | return out;
13 | }
14 |
15 | void RMSV2(float *input, float *output, int w, int h);
16 | torch::Tensor RMS_V2(torch::Tensor input)
17 | {
18 | auto out = torch::empty_like(input);
19 | int h = input.size(0);
20 | int w = input.size(1);
21 | RMSV1(input.data_ptr(), out.data_ptr(), w, h);
22 | return out;
23 | }
24 |
25 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
26 | {
27 | m.def("RMSV1", &RMS_V1, "RMSV1 (CUDA)");
28 | m.def("RMSV2", &RMS_V2, "RMSV2 (CUDA)");
29 | }
--------------------------------------------------------------------------------
/day13/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.cpp_extension import load
3 | import time
4 | from liger_kernel.ops import rms_norm
5 |
6 | def rms_norm(tensor):
7 | return tensor / torch.sqrt(torch.mean(tensor ** 2))
8 |
9 | sources = ["binding.cpp", "RMS.cu", "RMSBetter.cu"]
10 | RMS = load("RMS", sources=sources, verbose=True)
11 | print("Custom CUDA extension loaded.")
12 |
13 | tensor_sizes = [(1024, 1024), (2048, 2048), (4096, 4096), (8192, 8192)]
14 |
15 | for tensor_size in tensor_sizes:
16 | print("=" * 50)
17 | print("Input Size: ", tensor_size)
18 | print("=" * 50)
19 | input_tensor = torch.randn(tensor_size, device='cuda')
20 |
21 | # PyTorch RMS time and result
22 | pytorch_time = 0
23 | result_pytorch = None
24 | for _ in range(5):
25 | start_time = time.time()
26 | result_pytorch = rms_norm(input_tensor)
27 | pytorch_time += time.time() - start_time
28 | print(f"PyTorch RMS time: {pytorch_time / 6:.6f} seconds")
29 |
30 | # Custom kernel time and result
31 | custom_time = 0
32 | result_custom = None
33 | for _ in range(5):
34 | start_time = time.time()
35 | result_custom = RMS.RMSV2(input_tensor)
36 | custom_time += time.time() - start_time
37 | print(f"Custom kernel time: {custom_time / 6:.6f} seconds")
38 |
39 | # Liger kernel time and result
40 | liger_time = 0
41 | result_liger = None
42 | for _ in range(5):
43 | start_time = time.time()
44 | result_liger = rms_norm(input_tensor)
45 | liger_time += time.time() - start_time
46 | print(f"Liger kernel time: {liger_time / 6:.6f} seconds")
47 |
48 | # Checking if the results are the same
49 | pytorch_custom_diff = torch.max(torch.abs(result_pytorch - result_custom))
50 | pytorch_liger_diff = torch.max(torch.abs(result_pytorch - result_liger))
51 |
52 | print(f"Max difference between PyTorch and Custom kernel: {pytorch_custom_diff.item():.6f}")
53 | print(f"Max difference between PyTorch and Liger kernel: {pytorch_liger_diff.item():.6f}")
54 |
55 | # Check if they are numerically close (within tolerance)
56 | are_pytorch_custom_close = torch.allclose(result_pytorch, result_custom, atol=1) # You can adjust the tolerance
57 | are_pytorch_liger_close = torch.allclose(result_pytorch, result_liger, atol=1) # You can adjust the tolerance
58 |
59 | if are_pytorch_custom_close:
60 | print("PyTorch and Custom kernel results are the same!")
61 | else:
62 | print("PyTorch and Custom kernel results are different.")
63 |
64 | if are_pytorch_liger_close:
65 | print("PyTorch and Liger kernel results are the same!")
66 | else:
67 | print("PyTorch and Liger kernel results are different.")
68 |
69 | print("=" * 50 + "\n")
70 |
--------------------------------------------------------------------------------
/day14/FA2/helper.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "helper.cuh"
4 |
5 | __device__ float warpReduceMax(float val) {
6 | for (int offset = 16; offset > 0; offset /= 2) {
7 | val = fmaxf(val, __shfl_down_sync(0xffffffff, val, offset));
8 | }
9 | return val;
10 | }
--------------------------------------------------------------------------------
/day14/FA2/helper.cuh:
--------------------------------------------------------------------------------
1 | #ifndef HELPER_CUH
2 | #define HELPER_CUH
3 |
4 | __device__ float warpReduceMax(float val);
5 |
6 | #endif
--------------------------------------------------------------------------------
/day14/FA2/kernels.cuh:
--------------------------------------------------------------------------------
1 | #ifndef KERNELS_CUH
2 | #define KERNELS_CUH
3 |
4 | __global__ void computeDKernel(const float* dO, const float* O, float* D, int N, int d);
5 |
6 | __global__ void computeSiKernel(const float* Qi, const float* Kj, float* Si, int Br, int Bc, int d, float scale);
7 |
8 | __global__ void findRowMaxSiKernel(float* Si, float* maxSi, int Br, int Bc);
9 |
10 | __global__ void computeSoftmaxKernel(float* Si, float* softmaxSi, int Br, int Bc);
11 |
12 | __global__ void computeAttentionKernel(const float* Q, const float* K, const float* V, float* attention, int N, int d);
13 |
14 | __global__ void computeQKernel(const float* Q, const float* dO, float* dQ, int N, int d);
15 |
16 | __global__ void computeKKernel(const float* K, const float* dO, float* dK, int N, int d);
17 |
18 | __global__ void computeVKernel(const float* V, const float* dO, float* dV, int N, int d);
19 |
20 | __global__ void computeGradientsKernel(const float* dO, float* dQ, float* dK, float* dV, int N, int d);
21 |
22 | #endif
--------------------------------------------------------------------------------
/day14/FlashAttention2/kernel.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void baackwardKernel(float *Q, float *K, float *V, float *O,
5 | float *dQ, float *dK, float *dV, float *dO,
6 | float *L, int Bc, int Br,
7 | int batch_size, int N, int nr_heads, int d)
8 | {
9 | int Tr = ceil(N / Br);
10 | int Tc = ceil(N / Bc);
11 |
12 | // Q1 - > size of Br* d size in shared memory
13 | // O1 - > size of Br* d size in shared memory
14 |
15 | // K1 - > size of Bc *d size in shared memory
16 | // V1 - > size of Bc *d size in shared memory
17 |
18 | // L - > size of Br each
19 |
20 | int row = blockIdx.y * blockDim.y + threadIdx.y;
21 | int col = blockIdx.x * blockDim.x + threadIdx.x;
22 |
23 |
24 | }
--------------------------------------------------------------------------------
/day14/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day14/cat.jpg
--------------------------------------------------------------------------------
/day15/SMM.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | __global__ void spmv_csr_kernel(int num_rows, const float *values, const int *column_indices, const int *row_offsets, const float *x, float *y) {
6 | int row = blockIdx.x * blockDim.x + threadIdx.x;
7 | if (row < num_rows) {
8 | float dot = 0;
9 | for (int i = row_offsets[row]; i < row_offsets[row + 1]; i++) {
10 | dot += values[i] * x[column_indices[i]];
11 | }
12 | y[row] = dot;
13 | }
14 | }
15 |
16 | void spmv_csr(int num_rows, int nnz, float *h_values, int *h_column_indices, int *h_row_offsets, float *h_x, float *h_y) {
17 | float *d_values;
18 | float*d_x;
19 | float *d_y;
20 | int *d_column_indices;
21 | int *d_row_offsets;
22 |
23 | cudaMalloc(&d_values, nnz * sizeof(float));
24 | cudaMalloc(&d_column_indices, nnz * sizeof(int));
25 | cudaMalloc(&d_row_offsets, (num_rows + 1) * sizeof(int));
26 | cudaMalloc(&d_x, num_rows * sizeof(float));
27 | cudaMalloc(&d_y, num_rows * sizeof(float));
28 |
29 | cudaMemcpy(d_values, h_values, nnz * sizeof(float), cudaMemcpyHostToDevice);
30 | cudaMemcpy(d_column_indices, h_column_indices, nnz * sizeof(int), cudaMemcpyHostToDevice);
31 | cudaMemcpy(d_row_offsets, h_row_offsets, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice);
32 | cudaMemcpy(d_x, h_x, num_rows * sizeof(float), cudaMemcpyHostToDevice);
33 |
34 | int blockSize = 256;
35 | int gridSize = (num_rows + blockSize - 1) / blockSize;
36 | spmv_csr_kernel<<>>(num_rows, d_values, d_column_indices, d_row_offsets, d_x, d_y);
37 |
38 | cudaMemcpy(h_y, d_y, num_rows * sizeof(float), cudaMemcpyDeviceToHost);
39 |
40 | cudaFree(d_values);
41 | cudaFree(d_column_indices);
42 | cudaFree(d_row_offsets);
43 | cudaFree(d_x);
44 | cudaFree(d_y);
45 | }
46 |
47 | int main() {
48 | int num_rows = 3;
49 | int nnz = 4;
50 | float values[] = {1, 2, 3, 4};
51 | int column_indices[] = {0, 2, 1, 2};
52 | int row_offsets[] = {0, 1, 3, 4};
53 | float x[] = {1, 2, 3};
54 | float y[3] = {0};
55 |
56 | spmv_csr(num_rows, nnz, values, column_indices, row_offsets, x, y);
57 |
58 | std::cout << "Rezultat SpMV: ";
59 | for (int i = 0; i < num_rows; i++) {
60 | std::cout << y[i] << " ";
61 | }
62 | std::cout << std::endl;
63 | return 0;
64 | }
--------------------------------------------------------------------------------
/day16/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 | # Set the seed for reproducibility
5 | torch.manual_seed(42)
6 |
7 | # Define the dimensions
8 | seq_len = 4
9 | dim = 4
10 |
11 | # Initialize the tensors
12 | Q = torch.full((seq_len, dim), 2.0, requires_grad=True)
13 | K = torch.full((seq_len, dim), 2.0, requires_grad=True)
14 | V = torch.full((seq_len, dim), 2.0, requires_grad=True)
15 | # Forward pass
16 | scores = torch.matmul(Q, K.transpose(-2, -1)) / (dim ** 0.5)
17 | P = F.softmax(scores, dim=-1)
18 | O = torch.matmul(P, V)
19 |
20 | # Create a dummy gradient for the output
21 | dO = torch.ones_like(O)
22 |
23 | # Backward pass
24 | O.backward(dO)
25 |
26 |
27 | print("PyTorch O:")
28 | print(O)
29 | # Print the gradients
30 | print("PyTorch dQ:")
31 | print(Q.grad)
32 | print("PyTorch dK:")
33 | print(K.grad)
34 | print("PyTorch dV:")
35 | print(V.grad)
--------------------------------------------------------------------------------
/day17/cublas1.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 |
6 | #define n 6
7 |
8 | int main(){
9 | cudaError_t cudaStat;
10 | cublasStatus_t stat;
11 | cublasHandle_t handle;
12 |
13 | int j;
14 | float *x;
15 | x = (float*)malloc(sizeof(float)*n);
16 | for(j = 0 ; j
2 | #include
3 | #include
4 |
5 | #define n 10
6 |
7 | int main()
8 | {
9 | cudaError_t cudaStat;
10 | cublasStatus_t stat;
11 | cublasHandle_t handle;
12 |
13 | int j;
14 | float *x, *y;
15 | x = (float *)malloc(sizeof(float) * n);
16 | y = (float *)malloc(sizeof(float) * n);
17 |
18 | for (j = 0; j < n; ++j)
19 | {
20 | x[j] = (float)j;
21 | y[j] = (float)j + 1;
22 | }
23 |
24 | printf("\nx:\n");
25 | for (j = 0; j < n; ++j)
26 | {
27 | printf("%f ", x[j]);
28 | }
29 |
30 | printf("\ny:\n");
31 | for (j = 0; j < n; ++j)
32 | {
33 | printf("%f ", y[j]);
34 | }
35 |
36 | float *d_x, *d_y;
37 | cudaStat = cudaMalloc((void **)&d_x, n * sizeof(float));
38 | cudaStat = cudaMalloc((void **)&d_y, n * sizeof(float));
39 |
40 | stat = cublasCreate(&handle);
41 | stat = cublasSetVector(n, sizeof(float), x, 1, d_x, 1);
42 | stat = cublasSetVector(n, sizeof(float), y, 1, d_y, 1);
43 | float a = 3.0;
44 |
45 | stat = cublasSaxpy(handle, n, &a, d_x, 1, d_y, 1);
46 | stat = cublasGetVector(n, sizeof(float), d_y, 1, y, 1);
47 |
48 | printf("\nNew y:\n");
49 | for (j = 0; j < n; ++j)
50 | {
51 | printf("%f ", y[j]);
52 | }
53 | cudaFree(d_y);
54 | cudaFree(d_x);
55 | cublasDestroy(handle);
56 | free(x);
57 | free(y);
58 | return 0;
59 | }
--------------------------------------------------------------------------------
/day17/cublas3.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #define n 10
6 |
7 | int main()
8 | {
9 | cudaError_t cudaStat;
10 | cublasStatus_t stat;
11 | cublasHandle_t handle;
12 |
13 | int j;
14 | float *x, *y;
15 | x = (float *)malloc(sizeof(float) * n);
16 | y = (float *)malloc(sizeof(float) * n);
17 | for (j = 0; j < n; ++j)
18 | {
19 | x[j] = (float)1;
20 | y[j] = (float)1;
21 | }
22 |
23 | printf("\nx:\n");
24 | for (j = 0; j < n; ++j)
25 | {
26 | printf("%f ", x[j]);
27 | }
28 |
29 | printf("\ny:\n");
30 | for (j = 0; j < n; ++j)
31 | {
32 | printf("%f ", y[j]);
33 | }
34 |
35 | float *d_x, *d_y;
36 | cudaStat = cudaMalloc((void **)&d_x, n * sizeof(float));
37 | cudaStat = cudaMalloc((void **)&d_y, n * sizeof(float));
38 |
39 | stat = cublasCreate(&handle);
40 | stat = cublasSetVector(n, sizeof(float), x, 1, d_x, 1);
41 | stat = cublasSetVector(n, sizeof(float), y, 1, d_y, 1);
42 | float a = 3.0;
43 |
44 | float result;
45 | stat = cublasSdot(handle, n, d_x, 1, d_y, 1, &result);
46 | printf("\ndot product x . y : \n ");
47 | printf (" %7.0f \n " , result );
48 |
49 |
50 | cudaFree(d_y);
51 | cudaFree(d_x);
52 | cublasDestroy(handle);
53 | free(x);
54 | free(y);
55 | return 0;
56 | }
--------------------------------------------------------------------------------
/day18/atomic1.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #define N 32
3 | #include
4 | __device__ int lane_id() {
5 | return threadIdx.x & 31;
6 | }
7 |
8 | __device__ int atomicAggInc(int *ptr) {
9 | int mask = __match_any_sync(__activemask(), (unsigned long long)ptr);
10 | int leader = __ffs(mask) - 1;
11 | int res;
12 | if (lane_id() == leader)
13 | res = atomicAdd(ptr, __popc(mask));
14 | res = __shfl_sync(mask, res, leader);
15 | return res + __popc(mask & ((1 << lane_id()) - 1));
16 | }
17 |
18 | __global__ void test_atomicAggInc(int *d_ptr, int *d_results) {
19 | int old_val = atomicAggInc(d_ptr);
20 | d_results[threadIdx.x] = old_val;
21 | }
22 |
23 | int main() {
24 | int *d_ptr, *d_results;
25 | int h_ptr = 0;
26 | int h_results[N];
27 |
28 | cudaMalloc(&d_ptr, sizeof(int));
29 | cudaMalloc(&d_results, N * sizeof(int));
30 |
31 | cudaMemcpy(d_ptr, &h_ptr, sizeof(int), cudaMemcpyHostToDevice);
32 |
33 | test_atomicAggInc<<<1, N>>>(d_ptr, d_results);
34 |
35 | cudaMemcpy(&h_ptr, d_ptr, sizeof(int), cudaMemcpyDeviceToHost);
36 | cudaMemcpy(h_results, d_results, N * sizeof(int), cudaMemcpyDeviceToHost);
37 |
38 | printf("Final value of ptr: %d\n", h_ptr);
39 | printf("Old values returned by each thread:\n");
40 | for (int i = 0; i < N; i++) {
41 | printf("Thread %2d -> %d\n", i, h_results[i]);
42 | }
43 |
44 | cudaFree(d_ptr);
45 | cudaFree(d_results);
46 |
47 | return 0;
48 | }
49 |
--------------------------------------------------------------------------------
/day18/atomic2.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define N 320
5 |
6 | __device__ int lane_id(){
7 | return threadIdx.x & 31;
8 | }
9 |
10 | // incremenets threads by pointers
11 | __device__ int atomicIncrement(int * ptr){
12 | int mask = __match_any_sync(__activemask(), (unsigned long long)ptr);
13 | int leader = __ffs(mask) -1;
14 | int res;
15 | if(lane_id() == leader){
16 | res = atomicAdd(ptr,__popc(mask)); // add on ptr number of active threads
17 | }
18 | __shfl_sync(mask,res,leader);
19 | return *ptr;
20 | }
21 |
22 | __global__ void testatomicIncrement(int *d_ptr, int *d_results){
23 | int val = atomicIncrement(d_ptr);
24 | d_results[threadIdx.x] = val;
25 | }
26 |
27 |
28 |
29 | int main() {
30 | int *d_ptr, *d_results;
31 | int h_ptr = 100;
32 | int h_results[N];
33 |
34 | cudaMalloc(&d_ptr, sizeof(int));
35 | cudaMalloc(&d_results, N * sizeof(int));
36 |
37 | cudaMemcpy(d_ptr, &h_ptr, sizeof(int), cudaMemcpyHostToDevice);
38 |
39 | testatomicIncrement<<<1, N>>>(d_ptr, d_results);
40 |
41 | cudaMemcpy(&h_ptr, d_ptr, sizeof(int), cudaMemcpyDeviceToHost);
42 | cudaMemcpy(h_results, d_results, N * sizeof(int), cudaMemcpyDeviceToHost);
43 |
44 | printf("Final value of ptr: %d\n", h_ptr);
45 | printf("Old values returned by each thread:\n");
46 | for (int i = 0; i < N; i++) {
47 | printf("Thread %2d -> %d\n", i, h_results[i]);
48 | }
49 |
50 | cudaFree(d_ptr);
51 | cudaFree(d_results);
52 |
53 | return 0;
54 | }
55 |
56 |
--------------------------------------------------------------------------------
/day18/wrap.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | __device__ int lane_id() {
7 | return threadIdx.x & 31;
8 | }
9 |
10 | __device__ float reduceMax(float val) {
11 | // threads of 32 we perform reduction on them
12 | for (int offset = 16; offset > 0; offset /= 2) {
13 | float temp = __shfl_xor_sync(0xFFFFFFFF, val, offset);
14 | val = fmaxf(val, temp);
15 | }
16 | return val;
17 | }
18 |
19 | __device__ float atomicMaxFloat(float *addr, float value) {
20 | // give the adres of input
21 | // save old adress
22 | int *addr_as_int = (int*)addr;
23 | int old = *addr_as_int;
24 | int assumed;
25 | do {
26 | assumed = old;
27 | float old_val = __int_as_float(assumed);
28 | if (old_val >= value) {
29 | return old_val;
30 | }
31 | old = atomicCAS(addr_as_int, assumed, __float_as_int(fmaxf(old_val, value)));
32 | } while (assumed != old);
33 | return __int_as_float(old);
34 | }
35 |
36 | __global__ void MaxValue(float *data, float *max_value, int N) {
37 | int tx = threadIdx.x;
38 | int bx = blockIdx.x;
39 |
40 | extern __shared__ float reduction[];
41 |
42 | float block_max = -INFINITY;
43 |
44 | for (int i = bx * blockDim.x + tx; i < N; i += gridDim.x * blockDim.x) {
45 | block_max = fmaxf(block_max, data[i]);
46 | }
47 |
48 | block_max = reduceMax(block_max);
49 |
50 | reduction[tx] = block_max;
51 | __syncthreads();
52 |
53 | if (tx == 0) {
54 | float final_max = -INFINITY;
55 | for (int i = 0; i < blockDim.x; ++i) {
56 | final_max = fmaxf(final_max, reduction[i]);
57 | }
58 | atomicMaxFloat(max_value, final_max);
59 | }
60 | }
61 |
62 | int main() {
63 | int N = 1024;
64 | float *host_data = (float*)malloc(N * sizeof(float));
65 | float host_result = -INFINITY;
66 |
67 | for (int i = 0; i < N; ++i) {
68 | host_data[i] = rand()%10000;
69 | if (host_data[i] > host_result) {
70 | host_result = host_data[i];
71 | }
72 | }
73 |
74 | float *device_data, *device_result;
75 | cudaMalloc(&device_data, N * sizeof(float));
76 | cudaMalloc(&device_result, sizeof(float));
77 |
78 | cudaMemcpy(device_data, host_data, N * sizeof(float), cudaMemcpyHostToDevice);
79 | cudaMemcpy(device_result, &host_result, sizeof(float), cudaMemcpyHostToDevice);
80 |
81 | int threadsPerBlock = 256;
82 | int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
83 |
84 | MaxValue<<>>(device_data, device_result, N);
85 |
86 | cudaMemcpy(&host_result, device_result, sizeof(float), cudaMemcpyDeviceToHost);
87 |
88 | std::cout << "Max value: " << host_result << std::endl;
89 |
90 | free(host_data);
91 | cudaFree(device_data);
92 | cudaFree(device_result);
93 |
94 | return 0;
95 | }
--------------------------------------------------------------------------------
/day20/rope.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #define BLOCK_SIZE 256
7 | #define theta 10000.0f
8 | #define STRINGFY(str) #str
9 | #define TORCH_BINDING_COMMON_EXTENSION(func) \
10 | m.def(STRINGFY(func), &func, STRINGFY(func));
11 |
12 | __global__ void rope_kernel(float* x, float* out, int N){
13 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
14 | float4 x_v = reinterpret_cast(&(x[idx * 4]))[0];
15 |
16 | int token_pos = idx / N;
17 | int token_idx = idx % N;
18 |
19 | float exp_f_v = 1.0f / powf(theta, token_idx * 2 / (N * 4));
20 | float exp_s_v = 1.0f / powf(theta, ((token_idx * 2) + 1) / (N * 4));
21 |
22 | float sin_f_v = sinf(token_pos / exp_f_v);
23 | float cos_f_v = cosf(token_pos / exp_f_v);
24 |
25 | float sin_s_v = sinf(token_pos / exp_s_v);
26 | float cos_s_v = cosf(token_pos / exp_s_v);
27 | float4 out_v;
28 |
29 | out_v.x = x_v.x * cos_f_v - x_v.y * sin_f_v;
30 | out_v.y = x_v.x * sin_f_v + x_v.y * cos_f_v;
31 | out_v.z = x_v.z * cos_s_v - x_v.w * sin_s_v;
32 | out_v.w = x_v.z * sin_s_v + x_v.w * cos_s_v;
33 |
34 | reinterpret_cast(&(out[idx * 4]))[0] = out_v;
35 | }
36 |
37 | void rope(torch::Tensor x, torch::Tensor out) {
38 | int seq_len = x.size(0);
39 | int hidden_size = x.size(1);
40 |
41 | int N = (int)(hidden_size/4);
42 |
43 | dim3 grid((seq_len * N + BLOCK_SIZE - 1) / BLOCK_SIZE);
44 | dim3 block(BLOCK_SIZE);
45 |
46 | rope_kernel<<>>(x.data_ptr(), out.data_ptr(), N);
47 | }
48 |
49 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
50 | TORCH_BINDING_COMMON_EXTENSION(rope)
51 | }
--------------------------------------------------------------------------------
/day20/test_rope.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import time
3 | from torch.utils.cpp_extension import load
4 | print(torch.__version__) # Verifică versiunea PyTorch
5 | print(torch.cuda.is_available()) # Dacă e False, PyTorch nu vede CUDA
6 | print(torch.version.cuda) # Verifică versiunea CUDA detectată de
7 | lib = load(
8 | name="rope",
9 | sources=["rope.cu"],
10 | extra_cuda_cflags=[ "-O3",
11 | "--use_fast_math",
12 | ],
13 | extra_cflags=["-std=c++17"],
14 | )
15 |
16 | def benchmark(func, x, out=None, iters=20):
17 | torch.cuda.synchronize()
18 | start = time.time()
19 | for _ in range(iters):
20 | if out is not None:
21 | func(x, out)
22 | else:
23 | _ = func(x)
24 | torch.cuda.synchronize()
25 | return (time.time() - start) * 1000 / iters
26 |
27 | def naive_rope(x, theta=10000.0):
28 | dim = x.shape[-1]
29 | seq_len = x.shape[-2]
30 | x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
31 | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim)).cuda()
32 | freqs = torch.outer(torch.arange(seq_len, device='cuda'), freqs)
33 | freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
34 | return torch.view_as_real(x_ * freqs_cis).flatten(1).type_as(x)
35 |
36 | sizes = [(4096, 512), (4096, 1024), (8192, 512), (8192, 1024)]
37 | for M, N in sizes:
38 | print(f"Testing M={M}, N={N}")
39 | x = torch.randn((M, N), device='cuda', dtype=torch.float32).contiguous()
40 | out = torch.zeros_like(x)
41 |
42 | t_naive = benchmark(naive_rope, x)
43 | naive_out = naive_rope(x)
44 |
45 | t_cuda = benchmark(lib.rope, x, out)
46 |
47 | # Compute the maximum absolute difference
48 | max_diff = torch.max(torch.abs(naive_out - out)).item()
49 |
50 | print(f"Naive: {t_naive:.4f}ms, CUDA f32: {t_cuda:.4f}ms")
51 | print(f"Max difference: {max_diff:.6f}")
52 | print("-" * 60)
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/day23/kernel.ptx:
--------------------------------------------------------------------------------
1 | .version 6.0
2 | .target sm_50
3 | .address_size 64
4 |
5 | .visible .entry vectorAdd(
6 | .param .u64 param_A,
7 | .param .u64 param_B,
8 | .param .u64 param_C,
9 | .param .u32 param_N
10 | )
11 | {
12 | .reg .pred %p<2>;
13 | .reg .s32 %r<6>;
14 | .reg .f32 %f<4>;
15 | .reg .u64 %rd<10>;
16 |
17 | ld.param.u64 %rd1, [param_A];
18 | ld.param.u64 %rd2, [param_B];
19 | ld.param.u64 %rd3, [param_C];
20 | ld.param.u32 %r1, [param_N];
21 |
22 | mov.u32 %r2, %tid.x;
23 | mov.u32 %r3, %ctaid.x;
24 | mov.u32 %r4, %ntid.x;
25 | mad.lo.s32 %r5, %r3, %r4, %r2;
26 |
27 | setp.ge.s32 %p1, %r5, %r1;
28 | @%p1 bra EXIT;
29 |
30 | cvt.u64.s32 %rd4, %r5;
31 | mul.wide.s32 %rd5, %r5, 4;
32 | add.u64 %rd6, %rd1, %rd5;
33 | add.u64 %rd7, %rd2, %rd5;
34 | add.u64 %rd8, %rd3, %rd5;
35 |
36 | ld.global.f32 %f1, [%rd6];
37 | ld.global.f32 %f2, [%rd7];
38 |
39 | add.f32 %f3, %f1, %f2;
40 |
41 | st.global.f32 [%rd8], %f3;
42 |
43 | EXIT:
44 | ret;
45 | }
46 |
--------------------------------------------------------------------------------
/day24/GeGLU.cu:
--------------------------------------------------------------------------------
1 | #include "cuda_runtime.h"
2 |
3 | __global__ void GLUKernel(float* x, float* W, float* V, float* b, float* c, float* out, int M, int N, int K) {
4 | int row = blockIdx.x * blockDim.x + threadIdx.x;
5 | int col = blockIdx.y * blockDim.y + threadIdx.y;
6 |
7 | if (row < M && col < K) {
8 | float sum1 = b[col];
9 | float sum2 = c[col];
10 |
11 | for (int i = 0; i < N; i++) {
12 | sum1 += x[row * N + i] * W[i * K + col];
13 | sum2 += x[row * N + i] * V[i * K + col];
14 | }
15 |
16 | float gate = 1.0f / (1.0f + expf(-sum1));
17 | out[row * K + col] = gate * sum2;
18 | }
19 | }
20 |
21 | extern "C" void launchGLU(float* x, float* W, float* V, float* b, float* c, float* out, int M, int N, int K) {
22 | dim3 blockSize(16, 16);
23 | dim3 gridSize((M + 15) / 16, (K + 15) / 16);
24 |
25 | GLUKernel<<>>(x, W, V, b, c, out, M, N, K);
26 | cudaDeviceSynchronize();
27 | }
28 |
--------------------------------------------------------------------------------
/day26/gradientdescent.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day26/gradientdescent.out
--------------------------------------------------------------------------------
/day27/kmeans.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day27/kmeans.out
--------------------------------------------------------------------------------
/day28/sample.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | __global__ void update_x_kernel(
6 | float *x, const float *noise, const float *predicted_noise,
7 | float sqrt_alpha, float sqrt_alpha_hat, float beta, float alpha,
8 | int numel)
9 | {
10 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
11 | if (idx < numel)
12 | {
13 | x[idx] = (1.0f / sqrt_alpha) *(x[idx] - ((1 - alpha) / sqrt_alpha_hat) * predicted_noise[idx]) +sqrt(beta) * noise[idx];
14 | }
15 | }
16 |
17 | torch::Tensor update_x(torch::Tensor x, torch::Tensor noise, torch::Tensor predicted_noise,
18 | torch::Tensor sqrt_alpha, torch::Tensor sqrt_alpha_hat,
19 | torch::Tensor beta, torch::Tensor alpha)
20 | {
21 | int numel = x.numel();
22 | float sqrt_alpha_val = sqrt_alpha.item();
23 | float sqrt_alpha_hat_val = sqrt_alpha_hat.item();
24 | float beta_val = beta.item();
25 | float alpha_val = alpha.item();
26 |
27 | const int threads = 1024;
28 | const int blocks = (numel + threads - 1) / threads;
29 |
30 | update_x_kernel<<>>(
31 | x.data_ptr(), noise.data_ptr(), predicted_noise.data_ptr(),
32 | sqrt_alpha_val, sqrt_alpha_hat_val, beta_val, alpha_val, numel);
33 |
34 | cudaError_t err = cudaGetLastError();
35 | if (err != cudaSuccess) {
36 | printf("CUDA error: %s\n", cudaGetErrorString(err));
37 | throw std::runtime_error(cudaGetErrorString(err));
38 | }
39 |
40 | cudaDeviceSynchronize();
41 |
42 | return x;
43 | }
44 |
45 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
46 | {
47 | m.def("update_x", &update_x, "CUDA kernel for updating x");
48 | }
49 |
--------------------------------------------------------------------------------
/day28/test_sample.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.cpp_extension import load
3 | import time
4 |
5 | lib = load(
6 | name="update_x",
7 | sources=["sample.cu"],
8 | extra_cuda_cflags=[ "-O3",
9 | "--use_fast_math",
10 | ],
11 | extra_cflags=["-std=c++17"],
12 | )
13 |
14 |
15 | print("Loaded ")
16 |
17 | size = 10**6
18 | device = "cuda"
19 |
20 | x = torch.randn(size, device=device)
21 | noise = torch.randn(size, device=device)
22 | predicted_noise = torch.randn(size, device=device)
23 | alpha = torch.tensor(0.9, device=device)
24 | beta = torch.tensor(0.1, device=device)
25 | alpha_hat = torch.tensor(0.81, device=device)
26 |
27 | sqrt_alpha = torch.sqrt(alpha)
28 | sqrt_alpha_hat = torch.sqrt(1 - alpha_hat)
29 |
30 | torch.cuda.synchronize()
31 | start = time.time()
32 | x_cuda = lib.update_x(x.clone(), noise, predicted_noise, sqrt_alpha, sqrt_alpha_hat, beta, alpha)
33 | torch.cuda.synchronize()
34 | time_cuda = time.time() - start
35 |
36 | torch.cuda.synchronize()
37 | start = time.time()
38 | x_torch = 1 / sqrt_alpha * (x - ((1 - alpha) / sqrt_alpha_hat) * predicted_noise) + torch.sqrt(beta) * noise
39 | torch.cuda.synchronize()
40 | time_torch = time.time() - start
41 |
42 | print(f"CUDA Kernel Time: {time_cuda:.6f}s")
43 | print(f"PyTorch Time: {time_torch:.6f}s")
44 | print(f"Speedup: {time_torch / time_cuda:.2f}x")
45 |
--------------------------------------------------------------------------------
/day29/pi.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | __device__ float randomFloat(unsigned int *seed) {
6 | *seed = (*seed * 1664525u + 1013904223u);
7 | return (float)(*seed & 0x00FFFFFF) / (float)0x01000000;
8 | }
9 |
10 | __global__ void monteCarloPi(int iterations, unsigned long long *d_count) {
11 | int tid = blockIdx.x * blockDim.x + threadIdx.x;
12 | unsigned int seed = tid;
13 | unsigned int local_count = 0;
14 |
15 | for (int i = 0; i < iterations; i++) {
16 | float x = randomFloat(&seed);
17 | float y = randomFloat(&seed);
18 | if (x * x + y * y <= 1.0f)
19 | local_count++;
20 | }
21 |
22 | atomicAdd(d_count, (unsigned long long)local_count);
23 | }
24 |
25 | int main() {
26 | int iterations = 10000;
27 | int threadsPerBlock = 256;
28 | int blocks = 256;
29 |
30 | unsigned long long totalPoints = (unsigned long long)iterations * threadsPerBlock * blocks;
31 |
32 | unsigned long long host_count = 0;
33 | unsigned long long *d_count;
34 | cudaMalloc((void**)&d_count, sizeof(unsigned long long));
35 | cudaMemset(d_count, 0, sizeof(unsigned long long));
36 |
37 | monteCarloPi<<>>(iterations, d_count);
38 | cudaDeviceSynchronize();
39 |
40 | cudaMemcpy(&host_count, d_count, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
41 |
42 | float pi = 4.0f * (float)host_count / (float)totalPoints;
43 | printf("Estimated Pi = %f\n", pi);
44 |
45 | cudaFree(d_count);
46 | return 0;
47 | }
48 |
--------------------------------------------------------------------------------
/day30/kernelHisto.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define BLOCK_SIZE 16 // 16x16 thread block
5 | #define HIST_SIZE 256 // Grayscale histogram bins
6 |
7 | __global__ void histogram_equalization(unsigned char *d_img, unsigned char *d_out, int width, int height) {
8 | __shared__ unsigned int hist_shared[HIST_SIZE]; // Shared memory for histogram
9 | __shared__ float cdf_shared[HIST_SIZE]; // Shared memory for CDF
10 |
11 | int tx = threadIdx.x, ty = threadIdx.y;
12 | int x = blockIdx.x * blockDim.x + tx;
13 | int y = blockIdx.y * blockDim.y + ty;
14 |
15 | int index = y * width + x;
16 |
17 | // Initialize shared histogram
18 | if (tx < HIST_SIZE / BLOCK_SIZE && ty == 0) {
19 | hist_shared[tx * BLOCK_SIZE] = 0;
20 | }
21 | __syncthreads();
22 |
23 | // First pass: compute local histogram using atomic operations
24 | if (x < width && y < height) {
25 | atomicAdd(&hist_shared[d_img[index]], 1);
26 | }
27 | __syncthreads();
28 |
29 | // Merge local histograms into global memory
30 | __shared__ unsigned int hist_global[HIST_SIZE];
31 | if (tx == 0 && ty == 0) {
32 | for (int i = 0; i < HIST_SIZE; i++) {
33 | atomicAdd(&hist_global[i], hist_shared[i]);
34 | }
35 | }
36 | __syncthreads();
37 |
38 | // Compute CDF (Cumulative Distribution Function)
39 | if (tx == 0 && ty == 0) {
40 | float sum = 0;
41 | for (int i = 0; i < HIST_SIZE; i++) {
42 | sum += hist_global[i];
43 | cdf_shared[i] = sum;
44 | }
45 |
46 | // Normalize the CDF
47 | float min_cdf = cdf_shared[0];
48 | for (int i = 0; i < HIST_SIZE; i++) {
49 | cdf_shared[i] = ((cdf_shared[i] - min_cdf) / (width * height - min_cdf)) * 255.0f;
50 | }
51 | }
52 | __syncthreads();
53 |
54 | // Apply equalization
55 | if (x < width && y < height) {
56 | d_out[index] = (unsigned char)cdf_shared[d_img[index]];
57 | }
58 | }
--------------------------------------------------------------------------------
/day32/Makefile:
--------------------------------------------------------------------------------
1 | PROJECT_DIR := $(CURDIR)
2 |
3 | COLOR_RESET := \033[0m
4 | COLOR_GREEN := \033[32m
5 | COLOR_YELLOW := \033[33m
6 | COLOR_BLUE := \033[34m
7 | COLOR_RED := \033[31m
8 |
9 | HIP_GPU_TARGET := gfx90a
10 |
11 | all: build
12 |
13 | build: $(PROJECT_DIR)/$(dir)/$(program).out
14 |
15 | $(PROJECT_DIR)/$(dir)/$(program).out: $(PROJECT_DIR)/$(dir)/$(program).cpp
16 | @echo "$(COLOR_YELLOW)Building HIP program $(program) in directory $(dir)...$(COLOR_RESET)"
17 | @hipcc --offload-arch=$(HIP_GPU_TARGET) -O3 -o $@ $< -lrocblas
18 | @echo "$(COLOR_GREEN)Build completed for $(program).out in $(dir)$(COLOR_RESET)"
19 |
20 | run: $(PROJECT_DIR)/$(dir)/$(program).out
21 | @echo "$(COLOR_BLUE)Running $(program).out in directory $(dir)...$(COLOR_RESET)"
22 | @./$(dir)/$(program).out
23 |
24 | # Target: Separate rocprof command for kernel profiling
25 | rocprof: $(PROJECT_DIR)/$(dir)/$(program).out
26 | @echo "$(COLOR_BLUE)Running rocprof for kernel trace on $(program).out in directory $(dir)...$(COLOR_RESET)"
27 | @mkdir -p $(PROJECT_DIR)/$(dir)/output
28 | @echo "$(COLOR_GREEN)Kernel profiling completed for $(program).out in $(dir)$(COLOR_RESET)"
29 |
30 | # Target: Generate ISA assembly files and kernel resource usage analysis.
31 | isa:
32 | @echo "$(COLOR_BLUE)Generating ISA and kernel resource usage for $(program) in directory $(dir)...$(COLOR_RESET)"
33 | @mkdir -p $(PROJECT_DIR)/$(dir)/isa_output
34 | @hipcc -c --save-temps=obj -O3 -Rpass-analysis=kernel-resource-usage --offload-arch=$(HIP_GPU_TARGET) -o $(PROJECT_DIR)/$(dir)/isa_output/$(program).o $(PROJECT_DIR)/$(dir)/$(program).cpp
35 | @echo "$(COLOR_GREEN)ISA and resource analysis files saved in $(dir)/isa_output$(COLOR_RESET)"
36 |
37 | clean:
38 | @echo "$(COLOR_RED)Cleaning up .out and ISA files in directory $(dir)...$(COLOR_RESET)"
39 | @rm -f $(PROJECT_DIR)/$(dir)/*.out
40 | @rm -rf $(PROJECT_DIR)/$(dir)/isa_output
41 | @rm -rf $(PROJECT_DIR)/$(dir)/output
42 | @echo "$(COLOR_GREEN)Clean completed for directory $(dir)$(COLOR_RESET)"
43 |
44 | cleanall:
45 | @echo "$(COLOR_RED)Cleaning up all .out and ISA files in all directories...$(COLOR_RESET)"
46 | @find $(PROJECT_DIR) -type f -name "*.out" -exec rm -f {} \;
47 | @find $(PROJECT_DIR) -type d -name "isa_output" -exec rm -rf {} \;
48 | @find $(PROJECT_DIR) -type d -name "output" -exec rm -rf {} \;
49 | @echo "$(COLOR_GREEN)Cleanall completed for all directories$(COLOR_RESET)"
50 |
51 | help:
52 | @echo "$(COLOR_BLUE)Usage instructions for HIP Makefile:$(COLOR_RESET)"
53 | @echo ""
54 | @echo "$(COLOR_YELLOW)make dir= program=$(COLOR_RESET) # Build the HIP program .cpp in directory "
55 | @echo "$(COLOR_YELLOW)make run dir= program=$(COLOR_RESET) # Run the compiled .out in directory "
56 | @echo "$(COLOR_YELLOW)make clean dir=$(COLOR_RESET) # Clean all .out files in directory "
57 | @echo "$(COLOR_YELLOW)make cleanall$(COLOR_RESET) # Clean all .out files in all directories"
58 | @echo "$(COLOR_YELLOW)make isa dir= program=$(COLOR_RESET) # Generate ISA assembly files and kernel resource usage analysis"
59 | @echo ""
60 | @echo "$(COLOR_BLUE)Examples:$(COLOR_RESET)"
61 | @echo "$(COLOR_GREEN)make dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)"
62 | @echo "$(COLOR_GREEN)make run dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)"
63 | @echo "$(COLOR_GREEN)make isa dir=matmul_kernels program=kernel_rocblas$(COLOR_RESET)"
64 |
--------------------------------------------------------------------------------
/day32/matmul_kernels/kernel_1/kernel_1.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #define HIP_CHECK(status) \
6 | { \
7 | hipError_t err = status; \
8 | if (err != hipSuccess) { \
9 | std::cerr << "HIP error: " << hipGetErrorString(err) \
10 | << " at line " << __LINE__ << std::endl; \
11 | exit(err); \
12 | } \
13 | }
14 |
15 | __global__ void kernel(float *A, float *B, float *C, int N, int M, int K, float alpha, float beta) {
16 | int row = blockDim.y * blockIdx.y + threadIdx.y;
17 | int col = blockDim.x * blockIdx.x + threadIdx.x;
18 |
19 | if (row < M && col < N) {
20 | float sum = 0.0f;
21 | for (int k = 0; k < K; ++k) {
22 | sum += A[row * K + k] * B[k * N + col];
23 | }
24 | C[row * N + col] = alpha * sum + beta * C[row * N + col];
25 | }
26 | }
27 |
28 | // int main() {
29 | // float *A, *B, *C;
30 | // float *d_A, *d_B, *d_C;
31 |
32 | // float alpha, beta;
33 |
34 | // // For simplicity, we use a square matrix.
35 | // int SIZE = 100;
36 | // size_t mem_size = SIZE * SIZE * sizeof(float);
37 |
38 | // alpha = 1.0f;
39 | // beta = 0.0f;
40 |
41 | // A = (float*)malloc(mem_size);
42 | // B = (float*)malloc(mem_size);
43 | // C = (float*)malloc(mem_size);
44 |
45 | // for (int i = 0; i < SIZE * SIZE; ++i) {
46 | // A[i] = i%3;
47 | // B[i] = i%3;
48 | // C[i] = 0.0f;
49 | // }
50 |
51 | // HIP_CHECK(hipMalloc(&d_A, mem_size));
52 | // HIP_CHECK(hipMalloc(&d_B, mem_size));
53 | // HIP_CHECK(hipMalloc(&d_C, mem_size));
54 |
55 | // HIP_CHECK(hipMemcpy(d_A, A, mem_size, hipMemcpyHostToDevice));
56 | // HIP_CHECK(hipMemcpy(d_B, B, mem_size, hipMemcpyHostToDevice));
57 | // HIP_CHECK(hipMemcpy(d_C, C, mem_size, hipMemcpyHostToDevice));
58 |
59 | // dim3 threadsPerBlock(16, 16);
60 | // dim3 blocksPerGrid((SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
61 | // (SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);
62 |
63 | // hipLaunchKernelGGL(kernel, blocksPerGrid, threadsPerBlock, 0, 0,
64 | // d_A, d_B, d_C, SIZE, SIZE, SIZE, alpha, beta);
65 |
66 | // HIP_CHECK(hipDeviceSynchronize());
67 | // HIP_CHECK(hipMemcpy(C, d_C, mem_size, hipMemcpyDeviceToHost));
68 |
69 | // std::cout << "Result matrix C (first 10 elements):" << std::endl;
70 | // for (int i = 0; i < 10; ++i) {
71 | // std::cout << C[i] << " ";
72 | // }
73 | // std::cout << std::endl;
74 |
75 | // HIP_CHECK(hipFree(d_A));
76 | // HIP_CHECK(hipFree(d_B));
77 | // HIP_CHECK(hipFree(d_C));
78 | // free(A);
79 | // free(B);
80 | // free(C);
81 |
82 | // return 0;
83 | // }
84 |
--------------------------------------------------------------------------------
/day32/matmul_kernels/kernel_2/kernel_2.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | // Macro to check HIP errors.
5 | #define CHECK_HIP_ERROR(error) \
6 | { \
7 | if ((error) != hipSuccess) \
8 | { \
9 | std::cerr << "HIP error: " << hipGetErrorString(error) \
10 | << " at line " << __LINE__ << std::endl; \
11 | exit(EXIT_FAILURE); \
12 | } \
13 | }
14 |
15 | #define TILESIZE 32
16 |
17 | __global__ void kernel(const float *A, const float *B, float *C, int N)
18 | {
19 | __shared__ float As[TILESIZE][TILESIZE];
20 | __shared__ float Bs[TILESIZE][TILESIZE];
21 |
22 | int col = blockIdx.x * blockDim.x + threadIdx.x;
23 | int row = blockIdx.y * blockDim.y + threadIdx.y;
24 |
25 | float sum = 0.0f;
26 |
27 | for (int t = 0; t < N; t += TILESIZE)
28 | {
29 | Bs[threadIdx.y][threadIdx.x] = B[(t + threadIdx.y) * N + col];
30 | As[threadIdx.y][threadIdx.x] = A[row * N + t + threadIdx.x];
31 |
32 | __syncthreads();
33 |
34 | for (int k = 0; k < TILESIZE; k++)
35 | {
36 | sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
37 | }
38 |
39 | __syncthreads();
40 | }
41 | if (row < N && col < N)
42 | {
43 | C[row * N + col] = sum;
44 | }
45 | }
46 |
47 | int main(){
48 | float *A, *B, *C;
49 | float *d_A, *d_B, *d_C;
50 |
51 | int N = 1024; // Size of the matrix
52 |
53 | size_t size = N*N* sizeof(float);
54 |
55 | // Allocate host memory
56 | A = (float *)malloc(size);
57 | B = (float *)malloc(size);
58 | C = (float *)malloc(size);
59 |
60 | for(int i = 0 ; i < N * N ; i++){
61 | A[i] = i;
62 | B[i] = i;
63 | }
64 |
65 | CHECK_HIP_ERROR(hipMalloc((void**)&d_A,size));
66 | CHECK_HIP_ERROR(hipMalloc((void**)&d_B,size));
67 | CHECK_HIP_ERROR(hipMalloc((void**)&d_C,size));
68 |
69 | CHECK_HIP_ERROR(hipMemcpy(d_A, A, size, hipMemcpyHostToDevice));
70 | CHECK_HIP_ERROR(hipMemcpy(d_B, B, size, hipMemcpyHostToDevice));
71 |
72 | dim3 Threads(TILESIZE, TILESIZE);
73 | dim3 Blocks((N+Threads.x-1)/Threads.x, (N+Threads.y-1)/Threads.y);
74 | hipLaunchKernelGGL(kernel, Blocks, Threads, 0, 0, d_A, d_B, d_C, N);
75 |
76 | CHECK_HIP_ERROR(hipMemcpy(C, d_C, size, hipMemcpyDeviceToHost));
77 |
78 | // Check the result
79 | for(int i = 0 ; i < 10 ; i++){
80 | for(int j = 0 ; j < 10 ; j++){
81 | std::cout << C[i*N+j] << " ";
82 | }
83 | std::cout << std::endl;
84 | }
85 |
86 | CHECK_HIP_ERROR(hipFree(d_A));
87 | CHECK_HIP_ERROR(hipFree(d_B));
88 | CHECK_HIP_ERROR(hipFree(d_C));
89 |
90 | free(A);
91 | free(B);
92 | free(C);
93 |
94 | return 0;
95 | }
96 |
--------------------------------------------------------------------------------
/day33/load_in_pytorch/kernel.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | // Your HIP kernel remains the same.
5 | extern "C" __global__ void kernel_addition(const float *A, const float *B, float *C, size_t N) {
6 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
7 | if (idx < N) {
8 | C[idx] = A[idx] + B[idx];
9 | }
10 | }
11 |
12 | // Host wrapper function that launches the kernel.
13 | // This function will be callable from Python.
14 | extern "C" void launch_kernel_addition(const float *A, const float *B, float *C, size_t N,
15 | int grid_x, int grid_y, int grid_z,
16 | int block_x, int block_y, int block_z) {
17 | // Create dim3 objects for grid and block dimensions.
18 | dim3 grid(grid_x, grid_y, grid_z);
19 | dim3 block(block_x, block_y, block_z);
20 |
21 | // Launch the kernel with the provided configuration.
22 | hipLaunchKernelGGL(kernel_addition, grid, block, 0, 0, A, B, C, N);
23 |
24 | // Wait for the kernel to finish.
25 | hipDeviceSynchronize();
26 | }
27 |
--------------------------------------------------------------------------------
/day33/load_in_pytorch/kernel.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day33/load_in_pytorch/kernel.so
--------------------------------------------------------------------------------
/day33/load_in_pytorch/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import ctypes
3 | import time
4 |
5 | # Load the shared object
6 | lib = ctypes.CDLL('./kernel.so')
7 |
8 | # Specify the argument types for the host wrapper function.
9 | lib.launch_kernel_addition.argtypes = [
10 | ctypes.c_void_p, # pointer to A
11 | ctypes.c_void_p, # pointer to B
12 | ctypes.c_void_p, # pointer to C
13 | ctypes.c_size_t, # N
14 | ctypes.c_int, # grid_x
15 | ctypes.c_int, # grid_y
16 | ctypes.c_int, # grid_z
17 | ctypes.c_int, # block_x
18 | ctypes.c_int, # block_y
19 | ctypes.c_int # block_z
20 | ]
21 | lib.launch_kernel_addition.restype = None
22 |
23 | N = 1000
24 |
25 | # Create input tensors on the ROCm device.
26 | A = torch.randn(N, device='cuda', dtype=torch.float32)
27 | B = torch.randn(N, device='cuda', dtype=torch.float32)
28 | C = torch.empty(N, device='cuda', dtype=torch.float32)
29 |
30 | # Get pointers to the tensor data.
31 | a_ptr = A.data_ptr()
32 | b_ptr = B.data_ptr()
33 | c_ptr = C.data_ptr()
34 |
35 | # Define block and grid sizes.
36 | block_size = 256
37 | grid_size = (N + block_size - 1) // block_size
38 |
39 | def measure_amd_kernel_time():
40 | start_amd = time.time()
41 | lib.launch_kernel_addition(
42 | ctypes.c_void_p(a_ptr),
43 | ctypes.c_void_p(b_ptr),
44 | ctypes.c_void_p(c_ptr),
45 | ctypes.c_size_t(N),
46 | ctypes.c_int(grid_size), # grid_x
47 | ctypes.c_int(1), # grid_y
48 | ctypes.c_int(1), # grid_z
49 | ctypes.c_int(block_size), # block_x
50 | ctypes.c_int(1), # block_y
51 | ctypes.c_int(1) # block_z
52 | )
53 | torch.cuda.synchronize() # Ensure the kernel has finished executing
54 | end_amd = time.time()
55 | return end_amd - start_amd
56 |
57 | def measure_pytorch_time():
58 | start_pytorch = time.time()
59 | c_pytorch = A + B
60 | end_pytorch = time.time()
61 | return end_pytorch - start_pytorch
62 |
63 | # Run the measurements 5 times and get the lowest time
64 | amd_times = [measure_amd_kernel_time() for _ in range(5)]
65 | pytorch_times = [measure_pytorch_time() for _ in range(5)]
66 |
67 | min_amd_time = min(amd_times)
68 | min_pytorch_time = min(pytorch_times)
69 |
70 | # Verify the result.
71 | if torch.allclose(C, A + B):
72 | print("Success!")
73 | else:
74 | print("Error in computation.")
75 |
76 | print(f"Lowest AMD kernel execution time: {min_amd_time} seconds")
77 | print(f"Lowest Pytorch computation time: {min_pytorch_time} seconds")
78 |
--------------------------------------------------------------------------------
/day34/tensor_lib/test1.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | __global__ void kernel_noise_image(float *X, const float *e, const float *alpha_hat, int N)
6 | {
7 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
8 | __shared__ float sqrt_alphas[2];
9 |
10 | if (threadIdx.x == 0) {
11 | sqrt_alphas[0] = sqrtf(*alpha_hat);
12 | sqrt_alphas[1] = sqrtf(1.0f - *alpha_hat);
13 | }
14 |
15 | __syncthreads();
16 |
17 | if (idx < N) {
18 | e[idx] = hiprand_normal(&state[idx]);
19 |
20 | X[idx] = sqrt_alphas[0] * X[idx] + sqrt_alphas[1] * e[idx];
21 | }
22 | }
23 |
24 | torch::Tensor noiseImage(torch::Tensor X, int t, torch::Tensor alpha_hat)
25 | {
26 | torch::Tensor alpha_at_t = alpha_hat.index({t});
27 |
28 | float *d_X, *d_e, *d_alpha_hat;
29 | int N = X.numel();
30 |
31 | hipMalloc(&d_X, N * sizeof(float));
32 | hipMalloc(&d_e, N * sizeof(float));
33 | hipMalloc(&d_alpha_hat, sizeof(float));
34 |
35 | hipMemcpy(d_X, X.data_ptr(), N * sizeof(float), hipMemcpyHostToDevice);
36 | hipMemcpy(d_alpha_hat, alpha_at_t.data_ptr(), sizeof(float), hipMemcpyHostToDevice);
37 |
38 | int blockSize = 256;
39 | int numBlocks = (N + blockSize - 1) / blockSize;
40 |
41 | kernel_noise_image<<>>(d_X, d_e, d_alpha_hat, N);
42 |
43 | hipDeviceSynchronize();
44 |
45 | hipMemcpy(X.data_ptr(), d_X, N * sizeof(float), hipMemcpyDeviceToHost);
46 |
47 | hipFree(d_X);
48 | hipFree(d_e);
49 | hipFree(d_alpha_hat);
50 |
51 | return X;
52 | }
53 |
54 | int main()
55 | {
56 | torch::Tensor X = torch::rand({1, 3, 64, 64}, torch::kFloat32);
57 | torch::Tensor alpha_hat = torch::rand({1000}, torch::kFloat32);
58 |
59 | int t = 500;
60 |
61 | X = noiseImage(X, t, alpha_hat);
62 |
63 | std::cout << "Noisy image tensor shape: " << X.sizes() << std::endl;
64 |
65 | return 0;
66 | }
67 |
--------------------------------------------------------------------------------
/day34/tensor_lib/test1.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day34/tensor_lib/test1.out
--------------------------------------------------------------------------------
/day36/random.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #define BLOCK_SIZE 256
7 |
8 | __global__ void reductionKernelOptimized(const float *g_in, float *g_out, int n) {
9 | extern __shared__ float sdata[];
10 |
11 | unsigned int tid = threadIdx.x;
12 | unsigned int idx = blockIdx.x * (BLOCK_SIZE * 2) + tid;
13 |
14 | float mySum = 0.0f;
15 | if (idx < n)
16 | mySum = g_in[idx];
17 | if (idx + BLOCK_SIZE < n)
18 | mySum += g_in[idx + BLOCK_SIZE];
19 |
20 | sdata[tid] = mySum;
21 | __syncthreads();
22 |
23 | for (unsigned int s = BLOCK_SIZE / 2; s > 32; s >>= 1) {
24 | if (tid < s)
25 | sdata[tid] += sdata[tid + s];
26 | __syncthreads();
27 | }
28 |
29 | if (tid < 32) {
30 | volatile float *vsmem = sdata;
31 | vsmem[tid] += vsmem[tid + 32];
32 | vsmem[tid] += vsmem[tid + 16];
33 | vsmem[tid] += vsmem[tid + 8];
34 | vsmem[tid] += vsmem[tid + 4];
35 | vsmem[tid] += vsmem[tid + 2];
36 | vsmem[tid] += vsmem[tid + 1];
37 | }
38 |
39 | if (tid == 0)
40 | g_out[blockIdx.x] = sdata[0];
41 | }
42 |
43 | int main() {
44 | int n = 1 << 20;
45 | size_t size = n * sizeof(float);
46 |
47 | float *h_array = (float*)malloc(size);
48 | for (int i = 0; i < n; i++) {
49 | h_array[i] = 1.0f;
50 | }
51 |
52 | float *d_in, *d_out;
53 | hipMalloc(&d_in, size);
54 | int numBlocks = (n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2);
55 | hipMalloc(&d_out, numBlocks * sizeof(float));
56 |
57 | hipMemcpy(d_in, h_array, size, hipMemcpyHostToDevice);
58 |
59 | size_t sharedMemSize = BLOCK_SIZE * sizeof(float);
60 | hipLaunchKernelGGL(reductionKernelOptimized, dim3(numBlocks), dim3(BLOCK_SIZE), sharedMemSize, 0, d_in, d_out, n);
61 | hipDeviceSynchronize();
62 |
63 | float *h_partialSums = (float*)malloc(numBlocks * sizeof(float));
64 | hipMemcpy(h_partialSums, d_out, numBlocks * sizeof(float), hipMemcpyDeviceToHost);
65 |
66 | float sum = 0.0f;
67 | for (int i = 0; i < numBlocks; i++) {
68 | sum += h_partialSums[i];
69 | }
70 | printf("Reduction result: %f (expected %f)\n", sum, (float)n);
71 |
72 | free(h_array);
73 | free(h_partialSums);
74 | hipFree(d_in);
75 | hipFree(d_out);
76 |
77 | return 0;
78 | }
79 |
--------------------------------------------------------------------------------
/day37/MultiStreams/MHA.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #define HEADS 8
6 | #define SEQ_LEN 128
7 | #define DIM 768 // head dimension
8 |
9 | __global__ void addition(const float* query, const float* key, const float* value,
10 | float* output, int seq_len, int dim, int head_id)
11 | {
12 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
13 | int total = seq_len * dim; // work items per head
14 | if (idx < total) {
15 | int seq = idx / dim; // which seq | on row
16 | int d = idx % dim; // pos in seq | on col
17 |
18 | int offset = head_id * (seq_len * dim) + seq * dim;
19 | output[offset + d] = query[offset + d] + key[offset + d] + value[offset + d];
20 | }
21 | }
22 |
23 | int main(){
24 | size_t total_elements = HEADS * SEQ_LEN * DIM;
25 | size_t size = total_elements * sizeof(float);
26 |
27 | // Create one HIP stream per head.
28 | hipStream_t streams[HEADS];
29 | for (int i = 0; i < HEADS; i++){
30 | hipStreamCreate(&streams[i]);
31 | }
32 |
33 | float *key = (float*)malloc(size);
34 | float *value = (float*)malloc(size);
35 | float *query = (float*)malloc(size);
36 | float *output= (float*)malloc(size);
37 |
38 | for (size_t i = 0; i < total_elements; i++){
39 | key[i] = 3.0f;
40 | value[i] = 5.0f;
41 | query[i] = 6.0f;
42 | }
43 |
44 | float *d_key, *d_value, *d_query, *d_output;
45 | hipMalloc(&d_key, size);
46 | hipMalloc(&d_value, size);
47 | hipMalloc(&d_query, size);
48 | hipMalloc(&d_output, size);
49 |
50 | size_t headSize = SEQ_LEN * DIM * sizeof(float);
51 |
52 | // [HEADS][SEQ_LEN][DIM]
53 | for (int head = 0; head < HEADS; head++){
54 | int offset = head * SEQ_LEN * DIM;
55 | hipMemcpyAsync(d_key + offset, key + offset, headSize, hipMemcpyHostToDevice, streams[head]);
56 | hipMemcpyAsync(d_value + offset, value + offset, headSize, hipMemcpyHostToDevice, streams[head]);
57 | hipMemcpyAsync(d_query + offset, query + offset, headSize, hipMemcpyHostToDevice, streams[head]);
58 | }
59 |
60 | int threadsPerBlock = 256; // threads per block 16x16 layout
61 | int totalWork = SEQ_LEN * DIM; // elements in a head
62 | int blocks = (totalWork + threadsPerBlock - 1) / threadsPerBlock;
63 |
64 | for (int head = 0; head < HEADS; head++){
65 | hipLaunchKernelGGL(addition, dim3(blocks), dim3(threadsPerBlock), 0, streams[head],
66 | d_query, d_key, d_value, d_output, SEQ_LEN, DIM, head);
67 | }
68 |
69 | for (int head = 0; head < HEADS; head++){
70 | int offset = head * SEQ_LEN * DIM;
71 | hipMemcpyAsync(output + offset, d_output + offset, headSize,
72 | hipMemcpyDeviceToHost, streams[head]);
73 | }
74 |
75 | hipDeviceSynchronize();
76 |
77 |
78 | for (int i = 0; i < HEADS; i++){
79 | hipStreamDestroy(streams[i]);
80 | }
81 |
82 | hipFree(d_key);
83 | hipFree(d_value);
84 | hipFree(d_query);
85 | hipFree(d_output);
86 | free(key);
87 | free(value);
88 | free(query);
89 | free(output);
90 | return 0;
91 | }
92 |
--------------------------------------------------------------------------------
/day37/MultiStreams/MHA.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day37/MultiStreams/MHA.out
--------------------------------------------------------------------------------
/day37/MultiStreams/results.copy_stats.csv:
--------------------------------------------------------------------------------
1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage"
2 | "CopyDeviceToHost",8,894880,111860,52.82395164336985
3 | "CopyHostToDevice",24,799200,33300,47.17604835663015
4 |
--------------------------------------------------------------------------------
/day37/MultiStreams/results.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day37/MultiStreams/results.db
--------------------------------------------------------------------------------
/day37/MultiStreams/results.hip_stats.csv:
--------------------------------------------------------------------------------
1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage"
2 | "hipStreamCreate",8,287069631,35883703,96.91385734531264
3 | "hipMemcpyAsync",32,6984316,218259,2.357884399407559
4 | "hipStreamDestroy",8,1219926,152490,0.41184340511392464
5 | "hipLaunchKernel",8,587473,73434,0.19832914515510996
6 | "hipMalloc",4,186051,46512,0.06281026665949475
7 | "hipFree",4,147901,36975,0.049930939630563304
8 | "hipDeviceSynchronize",1,12841,12841,0.004335083574797083
9 | "__hipPushCallConfiguration",8,1710,213,0.0005772909362902432
10 | "__hipPopCallConfiguration",8,1280,160,0.0004321242096207668
11 |
--------------------------------------------------------------------------------
/day37/MultiStreams/results.hsa_stats.csv:
--------------------------------------------------------------------------------
1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage"
2 | "hsa_queue_create",4,47570153,11892538,77.44727724772702
3 | "hsa_amd_memory_pool_allocate",24,6784269,282677,11.045227501499935
4 | "hsa_signal_wait_scacquire",74,1694809,22902,2.7592583632207988
5 | "hsa_amd_memory_async_copy",32,1068866,33402,1.7401827873597333
6 | "hsa_amd_memory_pool_free",20,837405,41870,1.363349350666012
7 | "hsa_agent_get_info",65,764114,11755,1.2440268755677468
8 | "hsa_executable_load_agent_code_object",2,604473,302236,0.9841210310962273
9 | "hsa_amd_agents_allow_access",20,537142,26857,0.8745018204040375
10 | "hsa_signal_create",547,500153,914,0.8142813426999574
11 | "hsa_executable_freeze",2,332962,166481,0.5420836112710775
12 | "hsa_signal_load_relaxed",1496,123321,82,0.20077454191637648
13 | "hsa_signal_destroy",546,102760,188,0.1672999077799146
14 | "hsa_code_object_reader_create_from_memory",2,97010,48505,0.15793853691834872
15 | "hsa_amd_signal_async_handler",32,82190,2568,0.13381062106297373
16 | "hsa_isa_get_info_alt",2,74801,37400,0.12178085248973718
17 | "hsa_executable_iterate_symbols",16,55820,3488,0.09087856025958381
18 | "hsa_executable_create_alt",2,24260,12130,0.039496844713319657
19 | "hsa_iterate_agents",1,24220,24220,0.039431722133413116
20 | "hsa_amd_pointer_info",128,21370,166,0.03479173831507177
21 | "hsa_executable_symbol_get_info",260,15970,61,0.026000190027688167
22 | "hsa_signal_store_screlease",48,15030,313,0.024469809399884357
23 | "hsa_amd_agent_iterate_memory_pools",4,13870,3467,0.022581254582594544
24 | "hsa_amd_profiling_get_async_copy_time",32,13490,421,0.021962590073482367
25 | "hsa_queue_load_read_index_relaxed",48,12480,260,0.0203182449308421
26 | "hsa_amd_profiling_set_profiler_enabled",4,9170,2292,0.014929351443575484
27 | "hsa_amd_profiling_get_dispatch_time",16,7110,444,0.011575538578388408
28 | "hsa_executable_get_symbol_by_name",16,6530,408,0.010631261169743502
29 | "hsa_queue_add_write_index_screlease",48,5490,114,0.008938074092173327
30 | "hsa_signal_silent_store_relaxed",80,5350,66,0.008710145062500419
31 | "hsa_amd_profiling_async_copy_enable",8,4790,598,0.007798428943808787
32 | "hsa_amd_memory_pool_get_info",27,4010,148,0.006528538635631156
33 | "hsa_queue_load_read_index_scacquire",48,3000,62,0.00488419349299089
34 | "hsa_amd_memory_copy_engine_status",2,2180,1090,0.0035491806049067127
35 | "hsa_agent_iterate_isas",1,1860,1860,0.0030281999656543513
36 | "hsa_amd_agent_memory_pool_get_info",9,1470,163,0.0023932548115655357
37 | "hsa_system_get_info",4,370,92,0.000602383864135543
38 | "hsa_system_get_major_extension_table",1,360,360,0.0005861032191589067
39 |
--------------------------------------------------------------------------------
/day37/MultiStreams/results.stats.csv:
--------------------------------------------------------------------------------
1 | "Name","Calls","TotalDurationNs","AverageNs","Percentage"
2 | "addition(float const*, float const*, float const*, float*, int, int, int)",8,42080,5260,100.0
3 |
--------------------------------------------------------------------------------
/day38/myreduction.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include // for FLT_MAX
4 |
5 | #ifndef WARP_SIZE
6 | #define WARP_SIZE 64
7 | #endif
8 |
9 | #define HIPCHECK(error) \
10 | { \
11 | if ((error) != hipSuccess) \
12 | { \
13 | std::cerr << "HIP error: " << hipGetErrorString(error) \
14 | << " at line " << __LINE__ << std::endl; \
15 | exit(EXIT_FAILURE); \
16 | } \
17 | }
18 |
19 |
20 | template
21 | __global__ void reduce_max_1d(const scalar_t *__restrict__ input,
22 | scalar_t *__restrict__ output,
23 | int n)
24 | {
25 | extern __shared__ float sdata[];
26 | const uint32_t tid = threadIdx.x;
27 | const uint32_t i = blockIdx.x * (blockDim.x * 2) + tid;
28 | const uint32_t lane = tid % WARP_SIZE;
29 | const uint32_t warp_id = tid / WARP_SIZE;
30 | float max_val = -FLT_MAX;
31 | if (i < n)
32 | max_val = input[i];
33 | if (i + blockDim.x < n)
34 | max_val = fmaxf(max_val, input[i + blockDim.x]);
35 |
36 | for (uint32_t offset = WARP_SIZE / 2; offset > 0; offset /= 2)
37 | {
38 | max_val = fmaxf(max_val, __shfl_down(max_val, offset, WARP_SIZE));
39 | }
40 |
41 |
42 | if (lane == 0)
43 | {
44 | sdata[warp_id] = max_val;
45 | }
46 | __syncthreads();
47 |
48 |
49 | const uint32_t numWarps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
50 | if (tid < numWarps)
51 | {
52 | max_val = sdata[lane];
53 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
54 | {
55 | max_val = fmaxf(max_val, __shfl_down(max_val, offset, WARP_SIZE));
56 | }
57 | if (lane == 0)
58 | sdata[tid] = max_val;
59 | }
60 | __syncthreads();
61 |
62 | if (tid == 0)
63 | output[blockIdx.x] = sdata[0];
64 | }
65 |
66 |
67 | int main()
68 | {
69 | const int n = 102400;
70 | std::vector h_input(n, 1.0f);
71 | h_input[500] = 133.0f;
72 |
73 | float *d_input;
74 | float *d_output;
75 | HIPCHECK(hipMalloc(&d_input, n * sizeof(float)));
76 | HIPCHECK(hipMalloc(&d_output, sizeof(float)));
77 |
78 | HIPCHECK(hipMemcpy(d_input, h_input.data(), n * sizeof(float), hipMemcpyHostToDevice));
79 |
80 | const int threadsPerBlock = 256;
81 | const int blocks = (n + threadsPerBlock * 2 - 1) / (threadsPerBlock * 2);
82 | const size_t sharedMemSize = ((threadsPerBlock + WARP_SIZE - 1) / WARP_SIZE) * sizeof(float);
83 |
84 | hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_max_1d), dim3(blocks), dim3(threadsPerBlock), sharedMemSize, 0, d_input, d_output, n);
85 |
86 | float h_output;
87 | HIPCHECK(hipMemcpy(&h_output, d_output, sizeof(float), hipMemcpyDeviceToHost));
88 |
89 |
90 | HIPCHECK(hipFree(d_input));
91 | HIPCHECK(hipFree(d_output));
92 |
93 | std::cout << "Maximum value: " << h_output << std::endl;
94 |
95 | return 0;
96 | }
97 |
--------------------------------------------------------------------------------
/day42/mat_mul.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import triton
4 | import triton.language as tl
5 |
6 |
7 | DEVICE = torch.device("cuda:0")
8 |
9 |
10 | @triton.jit
11 | def matmul_kernel(
12 | a_ptr, b_ptr, c_ptr,
13 | M, N, K,
14 | stride_am, stride_ak,
15 | stride_bk, stride_bn,
16 | stride_cm, stride_cn,
17 | BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, #
18 | GROUP_SIZE_M: tl.constexpr, #
19 | ACTIVATION: tl.constexpr #
20 | ):
21 | pid = tl.program_id(axis=0)
22 | num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
23 | num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
24 | num_pid_in_group = GROUP_SIZE_M * num_pid_n
25 |
26 | group_id = pid // num_pid_in_group
27 | first_pid_m = group_id * GROUP_SIZE_M
28 |
29 | group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
30 |
31 | pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
32 | pid_n = (pid % num_pid_in_group) // group_size_m
33 |
34 | offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
35 | offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
36 |
37 | offs_k = tl.arange(0, BLOCK_SIZE_K)
38 |
39 | a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
40 | b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
41 |
42 | accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
43 | for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
44 |
45 | a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
46 | b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
47 |
48 | accumulator = tl.dot(a, b, accumulator)
49 |
50 | a_ptrs += BLOCK_SIZE_K * stride_ak
51 | b_ptrs += BLOCK_SIZE_K * stride_bk
52 | if ACTIVATION == "leaky_relu":
53 | accumulator = leaky_relu(accumulator)
54 | c = accumulator.to(tl.float16)
55 |
56 | offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
57 | offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
58 |
59 | c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
60 | c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
61 |
62 | tl.store(c_ptrs, c, mask=c_mask)
63 |
64 |
65 | @triton.jit
66 | def leaky_relu(x):
67 | return tl.where(x >= 0, x, 0.01 * x)
68 |
69 |
70 | def matmul(a, b, activation=""):
71 | assert a.shape[1] == b.shape[0], "Incompatible dimensions"
72 | assert a.is_contiguous(), "Matrix A must be contiguous"
73 | M, K = a.shape
74 | K, N = b.shape
75 |
76 | c = torch.empty((M, N), device=a.device, dtype=torch.float16)
77 |
78 | grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
79 |
80 | matmul_kernel[grid](
81 | a, b, c,
82 | M, N, K,
83 | a.stride(0), a.stride(1),
84 | b.stride(0), b.stride(1),
85 | c.stride(0), c.stride(1),
86 | ACTIVATION=activation
87 | )
88 | return c
89 |
90 |
--------------------------------------------------------------------------------
/day42/mat_mul_2.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import triton
4 | import triton.language as tl
5 |
6 | @triton.jit
7 | def get_1d_offest(size,n_prev_chunks):
8 | return n_prev_chunks * size + tl.arange(0,size)
9 |
10 | @triton.jit
11 | def get_2d_offest(offs_0,offs_1,stride_0,stride_1):
12 | return tl.expand_dims(offs_0,1)*stride_0 + tl.expand_dims(offs_1,0)*stride_1
13 |
14 | @triton.jit
15 | def get_1d_mask(offs,max):
16 | return offs
2 |
3 | namespace cg = cooperative_groups;
4 |
5 | template
6 | __device__ T reduce_sum(cg::thread_block_tile<32>& group, T val) {
7 | for (int offset = group.size()/2; offset > 0; offset /= 2) {
8 | T temp = group.shfl_down(val, offset);
9 | val += temp;
10 | }
11 | return val;
12 | }
13 |
14 | template
15 | __device__ T reduce_max(cg::thread_block_tile<32>& group, T val) {
16 | for (int offset = group.size()/2; offset > 0; offset /= 2) {
17 | T temp = group.shfl_down(val, offset);
18 | val = max(val, temp);
19 | }
20 | return val;
21 | }
22 |
23 | template
24 | __device__ T reduce_sum(cg::thread_block_tile<64>& group, T val) {
25 | for (int offset = group.size()/2; offset > 0; offset /= 2) {
26 | T temp = group.shfl_down(val, offset);
27 | val += temp;
28 | }
29 | return val;
30 | }
31 |
32 | template
33 | __device__ T reduce_max(cg::thread_block_tile<64>& group, T val) {
34 | for (int offset = group.size()/2; offset > 0; offset /= 2) {
35 | T temp = group.shfl_down(val, offset);
36 | val = max(val, temp);
37 | }
38 | return val;
39 | }
40 |
41 | __device__ cg::thread_block this_thread_block() {
42 | return cg::this_thread_block();
43 | }
44 |
45 | template
46 | __device__ cg::thread_block_tile tiled_partition(cg::thread_block& block) {
47 | return cg::tiled_partition(block);
48 | }
--------------------------------------------------------------------------------
/day48/kernel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def gelu_kernel(
7 | x_ptr,
8 | output_ptr,
9 | n_elements,
10 | BLOCK_SIZE: tl.constexpr,
11 | ):
12 | pid = tl.program_id(axis=0)
13 | block_start = pid * BLOCK_SIZE
14 | offsets = block_start + tl.arange(0, BLOCK_SIZE)
15 |
16 | mask = offsets < n_elements
17 |
18 | x = tl.load(x_ptr + offsets, mask=mask)
19 |
20 | coeff1 = 0.7978845608028654
21 | coeff2 = 0.044715
22 | x_cubed = x * x * x
23 | inner = coeff1 * (x + coeff2 * x_cubed)
24 | tanh = tl.math.tanh(inner)
25 | output = 0.5 * x * (1.0 + tanh)
26 |
27 | tl.store(output_ptr + offsets, output, mask=mask)
28 |
29 | def fused_gelu(x: torch.Tensor):
30 | output = torch.empty_like(x)
31 |
32 | n_elements = x.numel()
33 |
34 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
35 |
36 | gelu_kernel[grid](x, output, n_elements, BLOCK_SIZE=1024)
37 |
38 | return output
39 |
40 | if __name__ == "__main__":
41 | torch.manual_seed(0)
42 |
43 | x = torch.randn(1000000, device='cuda')
44 |
45 | triton_output = fused_gelu(x)
46 |
47 | torch_output = torch.nn.functional.gelu(x)
48 |
49 | print(f"Maximum absolute error: {torch.max(torch.abs(triton_output - torch_output)):.2e}")
50 | print(f"Results match: {torch.allclose(triton_output, torch_output, atol=1e-5)}")
--------------------------------------------------------------------------------
/day49/kernel.py:
--------------------------------------------------------------------------------
1 | import triton
2 | import triton.language as tl
3 | import torch
4 | import time
5 |
6 | @triton.jit
7 | def fused_bias_skip_gelu_scale_kernel(
8 | x_ptr,
9 | bias_ptr,
10 | skip_ptr,
11 | gamma_ptr,
12 | y_ptr,
13 | n_elements: tl.constexpr
14 | ):
15 | BLOCK_SIZE = 1024
16 | pid = tl.program_id(0)
17 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
18 | mask = offsets < n_elements
19 | x = tl.load(x_ptr + offsets, mask=mask)
20 | bias = tl.load(bias_ptr + offsets, mask=mask)
21 | skip = tl.load(skip_ptr + offsets, mask=mask)
22 | gamma = tl.load(gamma_ptr + offsets, mask=mask)
23 | temp = x + bias + skip
24 | gelu = 0.5 * temp * (1.0 + tl.tanh(0.7978845608028654 * (temp + 0.044715 * temp * temp * temp)))
25 | out = gelu * gamma
26 | tl.store(y_ptr + offsets, out, mask=mask)
27 |
28 | def test_fused_kernel():
29 | n_elements = 2048
30 | BLOCK_SIZE = 1024
31 | x = torch.randn(n_elements, device='cuda', dtype=torch.float32)
32 | bias = torch.randn(n_elements, device='cuda', dtype=torch.float32)
33 | skip = torch.randn(n_elements, device='cuda', dtype=torch.float32)
34 | gamma = torch.randn(n_elements, device='cuda', dtype=torch.float32)
35 | y = torch.empty_like(x)
36 | grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE,)
37 | fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements)
38 | torch.cuda.synchronize()
39 | temp = x + bias + skip
40 | gelu = 0.5 * temp * (1.0 + torch.tanh(0.7978845608028654 * (temp + 0.044715 * temp ** 3)))
41 | ref = gelu * gamma
42 | if torch.allclose(y, ref, atol=1e-6):
43 | print("Test passed! Kernel output matches reference.")
44 | else:
45 | print("Test failed! Maximum absolute error:", (y - ref).abs().max().item())
46 |
47 | def benchmark_kernel():
48 | n_elements = 2048
49 | BLOCK_SIZE = 1024
50 | x = torch.randn(n_elements, device='cuda', dtype=torch.float32)
51 | bias = torch.randn(n_elements, device='cuda', dtype=torch.float32)
52 | skip = torch.randn(n_elements, device='cuda', dtype=torch.float32)
53 | gamma = torch.randn(n_elements, device='cuda', dtype=torch.float32)
54 | y = torch.empty_like(x)
55 | grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE,)
56 | for _ in range(10):
57 | fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements)
58 | torch.cuda.synchronize()
59 | n_iter = 100
60 | start = time.time()
61 | for _ in range(n_iter):
62 | fused_bias_skip_gelu_scale_kernel[grid](x, bias, skip, gamma, y, n_elements)
63 | torch.cuda.synchronize()
64 | end = time.time()
65 | avg_time_ms = (end - start) / n_iter * 1000
66 | print(f"Average kernel time over {n_iter} iterations: {avg_time_ms:.3f} ms")
67 |
68 | if __name__ == "__main__":
69 | test_fused_kernel()
70 | benchmark_kernel()
71 |
--------------------------------------------------------------------------------
/day50/tritonnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def gelu_kernel(
7 | x_ptr,
8 | output_ptr,
9 | n_elements,
10 | BLOCK_SIZE: tl.constexpr,
11 | ):
12 | pid = tl.program_id(axis=0)
13 | block_start = pid * BLOCK_SIZE
14 | offsets = block_start + tl.arange(0, BLOCK_SIZE)
15 | mask = offsets < n_elements
16 | x = tl.load(x_ptr + offsets, mask=mask)
17 | sqrt_2_over_pi = tl.sqrt(2.0 / tl.math.pi)
18 | cdf = 0.5 * (1.0 + tl.tanh(sqrt_2_over_pi * (x + 0.044715 * (x ** 3))))
19 | output = x * cdf
20 | tl.store(output_ptr + offsets, output, mask=mask)
21 |
22 | @triton.jit
23 | def fused_add_multiply_kernel(
24 | a_ptr, b_ptr, c_ptr, output_ptr,
25 | n_elements,
26 | BLOCK_SIZE: tl.constexpr,
27 | ):
28 | pid = tl.program_id(axis=0)
29 | block_start = pid * BLOCK_SIZE
30 | offsets = block_start + tl.arange(0, BLOCK_SIZE)
31 | mask = offsets < n_elements
32 | a = tl.load(a_ptr + offsets, mask=mask)
33 | b = tl.load(b_ptr + offsets, mask=mask)
34 | c = tl.load(c_ptr + offsets, mask=mask)
35 | output = (a + b) * c
36 | tl.store(output_ptr + offsets, output, mask=mask)
37 |
38 | class GELUTriton(torch.autograd.Function):
39 | @staticmethod
40 | def forward(ctx, x):
41 | x = x.contiguous()
42 | output = torch.empty_like(x)
43 | n_elements = output.numel()
44 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
45 | gelu_kernel[grid](x, output, n_elements, BLOCK_SIZE=1024)
46 | ctx.save_for_backward(x)
47 | return output
48 |
49 | @staticmethod
50 | def backward(ctx, grad_output):
51 | x, = ctx.saved_tensors
52 | with torch.enable_grad():
53 | x = x.detach().requires_grad_()
54 | with torch.cuda.amp.autocast():
55 | output = GELUTriton.apply(x)
56 | grad_input = torch.autograd.grad(
57 | output, x, grad_output, create_graph=True
58 | )[0]
59 | return grad_input
60 |
61 | def fused_add_multiply(a, b, c):
62 | assert a.shape == b.shape == c.shape
63 | a, b, c = a.contiguous(), b.contiguous(), c.contiguous()
64 | output = torch.empty_like(a)
65 | n_elements = output.numel()
66 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
67 | fused_add_multiply_kernel[grid](a, b, c, output, n_elements, BLOCK_SIZE=1024)
68 | return output
69 |
70 | class TritonNN(torch.nn.Module):
71 | def __init__(self, in_features, hidden_size, out_features):
72 | super().__init__()
73 | self.fc1 = torch.nn.Linear(in_features, hidden_size)
74 | self.fc2 = torch.nn.Linear(hidden_size, out_features)
75 |
76 | def forward(self, x):
77 | x = self.fc1(x)
78 | x = GELUTriton.apply(x)
79 |
80 | residual = x
81 | a = x
82 | b = torch.ones_like(x) * 0.5
83 | c = torch.ones_like(x) * 1.5
84 | x = fused_add_multiply(a, b, c)
85 | x += residual
86 |
87 | return self.fc2(x)
88 |
89 | if __name__ == "__main__":
90 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
91 | model = TritonNN(784, 256, 10).to(device)
92 | x = torch.randn(32, 784).to(device)
93 | output = model(x)
94 | print("Output shape:", output.shape)
95 | print("Output values:", output[0, :5])
--------------------------------------------------------------------------------
/day52/functionsused.py:
--------------------------------------------------------------------------------
1 | import triton
2 | import numpy as np
3 |
4 | import triton.language as tl
5 |
6 | @triton.jit
7 | def init_matrix(matrix, seed: tl.constexpr):
8 | idx = tl.arange(0, matrix.shape[0])
9 | matrix[idx] = tl.random(seed + idx)
10 |
11 | @triton.jit
12 | def add_matrices(a, b, result):
13 | idx = tl.arange(0, a.shape[0])
14 | result[idx] = a[idx] + b[idx]
15 |
16 | @triton.jit
17 | def multiply_matrices(a, b, result):
18 | idx = tl.arange(0, a.shape[0])
19 | result[idx] = a[idx] * b[idx]
20 |
21 | @triton.jit
22 | def transpose_matrix(matrix, result):
23 | idx = tl.arange(0, matrix.shape[0])
24 | idy = tl.arange(0, matrix.shape[1])
25 | result[idy, idx] = matrix[idx, idy]
26 |
27 | @triton.jit
28 | def matmul_kernel(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
29 | pid = tl.program_id(0)
30 | row = pid // N
31 | col = pid % N
32 |
33 | acc = 0.0
34 | for k in range(K):
35 | acc += a[row, k] * b[k, col]
36 |
37 | c[row, col] = acc
38 |
39 | if __name__ == "__main__":
40 |
41 | M, N, K = 128, 128, 128
42 | a = np.random.rand(M, K).astype(np.float32)
43 | b = np.random.rand(K, N).astype(np.float32)
44 | c = np.zeros((M, N), dtype=np.float32)
45 |
46 | a_dev = triton.testing.to_device(a)
47 | b_dev = triton.testing.to_device(b)
48 | c_dev = triton.testing.to_device(c)
49 |
50 | grid = (M * N,)
51 | matmul_kernel[grid](a_dev, b_dev, c_dev, M, N, K)
52 |
53 | c = c_dev.cpu()
54 | print(c)
--------------------------------------------------------------------------------
/day54/softmax.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def softmax_kernel(
7 | output_ptr, input_ptr,
8 | input_row_stride, output_row_stride,
9 | n_cols,
10 | BLOCK_SIZE: tl.constexpr
11 | ):
12 | row_idx = tl.program_id(0)
13 | row_start = row_idx * input_row_stride
14 |
15 | col_offsets = tl.arange(0, BLOCK_SIZE)
16 | input_ptrs = input_ptr + row_start + col_offsets
17 | row_mask = col_offsets < n_cols
18 |
19 | row = tl.load(input_ptrs, mask=row_mask, other=-float('inf'))
20 | row_minus_max = row - tl.max(row, axis=0)
21 | numerator = tl.exp(row_minus_max)
22 | denominator = tl.sum(numerator, axis=0)
23 | softmax_output = numerator / denominator
24 |
25 | output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets
26 | tl.store(output_ptrs, softmax_output, mask=row_mask)
27 |
28 | def triton_softmax(x):
29 | n_rows, n_cols = x.shape
30 | BLOCK_SIZE = triton.next_power_of_2(n_cols)
31 |
32 | y = torch.empty_like(x)
33 | assert x.is_cuda and y.is_cuda
34 |
35 | num_warps = 4
36 | if BLOCK_SIZE >= 2048:
37 | num_warps = 8
38 | if BLOCK_SIZE >= 4096:
39 | num_warps = 16
40 |
41 | softmax_kernel[(n_rows,)](
42 | y, x,
43 | x.stride(0), y.stride(0),
44 | n_cols,
45 | BLOCK_SIZE=BLOCK_SIZE,
46 | num_warps=num_warps
47 | )
48 | return y
49 |
50 | x = torch.randn(10000, 1000, device='cuda')
51 | triton_result = triton_softmax(x)
52 | torch_result = torch.softmax(x, axis=1)
53 |
54 | print(f"Max error: {torch.max(torch.abs(triton_result - torch_result)):.2e}")
--------------------------------------------------------------------------------
/day57/main.py:
--------------------------------------------------------------------------------
1 | import triton
2 | import triton.language as tl
3 | import torch
4 | @triton.jit
5 | def fused_linear_xentropy_forward(
6 | input_ptr, weight_ptr, bias_ptr, target_ptr, loss_ptr,
7 | batch_size, in_features, out_features,
8 | stride_input_batch, stride_input_feature,
9 | stride_weight_out, stride_weight_in,
10 | stride_bias_out,
11 | BLOCK_SIZE_IN: tl.constexpr,
12 | BLOCK_SIZE_OUT: tl.constexpr,
13 | ):
14 | pid = tl.program_id(0)
15 | if pid >= batch_size:
16 | return
17 |
18 | input_row = input_ptr + pid * stride_input_batch
19 | target = tl.load(target_ptr + pid)
20 |
21 | logits = tl.zeros((BLOCK_SIZE_OUT,), dtype=tl.float32)
22 |
23 | for i in range(0, in_features, BLOCK_SIZE_IN):
24 | input_offsets = i + tl.arange(0, BLOCK_SIZE_IN)
25 | input_mask = input_offsets < in_features
26 | current_input = tl.load(input_row + input_offsets, mask=input_mask, other=0.0)
27 |
28 | weight_offsets = (i + tl.arange(0, BLOCK_SIZE_IN))[None, :] * stride_weight_in + \
29 | tl.arange(0, BLOCK_SIZE_OUT)[:, None] * stride_weight_out
30 | weight_mask = (input_mask[None, :]) & (tl.arange(0, BLOCK_SIZE_OUT)[:, None] < out_features)
31 | current_weight = tl.load(weight_ptr + weight_offsets, mask=weight_mask, other=0.0)
32 |
33 | logits += tl.sum(current_input[None, :] * current_weight, axis=1)
34 |
35 | bias_offsets = tl.arange(0, BLOCK_SIZE_OUT) * stride_bias_out
36 | bias_mask = tl.arange(0, BLOCK_SIZE_OUT) < out_features
37 | bias = tl.load(bias_ptr + bias_offsets, mask=bias_mask, other=0.0)
38 | logits += bias
39 |
40 | max_logit = tl.max(logits, axis=0)
41 | exp_logits = tl.exp(logits - max_logit)
42 | sum_exp = tl.sum(exp_logits, axis=0)
43 | log_sum_exp = tl.log(sum_exp)
44 | log_probs = logits - max_logit - log_sum_exp
45 |
46 | target_mask = tl.arange(0, BLOCK_SIZE_OUT) == target
47 | contribution = -tl.sum(log_probs * target_mask, axis=0)
48 | tl.atomic_add(loss_ptr, contribution / batch_size)
49 |
50 | def fused_linear_cross_entropy(
51 | input: torch.Tensor,
52 | weight: torch.Tensor,
53 | bias: torch.Tensor,
54 | target: torch.Tensor
55 | ) -> torch.Tensor:
56 | assert input.is_cuda and weight.is_cuda and bias.is_cuda and target.is_cuda
57 | batch_size, in_features = input.shape
58 | out_features, _ = weight.shape
59 |
60 | loss = torch.zeros(1, device=input.device, dtype=torch.float32)
61 |
62 | BLOCK_SIZE_IN = 128
63 | BLOCK_SIZE_OUT = triton.next_power_of_2(out_features)
64 | if BLOCK_SIZE_OUT > 4096:
65 | raise ValueError("Too many output features for this kernel implementation")
66 |
67 | grid = (batch_size,)
68 | fused_linear_xentropy_forward[grid](
69 | input_ptr=input,
70 | weight_ptr=weight,
71 | bias_ptr=bias,
72 | target_ptr=target,
73 | loss_ptr=loss,
74 | batch_size=batch_size,
75 | in_features=in_features,
76 | out_features=out_features,
77 | stride_input_batch=input.stride(0),
78 | stride_input_feature=input.stride(1),
79 | stride_weight_out=weight.stride(0),
80 | stride_weight_in=weight.stride(1),
81 | stride_bias_out=bias.stride(0),
82 | BLOCK_SIZE_IN=BLOCK_SIZE_IN,
83 | BLOCK_SIZE_OUT=BLOCK_SIZE_OUT,
84 | )
85 | return loss
--------------------------------------------------------------------------------
/day58/layer_norm.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | __global__ void layer_norm_kernel(
4 | float* output,
5 | const float* input,
6 | const float* gamma,
7 | const float* beta,
8 | int batch_size,
9 | int hidden_size,
10 | float epsilon)
11 | {
12 | extern __shared__ float shared[];
13 | int batch_idx = blockIdx.x;
14 | int tid = threadIdx.x;
15 |
16 | if (batch_idx >= batch_size) return;
17 |
18 | float* sum = shared;
19 | float* sum_sq = &shared[blockDim.x];
20 |
21 | float thread_sum = 0.0f;
22 | for (int i = tid; i < hidden_size; i += blockDim.x) {
23 | float val = input[batch_idx * hidden_size + i];
24 | thread_sum += val;
25 | }
26 | sum[tid] = thread_sum;
27 | __syncthreads();
28 |
29 | for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
30 | if (tid < stride) {
31 | sum[tid] += sum[tid + stride];
32 | }
33 | __syncthreads();
34 | }
35 | float mean = sum[0] / hidden_size;
36 |
37 | float thread_sum_sq = 0.0f;
38 | for (int i = tid; i < hidden_size; i += blockDim.x) {
39 | float val = input[batch_idx * hidden_size + i];
40 | float diff = val - mean;
41 | thread_sum_sq += diff * diff;
42 | }
43 | sum_sq[tid] = thread_sum_sq;
44 | __syncthreads();
45 |
46 | for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
47 | if (tid < stride) {
48 | sum_sq[tid] += sum_sq[tid + stride];
49 | }
50 | __syncthreads();
51 | }
52 | float variance = sum_sq[0] / hidden_size + epsilon;
53 | float inv_std = rsqrtf(variance);
54 |
55 | for (int i = tid; i < hidden_size; i += blockDim.x) {
56 | float val = input[batch_idx * hidden_size + i];
57 | float normalized = (val - mean) * inv_std;
58 | output[batch_idx * hidden_size + i] = normalized * gamma[i] + beta[i];
59 | }
60 | }
61 |
62 | void layer_norm_hip(
63 | float* output,
64 | const float* input,
65 | const float* gamma,
66 | const float* beta,
67 | int batch_size,
68 | int hidden_size,
69 | float epsilon,
70 | hipStream_t stream)
71 | {
72 | dim3 blocks(batch_size);
73 | dim3 threads(256);
74 | size_t shared_mem = 2 * threads.x * sizeof(float);
75 |
76 | hipLaunchKernelGGL(
77 | layer_norm_kernel,
78 | blocks, threads, shared_mem, stream,
79 | output, input, gamma, beta,
80 | batch_size, hidden_size, epsilon
81 | );
82 | }
--------------------------------------------------------------------------------
/day60/fused.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def _fused_skip_act_norm_dropout_kernel(
7 | input_ptr, skip_ptr, output_ptr,
8 | weight_ptr, bias_ptr,
9 | M, N,
10 | stride_input, stride_skip, stride_output,
11 | dropout_p, seed,
12 | eps,
13 | is_training,
14 | BLOCK_SIZE: tl.constexpr,
15 | ):
16 | pid = tl.program_id(0)
17 |
18 | offsets = pid * stride_input + tl.arange(0, BLOCK_SIZE)
19 | mask = tl.arange(0, BLOCK_SIZE) < N
20 |
21 | input = tl.load(input_ptr + offsets, mask=mask, other=0.0)
22 | skip = tl.load(skip_ptr + offsets, mask=mask, other=0.0)
23 |
24 | summed = input + skip
25 |
26 | mean = tl.sum(summed, axis=0) / N
27 | centered = summed - mean
28 | var = tl.sum(centered * centered, axis=0) / N
29 | inv_std = 1.0 / tl.sqrt(var + eps)
30 |
31 | normalized = centered * inv_std
32 |
33 | if weight_ptr is not None:
34 | weight = tl.load(weight_ptr + tl.arange(0, BLOCK_SIZE), mask=mask, other=1.0)
35 | normalized *= weight
36 | if bias_ptr is not None:
37 | bias = tl.load(bias_ptr + tl.arange(0, BLOCK_SIZE), mask=mask, other=0.0)
38 | normalized += bias
39 |
40 | gelu = normalized * 0.5 * (1.0 + tl.erf(normalized / tl.sqrt(2.0)))
41 |
42 | if is_training:
43 | dropout_mask = tl.rand(seed, offsets) > dropout_p
44 | gelu = tl.where(dropout_mask, gelu / (1 - dropout_p), 0.0)
45 |
46 | tl.store(output_ptr + offsets, gelu, mask=mask)
47 |
48 | class FusedSkipNormActDropout(torch.autograd.Function):
49 | @staticmethod
50 | def forward(ctx, input, skip, weight, bias, p, training, eps):
51 | assert input.shape == skip.shape
52 | M, N = input.shape
53 | output = torch.empty_like(input)
54 |
55 | BLOCK_SIZE = triton.next_power_of_2(N)
56 |
57 | seed = torch.randint(0, 2**31, (1,)).item()
58 |
59 | grid = (M,)
60 | _fused_skip_act_norm_dropout_kernel[grid](
61 | input, skip, output,
62 | weight if weight is not None else None,
63 | bias if bias is not None else None,
64 | M, N,
65 | input.stride(0), skip.stride(0), output.stride(0),
66 | dropout_p=p,
67 | seed=seed,
68 | eps=eps,
69 | is_training=training,
70 | BLOCK_SIZE=BLOCK_SIZE,
71 | )
72 |
73 | ctx.training = training
74 | ctx.p = p
75 | ctx.eps = eps
76 | ctx.save_for_backward(input, skip, weight, bias, output)
77 |
78 | return output
79 |
80 | @staticmethod
81 | def backward(ctx, grad_output):
82 | raise NotImplementedError("Backward not implemented for this fused operation")
83 |
84 | def fused_skip_norm_act_dropout(
85 | input: torch.Tensor,
86 | skip: torch.Tensor,
87 | weight: torch.Tensor = None,
88 | bias: torch.Tensor = None,
89 | p: float = 0.5,
90 | training: bool = False,
91 | eps: float = 1e-5
92 | ) -> torch.Tensor:
93 | return FusedSkipNormActDropout.apply(input, skip, weight, bias, p, training, eps)
--------------------------------------------------------------------------------
/day64/main.py:
--------------------------------------------------------------------------------
1 | import torch, time
2 | import torch.nn.functional as F
3 | import triton
4 | import triton.language as tl
5 |
6 | @triton.jit
7 | def geglu_kernel(input_ptr, output_ptr, numel: tl.constexpr, D: tl.constexpr, BLOCK_SIZE: tl.constexpr):
8 | pid = tl.program_id(0)
9 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
10 | mask = offsets < numel
11 |
12 | row = offsets // D
13 | col = offsets % D
14 | base_offset = row * (2 * D)
15 | x = tl.load(input_ptr + base_offset + col, mask=mask)
16 | gate = tl.load(input_ptr + base_offset + D + col, mask=mask)
17 |
18 | t = 0.7978845608 * (gate + 0.044715 * gate * gate * gate)
19 | exp_2t = tl.exp(2 * t)
20 | tanh_t = (exp_2t - 1.0) / (exp_2t + 1.0)
21 | gelu_gate = 0.5 * gate * (1.0 + tanh_t)
22 | out = x * gelu_gate
23 |
24 | tl.store(output_ptr + offsets, out, mask=mask)
25 |
26 | def fused_geglu(input_tensor):
27 | N, twoD = input_tensor.shape
28 | D = twoD // 2
29 | output = torch.empty((N, D), device=input_tensor.device, dtype=input_tensor.dtype)
30 | numel = N * D
31 | BLOCK_SIZE = 256
32 | grid = lambda meta: ((numel + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
33 | geglu_kernel[grid](input_tensor, output, numel, D, BLOCK_SIZE)
34 | return output
35 |
36 | def torch_geglu(input_tensor):
37 | x, gate = input_tensor.chunk(2, dim=-1)
38 | return x * F.gelu(gate)
39 |
40 | input_tensor = torch.randn(8192, 8192, device='cuda')
41 |
42 | _ = fused_geglu(input_tensor)
43 | _ = torch_geglu(input_tensor)
44 |
45 | torch.cuda.synchronize()
46 | start = time.time()
47 | for _ in range(100):
48 | _ = fused_geglu(input_tensor)
49 | torch.cuda.synchronize()
50 | fused_time = time.time() - start
51 |
52 | torch.cuda.synchronize()
53 | start = time.time()
54 | for _ in range(100):
55 | _ = torch_geglu(input_tensor)
56 | torch.cuda.synchronize()
57 | torch_time = time.time() - start
58 |
59 | print("Fused Triton kernel time: {:.6f} sec".format(fused_time))
60 | print("Torch baseline time: {:.6f} sec".format(torch_time))
61 |
--------------------------------------------------------------------------------
/day67/lora.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def lora_kernel(
7 | y_ptr, x_ptr, w_ptr, a_ptr, b_ptr,
8 | M, N, K, R,
9 | stride_ym, stride_yn,
10 | stride_xm, stride_xk,
11 | stride_wk, stride_wn,
12 | stride_ak, stride_ar,
13 | stride_br, stride_bn,
14 | BLOCK_SIZE_M: tl.constexpr,
15 | BLOCK_SIZE_N: tl.constexpr,
16 | BLOCK_SIZE_K: tl.constexpr,
17 | BLOCK_SIZE_R: tl.constexpr,
18 | ):
19 | pid = tl.program_id(0)
20 | num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
21 | num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
22 | pid_m = pid // num_pid_n
23 | pid_n = pid % num_pid_n
24 |
25 | offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
26 | offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
27 | offs_k = tl.arange(0, BLOCK_SIZE_K)
28 | offs_r = tl.arange(0, BLOCK_SIZE_R)
29 |
30 | y_ptrs = y_ptr + offs_m[:, None] * stride_ym + offs_n[None, :] * stride_yn
31 | mask_m = (offs_m < M)[:, None]
32 | mask_n = (offs_n < N)[None, :]
33 | mask_y = mask_m & mask_n
34 |
35 | acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
36 |
37 | for k in range(0, K, BLOCK_SIZE_K):
38 | x_ptrs = x_ptr + offs_m[:, None] * stride_xm + (k + offs_k)[None, :] * stride_xk
39 | mask_x = (offs_m < M)[:, None] & ((k + offs_k) < K)[None, :]
40 | x = tl.load(x_ptrs, mask=mask_x, other=0.0)
41 |
42 | w_ptrs = w_ptr + (k + offs_k)[:, None] * stride_wk + offs_n[None, :] * stride_wn
43 | mask_w = ((k + offs_k) < K)[:, None] & (offs_n < N)[None, :]
44 | w = tl.load(w_ptrs, mask=mask_w, other=0.0)
45 |
46 | ab = tl.zeros((BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=tl.float32)
47 | for r in range(0, R, BLOCK_SIZE_R):
48 | a_ptrs = a_ptr + (k + offs_k)[:, None] * stride_ak + (r + offs_r)[None, :] * stride_ar
49 | mask_a = ((k + offs_k) < K)[:, None] & ((r + offs_r) < R)[None, :]
50 | a = tl.load(a_ptrs, mask=mask_a, other=0.0)
51 |
52 | b_ptrs = b_ptr + (r + offs_r)[:, None] * stride_br + offs_n[None, :] * stride_bn
53 | mask_b = ((r + offs_r) < R)[:, None] & (offs_n < N)[None, :]
54 | b = tl.load(b_ptrs, mask=mask_b, other=0.0)
55 |
56 | ab += tl.dot(a.to(tl.float32), b.to(tl.float32))
57 |
58 | w_eff = w.to(tl.float32) + ab
59 | acc += tl.dot(x.to(tl.float32), w_eff)
60 |
61 | tl.store(y_ptrs, acc.to(tl.float16), mask=mask_y)
62 |
63 | def lora_matmul(x, W, A, B):
64 | M, K = x.shape
65 | _, N = W.shape
66 | R = A.shape[1]
67 | y = torch.empty((M, N), device=x.device, dtype=x.dtype)
68 |
69 | BLOCK_SIZE_M = 64
70 | BLOCK_SIZE_N = 64
71 | BLOCK_SIZE_K = 32
72 | BLOCK_SIZE_R = 32
73 |
74 | grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N, meta['BLOCK_SIZE_N']),)
75 |
76 | lora_kernel[grid](
77 | y, x, W, A, B,
78 | M, N, K, R,
79 | y.stride(0), y.stride(1),
80 | x.stride(0), x.stride(1),
81 | W.stride(0), W.stride(1),
82 | A.stride(0), A.stride(1),
83 | B.stride(0), B.stride(1),
84 | BLOCK_SIZE_M=BLOCK_SIZE_M,
85 | BLOCK_SIZE_N=BLOCK_SIZE_N,
86 | BLOCK_SIZE_K=BLOCK_SIZE_K,
87 | BLOCK_SIZE_R=BLOCK_SIZE_R,
88 | )
89 | return y
90 |
--------------------------------------------------------------------------------
/day68/adam.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def adam_fp8_kernel(
7 | param_ptr, grad_ptr, m_ptr, v_ptr, lr_ptr,
8 | beta1, beta2, eps, step,
9 | BLOCK_SIZE: tl.constexpr
10 | ):
11 | pid = tl.program_id(axis=0)
12 | offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
13 | mask = offset < tl.numel(param_ptr)
14 |
15 | param = tl.load(param_ptr + offset, mask=mask, other=0.0).to(tl.float16)
16 | grad = tl.load(grad_ptr + offset, mask=mask, other=0.0).to(tl.float16)
17 | m = tl.load(m_ptr + offset, mask=mask, other=0.0).to(tl.float16)
18 | v = tl.load(v_ptr + offset, mask=mask, other=0.0).to(tl.float16)
19 | lr = tl.load(lr_ptr + offset, mask=mask, other=0.0).to(tl.float16)
20 |
21 | m_new = beta1 * m + (1 - beta1) * grad
22 | v_new = beta2 * v + (1 - beta2) * grad * grad
23 | m_hat = m_new / (1 - beta1 ** step)
24 | v_hat = v_new / (1 - beta2 ** step)
25 | update = m_hat / (tl.sqrt(v_hat) + eps)
26 | param_new = param - lr * update
27 |
28 | param_new_fp8 = param_new.to(tl.float8_e4m3)
29 | m_new_fp8 = m_new.to(tl.float8_e4m3)
30 | v_new_fp8 = v_new.to(tl.float8_e4m3)
31 |
32 | tl.store(param_ptr + offset, param_new_fp8, mask=mask)
33 | tl.store(m_ptr + offset, m_new_fp8, mask=mask)
34 | tl.store(v_ptr + offset, v_new_fp8, mask=mask)
35 |
36 | def adam_fp8(param, grad, m, v, lr, beta1=0.9, beta2=0.999, eps=1e-8, step=1):
37 | BLOCK_SIZE = 1024
38 | n = param.numel()
39 | grid = lambda meta: (triton.cdiv(n, meta['BLOCK_SIZE']),)
40 | adam_fp8_kernel[grid](
41 | param, grad, m, v, lr,
42 | beta1, beta2, eps, step,
43 | BLOCK_SIZE=BLOCK_SIZE
44 | )
45 |
--------------------------------------------------------------------------------
/day69/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def reduce_kernel(K, V, A_ptr, b_ptr, N: tl.constexpr, D: tl.constexpr):
7 | acc_A = tl.zeros([D, D], dtype=tl.float32)
8 | acc_b = tl.zeros([D], dtype=tl.float32)
9 | for j in range(N):
10 | k = tl.load(K + j * D)
11 | v = tl.load(V + j * D)
12 | k_phi = tl.relu(k) + 1.0
13 | for i in range(D):
14 | for jj in range(D):
15 | acc_A[i, jj] += k_phi[i] * v[jj]
16 | acc_b += k_phi
17 | tl.store(A_ptr, acc_A)
18 | tl.store(b_ptr, acc_b)
19 |
20 | @triton.jit
21 | def attention_kernel(Q, A_ptr, b_ptr, Out, N: tl.constexpr, D: tl.constexpr):
22 | pid = tl.program_id(0)
23 | q = tl.load(Q + pid * D)
24 | q_phi = tl.relu(q) + 1.0
25 | out_vec = tl.zeros([D], dtype=tl.float32)
26 | for i in range(D):
27 | a_row = tl.load(A_ptr + i * D)
28 | out_vec[i] = tl.dot(q_phi, a_row)
29 | denom = tl.dot(q_phi, tl.load(b_ptr))
30 | tl.store(Out + pid * D, out_vec / denom)
31 |
32 | def linear_attention(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor) -> torch.Tensor:
33 | assert Q.is_cuda and K.is_cuda and V.is_cuda, "Input tensors must be on CUDA"
34 | N, D = Q.shape
35 | A = torch.empty((D, D), device='cuda', dtype=torch.float32)
36 | b = torch.empty((D,), device='cuda', dtype=torch.float32)
37 | reduce_kernel[(1,)](K, V, A, b, N, D)
38 | Out = torch.empty_like(Q)
39 | attention_kernel[(N,)](Q, A, b, Out, N, D)
40 | return Out
41 |
42 | if __name__ == "__main__":
43 | N = 1024
44 | D = 64
45 | Q = torch.randn((N, D), device='cuda', dtype=torch.float32)
46 | K = torch.randn((N, D), device='cuda', dtype=torch.float32)
47 | V = torch.randn((N, D), device='cuda', dtype=torch.float32)
48 | Out = linear_attention(Q, K, V)
49 | print("Output shape:", Out.shape)
50 |
--------------------------------------------------------------------------------
/day72/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def sgd_kernel(
7 | param_ptr,
8 | grad_ptr,
9 | momentum_ptr,
10 | lr,
11 | weight_decay,
12 | momentum_factor,
13 | dampening,
14 | nesterov,
15 | n_elements,
16 | BLOCK_SIZE: tl.constexpr,
17 | ):
18 | pid = tl.program_id(axis=0)
19 | block_start = pid * BLOCK_SIZE
20 | offsets = block_start + tl.arange(0, BLOCK_SIZE)
21 | mask = offsets < n_elements
22 | params = tl.load(param_ptr + offsets, mask=mask)
23 | grads = tl.load(grad_ptr + offsets, mask=mask)
24 | if weight_decay != 0.0:
25 | grads = grads + weight_decay * params
26 | if momentum_factor != 0.0:
27 | momentum_buf = tl.load(momentum_ptr + offsets, mask=mask)
28 | momentum_buf = momentum_factor * momentum_buf + (1.0 - dampening) * grads
29 | tl.store(momentum_ptr + offsets, momentum_buf, mask=mask)
30 | if nesterov:
31 | grads = grads + momentum_factor * momentum_buf
32 | else:
33 | grads = momentum_buf
34 | params = params - lr * grads
35 | tl.store(param_ptr + offsets, params, mask=mask)
36 |
37 | def sgd_update(
38 | params,
39 | grads,
40 | momentum_buffer=None,
41 | lr=0.01,
42 | weight_decay=0.0,
43 | momentum=0.0,
44 | dampening=0.0,
45 | nesterov=False,
46 | ):
47 | n_elements = params.numel()
48 | if momentum != 0.0 and momentum_buffer is None:
49 | momentum_buffer = torch.zeros_like(params)
50 | BLOCK_SIZE = 1024
51 | grid = (n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE
52 | sgd_kernel[grid, 1](
53 | params.data_ptr(),
54 | grads.data_ptr(),
55 | momentum_buffer.data_ptr() if momentum != 0.0 else 0,
56 | lr,
57 | weight_decay,
58 | momentum,
59 | dampening,
60 | 1 if nesterov else 0,
61 | n_elements,
62 | BLOCK_SIZE,
63 | )
64 | return params, momentum_buffer
65 |
66 | def example():
67 | params = torch.randn(10000, device='cuda')
68 | grads = torch.randn(10000, device='cuda')
69 | momentum_buffer = torch.zeros_like(params)
70 | updated_params, updated_momentum = sgd_update(
71 | params,
72 | grads,
73 | momentum_buffer,
74 | lr=0.01,
75 | weight_decay=0.0001,
76 | momentum=0.9,
77 | nesterov=True
78 | )
79 | print(f"Updated {params.shape} parameters using Triton SGD kernel")
80 |
81 | if __name__ == "__main__":
82 | example()
83 |
--------------------------------------------------------------------------------
/day73/code.py:
--------------------------------------------------------------------------------
1 | import triton
2 | import triton.language as tl
3 | import numpy as np
4 |
5 | @triton.jit
6 | def ddim_step_kernel(
7 | x_ptr,
8 | eps_ptr,
9 | out_ptr,
10 | alpha_t: tl.constexpr,
11 | alpha_t_prev: tl.constexpr,
12 | n_elements: tl.constexpr,
13 | BLOCK_SIZE: tl.constexpr = 1024
14 | ):
15 | pid = tl.program_id(0)
16 | offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
17 | mask = offsets < n_elements
18 |
19 | x = tl.load(x_ptr + offsets, mask=mask)
20 | eps = tl.load(eps_ptr + offsets, mask=mask)
21 |
22 | sqrt_alpha_t = tl.sqrt(alpha_t)
23 | sqrt_one_minus_alpha_t = tl.sqrt(1 - alpha_t)
24 | sqrt_alpha_t_prev = tl.sqrt(alpha_t_prev)
25 | sqrt_one_minus_alpha_t_prev = tl.sqrt(1 - alpha_t_prev)
26 |
27 | x0 = (x - sqrt_one_minus_alpha_t * eps) / sqrt_alpha_t
28 | new_x = sqrt_alpha_t_prev * x0 + sqrt_one_minus_alpha_t_prev * eps
29 |
30 | tl.store(out_ptr + offsets, new_x, mask=mask)
31 |
32 | def ddim_sampling_step(x: np.ndarray, eps: np.ndarray, alpha_t: float, alpha_t_prev: float):
33 | x = np.ascontiguousarray(x.astype(np.float32))
34 | eps = np.ascontiguousarray(eps.astype(np.float32))
35 | out = np.empty_like(x)
36 |
37 | n_elements = x.size
38 | grid = (triton.cdiv(n_elements, 1024),)
39 |
40 | ddim_step_kernel[grid](
41 | x_ptr=x,
42 | eps_ptr=eps,
43 | out_ptr=out,
44 | alpha_t=alpha_t,
45 | alpha_t_prev=alpha_t_prev,
46 | n_elements=n_elements,
47 | BLOCK_SIZE=1024
48 | )
49 | return out
50 |
51 | if __name__ == '__main__':
52 | N = 4096
53 | x = np.random.randn(N).astype(np.float32)
54 | eps = np.random.randn(N).astype(np.float32)
55 | alpha_t = 0.9
56 | alpha_t_prev = 0.85
57 |
58 | x_prev = ddim_sampling_step(x, eps, alpha_t, alpha_t_prev)
59 | print("Updated sample:", x_prev)
60 |
--------------------------------------------------------------------------------
/day74/kernel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def relu_device_fn(x):
7 | return tl.maximum(0.0, x)
8 |
9 | @triton.jit
10 | def swish_device_fn(x):
11 | return x * tl.sigmoid(x)
12 |
13 | @triton.jit
14 | def gelu_device_fn(x):
15 | return 0.5 * x * (1.0 + tl.tanh(0.7978845608 * (x + 0.044715 * x * x * x)))
16 |
17 | def create_activation_kernel(device_fn):
18 | @triton.jit
19 | def kernel(x_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
20 | pid = tl.program_id(axis=0)
21 | block_start = pid * BLOCK_SIZE
22 | offsets = block_start + tl.arange(0, BLOCK_SIZE)
23 | mask = offsets < n_elements
24 | x = tl.load(x_ptr + offsets, mask=mask)
25 | output = device_fn(x)
26 | tl.store(output_ptr + offsets, output, mask=mask)
27 | return kernel
28 |
29 | def create_activation_function(kernel, name):
30 | @triton.autotune(
31 | configs=[
32 | triton.Config({'BLOCK_SIZE': 128}),
33 | triton.Config({'BLOCK_SIZE': 256}),
34 | triton.Config({'BLOCK_SIZE': 512}),
35 | triton.Config({'BLOCK_SIZE': 1024}),
36 | ],
37 | key=['n_elements'],
38 | )
39 | def activation_fn(x):
40 | n_elements = x.numel()
41 | output = torch.empty_like(x)
42 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
43 | kernel[grid](
44 | x.data_ptr(),
45 | output.data_ptr(),
46 | n_elements,
47 | )
48 | return output
49 | activation_fn.__name__ = name
50 | return activation_fn
51 |
52 | relu_kernel = create_activation_kernel(relu_device_fn)
53 | swish_kernel = create_activation_kernel(swish_device_fn)
54 | gelu_kernel = create_activation_kernel(gelu_device_fn)
55 |
56 | relu = create_activation_function(relu_kernel, "relu")
57 | swish = create_activation_function(swish_kernel, "swish")
58 | gelu = create_activation_function(gelu_kernel, "gelu")
59 |
60 | def example():
61 | x = torch.randn(1024, 1024, device='cuda')
62 | y_relu = relu(x)
63 | y_swish = swish(x)
64 | y_gelu = gelu(x)
65 | print(f"Input shape: {x.shape}")
66 | print(f"ReLU output shape: {y_relu.shape}")
67 | print(f"Swish output shape: {y_swish.shape}")
68 | print(f"GELU output shape: {y_gelu.shape}")
69 | torch_relu = torch.nn.functional.relu(x)
70 | torch_gelu = torch.nn.functional.gelu(x)
71 | torch_swish = torch.nn.functional.silu(x)
72 | print(f"ReLU max error: {(y_relu - torch_relu).abs().max().item()}")
73 | print(f"Swish max error: {(y_swish - torch_swish).abs().max().item()}")
74 | print(f"GELU max error: {(y_gelu - torch_gelu).abs().max().item()}")
75 |
76 | if __name__ == "__main__":
77 | example()
78 |
--------------------------------------------------------------------------------
/day76/kernel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def moe_kernel(
7 | input_ptr,
8 | gate_weight_ptr,
9 | experts_ptr,
10 | output_ptr,
11 | num_tokens,
12 | hidden_size,
13 | num_experts,
14 | top_k,
15 | input_token_stride,
16 | input_hidden_stride,
17 | expert_stride,
18 | expert_hidden_stride,
19 | BLOCK_SIZE: tl.constexpr,
20 | ):
21 | token_idx = tl.program_id(0)
22 | if token_idx >= num_tokens:
23 | return
24 |
25 | input_offset = token_idx * input_token_stride
26 | input = tl.load(input_ptr + input_offset + tl.arange(0, BLOCK_SIZE) * input_hidden_stride,
27 | mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0)
28 |
29 | gate_logits = tl.zeros((num_experts,), dtype=tl.float32)
30 | for expert in range(num_experts):
31 | gate_w = tl.load(gate_weight_ptr + expert * hidden_size + tl.arange(0, BLOCK_SIZE),
32 | mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0)
33 | logit = tl.sum(input * gate_w)
34 | gate_logits = tl.store(gate_logits + expert, logit)
35 |
36 | max_logit = tl.max(gate_logits)
37 | exp_logits = tl.exp(gate_logits - max_logit)
38 | sum_exp = tl.sum(exp_logits)
39 | probs = exp_logits / sum_exp
40 |
41 | topk_values, topk_indices = tl.topk(probs, top_k)
42 |
43 | output = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
44 | for i in range(top_k):
45 | expert_idx = topk_indices[i]
46 | weight = topk_values[i]
47 |
48 | expert_offset = expert_idx * expert_stride
49 | expert_output = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
50 | for j in range(hidden_size):
51 | w = tl.load(experts_ptr + expert_offset + j * expert_hidden_stride + tl.arange(0, BLOCK_SIZE),
52 | mask=tl.arange(0, BLOCK_SIZE) < hidden_size, other=0.0)
53 | expert_output += input[j] * w
54 |
55 | output += weight * expert_output
56 |
57 | tl.store(output_ptr + token_idx * input_token_stride + tl.arange(0, BLOCK_SIZE) * input_hidden_stride,
58 | output, mask=tl.arange(0, BLOCK_SIZE) < hidden_size)
59 |
60 | def moe_layer(input: torch.Tensor, gate: torch.Tensor, experts: torch.Tensor, top_k: int):
61 | assert experts.shape[0] >= top_k, "Number of experts must be >= top_k"
62 | output = torch.empty_like(input)
63 | hidden_size = input.size(1)
64 | num_tokens = input.size(0)
65 | num_experts = gate.size(1)
66 |
67 | # Ensure block size is a power of two for optimal performance
68 | BLOCK_SIZE = triton.next_power_of_2(hidden_size)
69 | if BLOCK_SIZE > 4096:
70 | BLOCK_SIZE = 4096
71 |
72 | moe_kernel[(num_tokens,)](
73 | input_ptr=input,
74 | gate_weight_ptr=gate,
75 | experts_ptr=experts,
76 | output_ptr=output,
77 | num_tokens=num_tokens,
78 | hidden_size=hidden_size,
79 | num_experts=num_experts,
80 | top_k=top_k,
81 | input_token_stride=input.stride(0),
82 | input_hidden_stride=input.stride(1),
83 | expert_stride=experts.stride(0),
84 | expert_hidden_stride=experts.stride(2),
85 | BLOCK_SIZE=BLOCK_SIZE,
86 | )
87 | return output
--------------------------------------------------------------------------------
/day77/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 | import time
5 |
6 | # A simplified RetNet kernel: a decaying cumulative sum.
7 | # Given an input sequence x and a decay factor alpha,
8 | # it computes: y[0] = x[0] and for i>0, y[i] = x[i] + alpha * y[i-1]
9 | # Note: This kernel assumes the sequence length (N) is known at compile-time.
10 | @triton.jit
11 | def retnet_kernel(x_ptr, y_ptr, N: tl.constexpr, alpha: tl.constexpr):
12 | # we use a single program (grid = (1,)) to process the full sequence sequentially.
13 | acc = tl.zeros([1], dtype=tl.float32)
14 | # Process each element in sequence.
15 | for i in range(N):
16 | # Load the i-th element from input.
17 | x_val = tl.load(x_ptr + i)
18 | # Compute the recurrent relation.
19 | acc = x_val + alpha * acc
20 | # Store the result.
21 | tl.store(y_ptr + i, acc)
22 |
23 | # A CPU reference implementation for testing correctness and timing.
24 | def retnet_cpu(x, alpha):
25 | y = torch.empty_like(x)
26 | acc = 0.0
27 | for i in range(x.shape[0]):
28 | acc = x[i].item() + alpha * acc
29 | y[i] = acc
30 | return y
31 |
32 | def main():
33 | # Parameters
34 | N = 1024 # Sequence length (must match the kernel compile-time constant)
35 | alpha = 0.9
36 | # Create a random input tensor on the GPU.
37 | x = torch.randn(N, device='cuda', dtype=torch.float32)
38 | y = torch.empty_like(x)
39 |
40 | # Define a grid that launches one program instance (since the kernel is sequential).
41 | grid = lambda meta: (1,)
42 |
43 | # Warm-up: launch the kernel once to compile and warm up.
44 | retnet_kernel[grid](x, y, N, alpha)
45 | torch.cuda.synchronize()
46 |
47 | # Time the Triton kernel using CUDA events.
48 | start_event = torch.cuda.Event(enable_timing=True)
49 | end_event = torch.cuda.Event(enable_timing=True)
50 | start_event.record()
51 | retnet_kernel[grid](x, y, N, alpha)
52 | end_event.record()
53 | torch.cuda.synchronize()
54 | triton_time = start_event.elapsed_time(end_event) # milliseconds
55 |
56 | # Run the CPU version for comparison.
57 | x_cpu = x.cpu()
58 | start = time.time()
59 | y_cpu = retnet_cpu(x_cpu, alpha)
60 | cpu_time = (time.time() - start) * 1000 # convert to ms
61 |
62 | # Verify correctness.
63 | y_ref = y_cpu.to(device='cuda')
64 | if torch.allclose(y, y_ref, atol=1e-5):
65 | print("Results match.")
66 | else:
67 | print("Results differ!")
68 |
69 | print("Triton kernel time (ms):", triton_time)
70 | print("CPU cumulative sum time (ms):", cpu_time)
71 |
72 | if __name__ == '__main__':
73 | main()
74 |
--------------------------------------------------------------------------------
/day79/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def quantize_kernel(input_ptr, output_ptr, n_elements, scale, BLOCK_SIZE: tl.constexpr):
7 |
8 | pid = tl.program_id(0)
9 | block_start = pid * BLOCK_SIZE
10 | offsets = block_start + tl.arange(0, BLOCK_SIZE)
11 |
12 | mask = offsets < n_elements
13 |
14 | x = tl.load(input_ptr + offsets, mask=mask)
15 |
16 | x_scaled = x * scale
17 |
18 | x_rounded = tl.round(x_scaled)
19 |
20 | x_clamped = tl.max(tl.min(x_rounded, 127), -128)
21 |
22 | tl.store(output_ptr + offsets, tl.cast(x_clamped, tl.int8), mask=mask)
23 |
24 | def quantize(input_tensor, scale):
25 |
26 | assert input_tensor.is_cuda, "Input tensor must be on a CUDA device"
27 | n_elements = input_tensor.numel()
28 | output_tensor = torch.empty_like(input_tensor, dtype=torch.int8)
29 |
30 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
31 | quantize_kernel[grid](input_tensor, output_tensor, n_elements, scale, BLOCK_SIZE=1024)
32 |
33 | return output_tensor
34 |
35 | if __name__ == '__main__':
36 |
37 | input_tensor = torch.randn(1024 * 1024, device='cuda', dtype=torch.float32)
38 | scale = 127.0
39 | output_tensor = quantize(input_tensor, scale)
40 | print("Quantization complete. Output tensor:")
41 | print(output_tensor)
42 |
--------------------------------------------------------------------------------
/day80/kernel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 |
5 | @triton.jit
6 | def rwkv_kernel(
7 | output_ptr,
8 | k_ptr,
9 | v_ptr,
10 | w_ptr,
11 | n_time: tl.constexpr,
12 | n_channels: tl.constexpr,
13 | stride_time: tl.constexpr,
14 | stride_batch: tl.constexpr
15 | ):
16 | pid = tl.program_id(0)
17 | batch = pid // n_channels
18 | channel = pid % n_channels
19 |
20 | w = tl.load(w_ptr + channel)
21 |
22 | max_val = -1e30
23 | numerator = 0.0
24 | denominator = 0.0
25 |
26 | for t in range(n_time):
27 | offset = batch * stride_batch + t * stride_time + channel
28 |
29 | cur_k = tl.load(k_ptr + offset)
30 | cur_v = tl.load(v_ptr + offset)
31 |
32 | m = tl.maximum(max_val, cur_k)
33 |
34 | exp_max_diff = tl.exp(max_val - m)
35 | exp_k_diff = tl.exp(cur_k - m)
36 |
37 | numerator = numerator * exp_max_diff + cur_v * exp_k_diff
38 | denominator = denominator * exp_max_diff + exp_k_diff
39 |
40 | result = numerator / denominator
41 | tl.store(output_ptr + offset, result)
42 |
43 | max_val = m + w
44 |
45 | def rwkv_forward(k: torch.Tensor, v: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
46 | assert k.is_cuda and v.is_cuda and w.is_cuda, "All tensors must be on CUDA."
47 | B, T, C = k.shape
48 |
49 | output = torch.empty_like(v)
50 |
51 | stride_time = k.stride(1)
52 | stride_batch = k.stride(0)
53 |
54 | grid = (B * C,)
55 |
56 | rwkv_kernel[grid](
57 | output_ptr=output,
58 | k_ptr=k,
59 | v_ptr=v,
60 | w_ptr=w,
61 | n_time=T,
62 | n_channels=C,
63 | stride_time=stride_time,
64 | stride_batch=stride_batch,
65 | )
66 | return output
67 |
68 | if __name__ == '__main__':
69 | B = 2 # batch size
70 | T = 128 # sequence length
71 | C = 256 # number of channels
72 |
73 | k_tensor = torch.randn(B, T, C, device='cuda', dtype=torch.float32)
74 | v_tensor = torch.randn(B, T, C, device='cuda', dtype=torch.float32)
75 | w_tensor = torch.randn(C, device='cuda', dtype=torch.float32) * 0.1
76 |
77 | output_tensor = rwkv_forward(k_tensor, v_tensor, w_tensor)
78 | print("Output shape:", output_tensor.shape)
79 | print("Output sample:", output_tensor[0, :5, :5])
80 |
--------------------------------------------------------------------------------
/day81/main.py:
--------------------------------------------------------------------------------
1 | import triton
2 | import triton.language as tl
3 |
4 | @triton.jit
5 | def fused_layernorm_ff_dropout_kernel(
6 | x_ptr, out_ptr,
7 | gamma_ptr, beta_ptr,
8 | weight1_ptr, bias1_ptr,
9 | weight2_ptr, bias2_ptr,
10 | seed,
11 | dropout_p: tl.constexpr,
12 | N: tl.constexpr,
13 | M: tl.constexpr,
14 | BLOCK: tl.constexpr
15 | ):
16 | row_idx = tl.program_id(0)
17 | row_offset = row_idx * N
18 |
19 | x = tl.load(x_ptr + row_offset + tl.arange(0, N))
20 | mean = tl.sum(x, axis=0) / N
21 | diff = x - mean
22 | var = tl.sum(diff * diff, axis=0) / N
23 | norm = diff * tl.rsqrt(var + 1e-5)
24 |
25 | gamma = tl.load(gamma_ptr + tl.arange(0, N))
26 | beta = tl.load(beta_ptr + tl.arange(0, N))
27 | norm = norm * gamma + beta
28 |
29 | hidden = tl.zeros([M], dtype=x.dtype)
30 | for i in range(0, N, BLOCK):
31 | block_range = i + tl.arange(0, BLOCK)
32 | norm_block = norm[block_range]
33 | weight1_block = tl.load(
34 | weight1_ptr + i * M + tl.arange(0, BLOCK)[:, None] * M + tl.arange(0, M),
35 | mask=(i + tl.arange(0, BLOCK))[:, None] < N, other=0.0
36 | )
37 | hidden += tl.dot(norm_block, weight1_block)
38 |
39 | bias1 = tl.load(bias1_ptr + tl.arange(0, M))
40 | hidden += bias1
41 |
42 | SQRT_2_OVER_PI = 0.7978845608028654
43 | gelu_hidden = 0.5 * hidden * (1.0 + tl.tanh(SQRT_2_OVER_PI * (hidden + 0.044715 * hidden * hidden * hidden)))
44 |
45 | prng = tl.arange(0, M) + row_idx * M + seed
46 | rand_vals = ((1103515245 * prng + 12345) & 0x7fffffff) / 2147483647.0
47 | dropout_mask = rand_vals > dropout_p
48 | dropout_scale = 1.0 / (1.0 - dropout_p)
49 | dropped = gelu_hidden * dropout_mask * dropout_scale
50 |
51 | out = tl.zeros([N], dtype=x.dtype)
52 | for j in range(0, M, BLOCK):
53 | block_range = j + tl.arange(0, BLOCK)
54 | dropped_block = dropped[block_range]
55 | weight2_block = tl.load(
56 | weight2_ptr + j * N + tl.arange(0, BLOCK)[:, None] * N + tl.arange(0, N),
57 | mask=(j + tl.arange(0, BLOCK))[:, None] < M, other=0.0
58 | )
59 | out += tl.dot(dropped_block, weight2_block)
60 |
61 | bias2 = tl.load(bias2_ptr + tl.arange(0, N))
62 | out += bias2
63 |
64 | tl.store(out_ptr + row_offset + tl.arange(0, N), out)
65 |
--------------------------------------------------------------------------------
/day82/rope.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 | import math
5 |
6 | @triton.jit
7 | def rope_kernel(q_ptr, cos_ptr, sin_ptr, stride_q0, stride_q1, stride_cos0, stride_cos1, seq_len: tl.constexpr, head_half: tl.constexpr, BLOCK_SEQ: tl.constexpr, BLOCK_HD: tl.constexpr):
8 |
9 | pid_seq = tl.program_id(0)
10 | pid_hd = tl.program_id(1)
11 |
12 | seq_offset = pid_seq * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ)
13 | hd_offset = pid_hd * BLOCK_HD + tl.arange(0, BLOCK_HD)
14 |
15 | mask_seq = seq_offset < seq_len
16 | mask_hd = hd_offset < head_half
17 |
18 | q_ptrs = q_ptr + seq_offset[:, None] * stride_q0 + hd_offset[None, :] * (2 * stride_q1)
19 |
20 | q0 = tl.load(q_ptrs, mask=mask_seq[:, None] & mask_hd[None, :])
21 | q1 = tl.load(q_ptrs + stride_q1, mask=mask_seq[:, None] & mask_hd[None, :])
22 |
23 | cos_ptrs = cos_ptr + seq_offset[:, None] * stride_cos0 + hd_offset[None, :] * stride_cos1
24 | sin_ptrs = sin_ptr + seq_offset[:, None] * stride_cos0 + hd_offset[None, :] * stride_cos1
25 |
26 | cos_val = tl.load(cos_ptrs, mask=mask_seq[:, None] & mask_hd[None, :])
27 | sin_val = tl.load(sin_ptrs, mask=mask_seq[:, None] & mask_hd[None, :])
28 |
29 | out0 = q0 * cos_val - q1 * sin_val
30 | out1 = q0 * sin_val + q1 * cos_val
31 |
32 | tl.store(q_ptrs, out0, mask=mask_seq[:, None] & mask_hd[None, :])
33 | tl.store(q_ptrs + stride_q1, out1, mask=mask_seq[:, None] & mask_hd[None, :])
34 |
35 | def apply_rope(q, cos, sin, BLOCK_SEQ=64, BLOCK_HD=32):
36 |
37 | seq_len, head_dim = q.shape
38 | assert head_dim % 2 == 0
39 | head_half = head_dim // 2
40 |
41 | grid = ((seq_len + BLOCK_SEQ - 1) // BLOCK_SEQ, (head_half + BLOCK_HD - 1) // BLOCK_HD)
42 |
43 | q_contig = q.contiguous()
44 |
45 | rope_kernel[grid](q_contig, cos, sin, q_contig.stride(0), q_contig.stride(1), cos.stride(0), cos.stride(1), seq_len, head_half, BLOCK_SEQ, BLOCK_HD)
46 | return q_contig
47 |
48 | if __name__ == "__main__":
49 | torch.manual_seed(0)
50 | device = 'cuda'
51 |
52 | seq_len = 128
53 | head_dim = 64
54 |
55 | q = torch.randn(seq_len, head_dim, device=device, dtype=torch.float32)
56 |
57 | positions = torch.arange(seq_len, device=device, dtype=torch.float32).unsqueeze(1)
58 | dim_idx = torch.arange(head_dim // 2, device=device, dtype=torch.float32).unsqueeze(0)
59 | inv_freq = 1.0 / (10000 ** (dim_idx / (head_dim // 2)))
60 | theta = positions * inv_freq
61 |
62 | cos = torch.cos(theta)
63 | sin = torch.sin(theta)
64 |
65 | q_transformed = apply_rope(q, cos, sin)
66 | print("Transformed q:")
67 | print(q_transformed)
68 |
--------------------------------------------------------------------------------
/day84/kernel.py:
--------------------------------------------------------------------------------
1 | import triton
2 | import triton.language as tl
3 | import torch
4 |
5 | @triton.jit
6 | def fp8_gemm_kernel(
7 | a_ptr, b_ptr, c_ptr,
8 | M, N, K,
9 | stride_am, stride_ak,
10 | stride_bk, stride_bn,
11 | stride_cm, stride_cn,
12 | scale_a, scale_b, scale_c,
13 | BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr
14 | ):
15 | pid_m = tl.program_id(0)
16 | pid_n = tl.program_id(1)
17 |
18 | rm = tl.arange(0, BLOCK_M)
19 | rn = tl.arange(0, BLOCK_N)
20 | offm = pid_m * BLOCK_M + rm
21 | offn = pid_n * BLOCK_N + rn
22 |
23 | acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
24 |
25 | for k in range(0, K, BLOCK_K):
26 | offk = k + tl.arange(0, BLOCK_K)
27 |
28 | a = tl.load(
29 | a_ptr + offm[:, None] * stride_am + offk[None, :] * stride_ak,
30 | mask=(offm[:, None] < M) & (offk[None, :] < K),
31 | other=0,
32 | )
33 | b = tl.load(
34 | b_ptr + offk[:, None] * stride_bk + offn[None, :] * stride_bn,
35 | mask=(offk[:, None] < K) & (offn[None, :] < N),
36 | other=0,
37 | )
38 |
39 | a_fp32 = tl.cast(a, tl.float32) * scale_a
40 | b_fp32 = tl.cast(b, tl.float32) * scale_b
41 |
42 | acc += tl.dot(a_fp32, b_fp32)
43 |
44 | c_fp8 = tl.round(acc / scale_c)
45 | c_fp8 = tl.max(tl.min(c_fp8, 127), -128)
46 |
47 | tl.store(
48 | c_ptr + offm[:, None] * stride_cm + offn[None, :] * stride_cn,
49 | c_fp8.to(tl.int8),
50 | mask=(offm[:, None] < M) & (offn[None, :] < N)
51 | )
52 |
53 | def fp8_gemm(a: torch.Tensor, b: torch.Tensor,
54 | scale_a: float, scale_b: float, scale_c: float,
55 | BLOCK_M: int = 64, BLOCK_N: int = 64, BLOCK_K: int = 32) -> torch.Tensor:
56 | assert a.dtype == torch.int8 and b.dtype == torch.int8
57 | M, K = a.shape
58 | K2, N = b.shape
59 | assert K == K2
60 |
61 | c = torch.empty((M, N), device=a.device, dtype=torch.int8)
62 |
63 | grid = (triton.cdiv(M, BLOCK_M), triton.cdiv(N, BLOCK_N))
64 |
65 | fp8_gemm_kernel[grid](
66 | a, b, c,
67 | M, N, K,
68 | a.stride(0), a.stride(1),
69 | b.stride(0), b.stride(1),
70 | c.stride(0), c.stride(1),
71 | scale_a, scale_b, scale_c,
72 | BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K
73 | )
74 | return c
75 |
76 | if __name__ == "__main__":
77 | torch.manual_seed(0)
78 | M, K, N = 128, 256, 64
79 |
80 | a_fp8 = torch.randint(-128, 127, (M, K), device='cuda', dtype=torch.int8)
81 | b_fp8 = torch.randint(-128, 127, (K, N), device='cuda', dtype=torch.int8)
82 |
83 | scale_a, scale_b, scale_c = 0.1, 0.1, 0.05
84 |
85 | c_fp8 = fp8_gemm(a_fp8, b_fp8, scale_a, scale_b, scale_c)
86 | print("GEMM result (FP8 stored as int8):", c_fp8)
87 |
--------------------------------------------------------------------------------
/day85/TensorMatMul.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 |
4 | __global__ void tensorMatrixMultKernel(
5 | const float* A,
6 | const float* B,
7 | float* C,
8 | size_t B_dim,
9 | size_t I_dim,
10 | size_t J_dim,
11 | size_t L_dim,
12 | size_t K_dim
13 | ) {
14 |
15 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
16 |
17 |
18 | int total_elements = B_dim * I_dim * J_dim * K_dim;
19 | if (idx < total_elements) {
20 | int k = idx % K_dim;
21 | int j = (idx / K_dim) % J_dim;
22 | int i = (idx / (K_dim * J_dim)) % I_dim;
23 | int b = idx / (K_dim * J_dim * I_dim);
24 |
25 |
26 | size_t c_idx = ((b * I_dim + i) * J_dim + j) * K_dim + k;
27 |
28 |
29 | float sum = 0.0f;
30 |
31 |
32 | size_t a_base = ((b * I_dim + i) * J_dim + j) * L_dim;
33 |
34 | for (int l = 0; l < L_dim; l++) {
35 | sum += A[a_base + l] * B[l * K_dim + k];
36 | }
37 |
38 | C[c_idx] = sum;
39 | }
40 | }
41 |
42 | extern "C" void solution(const float* A, const float* B, float* C, size_t b, size_t i, size_t j, size_t l, size_t k) {
43 |
44 | size_t total_elements = b * i * j * k;
45 |
46 |
47 | int threadsPerBlock = 256;
48 | int blocksPerGrid = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
49 |
50 |
51 | tensorMatrixMultKernel<<>>(A, B, C, b, i, j, l, k);
52 | }
--------------------------------------------------------------------------------
/day86/hard_sigmoid.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 |
5 | __global__ void hard_sigmoid_kernel(const float* input, float* output, size_t total_elements) {
6 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
7 | if (idx >= total_elements) return;
8 |
9 | float x = input[idx];
10 | if (x <= -3.0f)
11 | output[idx] = 0.0f;
12 | else if (x >= 3.0f)
13 | output[idx] = 1.0f;
14 | else
15 | output[idx] = (x + 3.0f) / 6.0f;
16 | }
17 |
18 |
19 | extern "C" void solution(const float* input, float* output, size_t n, size_t m) {
20 |
21 | size_t total_elements = n * m;
22 |
23 | const int threadsPerBlock = 256;
24 | int blocksPerGrid = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
25 |
26 | hard_sigmoid_kernel<<>>(input, output, total_elements);
27 |
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/day87/SymMatMul.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define BLOCK_SIZE 16
4 |
5 | __global__ void matrixMulKernel(const float* A, const float* B, float* C, size_t n) {
6 | size_t row = blockIdx.y * blockDim.y + threadIdx.y;
7 | size_t col = blockIdx.x * blockDim.x + threadIdx.x;
8 |
9 | if (row < n && col < n) {
10 | float sum = 0.0f;
11 | for (size_t k = 0; k < n; k++) {
12 | sum += A[row * n + k] * B[k * n + col];
13 | }
14 | C[row * n + col] = sum;
15 | }
16 | }
17 |
18 | extern "C" void solution(const float* input_a, const float* input_b, float* output_c, size_t n) {
19 | dim3 block(BLOCK_SIZE, BLOCK_SIZE);
20 | dim3 grid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, (n + BLOCK_SIZE - 1) / BLOCK_SIZE);
21 |
22 | matrixMulKernel<<>>(input_a, input_b, output_c, n);
23 |
24 |
25 | cudaDeviceSynchronize();
26 | }
27 |
--------------------------------------------------------------------------------
/day88/MSE.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) {
5 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
6 | if (idx < numElements) {
7 | float diff = predictions[idx] - targets[idx];
8 | float sq_diff = diff * diff;
9 |
10 | atomicAdd(sum, sq_diff);
11 | }
12 | }
13 |
14 | extern "C" void solution(const float* predictions, const float* targets, float* output, size_t* shape, size_t ndim) {
15 |
16 | size_t* hostShape = new size_t[ndim];
17 | cudaMemcpy(hostShape, shape, ndim * sizeof(size_t), cudaMemcpyDeviceToHost);
18 |
19 | size_t numElements = 1;
20 | for (size_t i = 0; i < ndim; i++) {
21 | numElements *= hostShape[i];
22 | }
23 | delete[] hostShape;
24 |
25 |
26 | float init = 0.0f;
27 | cudaMemcpy(output, &init, sizeof(float), cudaMemcpyHostToDevice);
28 |
29 |
30 | int threadsPerBlock = 256;
31 | int blocks = (numElements + threadsPerBlock - 1) / threadsPerBlock;
32 | mseKernel<<>>(predictions, targets, numElements, output);
33 | cudaDeviceSynchronize();
34 |
35 | float hostSum = 0.0f;
36 | cudaMemcpy(&hostSum, output, sizeof(float), cudaMemcpyDeviceToHost);
37 |
38 | float mse = hostSum / numElements;
39 |
40 | cudaMemcpy(output, &mse, sizeof(float), cudaMemcpyHostToDevice);
41 | }
42 |
--------------------------------------------------------------------------------
/day89/LTMM.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define BLOCK_SIZE 16
5 |
6 |
7 | __global__
8 | void lowerTriangularMultiplyKernel(const float* A, const float* B, float* C, size_t n) {
9 | int row = blockIdx.y * blockDim.y + threadIdx.y;
10 | int col = blockIdx.x * blockDim.x + threadIdx.x;
11 |
12 | if (row < n && col < n) {
13 | if (col > row) {
14 | C[row * n + col] = 0.0f;
15 | } else {
16 | float sum = 0.0f;
17 | for (int k = col; k <= row; k++) {
18 | sum += A[row * n + k] * B[k * n + col];
19 | }
20 | C[row * n + col] = sum;
21 | }
22 | }
23 | }
24 |
25 | extern "C" void solution(const float* input_a, const float* input_b, float* output_c, size_t n) {
26 | dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
27 | dim3 gridDim((n + blockDim.x - 1) / blockDim.x,
28 | (n + blockDim.y - 1) / blockDim.y);
29 |
30 | lowerTriangularMultiplyKernel<<>>(input_a, input_b, output_c, n);
31 |
32 | cudaDeviceSynchronize();
33 | }
34 |
--------------------------------------------------------------------------------
/day90/FrobeniusNorm.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 |
4 | __global__ void calculateSumOfSquares(const float* X, float* partialSums, size_t size) {
5 | extern __shared__ float sharedData[];
6 |
7 |
8 | unsigned int tid = threadIdx.x;
9 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
10 |
11 |
12 | sharedData[tid] = 0.0f;
13 |
14 |
15 | while (i < size) {
16 | sharedData[tid] += X[i] * X[i];
17 | i += blockDim.x * gridDim.x;
18 | }
19 |
20 |
21 | __syncthreads();
22 |
23 |
24 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
25 | if (tid < s) {
26 | sharedData[tid] += sharedData[tid + s];
27 | }
28 | __syncthreads();
29 | }
30 |
31 |
32 | if (tid == 0) {
33 | partialSums[blockIdx.x] = sharedData[0];
34 | }
35 | }
36 |
37 |
38 | __global__ void normalizeByFrobeniusNorm(const float* X, float* Y, size_t size, float frobeniusNorm) {
39 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
40 |
41 | if (i < size) {
42 | Y[i] = X[i] / frobeniusNorm;
43 | }
44 | }
45 |
46 | extern "C" void solution(const float* X, float* Y, size_t size) {
47 |
48 | int blockSize = 256;
49 | int gridSize = (size + blockSize - 1) / blockSize;
50 | int maxBlocks = 1024;
51 |
52 | if (gridSize > maxBlocks) {
53 | gridSize = maxBlocks;
54 | }
55 |
56 | float* d_partialSums;
57 | cudaMalloc(&d_partialSums, gridSize * sizeof(float));
58 |
59 | calculateSumOfSquares<<>>(X, d_partialSums, size);
60 |
61 | float* h_partialSums = new float[gridSize];
62 | cudaMemcpy(h_partialSums, d_partialSums, gridSize * sizeof(float), cudaMemcpyDeviceToHost);
63 |
64 | float sumOfSquares = 0.0f;
65 | for (int i = 0; i < gridSize; i++) {
66 | sumOfSquares += h_partialSums[i];
67 | }
68 |
69 | float frobeniusNorm = sqrt(sumOfSquares);
70 |
71 | if (frobeniusNorm < 1e-10) {
72 | frobeniusNorm = 1.0f;
73 | }
74 |
75 | normalizeByFrobeniusNorm<<<(size + blockSize - 1) / blockSize, blockSize>>>(X, Y, size, frobeniusNorm);
76 |
77 | delete[] h_partialSums;
78 | cudaFree(d_partialSums);
79 | }
--------------------------------------------------------------------------------
/day91/Hinge_Loss.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 |
4 | __global__ void hingeKernel(const float* predictions, const float* targets, float* output, size_t n) {
5 |
6 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
7 |
8 | if (idx < n) {
9 | float prod = predictions[idx] * targets[idx];
10 | output[idx] = fmaxf(0.0f, 1.0f - prod);
11 | }
12 | }
13 |
14 |
15 | extern "C" void solution(const float* predictions, const float* targets, float* output, size_t n) {
16 | // I found this to be the best configuration for the kernel (h100)
17 | const int blockSize = 256;
18 | const int gridSize = (n + blockSize - 1) / blockSize;
19 |
20 | hingeKernel<<>>(predictions, targets, output, n);
21 |
22 |
23 | }
--------------------------------------------------------------------------------
/day92/1D_Convolution.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | __global__
4 | void conv1d(const float* A,
5 | const float* B,
6 | float* C,
7 | size_t N,
8 | size_t K)
9 | {
10 | size_t i = blockIdx.x * blockDim.x + threadIdx.x;
11 | int radius = int(K/2);
12 |
13 | if (i < N) {
14 | float sum = 0.0f;
15 | for (int j = 0; j < int(K); ++j) {
16 | int idx = int(i) + j - radius;
17 | if (idx >= 0 && idx < int(N)) {
18 | sum += A[idx] * B[j];
19 | }
20 | }
21 | C[i] = sum;
22 | }
23 | }
24 |
25 | extern "C"
26 | void solution(const float* A,
27 | const float* B,
28 | float* C,
29 | size_t N,
30 | size_t K)
31 | {
32 | int threads = 1024;
33 | int blocks = int((N + threads - 1) / threads);
34 |
35 | conv1d<<>>(A, B, C, N, K);
36 | }
37 |
--------------------------------------------------------------------------------
/day93/RMS_Normalization.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define EPSILON 1e-5f
5 |
6 | __global__ void compute_rms(const float* X, float* rms, size_t B, size_t N) {
7 | extern __shared__ float sdata[];
8 | size_t row = blockIdx.x;
9 | size_t tid = threadIdx.x;
10 | const float* row_ptr = X + row * N;
11 |
12 | float sum = 0.0f;
13 | for (size_t i = tid; i < N; i += blockDim.x) {
14 | float v = row_ptr[i];
15 | sum += v * v;
16 | }
17 | sdata[tid] = sum;
18 | __syncthreads();
19 |
20 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
21 | if (tid < s) {
22 | sdata[tid] += sdata[tid + s];
23 | }
24 | __syncthreads();
25 | }
26 |
27 | if (tid == 0) {
28 | float mean_sq = sdata[0] / static_cast(N);
29 | rms[row] = sqrtf(mean_sq + EPSILON);
30 | }
31 | }
32 |
33 | __global__ void normalize_rms(const float* X, float* Y, const float* rms, size_t B, size_t N) {
34 | size_t row = blockIdx.x;
35 | size_t tid = threadIdx.x;
36 | float r = rms[row];
37 | const float* row_in = X + row * N;
38 | float* row_out = Y + row * N;
39 |
40 | for (size_t i = tid; i < N; i += blockDim.x) {
41 | row_out[i] = row_in[i] / r;
42 | }
43 | }
44 |
45 | extern "C" void solution(const float* X, float* Y, size_t B, size_t N) {
46 | int threads = (N < 256) ? int(N) : 256;
47 | size_t shared_mem_size = threads * sizeof(float);
48 |
49 | float* d_rms = nullptr;
50 | cudaMalloc(&d_rms, B * sizeof(float));
51 |
52 | compute_rms<<>>(X, d_rms, B, N);
53 |
54 | normalize_rms<<>>(X, Y, d_rms, B, N);
55 |
56 | cudaFree(d_rms);
57 | }
58 |
--------------------------------------------------------------------------------
/day94/ELU.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1y33/100Days/1d8f606efc4f48d15be6a607faae1440ad37ab85/day94/ELU.cu
--------------------------------------------------------------------------------
/day95/2D_Max_Pooling.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include // for FLT_MAX
3 | #include // for size_t
4 |
5 |
6 | __global__
7 | void maxpool2d_kernel(const float* __restrict__ input,
8 | int H, int W,
9 | int kernel_size, int stride, int padding, int dilation,
10 | int H_out, int W_out,
11 | float* __restrict__ output)
12 | {
13 |
14 | int out_y = blockIdx.y * blockDim.y + threadIdx.y;
15 | int out_x = blockIdx.x * blockDim.x + threadIdx.x;
16 | if (out_y >= H_out || out_x >= W_out) return;
17 |
18 |
19 | float max_val = -FLT_MAX;
20 | for (int m = 0; m < kernel_size; ++m) {
21 | int in_y = out_y * stride + m * dilation - padding;
22 | for (int n = 0; n < kernel_size; ++n) {
23 | int in_x = out_x * stride + n * dilation - padding;
24 |
25 | if (in_y >= 0 && in_y < H && in_x >= 0 && in_x < W) {
26 | float v = input[in_y * W + in_x];
27 | if (v > max_val) max_val = v;
28 | }
29 | }
30 | }
31 | output[out_y * W_out + out_x] = max_val;
32 | }
33 |
34 |
35 | extern "C"
36 | void solution(const float* input,
37 | int kernel_size,
38 | int stride,
39 | int padding,
40 | int dilation,
41 | float* output,
42 | size_t H,
43 | size_t W)
44 | {
45 |
46 | int H_out = (int)(( (int)H + 2*padding
47 | - dilation*(kernel_size-1)
48 | - 1 ) / stride) + 1;
49 | int W_out = (int)(( (int)W + 2*padding
50 | - dilation*(kernel_size-1)
51 | - 1 ) / stride) + 1;
52 |
53 |
54 | const int Bx = 16, By = 16;
55 | dim3 block(Bx, By);
56 | dim3 grid( (W_out + Bx - 1) / Bx,
57 | (H_out + By - 1) / By );
58 |
59 | maxpool2d_kernel<<>>(
60 | input,
61 | (int)H, (int)W,
62 | kernel_size, stride, padding, dilation,
63 | H_out, W_out,
64 | output
65 | );
66 |
67 |
68 | }
69 |
--------------------------------------------------------------------------------
/day96/Product_Over_Dimension.cu:
--------------------------------------------------------------------------------
1 | #include