├── add.cu ├── add_bench.cu ├── coalescing_offset_bench.cu ├── coalescing_stride_bench.cu ├── const_bench.cu ├── custom_config.yml ├── download_mnist.sh ├── icons ├── like.svg ├── share.svg └── subscribe.svg ├── image.png ├── manim_scripts ├── 0_Introduction.py ├── 10_Memory_Coalescing.py ├── 11_Occupancy.py ├── 1_CPU_vs_GPU.py ├── 2_Grid_Blocks_Threads.py ├── 3_Neural_Network.py ├── 4_Backward_Pass.py ├── 5_PerformanceCharacteristics.py ├── 6_Memory_Hierarchy.py ├── 7_Tiling.py ├── 8_GPU_Architecture.py ├── 9_Constant_Memory.py ├── EndScreen_CE.py ├── FastSoftmax.py ├── HierarchicalTiling.py ├── HierarchicalTiling_CE.py ├── MoE.py ├── NN.py ├── Parallelism.py ├── Presentation.py ├── Quantization.py ├── TensorCores.py ├── TensorCores_CE.py ├── how_to_keep_gpu_happy.py ├── shaders │ └── one_sided │ │ ├── frag.glsl │ │ └── vert.glsl └── voicover_gl.py ├── matmul.cu ├── matmul_bench.cu ├── matvec.cu ├── mnist.cu ├── mnist_optimized.cu ├── utils ├── present.py └── split_video.py └── vectorized_bench.cu /add.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void add(int n , float* a, float* b, float* c) 4 | { 5 | int i = blockIdx.x * blockDim.x + threadIdx.x; 6 | if (i < n) 7 | { 8 | c[i] = a[i] + b[i]; 9 | } 10 | } 11 | 12 | int main() 13 | { 14 | int N = 4096; 15 | int BLOCK_SIZE=256; 16 | float* a = new float[N]; 17 | float* b = new float[N]; 18 | float* c = new float[N]; 19 | for (int i = 0; i>>(N, a_d, b_d, c_d); 36 | 37 | cudaMemcpy(c, c_d, N*sizeof(float), cudaMemcpyDeviceToHost); 38 | 39 | for (int i = 0; i<10; i++) 40 | { 41 | std::cout< 2 | #include 3 | #include 4 | 5 | #define BENCH_STEPS 400 6 | 7 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 8 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 9 | { 10 | if (code != cudaSuccess) 11 | { 12 | fprintf(stderr,"CUDA error %d: %s %s %d\n", code, cudaGetErrorString(code), file, line); 13 | if (abort) exit(code); 14 | } 15 | } 16 | 17 | __global__ void add(int n , float* a, float* b, float* c) 18 | { 19 | int i = blockIdx.x * blockDim.x + threadIdx.x; 20 | if (i < n) 21 | { 22 | c[i] = a[i] + b[i]; 23 | } 24 | } 25 | 26 | int main() 27 | { 28 | for (int p = 0; p<25; p++) 29 | { 30 | int N = std::pow(2, p); 31 | int BLOCK_SIZE=1024; 32 | float* a = new float[N]; 33 | float* b = new float[N]; 34 | float* c = new float[N]; 35 | float* c2 = new float[N]; 36 | for (int i = 0; i>>(N, a_d, b_d, c_d); 57 | gpuErrchk(cudaPeekAtLastError()); 58 | gpuErrchk(cudaDeviceSynchronize()); 59 | double final_time = std::chrono::duration_cast(std::chrono::system_clock::now() - start_time).count(); 60 | gpu_time += final_time; 61 | } 62 | 63 | double cpu_time=0.0; 64 | for (int i = 0; i(std::chrono::system_clock::now() - start_time).count(); 72 | cpu_time += final_time; 73 | } 74 | 75 | std::cout<<"p = "< 2 | #include 3 | #include 4 | 5 | #define BLOCK_SIZE 32 6 | #define BENCH_STEPS 4000 7 | #define MAX_OFFSET 129 8 | 9 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 10 | #define ASSERT(cond, msg, args...) assert((cond) || !fprintf(stderr, (msg "\n"), args)) 11 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 12 | { 13 | if (code != cudaSuccess) 14 | { 15 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 16 | if (abort) exit(code); 17 | } 18 | } 19 | 20 | void clear_l2() { 21 | // Get actual L2 size via CUDA on first call of this function 22 | static int l2_clear_size = 0; 23 | static unsigned char* gpu_scratch_l2_clear = NULL; 24 | if (!gpu_scratch_l2_clear) { 25 | cudaDeviceGetAttribute(&l2_clear_size, cudaDevAttrL2CacheSize, 0); 26 | l2_clear_size *= 2; // just to be extra safe (cache is not necessarily strict LRU) 27 | gpuErrchk(cudaMalloc(&gpu_scratch_l2_clear, l2_clear_size)); 28 | } 29 | // Clear L2 cache (this is run on every call unlike the above code) 30 | gpuErrchk(cudaMemset(gpu_scratch_l2_clear, 0, l2_clear_size)); 31 | } 32 | 33 | __global__ void copy(int n , float* in, float* out, int offset) 34 | { 35 | unsigned long i = blockIdx.x * blockDim.x + threadIdx.x; 36 | if (i < n) 37 | { 38 | out[i + offset] = in[i + offset]; 39 | } 40 | } 41 | 42 | int main() 43 | { 44 | double timings[MAX_OFFSET]; 45 | float* in_d; 46 | float* out_d; 47 | 48 | long N = std::pow(2, 20); 49 | 50 | for (int o = -1; o>>(N, in_d, out_d, offset); 71 | gpuErrchk(cudaEventRecord(stop)); 72 | gpuErrchk(cudaEventSynchronize(stop)); 73 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 74 | gpuErrchk(cudaPeekAtLastError()); 75 | gpuErrchk(cudaDeviceSynchronize()); 76 | if (i != -1) // one warmup run 77 | { 78 | run_time += time / BENCH_STEPS; 79 | } 80 | } 81 | 82 | timings[offset] = run_time; 83 | gpuErrchk(cudaEventDestroy(start)); 84 | gpuErrchk(cudaEventDestroy(stop)); 85 | } 86 | std::cout<<"timings"<<" = ["; 87 | for (int i = 0; i 2 | #include 3 | #include 4 | 5 | #define BLOCK_SIZE 32 6 | #define BENCH_STEPS 100 7 | #define MAX_STRIDE 15 8 | #define BLOCKS 84*10 9 | 10 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 11 | #define ASSERT(cond, msg, args...) assert((cond) || !fprintf(stderr, (msg "\n"), args)) 12 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 13 | { 14 | if (code != cudaSuccess) 15 | { 16 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 17 | if (abort) exit(code); 18 | } 19 | } 20 | 21 | void clear_l2() { 22 | // Get actual L2 size via CUDA on first call of this function 23 | static int l2_clear_size = 0; 24 | static unsigned char* gpu_scratch_l2_clear = NULL; 25 | if (!gpu_scratch_l2_clear) { 26 | cudaDeviceGetAttribute(&l2_clear_size, cudaDevAttrL2CacheSize, 0); 27 | l2_clear_size *= 2; // just to be extra safe (cache is not necessarily strict LRU) 28 | gpuErrchk(cudaMalloc(&gpu_scratch_l2_clear, l2_clear_size)); 29 | } 30 | // Clear L2 cache (this is run on every call unlike the above code) 31 | gpuErrchk(cudaMemset(gpu_scratch_l2_clear, 0, l2_clear_size)); 32 | } 33 | 34 | __global__ void copy(int n , float* in, float* out, int stride) 35 | { 36 | unsigned long i = (blockIdx.x*blockDim.x + threadIdx.x)*stride; 37 | if (i < n) 38 | { 39 | out[i] = in[i]; 40 | } 41 | else 42 | { 43 | printf("skip load \n"); 44 | } 45 | } 46 | 47 | int main() 48 | { 49 | double timings[MAX_STRIDE+1]; 50 | float* in_d; 51 | float* out_d; 52 | 53 | long N = std::pow(2, 31); 54 | 55 | float* out_h = new float[N]; 56 | float* in_h = new float[N]; 57 | gpuErrchk(cudaMalloc((void**) &out_d, N*sizeof(float))); 58 | gpuErrchk(cudaMalloc((void**) &in_d, N*sizeof(float))); 59 | for (int s = -1; s<=MAX_STRIDE; s++) 60 | { 61 | int stride = std::pow(2, std::max(0, s)); 62 | cudaEvent_t start, stop; 63 | gpuErrchk(cudaEventCreate(&start)); 64 | gpuErrchk(cudaEventCreate(&stop)); 65 | 66 | 67 | dim3 dimGrid(BLOCKS, 1, 1); 68 | dim3 dimBlock(BLOCK_SIZE, 1, 1); 69 | 70 | float time = 0.f; 71 | double run_time = 0.0; 72 | for (int i = -1; i>>(N, out_d, in_d, stride); 80 | gpuErrchk(cudaEventRecord(stop)); 81 | gpuErrchk(cudaEventSynchronize(stop)); 82 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 83 | gpuErrchk(cudaPeekAtLastError()); 84 | gpuErrchk(cudaDeviceSynchronize()); 85 | if (i != -1) // one warmup run 86 | { 87 | run_time += time / BENCH_STEPS; 88 | } 89 | } 90 | 91 | std::cout<= 0) 93 | { 94 | timings[s] = run_time; 95 | } 96 | gpuErrchk(cudaEventDestroy(start)); 97 | gpuErrchk(cudaEventDestroy(stop)); 98 | } 99 | std::cout<<"timings"<<" = ["; 100 | for (int i = 0; i<=MAX_STRIDE; i++) 101 | { 102 | std::cout< 2 | #include 3 | #include 4 | 5 | #define BLOCK_SIZE 1024 6 | #define CONST_SIZE 16384 7 | #define BENCH_STEPS 1000 8 | #define TIMINGS 14 9 | #define START 10 10 | #define ACCESSES 10 11 | 12 | #define access (threadIdx.x * dist) % CONST_SIZE 13 | 14 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 15 | #define ASSERT(cond, msg, args...) assert((cond) || !fprintf(stderr, (msg "\n"), args)) 16 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 17 | { 18 | if (code != cudaSuccess) 19 | { 20 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 21 | if (abort) exit(code); 22 | } 23 | } 24 | 25 | void clear_l2() { 26 | // Get actual L2 size via CUDA on first call of this function 27 | static int l2_clear_size = 0; 28 | static unsigned char* gpu_scratch_l2_clear = NULL; 29 | if (!gpu_scratch_l2_clear) { 30 | cudaDeviceGetAttribute(&l2_clear_size, cudaDevAttrL2CacheSize, 0); 31 | l2_clear_size *= 2; // just to be extra safe (cache is not necessarily strict LRU) 32 | gpuErrchk(cudaMalloc(&gpu_scratch_l2_clear, l2_clear_size)); 33 | } 34 | // Clear L2 cache (this is run on every call unlike the above code) 35 | gpuErrchk(cudaMemset(gpu_scratch_l2_clear, 0, l2_clear_size)); 36 | } 37 | 38 | __constant__ float c_mem[CONST_SIZE]; 39 | 40 | __global__ void add(int n , float* a, float* b, float* c, int dist) 41 | { 42 | int i = blockIdx.x * blockDim.x + threadIdx.x; 43 | int y = access; 44 | if (i < n-ACCESSES) 45 | { 46 | for(int x = 0; x(2, START+TIMINGS-1); 76 | cudaMalloc((void**) &a_d, max_N*sizeof(float)); 77 | cudaMalloc((void**) &b_d, CONST_SIZE*sizeof(float)); 78 | cudaMalloc((void**) &c_d, max_N*sizeof(float)); 79 | cudaMalloc((void**) &d_d, max_N*sizeof(float)); 80 | 81 | float* cmemset = new float[max_N]; 82 | cudaMemset(a_d, 1, max_N*sizeof(float)); 83 | cudaMemset(b_d, 1, CONST_SIZE*sizeof(float)); 84 | memset(cmemset, 1, CONST_SIZE*sizeof(float)); 85 | cudaMemcpyToSymbol(c_mem, cmemset, CONST_SIZE*sizeof(float)); 86 | cudaMemset(d_d, 1, max_N*sizeof(float)); 87 | 88 | for (int distance = 1; distance<17; distance++) 89 | { 90 | for (int p = START; p(2, p); 97 | 98 | dim3 dimGrid(ceil(N/(float)BLOCK_SIZE), 1, 1); 99 | dim3 dimBlock(BLOCK_SIZE, 1, 1); 100 | 101 | double add_time=0.0; 102 | for (int i = -1; i>>(N, a_d, b_d, c_d, distance); 108 | gpuErrchk(cudaEventRecord(stop)); 109 | gpuErrchk(cudaEventSynchronize(stop)); 110 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 111 | gpuErrchk(cudaPeekAtLastError()); 112 | gpuErrchk(cudaDeviceSynchronize()); 113 | if (i != -1) // one warmup run 114 | { 115 | add_time += time / BENCH_STEPS; 116 | } 117 | } 118 | 119 | double const_time=0.0; 120 | for (int i = -1; i>>(N, a_d, d_d, distance); 126 | gpuErrchk(cudaEventRecord(stop)); 127 | gpuErrchk(cudaEventSynchronize(stop)); 128 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 129 | gpuErrchk(cudaPeekAtLastError()); 130 | gpuErrchk(cudaDeviceSynchronize()); 131 | if (i != -1) // one warmup run 132 | { 133 | const_time += time / BENCH_STEPS; 134 | } 135 | } 136 | 137 | mt[p-START] = add_time; 138 | tt[p-START] = const_time; 139 | gpuErrchk(cudaEventDestroy(start)); 140 | gpuErrchk(cudaEventDestroy(stop)); 141 | } 142 | float* c_h = new float[max_N]; 143 | float* d_h = new float[max_N]; 144 | cudaMemcpy(c_h, c_d, max_N*sizeof(float), cudaMemcpyDeviceToHost); 145 | cudaMemcpy(d_h, d_d, max_N*sizeof(float), cudaMemcpyDeviceToHost); 146 | float tolerance = 1e-6; 147 | for (int i = 0; i < max_N; i++) 148 | { 149 | ASSERT(abs(c_h[i] - d_h[i]) < tolerance, "failed at %d, %f, %f\n", i, c_h[i], d_h[i]); 150 | } 151 | std::cout<<"ratio"< -------------------------------------------------------------------------------- /icons/share.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /icons/subscribe.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SzymonOzog/GPU_Programming/fd6bdf1b7fafaf9ea96f029648f62ba2766b67f1/image.png -------------------------------------------------------------------------------- /manim_scripts/2_Grid_Blocks_Threads.py: -------------------------------------------------------------------------------- 1 | from manim import * 2 | from manim.mobject.text.text_mobject import remove_invisible_chars 3 | from manim_voiceover import VoiceoverScene 4 | from manim_voiceover.services.recorder import RecorderService 5 | from manim_voiceover.services.gtts import GTTSService 6 | from math import radians, degrees 7 | import random 8 | 9 | class Thread(VGroup): 10 | def __init__( 11 | self, 12 | side_length: float = 2, 13 | fill_opacity: float = 0.75, 14 | fill_color: ParsableManimColor = BLUE, 15 | stroke_width: float = 0, 16 | thread_idx: tuple[int, int, int] = (0,1,2), 17 | font_size = 12, 18 | **kwargs, 19 | ) -> None: 20 | self.side_length = side_length 21 | self.thread_idx = thread_idx 22 | self.visible = False 23 | super().__init__( 24 | fill_color=fill_color, 25 | fill_opacity=fill_opacity, 26 | stroke_width=stroke_width, 27 | **kwargs, 28 | ) 29 | 30 | def generate_points(self) -> None: 31 | for vect in reversed([IN, OUT, LEFT, RIGHT, UP, DOWN]): 32 | face = Square( 33 | side_length=self.side_length, 34 | shade_in_3d=True, 35 | ) 36 | face.flip() 37 | face.shift(self.side_length * OUT / 2.0) 38 | face.apply_matrix(z_to_vector(vect)) 39 | 40 | self.add(face) 41 | init_points = generate_points 42 | 43 | texts = [] 44 | 45 | class Block: 46 | def __init__(self, n, center, show_tid=True): 47 | self.threads = [[[None] * n for j in range(n)] for i in range(n)] 48 | current_pos = center.copy() 49 | self.n = n 50 | for x in range(n): 51 | current_pos[1] = center[1] 52 | current_pos += 0.25 * RIGHT 53 | for y in range(n): 54 | current_pos[2] = center[2] 55 | current_pos += 0.25 * DOWN 56 | for z in range(n): 57 | current_pos += 0.25 * IN 58 | t = Thread(side_length=0.25, stroke_width=0.5,fill_opacity=1) 59 | if show_tid: 60 | if z == 0: 61 | texts.append(Text(str(x), font_size=15).move_to(t.get_corner(OUT))) 62 | t.add(texts[-1]) 63 | if y == 0: 64 | texts.append(Text(str(z), font_size=15).move_to(t.get_corner(UP)).rotate(radians(90),LEFT)) 65 | t.add(texts[-1]) 66 | if x == 0: 67 | texts.append(Text(str(y), font_size=15).move_to(t.get_corner(LEFT)).rotate(radians(90),DOWN)) 68 | t.add(texts[-1]) 69 | self.threads[x][y][z] = t.move_to(current_pos) 70 | 71 | def create(self, x_range=1, y_range=1, z_range=1, z_index=0): 72 | anims = [] 73 | for x in range(x_range): 74 | for y in range(y_range): 75 | for z in range(z_range): 76 | if self.threads[x][y][z] is not None and not self.threads[x][y][z].visible: 77 | self.threads[x][y][z].visible=True 78 | self.threads[x][y][z].z_index=z_index 79 | for so in self.threads[x][y][z].submobjects: 80 | so.z_index=z_index 81 | anims.append(Create(self.threads[x][y][z])) 82 | return anims 83 | 84 | class Block2: 85 | def __init__(self, x_r, y_r, z_r, center, show_tid=True): 86 | self.threads = [[[None] * x_r for j in range(y_r)] for i in range(z_r)] 87 | current_pos = center.copy() 88 | self.x_r = x_r 89 | self.y_r = y_r 90 | self.z_r = z_r 91 | for x in range(x_r): 92 | current_pos[1] = center[1] 93 | current_pos += 0.25 * RIGHT 94 | for y in range(y_r): 95 | current_pos[2] = center[2] 96 | current_pos += 0.25 * DOWN 97 | for z in range(z_r): 98 | current_pos += 0.25 * IN 99 | t = Thread(side_length=0.25, stroke_width=0.5,fill_opacity=1) 100 | self.threads[x][y][z] = t.move_to(current_pos) 101 | 102 | def create(self, x_range=1, y_range=1, z_range=1, z_index=0): 103 | anims = [] 104 | for x in range(x_range): 105 | for y in range(y_range): 106 | for z in range(z_range): 107 | if self.threads[x][y][z] is not None and not self.threads[x][y][z].visible: 108 | self.threads[x][y][z].visible=True 109 | self.threads[x][y][z].z_index=z_index 110 | for so in self.threads[x][y][z].submobjects: 111 | so.z_index=z_index 112 | anims.append(Create(self.threads[x][y][z])) 113 | return anims 114 | 115 | def rotate(self): 116 | anims = [] 117 | for x in range(self.x_r): 118 | for y in range(self.y_r): 119 | for z in range(self.z_r): 120 | anims.append(Rotate(self.threads[x][y][z], angle = PI/4, axis = UP+RIGHT+OUT, about_point=ORIGIN)) 121 | return anims 122 | 123 | def get_entries(self): 124 | entries = [] 125 | for z in range(self.z_r): 126 | for y in range(self.y_r): 127 | for x in range(self.x_r): 128 | entries.append(self.threads[x][y][z]) 129 | return entries 130 | 131 | class KernelGrid(VoiceoverScene, ThreeDScene): 132 | def construct(self): 133 | self.set_speech_service( 134 | # GTTSService() 135 | RecorderService(trim_buffer_end=50, trim_silence_threshold=-80, transcription_model=None) 136 | ) 137 | 138 | title = Text("GPU programming", font_size=72).shift(2*UP) 139 | with self.voiceover(text="Hello and welcome to episode 2 in the series on GPU programming") as trk: 140 | self.play(Write(title)) 141 | 142 | subtitle = Text("Kernel Grid", font_size=48).next_to(title, DOWN) 143 | 144 | with self.voiceover(text="In this episode, we are going to talk about the kernel grid") as trk: 145 | self.play(Write(subtitle)) 146 | 147 | self.play(Unwrite(title), Unwrite(subtitle)) 148 | 149 | n = 6 150 | v2 = Matrix([*[[f"b_{i}"] for i in range(n-2)], ["\\vdots"], ["b_n"]], element_alignment_corner=ORIGIN).shift(DOWN) 151 | plus = Tex("+").next_to(v2, LEFT) 152 | v1 = Matrix([*[[f"a_{i}"] for i in range(n-2)], ["\\vdots"], ["a_n"]], element_alignment_corner=ORIGIN).next_to(plus, LEFT) 153 | eq = Tex("=").next_to(v2, RIGHT) 154 | v3 = Matrix([*[["?"] for i in range(n-2)], ["\\vdots"], ["?"]], element_alignment_corner=ORIGIN).next_to(eq, RIGHT) 155 | 156 | fs = 32 157 | block1 = SurroundingRectangle(VGroup(*v1.get_entries()[:2])).shift(1.5*LEFT) 158 | t1 = [] 159 | t1.append(Tex("$T_0$", font_size=fs).move_to(block1.get_corner(UP)+DOWN*0.2, aligned_edge=UP)) 160 | t1.append(Tex("$T_1$", font_size=fs).move_to(block1.get_corner(DOWN)-DOWN*0.2, aligned_edge=DOWN)) 161 | t1.append(Tex("$B_0$", font_size=fs).next_to(block1, LEFT)) 162 | block2 = SurroundingRectangle(VGroup(*v1.get_entries()[2:4])).shift(1.5*LEFT) 163 | t2 = [] 164 | t2.append(Tex("$T_0$", font_size=fs).move_to(block2.get_corner(UP)+DOWN*0.2, aligned_edge=UP)) 165 | t2.append(Tex("$T_1$", font_size=fs).move_to(block2.get_corner(DOWN)-DOWN*0.2, aligned_edge=DOWN)) 166 | t2.append(Tex("$B_1$", font_size=fs).next_to(block2, LEFT)) 167 | 168 | block3 = SurroundingRectangle(VGroup(*v1.get_entries()[4:])).shift(1.5*LEFT) 169 | t3 = [] 170 | t3.append(Tex("$T_0$", font_size=fs).move_to(block3.get_corner(UP)+DOWN*0.2, aligned_edge=UP)) 171 | t3.append(Tex("$T_1$", font_size=fs).move_to(block3.get_corner(DOWN)-DOWN*0.2, aligned_edge=DOWN)) 172 | t3.append(Tex("$B_{\\frac{n}{2}}$", font_size=fs).next_to(block3, LEFT)) 173 | 174 | with self.voiceover(text="""During the last episode we've presented a vector addition kernel, where we launched blocks of 2 threads""") as trk: 175 | self.play(*[Create(x) for x in [v1, v2, v3, plus, eq]]) 176 | self.play(Create(block1), *[Write(t) for t in t1]) 177 | self.play(Create(block2), *[Write(t) for t in t2]) 178 | self.play(Create(block3), *[Write(t) for t in t3]) 179 | 180 | m = 3 181 | n = 3 182 | blocks = [[[None] * m for j in range(m)] for i in range(m)] 183 | start_pos = ORIGIN.copy() + 2*(LEFT + UP + OUT) 184 | current_pos = ORIGIN.copy() + 2*(LEFT + UP + OUT) 185 | for x in range(m): 186 | current_pos[1] = start_pos[1] 187 | for y in range(m): 188 | current_pos[2] = start_pos[2] 189 | for z in range(m): 190 | blocks[x][y][z] = Block(n, current_pos, show_tid=z==0 and y>0) 191 | current_pos += 2 * IN 192 | current_pos += 2 * DOWN 193 | current_pos += 2 * RIGHT 194 | 195 | self.play(*[Uncreate(x) for x in [v1, v2, v3, plus, eq]]) 196 | self.play(Uncreate(block1), *[Unwrite(t) for t in t1], Uncreate(block2), *[Unwrite(t) for t in t2], Uncreate(block3), *[Unwrite(t) for t in t3]) 197 | 198 | code = """int N=6; 199 | int BLOCK_SIZE=2; 200 | add<<>>(N, a_d, b_d, c_d); """ 201 | 202 | code_obj = Code(code=code, tab_width=2, language="c", font_size=14, line_no_buff=0.1, corner_radius=0.1).shift(2*UP) 203 | 204 | gpu_code = """__global__ void add(int n , float* a, float* b, float* c) 205 | { 206 | int i = blockIdx.x * blockDim.x + threadIdx.x; 207 | if (i < n) 208 | { 209 | c[i] = a[i] + b[i]; 210 | } 211 | }""" 212 | gpu_code_obj = Code(code=gpu_code, tab_width=2, language="c", font_size=14, line_no_buff=0.1, corner_radius=0.1).shift(2*DOWN) 213 | def transform_code(tidx, bidx): 214 | new = f"""__global__ void add(int n , float* a, float* b, float* c) 215 | {{ 216 | //int i = blockIdx.x * blockDim.x + threadIdx.x; 217 | int i = {bidx} * 2 + {tidx}; 218 | if (i < n) 219 | {{ 220 | c[i] = a[i] + b[i]; 221 | }} 222 | }}""" 223 | c = Code(code=new, tab_width=2, language="c", font_size=14, line_no_buff=0.1, corner_radius=0.1).shift(2*DOWN) 224 | c.code = remove_invisible_chars(c.code) 225 | c.code[2].set_color(GREEN_E) 226 | return c 227 | 228 | 229 | with self.voiceover(text="""If we launched the kernel with just 6 elements and 2 threads per block""") as trk: 230 | self.play(Create(gpu_code_obj)) 231 | self.play(Create(code_obj)) 232 | 233 | with self.voiceover(text="""the resulting kernel grid would look like this""") as trk: 234 | self.play(LaggedStart(blocks[0][1][0].create(x_range=2, z_index=1))) 235 | self.play(LaggedStart(blocks[1][1][0].create(x_range=2, z_index=1))) 236 | self.play(LaggedStart(blocks[2][1][0].create(x_range=2, z_index=1))) 237 | 238 | l1 = Line(blocks[0][1][0].threads[0][0][0].get_corner(UP+LEFT), blocks[0][1][0].threads[1][0][0].get_corner(UP+RIGHT)) 239 | b1 = Brace(l1, direction=UP) 240 | t1 = b1.get_text("threadIdx.x").scale(0.6) 241 | 242 | l2 = Line(blocks[0][1][0].threads[0][0][0].get_corner(UP+LEFT), blocks[2][1][0].threads[1][0][0].get_corner(UP+RIGHT)) 243 | b2 = Brace(l2, direction=UP, buff=0.4) 244 | t2 = b2.get_text("blockIdx.x").scale(0.6) 245 | 246 | with self.voiceover(text="""Where the code gets assigned a thread index""") as trk: 247 | self.play(Create(b1), Write(t1)) 248 | 249 | with self.voiceover(text="""And a block index""") as trk: 250 | self.play(Create(b2), Write(t2)) 251 | 252 | with self.voiceover(text="""When running each thread in our block, it will run a copy of our code with 253 | values of blockIdx and threadIdx set to match the curerntly executed thread. 254 | The blockDim variable represents our block dimension and is constant across all threads 255 | in our case - we set the block size to 2""") as trk: 256 | for b in range(3): 257 | for t in range(2): 258 | blocks[b][1][0].threads[t][0][0].save_state() 259 | self.play(blocks[b][1][0].threads[t][0][0].animate.set_color(GREEN), Transform(gpu_code_obj, transform_code(t, b))) 260 | self.wait(0.5) 261 | self.play(Restore(blocks[b][1][0].threads[t][0][0])) 262 | self.wait(0.5) 263 | 264 | with self.voiceover(text="""Some of the more alert viewers might have noticed that we keep using threadIdx and blockIdx x values, 265 | and that might imply that there are more dimensions""") as trk: 266 | self.play(Uncreate(b2), Unwrite(t2), Uncreate(b1), Unwrite(t1)) 267 | self.wait(4) 268 | self.play(Uncreate(gpu_code_obj)) 269 | 270 | def transform_run(dim_grid, dim_block): 271 | code = f"""dim3 dimGrid({','.join(map(str,dim_grid))}); 272 | dim3 dimBlock({','.join(map(str,dim_block))}); 273 | add<<>>(N, a_d, b_d, c_d); """ 274 | return Code(code=code, tab_width=2, language="c", font_size=14, line_no_buff=0.1, corner_radius=0.1).shift(2*UP) 275 | 276 | with self.voiceover(text="""And that is indeed true, we can run up to 3 dimensions by passing in a dim3 variable 277 | as our kernel parameters""") as trk: 278 | self.wait(2) 279 | self.play(Transform(code_obj, transform_run([3,1,1], [2,1,1]))) 280 | 281 | 282 | 283 | with self.voiceover(text="""so a 2 dimensional kernel grid would look like this""") as trk: 284 | self.play(Transform(code_obj, transform_run([3,2,1], [2,2,1])), 285 | LaggedStart(blocks[0][1][0].create(x_range=2, y_range=2, z_index=1)), 286 | LaggedStart(blocks[1][1][0].create(x_range=2, y_range=2, z_index=1)), 287 | LaggedStart(blocks[2][1][0].create(x_range=2, y_range=2, z_index=1)), 288 | LaggedStart(blocks[0][2][0].create(x_range=2, y_range=2, z_index=1)), 289 | LaggedStart(blocks[1][2][0].create(x_range=2, y_range=2, z_index=1)), 290 | LaggedStart(blocks[2][2][0].create(x_range=2, y_range=2, z_index=1))) 291 | 292 | self.wait(1) 293 | creations = [] 294 | for x in range(m): 295 | for y in range(m): 296 | for z in range(m): 297 | creations.extend(blocks[x][y][z].create(x_range=n, y_range=n, z_range=n, z_index=0)) 298 | 299 | self.add_fixed_in_frame_mobjects(code_obj) 300 | self.add_fixed_orientation_mobjects(code_obj) 301 | 302 | with self.voiceover(text="""While a 3 dimensional grid might look like this""") as trk: 303 | self.move_camera(theta=-radians(25), gamma=radians(85), phi=-radians(45), 304 | added_anims=[LaggedStart(*creations, lag_ratio=0.001), Transform(code_obj, transform_run([m,m,m], [n,n,n]))]) 305 | 306 | self.wait(2) 307 | self.begin_ambient_camera_rotation(-0.1, about="phi") 308 | with self.voiceover(text="You might wonder what is the purpose of multiple dimensions") as trk: 309 | self.play(*[Unwrite(x) for x in texts]) 310 | 311 | self.wait(0.5) 312 | with self.voiceover(text="""and it's mostly just syntactic sugar - some algorithms operate on multidimensional data 313 | and checking boundary conditions for them might be easier in those""") as trk: 314 | pass 315 | 316 | self.wait(0.5) 317 | with self.voiceover(text="""also, they might be more readable when you express them in a row/column form""") as trk: 318 | pass 319 | 320 | self.wait(0.5) 321 | with self.voiceover(text="""as a side note - there might be some edge cases where using a multidimensional grid instead of a single dimensional grid 322 | results in a bit smaller register usage but that is rarely of big importance""") as trk: 323 | pass 324 | 325 | self.wait(0.5) 326 | with self.voiceover(text="""As an example we can look into a square matrix multiplication kernel""") as trk: 327 | self.play(*[FadeOut(x) for x in self.mobjects]) 328 | self.stop_ambient_camera_rotation("phi") 329 | self.move_camera(theta=-radians(90), gamma=radians(0), phi=radians(0)) 330 | 331 | 332 | with self.voiceover(text="""As a remainder, matrix multiplication is a function that takes 2 matrices as the input 333 | and returns another matrix whose entries are dot products beteen rows of the first matrix and columns of the second one""") as trk: 334 | mul = Tex("$\\cdot$").shift(2*LEFT + UP) 335 | m1 = Matrix([[f"a_{{0,0}}", f"a_{{0,1}}"], [f"a_{{1,0}}", f"a_{{1,1}}"]]).next_to(mul, LEFT) 336 | m2 = Matrix([[f"b_{{0,0}}", f"b_{{0,1}}"], [f"b_{{1,0}}", f"b_{{1,1}}"]]).next_to(mul, RIGHT) 337 | eq = Tex("$=$").next_to(m2, RIGHT) 338 | m3 = Matrix([[f"c_{{0,0}}", f"c_{{0,1}}"], [f"c_{{1,0}}", f"c_{{1,1}}"]]).next_to(eq, RIGHT) 339 | m = [[f"$c_{{{j},{i}}} = a_{{{j},0}}*b_{{0,{i}}}+a_{{{j},0}}*b_{{1,{i}}}$" for i in range(2)] for j in range(2)] 340 | fs = 48 341 | t1 = Tex(m[0][0], font_size = fs).next_to(m1, DOWN, aligned_edge=LEFT) 342 | t2 = Tex(m[0][1], font_size = fs).next_to(t1, DOWN, aligned_edge=LEFT) 343 | t3 = Tex(m[1][0], font_size = fs).next_to(t2, DOWN, aligned_edge=LEFT) 344 | t4 = Tex(m[1][1], font_size = fs).next_to(t3, DOWN, aligned_edge=LEFT) 345 | 346 | 347 | self.add(m1) 348 | self.add(mul) 349 | self.add(m2) 350 | self.add(eq) 351 | self.add(m3) 352 | 353 | i1 = SurroundingRectangle(m1.get_entries()[:2], color=BLUE) 354 | i2 = SurroundingRectangle(VGroup(m2.get_entries()[0], m2.get_entries()[2]), color=BLUE) 355 | g1 = VGroup(i1.copy(), i2.copy(), 356 | m1.get_entries()[0].copy(), m1.get_entries()[1].copy(), 357 | m2.get_entries()[0].copy(), m2.get_entries()[2].copy()) 358 | self.play(Create(i1), Create(i2)) 359 | self.play(Transform(g1, t1, replace_mobject_with_target_in_scene=True)) 360 | self.wait(1) 361 | 362 | dd = m1.get_entries()[0].get_y() - m1.get_entries()[2].get_y() 363 | dr = m2.get_entries()[0].get_x() - m2.get_entries()[1].get_x() 364 | self.play(i2.animate.shift(LEFT * dr)) 365 | g1 = VGroup(i1.copy(), i2.copy(), 366 | m1.get_entries()[0].copy(), m1.get_entries()[1].copy(), 367 | m2.get_entries()[1].copy(), m2.get_entries()[3].copy()) 368 | self.play(Transform(g1, t2, replace_mobject_with_target_in_scene=True)) 369 | self.wait(1) 370 | self.play(i1.animate.shift(DOWN * dd), i2.animate.shift(RIGHT * dr)) 371 | g1 = VGroup(i1.copy(), i2.copy(), 372 | m1.get_entries()[2].copy(), m1.get_entries()[3].copy(), 373 | m2.get_entries()[0].copy(), m2.get_entries()[2].copy()) 374 | self.play(Transform(g1, t3, replace_mobject_with_target_in_scene=True)) 375 | self.wait(1) 376 | self.play(i2.animate.shift(LEFT * dr)) 377 | g1 = VGroup(i1.copy(), i2.copy(), 378 | m1.get_entries()[2].copy(), m1.get_entries()[3].copy(), 379 | m2.get_entries()[1].copy(), m2.get_entries()[3].copy()) 380 | self.play(Transform(g1, t4, replace_mobject_with_target_in_scene=True)) 381 | self.wait(1) 382 | 383 | with self.voiceover(text="""I do realize that the description was very brief so I'm going to leave some more links in the description 384 | for those that are unfamilliar with the operation""") as trk: 385 | pass 386 | 387 | m4 = Matrix([[f"a_{{{j},{i}}}" for i in range(3)] for j in range(3)]) 388 | 389 | with self.voiceover(text="""Before we jump into the code, there is one thing that you have to know about memory layout""") as trk: 390 | self.play(*[Uncreate(x) for x in [m2, m3, i1, i2]], 391 | *[Unwrite(x) for x in [t1, t2, t3, t4, eq, mul]]) 392 | 393 | with self.voiceover(text="""When we create a 2D array in our code, the computer still stores it in 1 Dimension - the 2D access is just an 394 | abstraction that is easier for us to read""") as trk: 395 | self.play(Transform(m1, m4)) 396 | 397 | t1 = Tex("Row * Width + Column").set_color(BLUE).next_to(m1, DOWN) 398 | v = Matrix([[f"a_{i}" for i in range(9)]]).set_color(GREEN).next_to(t1, DOWN) 399 | self.play(Create(v.get_brackets()[0])) 400 | for i in range(3): 401 | self.play(Transform(VGroup(m1.get_entries()[i*3:(i+1)*3]).copy(), VGroup(v.get_entries()[i*3:(i+1)*3]), replace_mobject_with_target_in_scene=True)) 402 | self.play(Create(v.get_brackets()[1])) 403 | 404 | 405 | with self.voiceover(text="""In cuda we get access to the raw pointer so we actually have to index into it ourselves - when we have our row and column index""") as trk: 406 | pass 407 | with self.voiceover(text="""we can do that by multiplying the row by our matrix width 408 | and adding the column index into it""") as trk: 409 | self.play(Write(t1)) 410 | self.wait(1) 411 | 412 | 413 | matmul = """__global__ void matmul_elem 414 | (int n, float* a, float* b, float* c) 415 | { 416 | int column = blockIdx.x*blockDim.x + threadIdx.x; 417 | int row = blockIdx.y*blockDim.y + threadIdx.y; 418 | if (row < n && column < n) 419 | { 420 | float dot_prod = 0.f; 421 | for(int i = 0; i < n; i++) 422 | { 423 | dot_prod += a[row*n + i] * b[i*n + column]; 424 | } 425 | c[row*n+column] = dot_prod; 426 | } 427 | }""" 428 | matmul_sd="""__global__ void matmul_elem_onedim 429 | (int n, float* a, float* b, float* c) 430 | { 431 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 432 | int row = idx/n; 433 | int column = idx%n; 434 | if (row < n && column < n) 435 | { 436 | float dot_prod = 0.f; 437 | for(int i = 0; i < n; i++) 438 | { 439 | dot_prod += a[row*n + i] * b[i*n + column]; 440 | } 441 | c[row*n+column] = dot_prod; 442 | } 443 | }""" 444 | 445 | matmul_obj = Code(code=matmul, tab_width=2, language="c", font_size=14, background="rectangle", line_no_buff=0.1, corner_radius=0.1).shift(3*LEFT) 446 | matmul_sd_obj = Code(code=matmul_sd, tab_width=2, language="c", font_size=14, background="rectangle", line_no_buff=0.1, corner_radius=0.1).next_to(matmul_obj, RIGHT) 447 | 448 | matmul_obj.code = remove_invisible_chars(matmul_obj.code) 449 | matmul_sd_obj.code = remove_invisible_chars(matmul_sd_obj.code) 450 | 451 | 452 | with self.voiceover(text="""To run out matrix multiplication kernel, we can assign each thread to 1 element in our output array""") as trk: 453 | self.play(*[FadeOut(x) for x in self.mobjects]) 454 | self.play(Create(matmul_obj)) 455 | 456 | hl = SurroundingRectangle(matmul_obj.code[3:5], buff=0.03, stroke_width=2, fill_opacity=0.3) 457 | with self.voiceover(text="""We first calculate our row and column indices based on the current thread and block""") as trk: 458 | self.play(Create(hl)) 459 | 460 | hl_t = SurroundingRectangle(matmul_obj.code[5], buff=0.03, stroke_width=2, fill_opacity=0.3) 461 | with self.voiceover(text="""Then we do the boundary check not to read and write outside our matrices""") as trk: 462 | self.play(Transform(hl, hl_t)) 463 | 464 | hl_t = SurroundingRectangle(matmul_obj.code[7], buff=0.03, stroke_width=2, fill_opacity=0.3) 465 | with self.voiceover(text="""Then we create an intermediate variable that will store our dot product""") as trk: 466 | self.play(Transform(hl, hl_t)) 467 | 468 | hl_t = SurroundingRectangle(matmul_obj.code[8:12], buff=0.03, stroke_width=2, fill_opacity=0.3) 469 | with self.voiceover(text="""And we iterate over the row vector of the first matrix, and the column vector of the second matrix 470 | calculating the dot product""") as trk: 471 | self.play(Transform(hl, hl_t)) 472 | 473 | hl_t = SurroundingRectangle(matmul_obj.code[12], buff=0.03, stroke_width=2, fill_opacity=0.3) 474 | with self.voiceover(text="""Finally, we save our result in the output matrix""") as trk: 475 | self.play(Transform(hl, hl_t)) 476 | 477 | self.wait(1) 478 | self.play(Uncreate(hl)) 479 | self.wait(1) 480 | hl = SurroundingRectangle(matmul_sd_obj.code[4:6], buff=0.03, stroke_width=2, fill_opacity=0.3) 481 | with self.voiceover(text="""And just as I mentioned before, we could also do the same thing with a single dimensional grid.""") as trk: 482 | self.play(Create(matmul_sd_obj)) 483 | self.wait(1) 484 | with self.voiceover(text="""We just have to parse the rows and columns from our x dimension, this adds a bit of an overhead but it's negligable 485 | compared to the rest of the work done by the kernel""") as trk: 486 | self.play(Create(hl)) 487 | 488 | 489 | self.wait(2) 490 | x_r, y_r, z_r = 3,2,3 491 | block = Block2(x_r, y_r, z_r, ORIGIN+UP, False) 492 | with self.voiceover(text="""And the simillar memory pattern happens when we extend out data to the third dimension""") as trk: 493 | self.play(*[FadeOut(x) for x in self.mobjects]) 494 | self.play(*block.create(x_r, y_r, z_r)) 495 | 496 | self.play(*block.rotate()) 497 | self.wait(1) 498 | 499 | v = Matrix([[f"a_{{{i}}}" for i in range(x_r*y_r*z_r)]]).scale(0.5) 500 | 501 | 502 | self.play(Create(v.get_brackets()[0])) 503 | with self.voiceover(text="""It just simply gets flattened out, across each data dimension we add""") as trk: 504 | for z in range(z_r): 505 | for y in range(y_r): 506 | i = z * x_r * y_r + y * x_r 507 | self.play(Transform(VGroup(*block.get_entries()[i:i+3]), VGroup(v.get_entries()[i:i+3]), replace_mobject_with_target_in_scene=True)) 508 | self.play(Create(v.get_brackets()[1])) 509 | 510 | with self.voiceover(text="""Can you come up with the formula for our 1 dimensional index 511 | when we know our x, y and z coordinates?""") as trk: 512 | pass 513 | 514 | self.wait(2) 515 | t1 = Tex("Z * Width * Height + Y * Width + X", font_size=36).set_color(BLUE).next_to(v, UP) 516 | with self.voiceover(text="""If you guessed the following - you were right!""") as trk: 517 | self.play(Write(t1)) 518 | 519 | self.wait(2) 520 | with self.voiceover(text="""Now that we have the theory behind us, I'm going to leave an excercise for those that want to practice 521 | running a multidimensional kernel grid""") as trk: 522 | pass 523 | self.wait(1) 524 | 525 | 526 | with self.voiceover(text="""And the excercise looks like this: take in 3 arrays as the input""") as trk: 527 | self.play(*[FadeOut(x) for x in self.mobjects]) 528 | 529 | a = Tex("$a \\in \\mathbb{R}^{x \\times y \\times z}$").shift(UP) 530 | b = Tex("$b \\in \\mathbb{R}^{x \\times y}$").next_to(a, DOWN, aligned_edge=LEFT) 531 | c = Tex("$c \\in \\mathbb{R}^{x}$").next_to(b, DOWN, aligned_edge=LEFT) 532 | out = Tex("$out[x][y][z] = a[x][y][z] + b[x][y] + c[x]$").next_to(c, DOWN) 533 | with self.voiceover(text="""A 3 dimensional array a""") as trk: 534 | self.play(Write(a)) 535 | 536 | with self.voiceover(text="""A 2 dimensional array b""") as trk: 537 | self.play(Write(b)) 538 | 539 | with self.voiceover(text="""A 1 dimensional array c""") as trk: 540 | self.play(Write(c)) 541 | 542 | with self.voiceover(text="""And produce the output that is a 3 dimensional array being a sum of 3 input arrays 543 | broadcasted to 3 dimensions""") as trk: 544 | self.play(Write(out)) 545 | 546 | with self.voiceover(text="""Please, share and discuss your code in the comments. Also if you liked the video, 547 | subscribe to stay up to date, leave a thumbs up and share it with your friends""") as trk: 548 | pass 549 | 550 | with self.voiceover(text="""See you in the next episode - bye""") as trk: 551 | self.play(*[FadeOut(x) for x in self.mobjects]) 552 | 553 | self.wait(2) 554 | -------------------------------------------------------------------------------- /manim_scripts/6_Memory_Hierarchy.py: -------------------------------------------------------------------------------- 1 | from manim import * 2 | from manim.mobject.text.text_mobject import remove_invisible_chars 3 | from manim_voiceover import VoiceoverScene 4 | from manim_voiceover.services.recorder import RecorderService 5 | from manim_voiceover.services.gtts import GTTSService 6 | import numpy as np 7 | 8 | 9 | class MemoryHierarchy(VoiceoverScene): 10 | def construct(self): 11 | self.set_speech_service( 12 | GTTSService(transcription_model="base") 13 | # RecorderService(trim_buffer_end=50, trim_silence_threshold=-80, transcription_model=None) 14 | ) 15 | 16 | title = Text("GPU programming", font_size=72) 17 | with self.voiceover(text="Hello and welcome to episode 6 in the series on GPU programming") as trk: 18 | self.play(Write(title)) 19 | 20 | subtitle = Text("Memory Hierarchy", font_size=48).next_to(title, DOWN) 21 | with self.voiceover(text="""In this episode, we are going to briefly go over the memory hierarchy of our gpu as understanding 22 | it will be crucial to getting the best performence our of our hardware""") as trk: 23 | self.play(Write(subtitle)) 24 | 25 | with self.voiceover(text="""The purpose of this episode is to give you a quick overview of how memory in cuda works 26 | befor we dive deeper into each kind of memory in future episodes""") as trk: 27 | pass 28 | 29 | 30 | pcb = ImageMobject("./PCB.jpg").scale(0.2) 31 | 32 | with self.voiceover(text="""When talking about memory we will ofter refer to some particular kind of memory as being on or off chip""") as trk: 33 | self.play(Unwrite(title), Unwrite(subtitle)) 34 | 35 | with self.voiceover(text="""It might be confusing if you are not familliar with how the gpu internals look like""") as trk: 36 | self.play(FadeIn(pcb)) 37 | 38 | with self.voiceover(text="""When you open up your gpu, you can see that it's actually like a small computer inside your computer""") as trk: 39 | pass 40 | 41 | chip = Rectangle(width=2, height=2, fill_color=GREEN, fill_opacity=0.5, color=GREEN).shift(0.5*UP) 42 | chip_text = Text("Chip", color=GREEN, font_size=36).next_to(chip, DOWN) 43 | with self.voiceover(text="""There is a chip that does the actuall computation""") as trk: 44 | self.play(Create(chip)) 45 | self.play(Write(chip_text)) 46 | 47 | offchip1 = Rectangle(width=0.7, height=2, fill_color=RED, fill_opacity=0.5, color=RED).shift(1.5*LEFT+0.5*UP) 48 | offchip2 = Rectangle(width=0.7, height=2, fill_color=RED, fill_opacity=0.5, color=RED).shift(1.3*RIGHT+0.5*UP) 49 | offchip3 = Rectangle(width=1.45, height=0.65, fill_color=RED, fill_opacity=0.5, color=RED).shift(1.95*UP) 50 | offchip_text = Text("Memory", color=RED, font_size=36).next_to(offchip3, UP) 51 | with self.voiceover(text="""And it's connected to VRAM that resides on the PCB""") as trk: 52 | self.play(Create(offchip1), Create(offchip2), Create(offchip3)) 53 | self.play(Write(offchip_text)) 54 | 55 | with self.voiceover(text="""but some of the memory resides in the actuall chip making it much faster to access""") as trk: 56 | pass 57 | 58 | self.play(*[Uncreate(x) for x in [chip, offchip1, offchip2, offchip3]], Unwrite(chip_text), Unwrite(offchip_text)) 59 | 60 | def join(r1, r2, start, double=True): 61 | nonlocal arrows 62 | e_y = r2.get_y() + (1 if r2.get_y() < start[1] else -1) * r2.height/2 63 | end = np.array([start[0], e_y, 0]) 64 | ret = None 65 | if double: 66 | ret = DoubleArrow(start, end, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1) 67 | else: 68 | ret = Arrow(end, start, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1) 69 | arrows.append(ret) 70 | return ret 71 | 72 | shared_store = [] 73 | shared_load = [] 74 | register_store = [] 75 | register_load = [] 76 | local_store = [] 77 | local_load = [] 78 | global_store = [] 79 | global_load = [] 80 | constant_load = [] 81 | 82 | thread_objs = [] 83 | rects = [] 84 | texts = [] 85 | arrows = [] 86 | def make_thread(idx=0): 87 | nonlocal thread_objs, rects, texts 88 | thread = Rectangle(height=0.5, width=2.2, color=BLUE) 89 | texts.append(Text(f"Thread {idx}", font_size=15, color=BLUE)) 90 | rects.append(thread) 91 | thread.add(texts[-1]) 92 | 93 | registers = Rectangle(height=0.5, width=1.0, color=GREEN).next_to(thread, UP, aligned_edge=LEFT, buff=0.5) 94 | texts.append(Text("Registers", font_size=15, color=GREEN).move_to(registers.get_center())) 95 | registers.add(texts[-1]) 96 | rects.append(registers) 97 | 98 | local = Rectangle(height=0.5, width=1.0, color=RED_A).next_to(thread, UP, aligned_edge=RIGHT, buff=0.5) 99 | l = Text("Local", font_size=15, color=RED_A) 100 | m = Text("Memory", font_size=15, color=RED_A) 101 | VGroup(l, m).arrange(DOWN, buff=0.05).move_to(local.get_center()) 102 | texts.append(l) 103 | texts.append(m) 104 | rects.append(local) 105 | local.add(l) 106 | local.add(m) 107 | 108 | t_group = VGroup(thread, registers, local) 109 | t_group.add(join(registers, thread, start=registers.get_corner(DOWN))) 110 | t_group.add(join(local, thread, start=local.get_corner(DOWN))) 111 | 112 | thread_objs.append(thread) 113 | return t_group 114 | 115 | def make_block(idx=0): 116 | nonlocal rects, texts 117 | block = Rectangle(height=3.5, width=5.0, color=PURPLE) 118 | rects.append(block) 119 | 120 | threads = VGroup(make_thread(0), make_thread(1)).arrange(RIGHT).shift(0.8*DOWN) 121 | block.add(threads) 122 | 123 | shared_mem = Rectangle(width=4.0, height=0.5, color=YELLOW).next_to(threads, UP) 124 | rects.append(shared_mem) 125 | block.add(shared_mem) 126 | 127 | texts.append(Text(f"Shared Memory", font_size=15, color=YELLOW).move_to(shared_mem.get_center())) 128 | shared_mem.add(texts[-1]) 129 | for t in thread_objs[idx*2:]: 130 | block.add(join(t, shared_mem, t.get_corner(UP))) 131 | texts.append(Text(f"Block {idx}", color=PURPLE).next_to(shared_mem, UP)) 132 | shared_mem.add(texts[-1]) 133 | 134 | return block 135 | 136 | blocks = VGroup(make_block(0), make_block(1)).arrange(RIGHT).shift(UP) 137 | 138 | constant = Rectangle(width=blocks.width, height=1, color=RED_B).next_to(blocks, DOWN) 139 | texts.append(Text("Constant Memory", font_size=30, color=RED_B).move_to(constant.get_center())) 140 | rects.append(constant) 141 | 142 | gmem = Rectangle(width=blocks.width, height=1, color=RED).next_to(constant, DOWN) 143 | rects.append(gmem) 144 | texts.append(Text("Global Memory", font_size=30, color=RED).move_to(gmem.get_center())) 145 | 146 | subobjects = [] 147 | queue = [blocks] 148 | while queue: 149 | o = queue.pop() 150 | subobjects.append(o) 151 | queue.extend(o.submobjects) 152 | 153 | 154 | for mo in subobjects: 155 | for so in mo.submobjects.copy(): 156 | if any(so in x for x in [rects, texts, arrows, thread_objs]): 157 | mo.remove(so) 158 | 159 | for t in thread_objs[:2]: 160 | join(t, constant, t.get_corner(DOWN+LEFT)+RIGHT*0.2, False) 161 | join(t, gmem, t.get_corner(DOWN+LEFT)) 162 | 163 | for t in thread_objs[2:]: 164 | join(t, constant, t.get_corner(DOWN+RIGHT)+LEFT*0.2, False) 165 | join(t, gmem, t.get_corner(DOWN+RIGHT)) 166 | 167 | for i in [1, 3, 7, 9]: 168 | local_store.append(ShowPassingFlash(Arrow(start=arrows[i].get_end(), end=arrows[i].get_start(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 169 | local_load.append(ShowPassingFlash(Arrow(start=arrows[i].get_start(), end=arrows[i].get_end(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 170 | for i in [0, 2, 6, 8]: 171 | register_store.append(ShowPassingFlash(Arrow(start=arrows[i].get_end(), end=arrows[i].get_start(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 172 | register_load.append(ShowPassingFlash(Arrow(start=arrows[i].get_start(), end=arrows[i].get_end(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 173 | for i in [4, 5, 10, 11]: 174 | shared_store.append(ShowPassingFlash(Arrow(start=arrows[i].get_start(), end=arrows[i].get_end(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 175 | shared_load.append(ShowPassingFlash(Arrow(start=arrows[i].get_end(), end=arrows[i].get_start(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 176 | for i in [13, 15, 17, 19]: 177 | global_store.append(ShowPassingFlash(Arrow(start=arrows[i].get_start(), end=arrows[i].get_end(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 178 | global_load.append(ShowPassingFlash(Arrow(start=arrows[i].get_end(), end=arrows[i].get_start(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 179 | for i in [12, 14, 16, 18]: 180 | constant_load.append(ShowPassingFlash(Arrow(start=arrows[i].get_start(), end=arrows[i].get_end(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 181 | 182 | access_anims = [shared_store, shared_load, register_store, register_load, local_store, local_load, global_store, global_load, constant_load] 183 | 184 | 185 | with self.voiceover(text="""When speaking about memory hierarchy we have to take each unit into consideration, that is our blocks 186 | and the threads that run inside our blocks.""") as trk: 187 | self.play(FadeOut(pcb)) 188 | self.wait_until_bookmark("1") 189 | self.play(*[Create(r) for r in rects if r.color == PURPLE], *[Write(t) for t in texts if t.color == PURPLE]) 190 | self.wait_until_bookmark("2") 191 | self.play(*[Create(r) for r in rects if r.color == BLUE], *[Write(t) for t in texts if t.color == BLUE]) 192 | 193 | with self.voiceover(text="""The first type of memory that we should be familliar with is global memory""") as trk: 194 | self.play(*[Create(r) for r in rects if r.color == RED], *[Write(t) for t in texts if t.color == RED]) 195 | 196 | with self.voiceover(text="""Each thread can read and write to global memory""") as trk: 197 | self.play(*[Create(arrows[i]) for i in [13, 15, 17, 19]]) 198 | 199 | 200 | with self.voiceover(text="""Global memory is our largest but also slowest memory space - it is the VRAM of our GPU and it resides off chip.""") as trk: 201 | pass 202 | 203 | malloc = Code(code="cudaMalloc((void**) &pointer, size);", tab_width=2, language="c", font_size=16, line_no_buff=0.1, corner_radius=0.1) 204 | global_var = Code(code="__device__ int GlobalVariable = 0;", tab_width=2, language="c", font_size=16, line_no_buff=0.1, corner_radius=0.1).next_to(malloc, DOWN) 205 | 206 | with self.voiceover(text="""Every time that we call a malloc function or create a 207 | global variable, it gets stored inside global memory""") as trk: 208 | self.wait_until_bookmark("1") 209 | self.play(Create(malloc)) 210 | self.wait_until_bookmark("2") 211 | self.play(Create(global_var)) 212 | 213 | target = VGroup(*[r.copy() for r in rects if r.color == RED]) 214 | self.play(Transform(VGroup(malloc, global_var), target, replace_mobject_with_target_in_scene=True)) 215 | self.remove(target) 216 | 217 | with self.voiceover(text="""The next type of memory that we have been using so far are registers""") as trk: 218 | self.play(*[Create(r) for r in rects if r.color == GREEN], *[Write(t) for t in texts if t.color == GREEN]) 219 | self.play(*[Create(arrows[i]) for i in [0, 2, 6, 8]]) 220 | 221 | with self.voiceover(text="""They are local to each thread, and extremely fast as they reside on chip""") as trk: 222 | pass 223 | 224 | reg = Code(code="float reg = pointer[i];", tab_width=2, language="c", font_size=16, line_no_buff=0.1, corner_radius=0.1) 225 | with self.voiceover(text="""Every time that we create a local variable inside our kernel, it gets stored inside our registers""") as trk: 226 | self.play(Create(reg)) 227 | 228 | target = VGroup(*[r.copy() for r in rects if r.color == GREEN]) 229 | self.play(Transform(reg, target, replace_mobject_with_target_in_scene=True)) 230 | self.remove(target) 231 | 232 | with self.voiceover(text="""We can check how much registers we are using by adding a compilation flag 233 | for increased verbosity in ptxas""") as trk: 234 | pass 235 | 236 | 237 | with self.voiceover(text="""We can also use cuobjdump to check how are our registers accessed in PTX, and SASS assembly""") as trk: 238 | pass 239 | 240 | with self.voiceover(text="""Don't worry if those look like black magic - we will go over what PTX and SASS are in later episodes""") as trk: 241 | pass 242 | 243 | with self.voiceover(text="""There are some performance considerations when using our registers""") as trk: 244 | pass 245 | 246 | with self.voiceover(text="""First would be that using too much registers can cause reduced occupancy. We will go over occupancy in later 247 | episodes as it deserves some more explanation, but for now just think about it as not having enough resources to run new thread 248 | groups""") as trk: 249 | pass 250 | 251 | with self.voiceover(text="""The second one occurs when we use too much registers and the compiler determines that there is no more register 252 | space to hold our variables""") as trk: 253 | pass 254 | 255 | with self.voiceover(text="""in this case, our variables get spilled into another kind of memory, which is local memory""") as trk: 256 | self.play(*[Create(r) for r in rects if r.color == RED_A], *[Write(t) for t in texts if t.color == RED_A]) 257 | self.play(*[Create(arrows[i]) for i in [1, 3, 7, 9]]) 258 | 259 | with self.voiceover(text="""And the name might be a bit confusing - it's called local not because of it's physical location but because it's local to a thread""") as trk: 260 | pass 261 | 262 | with self.voiceover(text="""it lives off chip - therefore accessing it is very slow and we want to avoid doing it""") as trk: 263 | pass 264 | 265 | with self.voiceover(text="""When compiling with increased verbosity we can also look into how much of our memory access is to local memory""") as trk: 266 | pass 267 | 268 | with self.voiceover(text="""I've made a kernel using a lot of variables and as you can see, after using 255 registers they started spilling into local memory, 269 | resulting in 2040 bytes read and written to local memory""") as trk: 270 | pass 271 | 272 | self.wait(1) 273 | 274 | with self.voiceover(text="""Another kind of memory that we can use is Constant Memory""") as trk: 275 | self.play(*[Create(r) for r in rects if r.color == RED_B], *[Write(t) for t in texts if t.color == RED_B]) 276 | self.play(*[Create(arrows[i]) for i in [12, 14, 16, 18]]) 277 | 278 | with self.voiceover(text="""It is a special kind of memory, it resides off chip as global and local memory but it's cached and read-only""") as trk: 279 | pass 280 | 281 | with self.voiceover(text="""It is limited to only 64KB""") as trk: 282 | pass 283 | 284 | with self.voiceover(text="""and accesses to different addresses by threads within a warp are serialized - that means that if we access the same memory 285 | address by multiple threads we can get better performance than when using global memory""") as trk: 286 | pass 287 | 288 | const_mem = """__constant__ float const_mem[size]; 289 | cudaMemcpyToSymbol(const_mem, const_mem_h, size*sizeof(float));""" 290 | const_mem_code = Code(code=const_mem, tab_width=2, language="c", font_size=16, line_no_buff=0.1, corner_radius=0.1) 291 | 292 | with self.voiceover(text="""To use constnt memory we have to use the __constant__ derivative when declaring our array, 293 | we then have to use cudaMemcpyToSymbol to move our data from the cpu to const memory""") as trk: 294 | self.play(Create(const_mem_code)) 295 | 296 | self.play(Uncreate(const_mem_code)) 297 | with self.voiceover(text="""The final type of memory is shared memory""") as trk: 298 | self.play(*[Create(r) for r in rects if r.color == YELLOW], *[Write(t) for t in texts if t.color == YELLOW]) 299 | self.play(*[Create(arrows[i]) for i in [4, 5, 10, 11]]) 300 | 301 | with self.voiceover(text="""As the name suggests, it's shared between the threads in a block""") as trk: 302 | pass 303 | 304 | with self.voiceover(text="""And what that means is that if one thread in a block writes to shared memory, all the other threads in a block can read 305 | the value written by that thread""") as trk: 306 | self.wait_until_bookmark("1") 307 | self.play(ShowPassingFlash(Arrow(start=arrows[4].get_start(), end=arrows[4].get_end(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 308 | self.wait_until_bookmark("2") 309 | self.play(ShowPassingFlash(Arrow(start=arrows[5].get_end(), end=arrows[5].get_start(), color=BLUE, buff=0, stroke_width=4, tip_length=0.12, max_stroke_width_to_length_ratio=90, max_tip_length_to_length_ratio=1).set_z_index(1), time_width=1)) 310 | 311 | with self.voiceover(text="""moreover, shared memory lives on chip - meaning that accessing it is much faster than accessing global memory. 312 | That is why it is very often used in order to increase performence when multiple threads access the same memory address""") as trk: 313 | pass 314 | 315 | shared_mem = "__shared__ float shared_mem[size];" 316 | shared_mem_code = Code(code=shared_mem, tab_width=2, language="c", font_size=16, line_no_buff=0.1, corner_radius=0.1) 317 | 318 | with self.voiceover(text="""To allocate an array in shared memory we just have to add a __shared__ keyword when declaring our variable""") as trk: 319 | self.play(Create(shared_mem_code)) 320 | 321 | self.play(Uncreate(shared_mem_code)) 322 | 323 | with self.voiceover(text="""So to recap everything we've learned so far""") as trk: 324 | self.play(*[Uncreate(r) for r in rects + arrows], *[Unwrite(t) for t in texts]) 325 | 326 | summary = Table([ 327 | ["On", "R/W", "Thread", "Thread"], 328 | ["On", "R/W", "Block", "Block"], 329 | ["Off", "R/W", "Thread", "Thread"], 330 | ["Off", "R/W", "Global", "Host Controlled"], 331 | ["Off", "R", "Global", "Host Controlled"]], 332 | row_labels=[Text(t) for t in ["Registers", "Shared", "Local", "Global", "Constant"]], 333 | col_labels=[Text(t) for t in ["On/Off Chip", "Access", "Scope", "Lifetime"]]).scale(0.5) 334 | 335 | with self.voiceover(text="""We have five kinds of memory that we can use in our CUDA code""") as trk: 336 | self.play(*[Create(x) for x in summary.get_vertical_lines()]) 337 | self.play(*[Write(x) for x in summary.get_col_labels()]) 338 | 339 | def create_row(i): 340 | nonlocal summary 341 | self.play(Create(summary.get_horizontal_lines()[i])) 342 | self.play(LaggedStart(Write(summary.get_row_labels()[i]), *[Write(x) for x in summary.get_entries_without_labels()[i*4:(i+1)*4]])) 343 | 344 | 345 | with self.voiceover(text="""Register memory that lives on chip, can be read and written to and has a scope and a lifetime of our thread""") as trk: 346 | create_row(0) 347 | 348 | with self.voiceover(text="""Shared memory that also lives on chip, can be read and written to and has a scope and a lifetime of one block""") as trk: 349 | create_row(1) 350 | 351 | with self.voiceover(text="""Local memory that resides off chip, can be read and written to and has a scope and a lifetime of a thread""") as trk: 352 | create_row(2) 353 | 354 | with self.voiceover(text="""Global memory that resides off chip, can be read and written that can be accessed anywhere in our code and it's lifetime is controlled 355 | by the host that decides when to deallocate it""") as trk: 356 | create_row(3) 357 | 358 | with self.voiceover(text="""And constant memory that also resides off chip, is read only, globally accessed and it's lifetime is also controlled by the host""") as trk: 359 | create_row(4) 360 | 361 | self.wait(1) 362 | 363 | with self.voiceover(text="""This will be it for our introduction to memory in CUDA, in the upcoming episodes we will dive deeper into 364 | how we can use each kind of memory to improve the performance of our code""") as trk: 365 | pass 366 | 367 | 368 | with self.voiceover(text="""Subscribe not to miss it, leave a like, comment your feedback and do anything that helps the algorithm. 369 | And I'll see you in the next episode - bye.""") as trk: 370 | pass 371 | 372 | anims = [] 373 | for obj in self.mobjects: 374 | anims.append(FadeOut(obj)) 375 | self.play(*anims) 376 | self.wait(3) 377 | 378 | 379 | -------------------------------------------------------------------------------- /manim_scripts/EndScreen_CE.py: -------------------------------------------------------------------------------- 1 | from manim import * 2 | from manim_voiceover import VoiceoverScene 3 | from manim_voiceover.services.recorder import RecorderService 4 | from manim_voiceover.services.gtts import GTTSService 5 | 6 | class EndScreen (VoiceoverScene, ZoomedScene): 7 | def construct(self): 8 | self.set_speech_service( 9 | GTTSService(transcription_model="base") 10 | ) 11 | bmac = Text("https://buymeacoffee.com/simonoz", font_size=48, color=YELLOW) 12 | donors = [Text("Alex", font_size=50), 13 | Text("Udit Ransaria", font_size=50), 14 | Text("stuartmcvicar.bsky.social", font_size=50), 15 | Text("Ilgwon Ha", font_size=50), 16 | Text("maneesh29s", font_size=50), 17 | Text("Gaussian Pombo", font_size=50), 18 | Text("Marc Uecker", font_size=50), 19 | Text("drunkyoda", font_size=50), 20 | Text("danikhan632", font_size=50), 21 | Text("SowmithK", font_size=50), 22 | Text("Anonymous x5", font_size=50)] 23 | VGroup(*donors).arrange(DOWN).next_to(bmac, DOWN) 24 | 25 | subscribe = SVGMobject("icons/subscribe.svg") 26 | like = SVGMobject("icons/like.svg") 27 | share = SVGMobject("icons/share.svg") 28 | VGroup(subscribe, like, share).arrange(RIGHT).next_to(VGroup(*donors), DOWN).scale(0.7) 29 | 30 | self.camera.auto_zoom(VGroup(bmac, share, like, subscribe), margin=4, animate=False) 31 | with self.voiceover(text="""This channel is ad free and it's because I get ocasional donations from people who enjoy this work 32 | that are sufficient enough for me to buy all the essential gear for making those videos""") as trk: 33 | self.play(Write(bmac)) 34 | for donor in donors: 35 | self.play(Write(donor)) 36 | 37 | with self.voiceover(text="""Huge thanks for them for doing so. If you want do become one of them - you can visit my buymeacoffe page""") as trk: 38 | pass 39 | 40 | with self.voiceover(text="""And you can always support me for fre by subscribing, leaving a like, commenting and sharing this video with your friends""") as trk: 41 | self.play(Create(like), Create(subscribe), Create(share)) 42 | self.wait_until_bookmark("1") 43 | self.play(subscribe.animate.set_color(RED)) 44 | self.wait_until_bookmark("2") 45 | self.play(like.animate.set_color(RED)) 46 | self.wait_until_bookmark("3") 47 | self.play(share.animate.set_color(RED)) 48 | 49 | with self.voiceover(text="""I'll see you in the next episode, bye""") as trk: 50 | pass 51 | 52 | self.play(*[FadeOut(x) for x in self.mobjects]) 53 | self.wait(2) 54 | -------------------------------------------------------------------------------- /manim_scripts/HierarchicalTiling_CE.py: -------------------------------------------------------------------------------- 1 | from manim import * 2 | from manim_voiceover import VoiceoverScene 3 | from manim.mobject.text.text_mobject import remove_invisible_chars 4 | from manim_voiceover.services.recorder import RecorderService 5 | from manim_voiceover.services.gtts import GTTSService 6 | import math 7 | 8 | class TensorCoresGraph(VoiceoverScene): 9 | def construct(self): 10 | self.set_speech_service( 11 | RecorderService(transcription_model="base") 12 | # GTTSService(transcription_model="base") 13 | ) 14 | normal_times = [0.0108544, 0.0195504, 0.0610426, 0.502562, 3.72811, ] 15 | tiled_times = [0.0172352, 0.0300934, 0.0591254, 0.141713, 0.762095, ] 16 | cublas_times = [0.00583872, 0.0082176, 0.027305, 0.0733114, 0.576111, ] 17 | 18 | 19 | 20 | def to_flops(times, ns): 21 | return [(2*n*n*n)/(t*1e6) for t, n in zip(times, ns)] 22 | 23 | ps = list(range(8, 9+len(tiled_times))) 24 | 25 | n = [2**p for p in ps] 26 | 27 | normal_flops = to_flops(normal_times, n) 28 | tiled_flops = to_flops(tiled_times, n) 29 | cublas_flops = to_flops(cublas_times, n) 30 | 31 | ax = Axes( 32 | x_range=[ps[0] , ps[-1], 2], 33 | y_range=[0, 380000, 50000], 34 | x_axis_config={"scaling": LogBase(2)}, 35 | axis_config={"include_numbers": True}).shift(0.4*RIGHT) 36 | 37 | labels = ax.get_axis_labels(x_label="n", y_label="Throughput[GFLOPS]") 38 | 39 | normal_graph = ax.plot_line_graph( 40 | x_values=n, 41 | y_values=normal_flops, 42 | line_color=RED, 43 | add_vertex_dots=False 44 | ) 45 | 46 | tiled_graph = ax.plot_line_graph( 47 | x_values=n, 48 | y_values=tiled_flops, 49 | line_color=BLUE, 50 | add_vertex_dots=False 51 | ) 52 | 53 | cublas_graph = ax.plot_line_graph( 54 | x_values=n, 55 | y_values=cublas_flops, 56 | line_color=GREEN, 57 | add_vertex_dots=False 58 | ) 59 | normal_label = Text("Tensor Cores", font_size=32, color=RED).next_to(labels[1], DOWN, aligned_edge=LEFT) 60 | normal_label.shift(0.1*DOWN).align_to(labels[0], RIGHT) 61 | tiled_label = Text("Hierarchical\nTiling", font_size=32, color=BLUE).next_to(normal_label, DOWN, aligned_edge=LEFT) 62 | cublas_label = Text("cuBLAS", font_size=32, color=GREEN).next_to(tiled_label, DOWN, aligned_edge=LEFT) 63 | with self.voiceover(text="""Last time we measured how much FLOPs we were gettin with our tensor cores kernel""") as trk: 64 | self.play(Create(ax), Write(labels)) 65 | self.play(Create(normal_graph)) 66 | self.play(Write(normal_label)) 67 | 68 | 69 | theoretical_max_tc = ax.plot_line_graph( 70 | x_values=n, 71 | y_values=([330000] * len(cublas_flops)), 72 | line_color=GOLD, 73 | add_vertex_dots=False 74 | ) 75 | 76 | theoretical_max_tc_t = Text("Tensor Core theoretical maximum (330 TFLOPS)", color=GOLD, font_size=24).next_to(theoretical_max_tc, DOWN) 77 | 78 | with self.voiceover(text="""and compared it to what is the theoretical maximum of our GPU""") as trk: 79 | self.play(Create(theoretical_max_tc), Write(theoretical_max_tc_t)) 80 | 81 | with self.voiceover(text="""By utilizing hierarchical tiling, we can get much higher throughput, even up to 4.5x higher than 82 | we were getting with our original kernel""") as trk: 83 | self.play(Create(tiled_graph)) 84 | self.play(Write(tiled_label)) 85 | self.wait(1) 86 | 87 | with self.voiceover(text="""But that's still off from our theoretical maximum, and this is something that we will never achieve""") as trk: 88 | pass 89 | 90 | with self.voiceover(text="""To see what is the state of the art, we can run cublas kernels from nvidia, this will be our reference point from now on""") as trk: 91 | self.play(Create(cublas_graph)) 92 | self.play(Write(cublas_label)) 93 | self.wait(1) 94 | 95 | with self.voiceover(text="""And in the next episode, we'll once again move closer to this line""") as trk: 96 | pass 97 | 98 | self.play(*[FadeOut(x) for x in self.mobjects]) 99 | 100 | 101 | # template 102 | # __global__ void tensor_core_matmul_reg_smem(int n_elem, half* a, half* b, half* c) 103 | # { 104 | # const int32_t warpM = (blockIdx.x*blockDim.x+threadIdx.x)/32; 105 | # const int32_t warpN = blockIdx.y*blockDim.y+threadIdx.y; 106 | # const int32_t laneM = threadIdx.x/32; 107 | # const int32_t laneN = threadIdx.y; 108 | # 109 | # extern __shared__ char smem[]; 110 | # 111 | # half (*a_smem)[WMMA_MKN*WMMA_MKN] 112 | # = reinterpret_cast(smem); 113 | # half (*b_smem)[WMMA_MKN*WMMA_MKN] 114 | # = reinterpret_cast( 115 | # smem + SM_TILES*WMMA_MKN*WMMA_MKN*sizeof(half)); 116 | # 117 | # nvcuda::wmma::fragment a_frag[OUT_TILES]; 118 | # nvcuda::wmma::fragment b_frag; 119 | # nvcuda::wmma::fragment acc[OUT_TILES][OUT_TILES]; 120 | # 121 | # for(int32_t i = 0; i(&a_smem[i/(WMMA_MKN*WMMA_MKN)][i%(WMMA_MKN*WMMA_MKN)])[0] 139 | # = reinterpret_cast(&a_curr[(i/WMMA_MKN)*n_elem + i%WMMA_MKN])[0]; 140 | # reinterpret_cast(&b_smem[(i/WMMA_MKN)%SM_TILES][(i/(SM_TILES*WMMA_MKN))*WMMA_MKN + i%(WMMA_MKN)])[0] 141 | # = reinterpret_cast(&b_curr[(i/(SM_TILES*WMMA_MKN))*n_elem + i%(SM_TILES*WMMA_MKN)])[0]; 142 | # } 143 | # 144 | # __syncthreads(); 145 | # for (int n = 0; n < OUT_TILES; n++) 146 | # { 147 | # int32_t a_row = matrix_a_row + n*WMMA_MKN; 148 | # int32_t a_col = tile + k*WMMA_MKN; 149 | # if(a_row < n_elem && a_col < n_elem) 150 | # { 151 | # nvcuda::wmma::load_matrix_sync(a_frag[n], a_smem[laneM*OUT_TILES + n], WMMA_MKN); 152 | # } 153 | # } 154 | # for (int n = 0; n < OUT_TILES; n++) 155 | # { 156 | # int32_t b_col = matrix_b_col + (n)*WMMA_MKN; 157 | # int32_t b_row = tile + k*WMMA_MKN; 158 | # if (b_row < n_elem && b_col < n_elem) 159 | # { 160 | # nvcuda::wmma::load_matrix_sync(b_frag, b_smem[laneN*OUT_TILES + n], WMMA_MKN); 161 | # for (int m = 0; m < OUT_TILES; m++) 162 | # { 163 | # nvcuda::wmma::mma_sync(acc[m][n], a_frag[m], b_frag, acc[m][n]); 164 | # } 165 | # } 166 | # } 167 | # __syncthreads(); 168 | # } 169 | # } 170 | # 171 | # for(int32_t i = 0; i acc[OUT_TILES][OUT_TILES]; 196 | for(int32_t i = 0; i(a_smem_curr)[0] 228 | = reinterpret_cast(a_gmem_curr)[0]; 229 | 230 | half* b_smem_curr = &b_smem[(i/WMMA_MKN)%SM_TILES][(i/(SM_TILES*WMMA_MKN))*WMMA_MKN + i%(WMMA_MKN)]; 231 | half* b_gmem_curr = &b_curr[(i/(SM_TILES*WMMA_MKN))*n_elem + i%(SM_TILES*WMMA_MKN)]; 232 | reinterpret_cast(b_smem_curr)[0] 233 | = reinterpret_cast(b_gmem_curr)[0]; 234 | }""" 235 | self.play(Transform(code_obj, create_code(code))) 236 | wait_timestamp() 237 | 238 | code = """for (int32_t tile = 0; tile < n_elem; tile+=OUT_TILES*WMMA_MKN) 239 | { 240 | for (int k = 0; k < OUT_TILES; k++) 241 | { 242 | half* a_curr = a + blockIdx.x*SM_TILES*WMMA_MKN*n_elem + tile + k*WMMA_MKN; 243 | half* b_curr = b + (k*WMMA_MKN+tile)*n_elem + blockIdx.y*SM_TILES*WMMA_MKN; 244 | load_tiles(); 245 | __syncthreads(); 246 | for (int n = 0; n < OUT_TILES; n++) 247 | { 248 | nvcuda::wmma::load_matrix_sync(a_frag[n], a_smem[laneM*OUT_TILES + n], WMMA_MKN); 249 | } 250 | for (int n = 0; n < OUT_TILES; n++) 251 | { 252 | nvcuda::wmma::load_matrix_sync(b_frag, b_smem[laneN*OUT_TILES + n], WMMA_MKN); 253 | for (int m = 0; m < OUT_TILES; m++) 254 | { 255 | nvcuda::wmma::mma_sync(acc[m][n], a_frag[m], b_frag, acc[m][n]); 256 | } 257 | } 258 | __syncthreads(); 259 | } 260 | }""" 261 | self.play(Transform(code_obj, create_code(code))) 262 | code_obj = create_code(code) 263 | hl = SurroundingRectangle(code_obj.code[10], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 264 | self.play(Create(hl)) 265 | wait_timestamp() 266 | 267 | hl_t = SurroundingRectangle(code_obj.code[14], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 268 | self.play(Transform(hl, hl_t)) 269 | wait_timestamp() 270 | 271 | hl_t = SurroundingRectangle(code_obj.code[17], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 272 | self.play(Transform(hl, hl_t)) 273 | wait_timestamp() 274 | 275 | hl_t = SurroundingRectangle(code_obj.code[15], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 276 | self.play(Transform(hl, hl_t)) 277 | 278 | self.wait(0.2) 279 | 280 | hl_t = SurroundingRectangle(code_obj.code[17], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 281 | self.play(Transform(hl, hl_t)) 282 | wait_timestamp() 283 | 284 | hl_t = SurroundingRectangle(code_obj.code[14], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 285 | self.play(Transform(hl, hl_t)) 286 | wait_timestamp() 287 | 288 | hl_t = SurroundingRectangle(code_obj.code[15:19], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 289 | self.play(Transform(hl, hl_t)) 290 | wait_timestamp() 291 | 292 | hl_t = SurroundingRectangle(code_obj.code[2], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 293 | self.play(Transform(hl, hl_t)) 294 | wait_timestamp() 295 | 296 | hl_t = SurroundingRectangle(code_obj.code[4:7], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 297 | self.play(Transform(hl, hl_t)) 298 | wait_timestamp() 299 | 300 | hl_t = SurroundingRectangle(code_obj.code[8:12], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 301 | self.play(Transform(hl, hl_t)) 302 | wait_timestamp() 303 | 304 | hl_t = SurroundingRectangle(code_obj.code[12:20], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 305 | self.play(Transform(hl, hl_t)) 306 | wait_timestamp() 307 | for tile in range(1, 4): 308 | for c in range(2): 309 | hl_t = SurroundingRectangle(code_obj.code[4:7], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 310 | self.play(Transform(hl, hl_t)) 311 | wait_timestamp() 312 | 313 | hl_t = SurroundingRectangle(code_obj.code[8:12], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 314 | self.play(Transform(hl, hl_t)) 315 | wait_timestamp() 316 | 317 | hl_t = SurroundingRectangle(code_obj.code[12:20], buff=0.03, stroke_width=2, fill_opacity=0.3, color=YELLOW) 318 | self.play(Transform(hl, hl_t)) 319 | wait_timestamp() 320 | self.play(*[FadeOut(x) for x in self.mobjects]) 321 | -------------------------------------------------------------------------------- /manim_scripts/MoE.py: -------------------------------------------------------------------------------- 1 | import os 2 | from manimlib import * 3 | from math import radians 4 | 5 | class MoE(Scene): 6 | def construct(self): 7 | 8 | vector2 = TexMatrix([["x_1"], ["x_2"], ["x_3"], ["x_4"], ["\\vdots"], ["x_n"]]).to_edge(LEFT) 9 | 10 | dot = Tex("\\cdot").next_to(vector2) 11 | 12 | mat = TexMatrix([["w_{0,0}", "w_{0,1}", "\\cdots", "w_{0,n}"], 13 | ["w_{1,0}", "w_{1,1}", "\\cdots", "w_{1,n}"], 14 | ["\\vdots", "\\vdots", "\\ddots", "\\vdots"], 15 | ["w_{m,0}", "w_{m,1}", "\\cdots", "w_{m,n}"]]).next_to(dot) 16 | eq = Tex("=").next_to(mat) 17 | vector3 = TexMatrix([["x_1"], ["x_2"], ["x_3"], ["x_4"], ["\\vdots"], ["x_m"]]).next_to(eq) 18 | 19 | self.play(ShowCreation(vector2)) 20 | self.play(ShowCreation(mat), ShowCreation(dot)) 21 | self.play(ShowCreation(vector3), ShowCreation(eq)) 22 | self.wait() 23 | 24 | #fade out signs 25 | self.play(FadeOut(eq), FadeOut(dot)) 26 | 27 | 28 | #experts 29 | mats = [mat.copy().shift(2*x*IN + 16*OUT) for x in range(16)] 30 | bgs = [] 31 | for i, m in enumerate(list(reversed(mats))): 32 | m.set_z_index(i*2 + 1) 33 | bg = Rectangle(m.get_width(), m.get_height(), color=BLACK, fill_color=BLACK, fill_opacity=1).move_to(m).shift(0.1*IN).set_z_index(i*2) 34 | bgs.append(bg) 35 | 36 | self.play(self.frame.animate.move_to([-0.56036127, 0.8495176, 1.0232906]).set_euler_angles(-3.14159265, 0.26179939, 3.11379317).set_shape(53.023605, 29.802404)) 37 | self.play(vector2.animate.shift(4*LEFT), FadeOut(vector3)) 38 | self.play(*[ShowCreation(m) for m in mats + bgs]) 39 | self.wait() 40 | 41 | vector3.shift(4*RIGHT) 42 | 43 | #show mapping 44 | active = [0, 4, 8, 15] 45 | lines1 = [] 46 | lines2 = [] 47 | outputs = [] 48 | anims = [] 49 | for a in active: 50 | lines1.append(Line(vector2.get_corner(RIGHT), mats[a].get_corner(LEFT), z_index=100)) 51 | outputs.append(vector3.copy().set_z(mats[a].get_z())) 52 | lines2.append(Line(mats[a].get_corner(RIGHT), outputs[-1].get_corner(LEFT))) 53 | anims.append(mats[a].animate.set_color(YELLOW)) 54 | self.play(*[ShowCreation(x) for x in lines1], *anims) 55 | self.play(*[ShowCreation(x) for x in lines2 + outputs]) 56 | self.wait() 57 | 58 | # more tokens 59 | toks = [vector2.copy().shift(2*x*IN + 16*OUT) for x in range(16)] 60 | bgs2 = [] 61 | for i, m in enumerate(list(reversed(toks))): 62 | m.set_z_index(i*2 + 1) 63 | bg = Rectangle(m.get_width(), m.get_height(), color=BLACK, fill_color=BLACK, fill_opacity=1).move_to(m).shift(0.1*IN).set_z_index(i*2) 64 | bgs2.append(bg) 65 | 66 | self.play(*[ShowCreation(x) for x in toks + bgs2]) 67 | self.wait() 68 | 69 | #create more mappings 70 | lines3 = [] 71 | for t in toks: 72 | for _ in range(4): 73 | a = random.randint(0, 15) 74 | lines3.append(Line(t.get_corner(RIGHT), mats[a].get_corner(LEFT), z_index=100)) 75 | self.play(*[ShowCreation(x) for x in lines3]) 76 | self.wait() 77 | 78 | 79 | # show isolated kernel 80 | self.play(*[FadeOut(x) for x in lines3 + lines1 + lines2 + outputs]) 81 | 82 | self.play(mats[0].animate.shift(10*UP)) 83 | mat2 = TexMatrix([["x_{0,0}", "x_{0,1}", "\\cdots", "x_{0,n}"], 84 | ["x_{1,0}", "x_{1,1}", "\\cdots", "x_{1,n}"], 85 | ["\\vdots", "\\vdots", "\\ddots", "\\vdots"], 86 | ["x_{m,0}", "x_{m,1}", "\\cdots", "x_{m,n}"]]).next_to(mats[0], LEFT).shift(LEFT) 87 | lines4 = [] 88 | for tok in toks[:4]: 89 | lines4.append(Line(mat2.get_corner(DOWN), tok.get_corner(UP))) 90 | self.play(*[ReplacementTransform(t.copy(), x) for x, t in zip(lines4, toks[:4])]) 91 | self.play(ShowCreation(mat2)) 92 | 93 | -------------------------------------------------------------------------------- /manim_scripts/NN.py: -------------------------------------------------------------------------------- 1 | from manim import * 2 | import itertools as it 3 | from collections import defaultdict 4 | 5 | # A customizable Sequential Neural Network 6 | # Copied and edited based on https://www.youtube.com/watch?v=HnIeAP--vWc 7 | class NeuralNetworkMobject(VGroup): 8 | CONFIG = { 9 | "neuron_radius": 0.1, 10 | "neuron_to_neuron_buff": SMALL_BUFF, 11 | "layer_to_layer_buff": LARGE_BUFF*2, 12 | "output_neuron_color": WHITE, 13 | "input_neuron_color": WHITE, 14 | "hidden_layer_neuron_color": WHITE, 15 | "neuron_stroke_width": 2, 16 | "neuron_fill_color": GREEN, 17 | "edge_color": LIGHT_GREY, 18 | "edge_stroke_width": 2, 19 | "edge_propogation_color": YELLOW, 20 | "edge_propogation_time": 1, 21 | "max_shown_neurons": [20,16,12,10], 22 | "brace_for_large_layers": True, 23 | "average_shown_activation_of_large_layer": True, 24 | "include_output_labels": False, 25 | "arrow": False, 26 | "arrow_tip_size": 0.1, 27 | "left_size": 1, 28 | "neuron_fill_opacity": 1 29 | } 30 | # Constructor with parameters of the neurons in a list 31 | def __init__(self, neural_network, *args, **kwargs): 32 | VGroup.__init__(self, *args, **kwargs) 33 | self.layer_sizes = neural_network 34 | self.__dict__.update(self.CONFIG) 35 | self.neurons = [] 36 | self.edges = [] 37 | self.braces = [] 38 | self.neuron_to_input = defaultdict(list) 39 | self.add_neurons() 40 | self.add_edges() 41 | self.add_to_back(self.layers) 42 | 43 | # Helper method for constructor 44 | def add_neurons(self): 45 | layers = VGroup(*[ 46 | self.get_layer(size, index, self.max_shown_neurons[index]) 47 | for index, size in enumerate(self.layer_sizes) 48 | ]) 49 | layers.arrange_submobjects(RIGHT, buff=self.layer_to_layer_buff) 50 | self.layers = layers 51 | if self.include_output_labels: 52 | self.label_outputs_text() 53 | # Helper method for constructor 54 | def get_nn_fill_color(self, index): 55 | if index == -1 or index == len(self.layer_sizes) - 1: 56 | return self.output_neuron_color 57 | if index == 0: 58 | return self.input_neuron_color 59 | else: 60 | return self.hidden_layer_neuron_color 61 | # Helper method for constructor 62 | def get_layer(self, size, index=-1, max_shown_neurons=16): 63 | layer = VGroup() 64 | n_neurons = size 65 | self.neurons.append([]) 66 | if n_neurons > max_shown_neurons: 67 | n_neurons = max_shown_neurons 68 | for x in range(n_neurons): 69 | self.neurons[-1].append(Circle( 70 | radius=self.neuron_radius, 71 | stroke_color=self.get_nn_fill_color(index), 72 | stroke_width=self.neuron_stroke_width, 73 | fill_color=BLACK, 74 | fill_opacity=self.neuron_fill_opacity, 75 | )) 76 | neurons = VGroup(*self.neurons[-1]) 77 | neurons.arrange_submobjects( 78 | DOWN, buff=self.neuron_to_neuron_buff 79 | ) 80 | for neuron in neurons: 81 | neuron.edges_in = VGroup() 82 | neuron.edges_out = VGroup() 83 | layer.neurons = neurons 84 | layer.add(neurons) 85 | 86 | if size > n_neurons: 87 | dots = Tex("\\vdots") 88 | dots.move_to(neurons) 89 | self.neurons[-1].insert(len(neurons)//2, dots) 90 | VGroup(*neurons[:len(neurons) // 2]).next_to( 91 | dots, UP, MED_SMALL_BUFF 92 | ) 93 | VGroup(*neurons[len(neurons) // 2:]).next_to( 94 | dots, DOWN, MED_SMALL_BUFF 95 | ) 96 | layer.dots = dots 97 | layer.add(dots) 98 | if self.brace_for_large_layers: 99 | brace = Brace(layer, UP, buff=0.1) 100 | brace_label = brace.get_tex(str(size)).scale(0.8) 101 | layer.brace = brace 102 | layer.brace_label = brace_label 103 | self.braces.append((brace, brace_label)) 104 | 105 | return layer 106 | # Helper method for constructor 107 | def add_edges(self): 108 | self.edge_groups = VGroup() 109 | for l1, l2 in zip(self.layers[:-1], self.layers[1:]): 110 | self.edges.append([]) 111 | edge_group = VGroup() 112 | for n1, n2 in it.product(l1.neurons, l2.neurons): 113 | edge = self.get_edge(n1, n2) 114 | edge_group.add(edge) 115 | n1.edges_out.add(edge) 116 | n2.edges_in.add(edge) 117 | self.edge_groups.add(edge_group) 118 | self.add_to_back(self.edge_groups) 119 | # Helper method for constructor 120 | def get_edge(self, neuron1, neuron2): 121 | if self.arrow: 122 | return Arrow( 123 | neuron1.get_center(), 124 | neuron2.get_center(), 125 | buff=self.neuron_radius, 126 | stroke_color=self.edge_color, 127 | stroke_width=self.edge_stroke_width, 128 | tip_length=self.arrow_tip_size 129 | ) 130 | self.edges[-1].append(Line( 131 | neuron1.get_center(), 132 | neuron2.get_center(), 133 | buff=self.neuron_radius, 134 | stroke_color=self.edge_color, 135 | stroke_width=self.edge_stroke_width, 136 | )) 137 | self.neuron_to_input[neuron2].append(self.edges[-1][-1]) 138 | return self.edges[-1][-1] 139 | 140 | # Labels each input neuron with a char l or a LaTeX character 141 | def label_inputs(self, l): 142 | self.output_labels = VGroup() 143 | for n, neuron in enumerate(self.layers[0].neurons): 144 | label = Tex(f"{l}_"+"{"+f"{n + 1}"+"}") 145 | label.set_height(0.3 * neuron.get_height()) 146 | label.move_to(neuron) 147 | self.output_labels.add(label) 148 | self.add(self.output_labels) 149 | 150 | # Labels each output neuron with a char l or a LaTeX character 151 | def label_outputs(self, l): 152 | self.output_labels = VGroup() 153 | for n, neuron in enumerate(self.layers[-1].neurons): 154 | label = Tex(f"{l}_"+"{"+f"{n + 1}"+"}") 155 | label.set_height(0.4 * neuron.get_height()) 156 | label.move_to(neuron) 157 | self.output_labels.add(label) 158 | self.add(self.output_labels) 159 | 160 | # Labels each neuron in the output layer with text according to an output list 161 | def label_outputs_text(self, outputs): 162 | self.output_labels = VGroup() 163 | for n, neuron in enumerate(self.layers[-1].neurons): 164 | label = Tex(outputs[n]) 165 | label.set_height(0.75*neuron.get_height()) 166 | label.move_to(neuron) 167 | label.shift((neuron.get_width() + label.get_width()/2)*RIGHT) 168 | self.output_labels.add(label) 169 | self.add(self.output_labels) 170 | 171 | # Labels the hidden layers with a char l or a LaTeX character 172 | def label_hidden_layers(self, l): 173 | self.output_labels = VGroup() 174 | for layer in self.layers[1:-1]: 175 | for n, neuron in enumerate(layer.neurons): 176 | label = Tex(f"{l}_{n + 1}") 177 | label.set_height(0.4 * neuron.get_height()) 178 | label.move_to(neuron) 179 | self.output_labels.add(label) 180 | self.add(self.output_labels) 181 | -------------------------------------------------------------------------------- /manim_scripts/Presentation.py: -------------------------------------------------------------------------------- 1 | import os 2 | from manimlib import * 3 | from math import radians 4 | import cv2 5 | 6 | class Speed(Scene): 7 | def construct(self): 8 | 9 | title = Text("4090").scale(2).shift(3*UP) 10 | 11 | self.play(Write(title)) 12 | self.wait() 13 | 14 | #write out specs 15 | opsp = Text("Operation speed").next_to(title, DOWN).shift(2*RIGHT) 16 | mem = Text("Memory Speed").next_to(title, DOWN).shift(2*LEFT) 17 | flops = Text("82.6 TFLOPS").next_to(opsp, DOWN) 18 | bandwidth = Text("1.01 TB/s").next_to(mem, DOWN) 19 | self.play(Write(opsp), Write(mem), 20 | Write(flops), Write(bandwidth)) 21 | self.wait() 22 | 23 | #flops byte ratio 24 | breakdown = Tex("328\\frac{operations}{float}").next_to(title, DOWN).shift(2*DOWN) 25 | self.play(Write(breakdown)) 26 | self.wait() 27 | 28 | #timings 29 | clock = Text("Clock speed = 1-1.8 GHz").next_to(title, DOWN) 30 | self.play(*[FadeOut(x) for x in [opsp, mem, flops, bandwidth, breakdown]]) 31 | self.play(Write(clock)) 32 | self.wait() 33 | 34 | timing = Tex("1\\,clock\\,cycle = \\frac{2}{3} ns").next_to(clock, DOWN) 35 | self.play(Write(timing)) 36 | self.wait() 37 | 38 | distance = Tex("distance=100 \\,mm").next_to(timing, DOWN) 39 | self.play(Write(distance)) 40 | self.wait() 41 | 42 | -------------------------------------------------------------------------------- /manim_scripts/Quantization.py: -------------------------------------------------------------------------------- 1 | import os 2 | from manimlib import * 3 | from math import radians 4 | 5 | class Quantization(Scene): 6 | def construct(self): 7 | t = Text("QUANTIZATION").scale(3) 8 | self.play(Write(t)) 9 | self.wait() 10 | 11 | self.play(*[FadeOut(x) for x in self.mobjects]) 12 | #start 13 | ax = Axes(x_range=(-70000, 70000, 10000), 14 | y_range=(-0.1, 1.1, 0.1), 15 | height=6, 16 | width=10 17 | ) 18 | self.play(ShowCreation(ax)) 19 | 20 | #create rect 21 | w = (ax.c2p(65504, 0) - ax.c2p(-65504, 0))[0] 22 | h = (ax.c2p(0, 1.1) - ax.c2p(0, 0))[1] 23 | fp16_box = Rectangle(w, h, color=GREEN) 24 | print(w,h) 25 | fp16_text = Text("FP16 range (-65504, 65504)").next_to(fp16_box, UP) 26 | self.play(ShowCreation(fp16_box), Write(fp16_text)) 27 | self.wait() 28 | 29 | #create q4 30 | w = (ax.c2p(65504, 0) - ax.c2p(-65504, 0))[0] 31 | h = (ax.c2p(0, 0.6) - ax.c2p(0, 0))[1] 32 | q4_box = Rectangle(w, h, color=RED).shift(DOWN) 33 | lines = [] 34 | start_point = q4_box.get_corner(UL) 35 | end_point = q4_box.get_corner(DL) 36 | for i in range(15): 37 | start_point[0] += w/16 38 | end_point[0] += w/16 39 | lines.append(Line(start_point, end_point, color=RED, stroke_width=2)) 40 | 41 | q4_text = Text("Q4 representation").next_to(q4_box, UP) 42 | self.play(ShowCreation(q4_box), Write(q4_text)) 43 | self.play(LaggedStart(*[ShowCreation(x) for x in lines])) 44 | q4 = VGroup(q4_box, *lines) 45 | self.wait() 46 | 47 | #create values 48 | values = [] 49 | rng = 20000 50 | start_point = ax.c2p(-rng, 0.3) 51 | end_point = ax.c2p(-rng, 0) 52 | w = (ax.c2p(rng, 0) - ax.c2p(-rng, 0))[0] 53 | for i in range(24): 54 | start_point[0] += w/16 55 | end_point[0] += w/16 56 | values.append(Line(start_point, end_point, color=ORANGE, stroke_width=4, z_index=5)) 57 | self.play(*[ShowCreation(x) for x in values[:16]]) 58 | self.wait() 59 | 60 | #show scale 61 | self.play(q4.animate.stretch_to_fit_width(w)) 62 | self.wait() 63 | 64 | # not centared around 0 65 | self.play(*[Uncreate(x) for x in values[:8]]) 66 | self.wait() 67 | 68 | #more values 69 | self.play(*[ShowCreation(x) for x in values[16:]]) 70 | self.wait() 71 | 72 | #show zero point 73 | self.play(q4.animate.shift(w/2 * RIGHT)) 74 | self.wait() 75 | 76 | #Show quantized block 77 | self.play(*[FadeOut(x) for x in self.mobjects]) 78 | block = Rectangle(width=8, height=2, color=BLUE) 79 | self.play(ShowCreation(block)) 80 | scale_global = Text("FP16 scale & FP16 shift", color=BLUE).next_to(block, UP) 81 | self.play(Write(scale_global)) 82 | self.wait() 83 | 84 | start_point = block.get_corner(UL) 85 | end_point = block.get_corner(DL) 86 | w = 8 87 | lines=[] 88 | scale_blocks = [] 89 | for i in range(8): 90 | start_point[0] += w/8 91 | end_point[0] += w/8 92 | lines.append(Line(start_point, end_point, color=BLUE)) 93 | scale_blocks.append(Text("6Bit scale\n6Bit shift").scale(0.35).move_to(end_point + 0.5*LEFT + 0.5*DOWN)) 94 | 95 | self.play(LaggedStart(*[ShowCreation(x) for x in lines])) 96 | self.wait() 97 | 98 | self.play(LaggedStart(*[Write(x) for x in scale_blocks])) 99 | self.wait() 100 | 101 | #summarize 102 | x = Text("32 bits").next_to(scale_global).shift(2*RIGHT) 103 | self.play(Write(x)) 104 | self.wait() 105 | y = Text("256x4bits").next_to(lines[-1], RIGHT) 106 | center_x = x.get_center()[0] 107 | y_loc = y.get_center().copy() 108 | y_loc[0] = center_x 109 | y.move_to(y_loc) 110 | self.play(Write(y)) 111 | self.wait() 112 | z = Text("8x12bits").next_to(scale_blocks[-1], RIGHT) 113 | z_loc = z.get_center().copy() 114 | z_loc[0] = center_x 115 | z.move_to(z_loc) 116 | self.play(Write(z)) 117 | self.wait() 118 | equation = [Text("+").next_to(x, DOWN).shift(0.2*DOWN), 119 | Text("+").next_to(y, DOWN).shift(0.2*DOWN), 120 | Text("=").next_to(z, DOWN).shift(0.2*DOWN) 121 | ] 122 | self.play(*[Write(t) for t in equation]) 123 | 124 | result = Text(f"{32+(4*256)+(12*8)} bits").next_to(equation[-1], DOWN) 125 | self.play(Write(result)) 126 | self.wait() 127 | -------------------------------------------------------------------------------- /manim_scripts/TensorCores_CE.py: -------------------------------------------------------------------------------- 1 | from manim import * 2 | from manim_voiceover import VoiceoverScene 3 | from manim.mobject.text.text_mobject import remove_invisible_chars 4 | from manim_voiceover.services.recorder import RecorderService 5 | from manim_voiceover.services.gtts import GTTSService 6 | import math 7 | 8 | class TensorCoresGraph(VoiceoverScene): 9 | def construct(self): 10 | self.set_speech_service( 11 | # RecorderService(transcription_model="base") 12 | GTTSService(transcription_model="base") 13 | ) 14 | normal_times = [0.0229811, 0.0724406, 0.450077, 3.40259, 27.0164, ] 15 | tiled_times = [0.0162278, 0.0540909, 0.378201, 2.92617, 23.4949, ] 16 | tensor_core_times = [0.0113766, 0.0197933, 0.0607453, 0.501476, 3.70657, ] 17 | 18 | def to_flops(times, ns): 19 | return [(2*n*n*n)/(t*1e6) for t, n in zip(times, ns)] 20 | 21 | ps = list(range(8, 9+len(tiled_times))) 22 | 23 | n = [2**p for p in ps] 24 | 25 | normal_flops = to_flops(normal_times, n) 26 | tiled_flops = to_flops(tiled_times, n) 27 | tc_flops = to_flops(tensor_core_times, n) 28 | 29 | ax = Axes( 30 | x_range=[ps[0] , ps[-1], 2], 31 | y_range=[0, tc_flops[-1] + 5000, 10000], 32 | x_axis_config={"scaling": LogBase(2)}, 33 | axis_config={"include_numbers": True}).shift(0.4*RIGHT) 34 | 35 | labels = ax.get_axis_labels(x_label="n", y_label="Throughput[GFLOPS]") 36 | 37 | normal_graph = ax.plot_line_graph( 38 | x_values=n, 39 | y_values=normal_flops, 40 | line_color=RED, 41 | add_vertex_dots=False 42 | ) 43 | 44 | tiled_graph = ax.plot_line_graph( 45 | x_values=n, 46 | y_values=tiled_flops, 47 | line_color=BLUE, 48 | add_vertex_dots=False 49 | ) 50 | 51 | tc_graph = ax.plot_line_graph( 52 | x_values=n, 53 | y_values=tc_flops, 54 | line_color=GREEN, 55 | add_vertex_dots=False 56 | ) 57 | tc_label = Text("TensorCores", font_size=32, color=GREEN).next_to(labels[1], DOWN, aligned_edge=LEFT) 58 | tc_label.shift(0.1*DOWN).align_to(labels[0], RIGHT) 59 | tiled_label = Text("Tiled Matmul", font_size=32, color=BLUE).next_to(tc_label, DOWN, aligned_edge=LEFT) 60 | normal_label = Text("Naive Matmul", font_size=32, color=RED).next_to(tiled_label, DOWN, aligned_edge=LEFT) 61 | with self.voiceover(text="""If we were to graph the throughput that we are getting with tensor cores""") as trk: 62 | self.play(Create(ax), Write(labels)) 63 | 64 | with self.voiceover(text="""We can see that just by utilizing tensor cores, we are getting an algorithm 65 | that's 6 times faster than our tiled kernel that we wrote a while back""") as trk: 66 | self.play(Create(tc_graph), Create(tiled_graph), Create(normal_graph)) 67 | self.play(Write(tc_label), Write(tiled_label), Write(normal_label)) 68 | self.wait(1) 69 | 70 | ax_t = Axes( 71 | x_range=[ps[0] , ps[-1], 2], 72 | y_range=[0, 380000, 50000], 73 | x_axis_config={"scaling": LogBase(2)}, 74 | axis_config={"include_numbers": True}).shift(0.4*RIGHT) 75 | 76 | normal_graph_t = ax_t.plot_line_graph( 77 | x_values=n, 78 | y_values=normal_flops, 79 | line_color=RED, 80 | add_vertex_dots=False 81 | ) 82 | 83 | tiled_graph_t = ax_t.plot_line_graph( 84 | x_values=n, 85 | y_values=tiled_flops, 86 | line_color=BLUE, 87 | add_vertex_dots=False 88 | ) 89 | 90 | tc_graph_t = ax_t.plot_line_graph( 91 | x_values=n, 92 | y_values=tc_flops, 93 | line_color=GREEN, 94 | add_vertex_dots=False 95 | ) 96 | 97 | theoretical_max_tc = ax_t.plot_line_graph( 98 | x_values=n, 99 | y_values=([330000] * len(tc_flops)), 100 | line_color=GOLD, 101 | add_vertex_dots=False 102 | ) 103 | 104 | theoretical_max_tc_t = Text("Tensor Core theoretical maximum (330 TFLOPS)", color=GOLD, font_size=24).next_to(theoretical_max_tc, DOWN) 105 | 106 | theoretical_max = ax_t.plot_line_graph( 107 | x_values=n, 108 | y_values=([86000] * len(tc_flops)), 109 | line_color=YELLOW, 110 | add_vertex_dots=False 111 | ) 112 | 113 | theoretical_max_t = Text("Cuda Cores theoretical maximum (86 TFLOPS)", color=YELLOW, font_size=24).next_to(theoretical_max, DOWN) 114 | 115 | with self.voiceover(text="""But to be completly honest with you, if we zoom out our graph we can see 116 | that we are only slightly above 10% of what the hardware is capable of""") as trk: 117 | self.play(Transform(ax, ax_t), Transform(tc_graph, tc_graph_t), 118 | Transform(normal_graph, normal_graph_t), Transform(tiled_graph, tiled_graph_t)) 119 | 120 | self.play(Create(theoretical_max_tc), Write(theoretical_max_tc_t)) 121 | with self.voiceover(text="""In fact we have not even reached the theoretical maximum of what 122 | the GPU can do without using tensor cores. But filling this gap will be the topic of future videos""") as trk: 123 | self.play(Create(theoretical_max), Write(theoretical_max_t)) 124 | 125 | self.play(*[FadeOut(x) for x in self.mobjects]) 126 | self.wait(2) 127 | 128 | class TensorCoresCode2(Scene): 129 | def construct(self): 130 | code = """using layout = nvcuda::wmma::row_major; 131 | nvcuda::wmma::fragment a_frag; 132 | nvcuda::wmma::fragment b_frag; 133 | nvcuda::wmma::fragment acc;""" 134 | code_obj = Code(code=code, tab_width=2, language="c++", style='monokai', margin=0.1, line_spacing=0.7, insert_line_no=False, font_size=12, corner_radius=0.1) 135 | code_obj.code = remove_invisible_chars(code_obj.code) 136 | hl_a = SurroundingRectangle(code_obj.code[1], buff=0.03, stroke_width=2, fill_opacity=0.3, color=BLUE) 137 | hl_b = SurroundingRectangle(code_obj.code[2], buff=0.03, stroke_width=2, fill_opacity=0.3, color=ORANGE) 138 | hl_acc = SurroundingRectangle(code_obj.code[3], buff=0.03, stroke_width=2, fill_opacity=0.3, color=GREEN) 139 | self.play(Create(code_obj)) 140 | self.play(Create(hl_a), Create(hl_b), Create(hl_acc)) 141 | 142 | self.wait(1) 143 | 144 | code = """constexpr int M = 16; 145 | constexpr int K = 16; 146 | constexpr int N = 16; 147 | using layout = nvcuda::wmma::row_major; 148 | nvcuda::wmma::fragment a_frag; 149 | nvcuda::wmma::fragment b_frag; 150 | nvcuda::wmma::fragment acc;""" 151 | code_obj_t = Code(code=code, tab_width=2, language="c++", style='monokai', margin=0.1, line_spacing=0.7, insert_line_no=False, font_size=12, corner_radius=0.1) 152 | code_obj_t.code = remove_invisible_chars(code_obj_t.code) 153 | hl_a_t = SurroundingRectangle(code_obj_t.code[4], buff=0.03, stroke_width=2, fill_opacity=0.3, color=BLUE) 154 | hl_b_t = SurroundingRectangle(code_obj_t.code[5], buff=0.03, stroke_width=2, fill_opacity=0.3, color=ORANGE) 155 | hl_acc_t = SurroundingRectangle(code_obj_t.code[6], buff=0.03, stroke_width=2, fill_opacity=0.3, color=GREEN) 156 | self.play(Transform(code_obj, code_obj_t), Transform(hl_a, hl_a_t), Transform(hl_b, hl_b_t), Transform(hl_acc, hl_acc_t)) 157 | 158 | self.wait(1) 159 | 160 | code = """constexpr int M = 32; 161 | constexpr int K = 8; 162 | constexpr int N = 16; 163 | using layout = nvcuda::wmma::row_major; 164 | nvcuda::wmma::fragment a_frag; 165 | nvcuda::wmma::fragment b_frag; 166 | nvcuda::wmma::fragment acc;""" 167 | code_obj_t = Code(code=code, tab_width=2, language="c++", style='monokai', margin=0.1, line_spacing=0.7, insert_line_no=False, font_size=12, corner_radius=0.1) 168 | code_obj_t.code = remove_invisible_chars(code_obj_t.code) 169 | hl_a_t = SurroundingRectangle(code_obj_t.code[4], buff=0.03, stroke_width=2, fill_opacity=0.3, color=BLUE) 170 | hl_b_t = SurroundingRectangle(code_obj_t.code[5], buff=0.03, stroke_width=2, fill_opacity=0.3, color=ORANGE) 171 | hl_acc_t = SurroundingRectangle(code_obj_t.code[6], buff=0.03, stroke_width=2, fill_opacity=0.3, color=GREEN) 172 | self.play(Transform(code_obj, code_obj_t), Transform(hl_a, hl_a_t), Transform(hl_b, hl_b_t), Transform(hl_acc, hl_acc_t)) 173 | 174 | self.wait(1) 175 | 176 | code = """constexpr int M = 8; 177 | constexpr int K = 32; 178 | constexpr int N = 16; 179 | using layout = nvcuda::wmma::row_major; 180 | nvcuda::wmma::fragment a_frag; 181 | nvcuda::wmma::fragment b_frag; 182 | nvcuda::wmma::fragment acc;""" 183 | code_obj_t = Code(code=code, tab_width=2, language="c++", style='monokai', margin=0.1, line_spacing=0.7, insert_line_no=False, font_size=12, corner_radius=0.1) 184 | code_obj_t.code = remove_invisible_chars(code_obj_t.code) 185 | self.play(Transform(code_obj, code_obj_t)) 186 | 187 | 188 | self.wait(1) 189 | 190 | code = """constexpr int MKN = 16; 191 | using layout = nvcuda::wmma::row_major; 192 | nvcuda::wmma::fragment a_frag; 193 | nvcuda::wmma::fragment b_frag; 194 | nvcuda::wmma::fragment acc; 195 | """ 196 | code_obj_t = Code(code=code, tab_width=2, language="c++", style='monokai', margin=0.1, line_spacing=0.7, insert_line_no=False, font_size=12, corner_radius=0.1) 197 | code_obj_t.code = remove_invisible_chars(code_obj_t.code) 198 | hl_a_t = SurroundingRectangle(code_obj_t.code[2], buff=0.03, stroke_width=2, fill_opacity=0.3, color=BLUE) 199 | hl_b_t = SurroundingRectangle(code_obj_t.code[3], buff=0.03, stroke_width=2, fill_opacity=0.3, color=ORANGE) 200 | hl_acc_t = SurroundingRectangle(code_obj_t.code[4], buff=0.03, stroke_width=2, fill_opacity=0.3, color=GREEN) 201 | self.play(Transform(code_obj, code_obj_t), Transform(hl_a, hl_a_t), Transform(hl_b, hl_b_t), Transform(hl_acc, hl_acc_t)) 202 | 203 | self.wait(1) 204 | 205 | code = """constexpr int MKN = 16; 206 | using layout = nvcuda::wmma::row_major; 207 | nvcuda::wmma::fragment a_frag; 208 | nvcuda::wmma::fragment b_frag; 209 | nvcuda::wmma::fragment acc; 210 | 211 | nvcuda::wmma::fill_fragment(acc, 0); 212 | """ 213 | code_obj_t = Code(code=code, tab_width=2, language="c++", style='monokai', margin=0.1, line_spacing=0.7, insert_line_no=False, font_size=12, corner_radius=0.1) 214 | code_obj_t.code = remove_invisible_chars(code_obj_t.code) 215 | hl_a_t = SurroundingRectangle(code_obj_t.code[2], buff=0.03, stroke_width=2, fill_opacity=0.3, color=BLUE) 216 | hl_b_t = SurroundingRectangle(code_obj_t.code[3], buff=0.03, stroke_width=2, fill_opacity=0.3, color=ORANGE) 217 | hl_acc_t = SurroundingRectangle(code_obj_t.code[4], buff=0.03, stroke_width=2, fill_opacity=0.3, color=GREEN) 218 | self.play(Transform(code_obj, code_obj_t), Transform(hl_a, hl_a_t), Transform(hl_b, hl_b_t), Transform(hl_acc, hl_acc_t)) 219 | 220 | self.wait(1) 221 | 222 | 223 | code = """for (int32_t i = 0; i < n; i+= MKN) 224 | { 225 | const int32_t matrix_a_row = warpM * MKN; 226 | const int32_t matrix_b_col = warpN * MKN; 227 | 228 | if(matrix_a_row np.ndarray: 35 | try: 36 | return self.f(distance) 37 | except: 38 | logger.warning( 39 | "TimeInterpolator received weird input, there may be something wrong with the word boundaries." 40 | ) 41 | return self.y[-1] 42 | 43 | 44 | class VoiceoverTracker: 45 | """Class to track the progress of a voiceover in a scene.""" 46 | 47 | def __init__(self, scene: Scene, data: dict, cache_dir: str, dummy: bool = False): 48 | """Initializes a VoiceoverTracker object. 49 | 50 | Args: 51 | scene (Scene): The scene to which the voiceover belongs. 52 | path (str): The path to the JSON file containing the voiceover data. 53 | """ 54 | self.scene = scene 55 | self.data = data 56 | self.cache_dir = cache_dir 57 | self.duration = 0.01 if dummy else get_duration(Path(cache_dir) / self.data["final_audio"]) 58 | last_t = scene.time 59 | # last_t = scene.renderer.time 60 | if last_t is None: 61 | last_t = 0 62 | self.start_t = last_t 63 | self.end_t = last_t + self.duration 64 | 65 | if "word_boundaries" in self.data: 66 | self._process_bookmarks() 67 | 68 | def _get_fallback_word_boundaries(self): 69 | """ 70 | Returns dummy word boundaries assuming a linear mapping between 71 | text and audio. Used when word boundaries are not available. 72 | """ 73 | input_text = remove_bookmarks(self.data["input_text"]) 74 | return [ 75 | { 76 | "audio_offset": 0, 77 | "text_offset": 0, 78 | "word_length": len(input_text), 79 | "text": self.data["input_text"], 80 | "boundary_type": "Word", 81 | }, 82 | { 83 | "audio_offset": self.duration * AUDIO_OFFSET_RESOLUTION, 84 | "text_offset": len(input_text), 85 | "word_length": 1, 86 | "text": ".", 87 | "boundary_type": "Word", 88 | }, 89 | ] 90 | 91 | def _process_bookmarks(self) -> None: 92 | self.bookmark_times = {} 93 | self.bookmark_distances = {} 94 | 95 | word_boundaries = self.data["word_boundaries"] 96 | if not word_boundaries or len(word_boundaries) < 2: 97 | logger.warning( 98 | f"Word boundaries for voiceover {self.data['input_text']} are not " 99 | "available or are insufficient. Using fallback word boundaries." 100 | ) 101 | word_boundaries = self._get_fallback_word_boundaries() 102 | 103 | self.time_interpolator = TimeInterpolator(word_boundaries) 104 | 105 | net_text_len = len(remove_bookmarks(self.data["input_text"])) 106 | if "transcribed_text" in self.data: 107 | transcribed_text_len = len(self.data["transcribed_text"].strip()) 108 | else: 109 | transcribed_text_len = net_text_len 110 | 111 | self.input_text = self.data["input_text"] 112 | self.content = "" 113 | 114 | # Mark bookmark distances 115 | # parts = re.split("()", self.input_text) 116 | parts = re.split(r"()", self.input_text) 117 | for p in parts: 118 | matched = re.match(r"", p) 119 | if matched: 120 | self.bookmark_distances[matched.group(1)] = len(self.content) 121 | else: 122 | self.content += p 123 | 124 | for mark, dist in self.bookmark_distances.items(): 125 | # Normalize text offset 126 | elapsed = self.time_interpolator.interpolate( 127 | dist * transcribed_text_len / net_text_len 128 | ) 129 | self.bookmark_times[mark] = self.start_t + elapsed 130 | 131 | def get_remaining_duration(self, buff: float = 0.0) -> float: 132 | """Returns the remaining duration of the voiceover. 133 | 134 | Args: 135 | buff (float, optional): A buffer to add to the remaining duration. Defaults to 0. 136 | 137 | Returns: 138 | int: The remaining duration of the voiceover in seconds. 139 | """ 140 | # result= max(self.end_t - self.scene.last_t, 0) 141 | result = max(self.end_t - self.scene.time + buff, 0) 142 | # print(result) 143 | return result 144 | 145 | def _check_bookmarks(self): 146 | if not hasattr(self, "bookmark_times"): 147 | raise Exception( 148 | "Word boundaries are required for timing with bookmarks. " 149 | "Manim Voiceover currently supports auto-transcription using OpenAI Whisper, " 150 | "but this is not enabled for each speech service by default. " 151 | "You can enable it by setting transcription_model='base' in your speech service initialization. " 152 | "If the performance of the base model is not satisfactory, you can use one of the larger models. " 153 | "See https://github.com/openai/whisper for a list of all the available models." 154 | ) 155 | 156 | def time_until_bookmark( 157 | self, mark: str, buff: int = 0, limit: Optional[int] = None 158 | ) -> int: 159 | """Returns the time until a bookmark. 160 | 161 | Args: 162 | mark (str): The `mark` attribute of the bookmark to count up to. 163 | buff (int, optional): A buffer to add to the remaining duration, in seconds. Defaults to 0. 164 | limit (Optional[int], optional): A maximum value to return. Defaults to None. 165 | 166 | Returns: 167 | int: 168 | """ 169 | self._check_bookmarks() 170 | if not mark in self.bookmark_times: 171 | raise Exception("There is no " % mark) 172 | result = max(self.bookmark_times[mark] - self.scene.time + buff, 0) 173 | if limit is not None: 174 | result = min(limit, result) 175 | return result 176 | 177 | 178 | # SCRIPT_FILE_PATH = "media/script.txt" 179 | 180 | 181 | class VoiceoverScene(Scene): 182 | """A scene class that can be used to add voiceover to a scene.""" 183 | 184 | speech_service: SpeechService 185 | current_tracker: Optional[VoiceoverTracker] 186 | create_subcaption: bool 187 | create_script: bool 188 | voiceovers_in_embed: bool = False 189 | mock: bool = False 190 | 191 | def set_speech_service( 192 | self, 193 | speech_service: SpeechService, 194 | create_subcaption: bool = False, 195 | ) -> None: 196 | """Sets the speech service to be used for the voiceover. This method 197 | should be called before adding any voiceover to the scene. 198 | 199 | Args: 200 | speech_service (SpeechService): The speech service to be used. 201 | create_subcaption (bool, optional): Whether to create subcaptions for the scene. Defaults to True. If `config.save_last_frame` is True, the argument is 202 | ignored and no subcaptions will be created. 203 | """ 204 | self.speech_service = speech_service 205 | self.current_tracker = None 206 | # TODO not supported 207 | self.create_subcaption = False 208 | self.timestamps = [] 209 | 210 | def add_voiceover_text( 211 | self, 212 | text: str, 213 | subcaption: Optional[str] = None, 214 | max_subcaption_len: int = 70, 215 | subcaption_buff: float = 0.1, 216 | **kwargs, 217 | ) -> VoiceoverTracker: 218 | """Adds voiceover to the scene. 219 | 220 | Args: 221 | text (str): The text to be spoken. 222 | subcaption (Optional[str], optional): Alternative subcaption text. If not specified, `text` is chosen as the subcaption. Defaults to None. 223 | max_subcaption_len (int, optional): Maximum number of characters for a subcaption. Subcaptions that are longer are split into chunks that are smaller than `max_subcaption_len`. Defaults to 70. 224 | subcaption_buff (float, optional): The duration between split subcaption chunks in seconds. Defaults to 0.1. 225 | 226 | Returns: 227 | VoiceoverTracker: The tracker object for the voiceover. 228 | """ 229 | if not hasattr(self, "speech_service"): 230 | raise Exception( 231 | "You need to call init_voiceover() before adding a voiceover." 232 | ) 233 | 234 | dict_ = self.speech_service._wrap_generate_from_text(text, **kwargs) 235 | tracker = VoiceoverTracker(self, dict_, self.speech_service.cache_dir) 236 | self.add_sound(str(Path(self.speech_service.cache_dir) / dict_["final_audio"])) 237 | self.current_tracker = tracker 238 | 239 | # if self.create_script: 240 | # self.save_to_script_file(text) 241 | 242 | if self.create_subcaption: 243 | if subcaption is None: 244 | subcaption = remove_bookmarks(text) 245 | 246 | self.add_wrapped_subcaption( 247 | subcaption, 248 | tracker.duration, 249 | subcaption_buff=subcaption_buff, 250 | max_subcaption_len=max_subcaption_len, 251 | ) 252 | 253 | return tracker 254 | 255 | def add_wrapped_subcaption( 256 | self, 257 | subcaption: str, 258 | duration: float, 259 | subcaption_buff: float = 0.1, 260 | max_subcaption_len: int = 70, 261 | ) -> None: 262 | """Adds a subcaption to the scene. If the subcaption is longer than `max_subcaption_len`, it is split into chunks that are smaller than `max_subcaption_len`. 263 | 264 | Args: 265 | subcaption (str): The subcaption text. 266 | duration (float): The duration of the subcaption in seconds. 267 | max_subcaption_len (int, optional): Maximum number of characters for a subcaption. Subcaptions that are longer are split into chunks that are smaller than `max_subcaption_len`. Defaults to 70. 268 | subcaption_buff (float, optional): The duration between split subcaption chunks in seconds. Defaults to 0.1. 269 | """ 270 | subcaption = " ".join(subcaption.split()) 271 | n_chunk = ceil(len(subcaption) / max_subcaption_len) 272 | tokens = subcaption.split(" ") 273 | chunk_len = ceil(len(tokens) / n_chunk) 274 | chunks_ = list(chunks(tokens, chunk_len)) 275 | try: 276 | assert len(chunks_) == n_chunk or len(chunks_) == n_chunk - 1 277 | except AssertionError: 278 | import ipdb 279 | 280 | ipdb.set_trace() 281 | 282 | subcaptions = [" ".join(i) for i in chunks_] 283 | subcaption_weights = [ 284 | len(subcaption) / len("".join(subcaptions)) for subcaption in subcaptions 285 | ] 286 | 287 | current_offset = 0 288 | for idx, subcaption in enumerate(subcaptions): 289 | chunk_duration = duration * subcaption_weights[idx] 290 | self.add_subcaption( 291 | subcaption, 292 | duration=max(chunk_duration - subcaption_buff, 0), 293 | offset=current_offset, 294 | ) 295 | current_offset += chunk_duration 296 | 297 | def add_voiceover_ssml(self, ssml: str, **kwargs) -> None: 298 | raise NotImplementedError("SSML input not implemented yet.") 299 | 300 | # def save_to_script_file(self, text: str) -> None: 301 | # text = " ".join(text.split()) 302 | # # script_file_path = Path(config.get_dir("output_file")).with_suffix(".script.srt") 303 | # with open(SCRIPT_FILE_PATH, "a") as f: 304 | # f.write(text) 305 | # f.write("\n\n") 306 | 307 | def wait_for_voiceover(self) -> None: 308 | """Waits for the voiceover to finish.""" 309 | if not hasattr(self, "current_tracker"): 310 | return 311 | if self.current_tracker is None: 312 | return 313 | 314 | self.safe_wait(self.current_tracker.get_remaining_duration()) 315 | 316 | def safe_wait(self, duration: float) -> None: 317 | """Waits for a given duration. If the duration is less than one frame, it waits for one frame. 318 | 319 | Args: 320 | duration (float): The duration to wait for in seconds. 321 | """ 322 | fps = manim_config["camera"]["fps"] 323 | if duration > 1 / 30: 324 | self.wait(duration) 325 | 326 | def wait_until_bookmark(self, mark: str) -> None: 327 | """Waits until a bookmark is reached. 328 | 329 | Args: 330 | mark (str): The `mark` attribute of the bookmark to wait for. 331 | """ 332 | if self.current_tracker is None or self.mock: 333 | return 334 | self.safe_wait(self.current_tracker.time_until_bookmark(mark)) 335 | 336 | @contextmanager 337 | def voiceover( 338 | self, text: t.Optional[str] = None, ssml: t.Optional[str] = None, **kwargs 339 | ) -> Generator[VoiceoverTracker, None, None]: 340 | """The main function to be used for adding voiceover to a scene. 341 | 342 | Args: 343 | text (str, optional): The text to be spoken. Defaults to None. 344 | ssml (str, optional): The SSML to be spoken. Defaults to None. 345 | 346 | Yields: 347 | Generator[VoiceoverTracker, None, None]: The voiceover tracker object. 348 | """ 349 | if self.mock: 350 | text = "pass" 351 | start_time = self.time 352 | if text is None and ssml is None: 353 | raise ValueError("Please specify either a voiceover text or SSML string.") 354 | 355 | try: 356 | if self.window is not None and not self.voiceovers_in_embed: 357 | yield VoiceoverTracker(self, "", None, True) 358 | elif text is not None: 359 | yield self.add_voiceover_text(text, **kwargs) 360 | elif ssml is not None: 361 | yield self.add_voiceover_ssml(ssml, **kwargs) 362 | finally: 363 | self.wait_for_voiceover() 364 | self.timestamps.append(f"{start_time},{self.time}") 365 | 366 | def print_timestamps(self): 367 | print(";".join(self.timestamps)) 368 | -------------------------------------------------------------------------------- /matmul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 5 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 6 | { 7 | if (code != cudaSuccess) 8 | { 9 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 10 | if (abort) exit(code); 11 | } 12 | } 13 | 14 | __global__ void matmul_elem(int n, float* a, float* b, float* c) 15 | { 16 | int column = blockIdx.x*blockDim.x + threadIdx.x; 17 | int row = blockIdx.y*blockDim.y + threadIdx.y; 18 | if (row < n && column < n) 19 | { 20 | float dot_prod = 0.f; 21 | for(int i = 0; i < n; i++) 22 | { 23 | dot_prod += a[row*n + i] * b[i*n + column]; 24 | } 25 | c[row*n+column] = dot_prod; 26 | } 27 | } 28 | 29 | __global__ void matmul_elem_onedim(int n, float* a, float* b, float* c) 30 | { 31 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 32 | int row = idx/n; 33 | int column = idx%n; 34 | if (row < n && column < n) 35 | { 36 | float dot_prod = 0.f; 37 | for(int i = 0; i < n; i++) 38 | { 39 | dot_prod += a[row*n + i] * b[i*n + column]; 40 | } 41 | c[row*n+column] = dot_prod; 42 | } 43 | } 44 | 45 | int main() 46 | { 47 | int N = 1024; 48 | int BLOCK_SIZE=32; 49 | float* a = new float[N*N]; 50 | float* b = new float[N*N]; 51 | float* c = new float[N*N]; 52 | float* d = new float[N*N]; 53 | float* e = new float[N*N]; 54 | for (int i = 0; i>>(N, a_d, b_d, c_d); 82 | gpuErrchk(cudaPeekAtLastError()); 83 | gpuErrchk(cudaDeviceSynchronize()); 84 | 85 | cudaMemcpy(c, c_d, N*N*sizeof(float), cudaMemcpyDeviceToHost); 86 | 87 | dim3 dimGrid_r(ceil(N*N/(float)BLOCK_SIZE), 1, 1); 88 | dim3 dimBlock_r(BLOCK_SIZE, 1, 1); 89 | 90 | matmul_elem_onedim<<>>(N, a_d, b_d, c_d); 91 | gpuErrchk(cudaPeekAtLastError()); 92 | gpuErrchk(cudaDeviceSynchronize()); 93 | 94 | cudaMemcpy(d, d_d, N*N*sizeof(float), cudaMemcpyDeviceToHost); 95 | 96 | for (int i = 0; i 2 | #include 3 | #include 4 | #include 5 | 6 | #define TILE_WIDTH 32 7 | #define BENCH_STEPS 3 8 | #define TIMINGS 8 9 | #define START 8 10 | 11 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 12 | #define ASSERT(cond, msg, args...) assert((cond) || !fprintf(stderr, (msg "\n"), args)) 13 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 14 | { 15 | if (code != cudaSuccess) 16 | { 17 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 18 | if (abort) exit(code); 19 | } 20 | } 21 | 22 | __global__ void matmul_elem(int n, float* a, float* b, float* c) 23 | { 24 | int column = blockIdx.x*blockDim.x + threadIdx.x; 25 | int row = blockIdx.y*blockDim.y + threadIdx.y; 26 | if (row < n && column < n) 27 | { 28 | float dot_prod = 0.f; 29 | for(int i = 0; i < n; i++) 30 | { 31 | dot_prod += a[row*n + i] * b[i*n + column]; 32 | } 33 | c[row*n+column] = dot_prod; 34 | } 35 | } 36 | 37 | __global__ void tiled_matmul(int n, float* a, float* b, float* c) 38 | { 39 | __shared__ float a_tile[TILE_WIDTH][TILE_WIDTH]; 40 | __shared__ float b_tile[TILE_WIDTH][TILE_WIDTH]; 41 | 42 | int column = blockIdx.x*TILE_WIDTH + threadIdx.x; 43 | int row = blockIdx.y*TILE_WIDTH + threadIdx.y; 44 | 45 | int tx = threadIdx.x; 46 | int ty = threadIdx.y; 47 | 48 | float dot_prod = 0.f; 49 | for (int tile_offset = 0; tile_offset dis(0, 1); // range [0, 1) 75 | return dis(e); 76 | } 77 | 78 | void cpu_matmul(int n, float* a, float* b, float*c) 79 | { 80 | for (int i = 0; i(2, START+TIMINGS-1); 105 | cudaMalloc((void**) &a_d, max_N*max_N*sizeof(float)); 106 | cudaMalloc((void**) &b_d, max_N*max_N*sizeof(float)); 107 | cudaMalloc((void**) &c_d, max_N*max_N*sizeof(float)); 108 | cudaMalloc((void**) &d_d, max_N*max_N*sizeof(float)); 109 | 110 | float* a = new float[max_N * max_N]; 111 | float* b = new float[max_N * max_N]; 112 | float* c = new float[max_N * max_N]; 113 | 114 | for (int p = START; p(2, p); 117 | int BLOCK_SIZE=32; 118 | 119 | dim3 dimGrid(ceil(N/(float)BLOCK_SIZE), ceil(N/(float)BLOCK_SIZE), 1); 120 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE, 1); 121 | 122 | double matmul_time=0.0; 123 | for (int i = -1; i>>(N, a_d, b_d, c_d); 132 | gpuErrchk(cudaPeekAtLastError()); 133 | gpuErrchk(cudaDeviceSynchronize()); 134 | double final_time = std::chrono::duration_cast(std::chrono::system_clock::now() - start_time).count(); 135 | if (i != -1) // one warmup run 136 | { 137 | matmul_time += final_time; 138 | } 139 | } 140 | 141 | dimGrid = dim3(ceil(N/(float)TILE_WIDTH), ceil(N/(float)TILE_WIDTH), 1); 142 | dimBlock = dim3(TILE_WIDTH, TILE_WIDTH, 1); 143 | 144 | double tiled_time=0.0; 145 | for (int i = -1; i>>(N, a_d, b_d, d_d); 154 | gpuErrchk(cudaPeekAtLastError()); 155 | gpuErrchk(cudaDeviceSynchronize()); 156 | double final_time = std::chrono::duration_cast(std::chrono::system_clock::now() - start_time).count(); 157 | if (i != -1) // one warmup run 158 | { 159 | tiled_time += final_time; 160 | } 161 | } 162 | 163 | double cpu_time=0.0; 164 | for (int i = -1; i(std::chrono::system_clock::now() - start_time).count(); 173 | if (i != -1) // one warmup run 174 | { 175 | cpu_time += final_time; 176 | } 177 | } 178 | std::cout<<"n = "< 2 | #include 3 | #include 4 | 5 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 6 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 7 | { 8 | if (code != cudaSuccess) 9 | { 10 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 11 | if (abort) exit(code); 12 | } 13 | } 14 | 15 | __global__ void matvec(int n, float* a, float* b, float* c) 16 | { 17 | int col = blockIdx.x*blockDim.x + threadIdx.x; 18 | if (col < n) 19 | { 20 | for(int i = 0; i < n; i++) 21 | { 22 | c[col] += a[i*n + col] * b[col]; 23 | } 24 | } 25 | } 26 | 27 | 28 | int main() 29 | { 30 | int N = 1024; 31 | int BLOCK_SIZE=32; 32 | float* a = new float[N*N]; 33 | float* b = new float[N]; 34 | float* c = new float[N]; 35 | for (int i = 0; i>>(N, a_d, b_d, c_d); 55 | gpuErrchk(cudaPeekAtLastError()); 56 | gpuErrchk(cudaDeviceSynchronize()); 57 | 58 | cudaMemcpy(c, c_d, N*sizeof(float), cudaMemcpyDeviceToHost); 59 | 60 | 61 | for (int i = 0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define ASSERT(cond, msg, args...) assert((cond) || !fprintf(stderr, (msg "\n"), args)) 12 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 13 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 14 | { 15 | if (code != cudaSuccess) 16 | { 17 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 18 | if (abort) exit(code); 19 | } 20 | } 21 | 22 | class Timer 23 | { 24 | public: 25 | Timer(std::string in_name) : name(in_name) 26 | { 27 | start_time = std::chrono::system_clock::now(); 28 | } 29 | ~Timer() 30 | { 31 | std::cout<(std::chrono::system_clock::now() - start_time).count()<<" ms"< start_time; 35 | std::string name; 36 | }; 37 | 38 | __global__ void forward(int batch_size, int n, int out_w, float* input, float* weights, float* biases, float* output) 39 | { 40 | int column = blockIdx.x*blockDim.x + threadIdx.x; 41 | int row = blockIdx.y*blockDim.y + threadIdx.y; 42 | if (row < batch_size && column < out_w) 43 | { 44 | output[row*out_w+column] = biases[column]; 45 | for(int i = 0; i < n; i++) 46 | { 47 | output[row*out_w+column] += weights[i*out_w + column] * input[row*n + i]; 48 | } 49 | } 50 | } 51 | 52 | __global__ void backward(int batch_size, int n, int out_w, float* weights, float* biases, float* d_l, float* out_d_l) 53 | { 54 | int column = blockIdx.x*blockDim.x + threadIdx.x; 55 | int row = blockIdx.y*blockDim.y + threadIdx.y; 56 | if (row < batch_size && column < out_w) 57 | { 58 | float dl = 0.f; 59 | for(int i = 0; i < n; i++) 60 | { 61 | float w = weights[i*out_w + column]; 62 | dl += w*d_l[row*n + i]; 63 | } 64 | out_d_l[row*out_w + column] = dl; 65 | } 66 | } 67 | 68 | __global__ void update_layer(int w, int h, int batch_size, float lr, float* weights, float* biases, float* activations, float* d_l) 69 | { 70 | int column = blockIdx.x*blockDim.x + threadIdx.x; 71 | int row = blockIdx.y*blockDim.y + threadIdx.y; 72 | if (row < h && column < w) 73 | { 74 | float dw = 0.f; 75 | float db = 0.f; 76 | for(int i = 0; i < batch_size; i++) 77 | { 78 | float act = activations[i*h + row]; 79 | float dl = d_l[i*w + column]; 80 | dw += act*dl; 81 | db += dl; 82 | } 83 | weights[row*w + column] -= lr * dw / batch_size; 84 | biases[column] -= lr * db / batch_size; 85 | } 86 | } 87 | 88 | __global__ void relu(int w, int h, float* a, float* b) 89 | { 90 | int column = blockIdx.x*blockDim.x + threadIdx.x; 91 | int row = blockIdx.y*blockDim.y + threadIdx.y; 92 | if (row < h && column < w) 93 | { 94 | float activation = a[row*w+column]; 95 | b[row*w+column] = activation > 0.f ? activation : 0.f; 96 | } 97 | } 98 | 99 | __global__ void relu_backwards(int w, int h, float* a, float* d_l, float* b) 100 | { 101 | int column = blockIdx.x*blockDim.x + threadIdx.x; 102 | int row = blockIdx.y*blockDim.y + threadIdx.y; 103 | if (row < h && column < w) 104 | { 105 | float activation = a[row*w+column]; 106 | b[row*w+column] = activation > 0.f ? d_l[row*w+column] : 0.f; 107 | } 108 | } 109 | 110 | __global__ void softmax(int w, int h, float* a, float* b) 111 | { 112 | int col = blockIdx.x*blockDim.x + threadIdx.x; 113 | int row = blockIdx.y*blockDim.y + threadIdx.y; 114 | if (row < h && col < w) 115 | { 116 | float maxval = a[row*w]; 117 | for (int i = 1; i>>(w, h, weights); 187 | gpuErrchk(cudaPeekAtLastError()); 188 | 189 | dimGrid = dim3(ceil(h/(float)BLOCK_SIZE), 1, 1); 190 | dimBlock = dim3(BLOCK_SIZE, 1, 1); 191 | init_rand<<>>(1, h, biases); 192 | gpuErrchk(cudaPeekAtLastError()); 193 | } 194 | 195 | void read_mnist(const std::string filename, int length, float* x, float* y) 196 | { 197 | int input_size = 784; 198 | int labels = 10; 199 | 200 | std::fstream fin; 201 | fin.open(filename); 202 | std::string row; 203 | constexpr char delim = ','; 204 | for(int i = 0; i> row; 207 | int pos = row.find(delim); 208 | int label = std::stoi(row.substr(0, pos+1)); 209 | for(int j = 0; j>>(BATCH_SIZE, input_size, size1, input, weights1, biases1, x1); 333 | gpuErrchk(cudaPeekAtLastError()); 334 | 335 | relu<<>>(size1, BATCH_SIZE, x1, a1); 336 | gpuErrchk(cudaPeekAtLastError()); 337 | 338 | dimGrid = dim3(ceil(size2/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 339 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 340 | 341 | forward<<>>(BATCH_SIZE, size1, size2, a1, weights2, biases2, x2); 342 | gpuErrchk(cudaPeekAtLastError()); 343 | 344 | relu<<>>(size2, BATCH_SIZE, x2, a2); 345 | gpuErrchk(cudaPeekAtLastError()); 346 | 347 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 348 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 349 | 350 | forward<<>>(BATCH_SIZE, size2, size3, a2, weights3, biases3, x3); 351 | gpuErrchk(cudaPeekAtLastError()); 352 | 353 | softmax<<>>(size3, BATCH_SIZE, x3, a3); 354 | gpuErrchk(cudaPeekAtLastError()); 355 | 356 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), 1, 1); 357 | dimBlock = dim3(BLOCK_SIZE, 1, 1); 358 | cross_entropy<<>>(size3, BATCH_SIZE, a3, labels, loss); 359 | 360 | gpuErrchk(cudaDeviceSynchronize()); 361 | 362 | gpuErrchk(cudaMemcpy(out_h, a3, BATCH_SIZE*size3*sizeof(float), cudaMemcpyDeviceToHost)); 363 | gpuErrchk(cudaMemcpy(loss_h, loss, BATCH_SIZE*sizeof(float), cudaMemcpyDeviceToHost)); 364 | 365 | for (int i = 0; i < BATCH_SIZE; i++) 366 | { 367 | float max_1 = 0.f; 368 | float max_2 = 0.f; 369 | int i1 = 0; 370 | int i2 = 0; 371 | for (int j = 0; j max_1) 374 | { 375 | max_1 = out_h[i*labels_size + j]; 376 | i1 = j; 377 | } 378 | 379 | if (mnist_train_y[batch*BATCH_SIZE*labels_size + i*labels_size + j] > max_2) 380 | { 381 | max_2 = mnist_train_y[batch*BATCH_SIZE*labels_size + i*labels_size + j]; 382 | i2 = j; 383 | } 384 | } 385 | correct += (i1 == i2); 386 | cum_loss += loss_h[i]; 387 | } 388 | 389 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 390 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 391 | 392 | cross_entropy_backwards<<>>(size3, BATCH_SIZE, a3, labels, d_l3); 393 | gpuErrchk(cudaPeekAtLastError()); 394 | 395 | dimGrid = dim3(ceil(size2/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 396 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 397 | 398 | backward<<>>(BATCH_SIZE, size3, size2, weights3, biases3, d_l3, d_l2); 399 | gpuErrchk(cudaPeekAtLastError()); 400 | 401 | relu_backwards<<>>(size2, BATCH_SIZE, a2, d_l2, d_l2); 402 | 403 | dimGrid = dim3(ceil(size1/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 404 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 405 | 406 | backward<<>>(BATCH_SIZE, size2, size1, weights2, biases2, d_l2, d_l1); 407 | gpuErrchk(cudaPeekAtLastError()); 408 | relu_backwards<<>>(size1, BATCH_SIZE, a1, d_l1, d_l1); 409 | 410 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), ceil(size2/(float)BLOCK_SIZE), 1); 411 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 412 | update_layer<<>>(size3, size2, BATCH_SIZE, LR, weights3, biases3, a2, d_l3); 413 | dimGrid = dim3(ceil(size2/(float)BLOCK_SIZE), ceil(size1/(float)BLOCK_SIZE), 1); 414 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 415 | update_layer<<>>(size2, size1, BATCH_SIZE, LR, weights2, biases2, a1, d_l2); 416 | dimGrid = dim3(ceil(size1/(float)BLOCK_SIZE), ceil(input_size/(float)BLOCK_SIZE), 1); 417 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 418 | update_layer<<>>(size1, input_size, BATCH_SIZE, LR, weights1, biases1, input, d_l1); 419 | 420 | } 421 | float val_loss = 0.f; 422 | int val_correct = 0; 423 | int val_total = 0; 424 | for(int batch = 0; batch>>(BATCH_SIZE, input_size, size1, input, weights1, biases1, x1); 434 | gpuErrchk(cudaPeekAtLastError()); 435 | 436 | relu<<>>(size1, BATCH_SIZE, x1, a1); 437 | gpuErrchk(cudaPeekAtLastError()); 438 | 439 | dimGrid = dim3(ceil(size2/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 440 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 441 | 442 | forward<<>>(BATCH_SIZE, size1, size2, a1, weights2, biases2, x2); 443 | gpuErrchk(cudaPeekAtLastError()); 444 | 445 | relu<<>>(size2, BATCH_SIZE, x2, a2); 446 | gpuErrchk(cudaPeekAtLastError()); 447 | 448 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 449 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 450 | 451 | forward<<>>(BATCH_SIZE, size2, size3, a2, weights3, biases3, x3); 452 | gpuErrchk(cudaPeekAtLastError()); 453 | 454 | softmax<<>>(size3, BATCH_SIZE, x3, a3); 455 | gpuErrchk(cudaPeekAtLastError()); 456 | 457 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), 1, 1); 458 | dimBlock = dim3(BLOCK_SIZE, 1, 1); 459 | cross_entropy<<>>(size3, BATCH_SIZE, a3, labels, loss); 460 | 461 | gpuErrchk(cudaDeviceSynchronize()); 462 | gpuErrchk(cudaMemcpy(out_h, a3, BATCH_SIZE*size3*sizeof(float), cudaMemcpyDeviceToHost)); 463 | gpuErrchk(cudaMemcpy(loss_h, loss, BATCH_SIZE*sizeof(float), cudaMemcpyDeviceToHost)); 464 | 465 | for (int i = 0; i < BATCH_SIZE; i++) 466 | { 467 | float max_1 = 0.f; 468 | float max_2 = 0.f; 469 | int i1 = 0; 470 | int i2 = 0; 471 | for (int j = 0; j max_1) 474 | { 475 | max_1 = out_h[i*labels_size + j]; 476 | i1 = j; 477 | } 478 | 479 | if (mnist_test_y[batch*BATCH_SIZE*labels_size + i*labels_size + j] > max_2) 480 | { 481 | max_2 = mnist_test_y[batch*BATCH_SIZE*labels_size + i*labels_size + j]; 482 | i2 = j; 483 | } 484 | } 485 | val_correct += (i1 == i2); 486 | val_loss += loss_h[i]; 487 | } 488 | } 489 | 490 | float epoch_time = std::chrono::duration_cast(std::chrono::system_clock::now() - start_time).count(); 491 | total_time += epoch_time; 492 | std::cout<<"epoch "< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define ASSERT(cond, msg, args...) assert((cond) || !fprintf(stderr, (msg "\n"), args)) 14 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 15 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 16 | { 17 | if (code != cudaSuccess) 18 | { 19 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 20 | if (abort) exit(code); 21 | } 22 | } 23 | 24 | class Timer 25 | { 26 | public: 27 | Timer(std::string in_name) : name(in_name) 28 | { 29 | start_time = std::chrono::system_clock::now(); 30 | } 31 | ~Timer() 32 | { 33 | std::cout<(std::chrono::system_clock::now() - start_time).count()/(float)1e6<<" ms"< start_time; 37 | std::string name; 38 | }; 39 | 40 | __global__ void forward(int batch_size, int n, int out_w, float* input, float* weights, float* biases, float* output) 41 | { 42 | int column = blockIdx.x*blockDim.x + threadIdx.x; 43 | int row = blockIdx.y*blockDim.y + threadIdx.y; 44 | if (row < batch_size && column < out_w) 45 | { 46 | output[row*out_w+column] = biases[column]; 47 | for(int i = 0; i < n; i++) 48 | { 49 | output[row*out_w+column] += weights[i*out_w + column] * input[row*n + i]; 50 | } 51 | } 52 | } 53 | 54 | __global__ void forward_relu(int batch_size, int n, int out_w, float* input, float* weights, float* biases, float* output) 55 | { 56 | int column = blockIdx.x*blockDim.x + threadIdx.x; 57 | int row = blockIdx.y*blockDim.y + threadIdx.y; 58 | if (row < batch_size && column < out_w) 59 | { 60 | float out = biases[column]; 61 | for(int i = 0; i < n; i++) 62 | { 63 | out += weights[i*out_w + column] * input[row*n + i]; 64 | } 65 | output[row*out_w+column] = out > 0.f ? out : 0.f; 66 | } 67 | } 68 | 69 | __global__ void backward(int batch_size, int n, int out_w, float* weights, float* biases, float* d_l, float* out_d_l, float* activations) 70 | { 71 | int column = blockIdx.x*blockDim.x + threadIdx.x; 72 | int row = blockIdx.y*blockDim.y + threadIdx.y; 73 | if (row < batch_size && column < out_w) 74 | { 75 | float dl = 0.f; 76 | for(int i = 0; i < n; i++) 77 | { 78 | float w = weights[i*out_w + column]; 79 | dl += w*d_l[row*n + i]; 80 | } 81 | float activation = activations[row*out_w+column]; 82 | out_d_l[row*out_w + column] = activation > 0.f ? dl : 0.f; 83 | } 84 | } 85 | 86 | __global__ void update_layer(int w, int h, int batch_size, float lr, float* weights, float* biases, float* activations, float* d_l) 87 | { 88 | int column = blockIdx.x*blockDim.x + threadIdx.x; 89 | int row = blockIdx.y*blockDim.y + threadIdx.y; 90 | if (row < h && column < w) 91 | { 92 | float dw = 0.f; 93 | float db = 0.f; 94 | for(int i = 0; i < batch_size; i++) 95 | { 96 | float act = activations[i*h + row]; 97 | float dl = d_l[i*w + column]; 98 | dw += act*dl; 99 | db += dl; 100 | } 101 | weights[row*w + column] -= lr * dw / batch_size; 102 | biases[column] -= lr * db / batch_size; 103 | } 104 | } 105 | 106 | __global__ void softmax(int w, int h, float* a, float* b) 107 | { 108 | int col = blockIdx.x*blockDim.x + threadIdx.x; 109 | int row = blockIdx.y*blockDim.y + threadIdx.y; 110 | if (row < h && col < w) 111 | { 112 | float maxval = a[row*w]; 113 | for (int i = 1; i>>(w, h, weights); 183 | gpuErrchk(cudaPeekAtLastError()); 184 | 185 | dimGrid = dim3(ceil(h/(float)BLOCK_SIZE), 1, 1); 186 | dimBlock = dim3(BLOCK_SIZE, 1, 1); 187 | init_rand<<>>(1, h, biases); 188 | gpuErrchk(cudaPeekAtLastError()); 189 | } 190 | 191 | void read_mnist(std::ifstream& fin, int start, int length, float* x, float* y) 192 | { 193 | constexpr int input_size = 784; 194 | constexpr int labels = 10; 195 | 196 | std::string line; 197 | std::vector buffer(4096); 198 | 199 | for (int i = start; i < start + length; ++i) 200 | { 201 | if (!std::getline(fin, line)) { 202 | throw std::runtime_error("Unexpected end of file"); 203 | } 204 | 205 | std::istringstream ss(line); 206 | 207 | int label; 208 | if (!(ss >> label)) { 209 | throw std::runtime_error("Failed to read label"); 210 | } 211 | 212 | std::memset(y + labels * i, 0, labels * sizeof(float)); 213 | y[labels * i + label] = 1.0f; 214 | 215 | float* x_row = x + i * input_size; 216 | for (int j = 0; j < input_size; ++j) 217 | { 218 | ASSERT(ss.getline(&buffer[0], buffer.size(), ','), "Failed to read pixel value for entry %d, pixel %d", i, j); 219 | x_row[j] = std::strtof(&buffer[0], nullptr) / 255.0f; 220 | } 221 | } 222 | } 223 | 224 | int main(int argc, char** argv) 225 | { 226 | Timer full("full training"); 227 | int test_length = 10000; 228 | int train_length = 60000; 229 | 230 | float* input; 231 | float* labels; 232 | int input_size = 784; 233 | int labels_size = 10; 234 | 235 | int BLOCK_SIZE = 16; 236 | int BATCH_SIZE = 64; 237 | int EPOCHS = 10; 238 | float LR = 0.03f; 239 | 240 | float* mnist_train_x = new float[input_size * train_length]; 241 | float* mnist_train_y = new float[labels_size * train_length]; 242 | 243 | float* mnist_test_x = new float[input_size * test_length]; 244 | float* mnist_test_y = new float[labels_size * test_length]; 245 | std::ifstream train_fin("./mnist_train.csv"); 246 | std::ifstream test_fin("./mnist_test.csv"); 247 | 248 | read_mnist(train_fin, 0, BATCH_SIZE, mnist_train_x, mnist_train_y); 249 | read_mnist(test_fin, 0, BATCH_SIZE, mnist_test_x, mnist_test_y); 250 | 251 | int size1 = 300; 252 | float* weights1; 253 | float* biases1; 254 | float* d_l1; 255 | 256 | int size2 = 100; 257 | float* weights2; 258 | float* biases2; 259 | float* d_l2; 260 | 261 | int size3 = 10; 262 | float* weights3; 263 | float* biases3; 264 | float* d_l3; 265 | 266 | 267 | dim3 dimGrid; 268 | dim3 dimBlock; 269 | 270 | float* out_h = new float[BATCH_SIZE*size3]; 271 | float* loss_h = new float[BATCH_SIZE]; 272 | 273 | float *x1; 274 | float *a1; 275 | float *x2; 276 | float *a2; 277 | float *x3; 278 | float *a3; 279 | float* loss; 280 | { 281 | Timer init("initialization"); 282 | gpuErrchk(cudaMalloc((void**) &input, input_size*BATCH_SIZE*sizeof(float))); 283 | gpuErrchk(cudaMalloc((void**) &labels, labels_size*BATCH_SIZE*sizeof(float))); 284 | 285 | gpuErrchk(cudaMalloc((void**) &weights1, size1*input_size*sizeof(float))); 286 | gpuErrchk(cudaMalloc((void**) &biases1, size1*sizeof(float))); 287 | gpuErrchk(cudaMalloc((void**) &d_l1, size1*BATCH_SIZE*sizeof(float))); 288 | initLayer(weights1, biases1, size1, input_size, BLOCK_SIZE); 289 | 290 | gpuErrchk(cudaMalloc((void**) &weights2, size2*size1*sizeof(float))); 291 | gpuErrchk(cudaMalloc((void**) &biases2, size2*sizeof(float))); 292 | gpuErrchk(cudaMalloc((void**) &d_l2, size2*BATCH_SIZE*sizeof(float))); 293 | initLayer(weights2, biases2, size2, size1, BLOCK_SIZE); 294 | 295 | 296 | gpuErrchk(cudaMalloc((void**) &weights3, size3*size2*sizeof(float))); 297 | gpuErrchk(cudaMalloc((void**) &biases3, size3*sizeof(float))); 298 | gpuErrchk(cudaMalloc((void**) &d_l3, size3*BATCH_SIZE*sizeof(float))); 299 | initLayer(weights3, biases3, size3, size2, BLOCK_SIZE); 300 | 301 | gpuErrchk(cudaMalloc((void**) &x1, size1*BATCH_SIZE*sizeof(float))); 302 | gpuErrchk(cudaMalloc((void**) &a1, size1*BATCH_SIZE*sizeof(float))); 303 | 304 | gpuErrchk(cudaMalloc((void**) &x2, size2*BATCH_SIZE*sizeof(float))); 305 | gpuErrchk(cudaMalloc((void**) &a2, size2*BATCH_SIZE*sizeof(float))); 306 | 307 | gpuErrchk(cudaMalloc((void**) &x3, size3*BATCH_SIZE*sizeof(float))); 308 | gpuErrchk(cudaMalloc((void**) &a3, size3*BATCH_SIZE*sizeof(float))); 309 | 310 | gpuErrchk(cudaMalloc((void**) &loss, BATCH_SIZE*sizeof(float))); 311 | } 312 | 313 | float total_time = 0.f; 314 | for(int epoch = 0; epoch>>(BATCH_SIZE, input_size, size1, input, weights1, biases1, a1); 330 | gpuErrchk(cudaPeekAtLastError()); 331 | 332 | dimGrid = dim3(ceil(size2/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 333 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 334 | 335 | forward_relu<<>>(BATCH_SIZE, size1, size2, a1, weights2, biases2, a2); 336 | gpuErrchk(cudaPeekAtLastError()); 337 | 338 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 339 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 340 | 341 | forward<<>>(BATCH_SIZE, size2, size3, a2, weights3, biases3, x3); 342 | gpuErrchk(cudaPeekAtLastError()); 343 | 344 | softmax<<>>(size3, BATCH_SIZE, x3, a3); 345 | gpuErrchk(cudaPeekAtLastError()); 346 | 347 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), 1, 1); 348 | dimBlock = dim3(BLOCK_SIZE, 1, 1); 349 | cross_entropy<<>>(size3, BATCH_SIZE, a3, labels, loss); 350 | 351 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 352 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 353 | 354 | cross_entropy_backwards<<>>(size3, BATCH_SIZE, a3, labels, d_l3); 355 | gpuErrchk(cudaPeekAtLastError()); 356 | 357 | dimGrid = dim3(ceil(size2/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 358 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 359 | 360 | backward<<>>(BATCH_SIZE, size3, size2, weights3, biases3, d_l3, d_l2, a2); 361 | gpuErrchk(cudaPeekAtLastError()); 362 | 363 | dimGrid = dim3(ceil(size1/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 364 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 365 | 366 | backward<<>>(BATCH_SIZE, size2, size1, weights2, biases2, d_l2, d_l1, a1); 367 | gpuErrchk(cudaPeekAtLastError()); 368 | 369 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), ceil(size2/(float)BLOCK_SIZE), 1); 370 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 371 | update_layer<<>>(size3, size2, BATCH_SIZE, LR, weights3, biases3, a2, d_l3); 372 | dimGrid = dim3(ceil(size2/(float)BLOCK_SIZE), ceil(size1/(float)BLOCK_SIZE), 1); 373 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 374 | update_layer<<>>(size2, size1, BATCH_SIZE, LR, weights2, biases2, a1, d_l2); 375 | dimGrid = dim3(ceil(size1/(float)BLOCK_SIZE), ceil(input_size/(float)BLOCK_SIZE), 1); 376 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 377 | update_layer<<>>(size1, input_size, BATCH_SIZE, LR, weights1, biases1, input, d_l1); 378 | 379 | if (epoch == 0 && (batch+2)*BATCH_SIZE < train_length) 380 | { 381 | read_mnist(train_fin, (batch+1)*BATCH_SIZE, BATCH_SIZE, mnist_train_x, mnist_train_y); 382 | } 383 | 384 | gpuErrchk(cudaMemcpy(out_h, a3, BATCH_SIZE*size3*sizeof(float), cudaMemcpyDeviceToHost)); 385 | gpuErrchk(cudaMemcpy(loss_h, loss, BATCH_SIZE*sizeof(float), cudaMemcpyDeviceToHost)); 386 | 387 | for (int i = 0; i < BATCH_SIZE; i++) 388 | { 389 | float max_1 = 0.f; 390 | float max_2 = 0.f; 391 | int i1 = 0; 392 | int i2 = 0; 393 | for (int j = 0; j max_1) 396 | { 397 | max_1 = out_h[i*labels_size + j]; 398 | i1 = j; 399 | } 400 | 401 | if (mnist_train_y[batch*BATCH_SIZE*labels_size + i*labels_size + j] > max_2) 402 | { 403 | max_2 = mnist_train_y[batch*BATCH_SIZE*labels_size + i*labels_size + j]; 404 | i2 = j; 405 | } 406 | } 407 | correct += (i1 == i2); 408 | cum_loss += loss_h[i]; 409 | } 410 | 411 | } 412 | float val_loss = 0.f; 413 | int val_correct = 0; 414 | int val_total = 0; 415 | for(int batch = 0; batch>>(BATCH_SIZE, input_size, size1, input, weights1, biases1, a1); 425 | gpuErrchk(cudaPeekAtLastError()); 426 | 427 | dimGrid = dim3(ceil(size2/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 428 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 429 | 430 | forward_relu<<>>(BATCH_SIZE, size1, size2, a1, weights2, biases2, a2); 431 | gpuErrchk(cudaPeekAtLastError()); 432 | 433 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), ceil(BATCH_SIZE/(float)BLOCK_SIZE), 1); 434 | dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE, 1); 435 | 436 | forward<<>>(BATCH_SIZE, size2, size3, a2, weights3, biases3, x3); 437 | gpuErrchk(cudaPeekAtLastError()); 438 | 439 | softmax<<>>(size3, BATCH_SIZE, x3, a3); 440 | gpuErrchk(cudaPeekAtLastError()); 441 | 442 | dimGrid = dim3(ceil(size3/(float)BLOCK_SIZE), 1, 1); 443 | dimBlock = dim3(BLOCK_SIZE, 1, 1); 444 | cross_entropy<<>>(size3, BATCH_SIZE, a3, labels, loss); 445 | 446 | if (epoch == 0 && (batch+2)*BATCH_SIZE < test_length) 447 | { 448 | read_mnist(test_fin, (batch+1)*BATCH_SIZE, BATCH_SIZE, mnist_test_x, mnist_test_y); 449 | } 450 | gpuErrchk(cudaDeviceSynchronize()); 451 | gpuErrchk(cudaMemcpy(out_h, a3, BATCH_SIZE*size3*sizeof(float), cudaMemcpyDeviceToHost)); 452 | gpuErrchk(cudaMemcpy(loss_h, loss, BATCH_SIZE*sizeof(float), cudaMemcpyDeviceToHost)); 453 | 454 | for (int i = 0; i < BATCH_SIZE; i++) 455 | { 456 | float max_1 = 0.f; 457 | float max_2 = 0.f; 458 | int i1 = 0; 459 | int i2 = 0; 460 | for (int j = 0; j max_1) 463 | { 464 | max_1 = out_h[i*labels_size + j]; 465 | i1 = j; 466 | } 467 | 468 | if (mnist_test_y[batch*BATCH_SIZE*labels_size + i*labels_size + j] > max_2) 469 | { 470 | max_2 = mnist_test_y[batch*BATCH_SIZE*labels_size + i*labels_size + j]; 471 | i2 = j; 472 | } 473 | } 474 | val_correct += (i1 == i2); 475 | val_loss += loss_h[i]; 476 | } 477 | } 478 | 479 | float epoch_time = std::chrono::duration_cast(std::chrono::system_clock::now() - start_time).count(); 480 | total_time += epoch_time; 481 | std::cout<<"epoch "< 0 else 33 23 | 24 | else: 25 | img = cv2.imread(video_path) 26 | 27 | while cv2.waitKey(frame_delay) != 32: 28 | (x, y, w, h) = cv2.getWindowImageRect("Video Presentation") 29 | if is_video: 30 | ret, frame = cap.read() 31 | else: 32 | ret, frame = True, img 33 | 34 | if ret: 35 | if h > 0 and w > 0: 36 | frame = cv2.resize(frame, (w,h)) 37 | cv2.imshow('Video Presentation', frame) 38 | 39 | if is_video: 40 | cap.release() 41 | 42 | cv2.destroyAllWindows() 43 | print("All videos have been played.") 44 | 45 | def convert_videos_fps(video_paths, input_fps=60, output_fps=30, output_dir=None): 46 | """ 47 | Convert videos from one framerate to another using ffmpeg. 48 | 49 | Args: 50 | video_paths (list): List of paths to video files 51 | input_fps (int): Input framerate (default: 60) 52 | output_fps (int): Output framerate (default: 30) 53 | output_dir (str, optional): Directory to save output videos. If None, videos are saved in the same directory with '_30fps' suffix 54 | 55 | Returns: 56 | list: Paths to converted video files 57 | """ 58 | # Check if ffmpeg is installed 59 | try: 60 | subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 61 | except FileNotFoundError: 62 | raise RuntimeError("FFmpeg not found. Please install FFmpeg before running this function.") 63 | 64 | converted_paths = [] 65 | 66 | for video_path in video_paths: 67 | if not os.path.exists(video_path): 68 | print(f"Warning: File not found - {video_path}") 69 | continue 70 | 71 | # Get file details 72 | file_dir, file_name = os.path.split(video_path) 73 | file_name_no_ext, file_ext = os.path.splitext(file_name) 74 | 75 | # Determine output path 76 | if output_dir: 77 | os.makedirs(output_dir, exist_ok=True) 78 | output_path = os.path.join(output_dir, f"{file_name_no_ext}_{output_fps}fps{file_ext}") 79 | else: 80 | output_path = os.path.join(file_dir, f"{file_name_no_ext}_{output_fps}fps{file_ext}") 81 | 82 | try: 83 | # Process the video 84 | print(f"Converting {video_path} to {output_fps} fps...") 85 | 86 | # Use ffmpeg-python to convert the video 87 | ( 88 | ffmpeg 89 | .input(video_path) 90 | .filter('fps', fps=output_fps) 91 | .output(output_path, crf=18) 92 | .run(overwrite_output=True, quiet=True) 93 | ) 94 | 95 | print(f"Successfully converted! Output saved to: {output_path}") 96 | converted_paths.append(output_path) 97 | 98 | except ffmpeg.Error as e: 99 | print(f"Error converting {video_path}: {e.stderr.decode() if e.stderr else str(e)}") 100 | except Exception as e: 101 | print(f"Unexpected error converting {video_path}: {str(e)}") 102 | 103 | return converted_paths 104 | 105 | 106 | if __name__ == "__main__": 107 | video_paths = [ 108 | "./videos/Speed_segment_1.mp4", 109 | "./videos/Speed_segment_2.mp4", 110 | "./videos/Speed_segment_3.mp4", 111 | "./videos/Speed_segment_4.mp4", 112 | 113 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_2_30fps.mp4", 114 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_3_30fps.mp4", 115 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_4_30fps.mp4", 116 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_5_30fps.mp4", 117 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_6_30fps.mp4", 118 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_7_30fps.mp4", 119 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_8_30fps.mp4", 120 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_9_30fps.mp4", 121 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_10_30fps.mp4", 122 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_11_30fps.mp4", 123 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_12_30fps.mp4", 124 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_13_30fps.mp4", 125 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_14_30fps.mp4", 126 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_15_30fps.mp4", 127 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_16_30fps.mp4", 128 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_17_30fps.mp4", 129 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_18_30fps.mp4", 130 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_19_30fps.mp4", 131 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_20_30fps.mp4", 132 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_21_30fps.mp4", 133 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_22_30fps.mp4", 134 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_23_30fps.mp4", 135 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_24_30fps.mp4", 136 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_25_30fps.mp4", 137 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_26_30fps.mp4", 138 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_27_30fps.mp4", 139 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_28_30fps.mp4", 140 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_29_30fps.mp4", 141 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_30_30fps.mp4", 142 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_31_30fps.mp4", 143 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_32_30fps.mp4", 144 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_33_30fps.mp4", 145 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_34_30fps.mp4", 146 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_35_30fps.mp4", 147 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_36_30fps.mp4", 148 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_37_30fps.mp4", 149 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_38_30fps.mp4", 150 | "./media/videos/10_Memory_Coalescing/2160p30/Coalescing_segment_39_30fps.mp4", 151 | 152 | "/Users/szymon.ozog/Downloads/front.jpg", 153 | "/Users/szymon.ozog/Downloads/memory.jpg", 154 | 155 | "./media/videos/11_Occupancy/2160p30/Occupancy_segment_1_30fps.mp4", 156 | "./media/videos/11_Occupancy/2160p30/Occupancy_segment_2_30fps.mp4", 157 | "./media/videos/11_Occupancy/2160p30/Occupancy_segment_3_30fps.mp4", 158 | "./media/videos/11_Occupancy/2160p30/Occupancy_segment_4_30fps.mp4", 159 | 160 | "./videos/TensorCores_segment_4_30fps.mp4", 161 | 162 | "./videos/HierarchicalTiling_segment_9_30fps.mp4", 163 | "./videos/HierarchicalTiling_segment_10_30fps.mp4", 164 | "./videos/HierarchicalTiling_segment_11_30fps.mp4", 165 | "./videos/HierarchicalTiling_segment_12_30fps.mp4", 166 | "./videos/HierarchicalTiling_segment_13_30fps.mp4", 167 | "./videos/HierarchicalTiling_segment_14_30fps.mp4", 168 | "./videos/HierarchicalTiling_segment_15_30fps.mp4", 169 | "./videos/HierarchicalTiling_segment_16_30fps.mp4", 170 | "./videos/HierarchicalTiling_segment_17_30fps.mp4", 171 | "./videos/HierarchicalTiling_segment_18_30fps.mp4", 172 | 173 | 174 | "./videos/Quantization_segment_1_30fps.mp4", 175 | "./videos/Quantization_segment_2_30fps.mp4", 176 | "./videos/Quantization_segment_3_30fps.mp4", 177 | "./videos/Quantization_segment_4_30fps.mp4", 178 | "./videos/Quantization_segment_5_30fps.mp4", 179 | "./videos/Quantization_segment_6_30fps.mp4", 180 | "./videos/Quantization_segment_7_30fps.mp4", 181 | "./videos/Quantization_segment_8_30fps.mp4", 182 | "./videos/Quantization_segment_9_30fps.mp4", 183 | "./videos/Quantization_segment_10_30fps.mp4", 184 | "./videos/Quantization_segment_11_30fps.mp4", 185 | "./videos/Quantization_segment_12_30fps.mp4", 186 | "./videos/Quantization_segment_13_30fps.mp4", 187 | "./videos/Quantization_segment_14_30fps.mp4", 188 | 189 | "./videos/MoE_segment_2_30fps.mp4", 190 | "./videos/MoE_segment_3_30fps.mp4", 191 | "./videos/MoE_segment_4_30fps.mp4", 192 | 193 | ] 194 | # convert_videos_fps(video_paths) 195 | play_videos(video_paths) 196 | 197 | -------------------------------------------------------------------------------- /utils/split_video.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from typing import List, Tuple 4 | import subprocess 5 | import ffmpeg 6 | 7 | 8 | def extract_video_segment(video_path: str, start_time: float, end_time: float, output_path: str) -> None: 9 | """ 10 | Extract a segment from a video file based on start and end timestamps using ffmpeg. 11 | 12 | Args: 13 | video_path: Path to the input video file 14 | start_time: Start time in seconds 15 | end_time: End time in seconds 16 | output_path: Path to save the output video segment 17 | """ 18 | try: 19 | # Calculate duration 20 | duration = end_time - start_time 21 | 22 | # Use ffmpeg-python 23 | ( 24 | ffmpeg 25 | .input(video_path, ss=start_time, t=duration) 26 | .output(output_path, c='copy') 27 | .run(capture_stdout=True, capture_stderr=True) 28 | ) 29 | 30 | print(f"Successfully created segment: {output_path}") 31 | except ffmpeg.Error as e: 32 | print(f"Error creating segment {output_path}: {e.stderr.decode()}") 33 | except Exception as e: 34 | print(f"Error creating segment {output_path}: {str(e)}") 35 | 36 | 37 | def split_video(video_path: str, timestamps: List[Tuple[float, float]]) -> None: 38 | """ 39 | Split a video into multiple segments based on a list of timestamp tuples. 40 | 41 | Args: 42 | video_path: Path to the input video file 43 | timestamps: List of (start_time, end_time) tuples in seconds 44 | """ 45 | if not os.path.exists(video_path): 46 | print(f"Error: Video file '{video_path}' not found") 47 | return 48 | 49 | # Get video filename without extension 50 | base_name = os.path.splitext(os.path.basename(video_path))[0] 51 | output_dir = os.path.dirname(os.path.abspath(video_path)) 52 | 53 | # Process each timestamp tuple 54 | for i, (start_time, end_time) in enumerate(timestamps): 55 | # Validate timestamps 56 | if start_time >= end_time: 57 | print(f"Warning: Skipping segment {i+1} - start time must be less than end time") 58 | continue 59 | 60 | # Create output filename 61 | output_filename = f"{base_name}_segment_{i+1}.mp4" 62 | output_path = os.path.join(output_dir, output_filename) 63 | if os.path.exists(output_path): 64 | print("removing ", output_path) 65 | os.unlink(output_path) 66 | 67 | # Extract the segment 68 | extract_video_segment(video_path, start_time, end_time, output_path) 69 | 70 | 71 | def parse_timestamp_tuple(timestamp_str: str) -> Tuple[float, float]: 72 | """ 73 | Parse a string representation of a timestamp tuple into a tuple of floats. 74 | Format: "start_time,end_time" in seconds 75 | 76 | Args: 77 | timestamp_str: String representation of timestamp tuple (e.g., "10.5,25.0") 78 | 79 | Returns: 80 | Tuple of (start_time, end_time) as floats 81 | """ 82 | out = [] 83 | try: 84 | for x in timestamp_str.split(";"): 85 | start, end = x.split(',') 86 | out.append((float(start.strip()), float(end.strip()))) 87 | except ValueError: 88 | raise argparse.ArgumentTypeError( 89 | f"Invalid timestamp format: {timestamp_str}. Use 'start,end' format (e.g., '10.5,25.0')" 90 | ) 91 | print(out) 92 | return tuple(out) 93 | 94 | 95 | def check_ffmpeg_installed(): 96 | """Check if ffmpeg is installed on the system.""" 97 | try: 98 | subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 99 | return True 100 | except FileNotFoundError: 101 | return False 102 | 103 | 104 | def main(): 105 | parser = argparse.ArgumentParser(description='Split a video into segments based on timestamps') 106 | parser.add_argument('video_path', type=str, help='Path to the input video file') 107 | parser.add_argument( 108 | 'timestamps', 109 | type=str, 110 | help='List of timestamp tuples (start,end) in seconds. Example: 10,30 60,90' 111 | ) 112 | 113 | args = parser.parse_args() 114 | 115 | # Check if ffmpeg is installed 116 | if not check_ffmpeg_installed(): 117 | print("Error: ffmpeg is not installed or not in the PATH.") 118 | print("Please install ffmpeg before running this script.") 119 | return 120 | 121 | # Check if ffmpeg-python is installed 122 | try: 123 | import ffmpeg 124 | except ImportError: 125 | print("Error: ffmpeg-python package is not installed.") 126 | print("Please install it with: pip install ffmpeg-python") 127 | return 128 | 129 | split_video(args.video_path, parse_timestamp_tuple(args.timestamps)) 130 | 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /vectorized_bench.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define BLOCK_SIZE 128 7 | #define BENCH_STEPS 1 8 | #define WARMUP_STEPS 0 9 | #define VEC_RATIO 4 10 | 11 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 12 | #define ASSERT(cond, msg, args...) assert((cond) || !fprintf(stderr, (msg "\n"), args)) 13 | 14 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 15 | { 16 | if (code != cudaSuccess) { 17 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 18 | if (abort) exit(code); 19 | } 20 | } 21 | 22 | void clear_l2() { 23 | // Get actual L2 size via CUDA on first call of this function 24 | static int l2_clear_size = 0; 25 | static unsigned char* gpu_scratch_l2_clear = NULL; 26 | if (!gpu_scratch_l2_clear) { 27 | cudaDeviceGetAttribute(&l2_clear_size, cudaDevAttrL2CacheSize, 0); 28 | l2_clear_size *= 2; // just to be extra safe (cache is not necessarily strict LRU) 29 | gpuErrchk(cudaMalloc(&gpu_scratch_l2_clear, l2_clear_size)); 30 | } 31 | // Clear L2 cache (this is run on every call unlike the above code) 32 | gpuErrchk(cudaMemset(gpu_scratch_l2_clear, 0, l2_clear_size)); 33 | } 34 | 35 | using datatype = float; 36 | using datatype_vec = float4; 37 | 38 | __global__ void copy(const int n , const datatype* __restrict__ in, datatype* __restrict__ out) 39 | { 40 | unsigned long i = blockIdx.x * blockDim.x + threadIdx.x; 41 | if (i < n) 42 | { 43 | out[i] = in[i]; 44 | } 45 | } 46 | 47 | __global__ void copyh2(const int n , const datatype_vec* __restrict__ in, datatype_vec* __restrict__ out) 48 | { 49 | unsigned long i = blockIdx.x * blockDim.x + threadIdx.x; 50 | if (i < n) 51 | { 52 | out[i] = in[i]; 53 | } 54 | } 55 | 56 | __global__ void copy_loop(const unsigned int n , const datatype* __restrict__ in, datatype* __restrict__ out) 57 | { 58 | unsigned long i = blockIdx.x * blockDim.x + threadIdx.x; 59 | for (int idx = i; idx < n; idx+=gridDim.x * blockDim.x) 60 | { 61 | out[idx] = in[idx]; 62 | } 63 | } 64 | 65 | __global__ void copy_loop_unrolled(const unsigned int n , const datatype* __restrict__ in, datatype* __restrict__ out) 66 | { 67 | unsigned long i = blockIdx.x * blockDim.x + threadIdx.x; 68 | for (int idx = i; idx < n; idx+=gridDim.x * blockDim.x * 4) 69 | { 70 | out[idx] = in[idx]; 71 | out[idx+gridDim.x*blockDim.x] = in[idx+gridDim.x*blockDim.x]; 72 | out[idx+2*gridDim.x*blockDim.x] = in[idx+2*gridDim.x*blockDim.x]; 73 | out[idx+3*gridDim.x*blockDim.x] = in[idx+3*gridDim.x*blockDim.x]; 74 | } 75 | } 76 | 77 | __global__ void copy_loop_float4(const unsigned int n , const datatype_vec* __restrict__ in, datatype_vec* __restrict__ out) 78 | { 79 | unsigned long i = blockIdx.x * blockDim.x + threadIdx.x; 80 | for (int idx = i; idx < n; idx+=gridDim.x * blockDim.x) 81 | { 82 | out[idx] = in[idx]; 83 | } 84 | } 85 | 86 | __global__ void reduce(const unsigned int n, const datatype* __restrict__ in, datatype* __restrict__ out) 87 | { 88 | unsigned long i = blockIdx.x * blockDim.x + threadIdx.x; 89 | float reduction = 0.f; 90 | #pragma unroll 91 | for (int idx = i; idx < n; idx+=gridDim.x * blockDim.x) 92 | { 93 | reduction += in[idx]; 94 | } 95 | out[i] = reduction; 96 | } 97 | 98 | __global__ void reduce_float4(const unsigned int n, const datatype_vec* __restrict__ in, datatype_vec* __restrict__ out) 99 | { 100 | unsigned long i = blockIdx.x * blockDim.x + threadIdx.x; 101 | float4 reduction = make_float4(0.f, 0.f, 0.f, 0.f); 102 | #pragma unroll 103 | for (int idx = i; idx < n; idx+=gridDim.x * blockDim.x) 104 | { 105 | float4 val = in[idx]; 106 | reduction.x += val.x; 107 | reduction.y += val.y; 108 | reduction.z += val.z; 109 | reduction.w += val.w; 110 | } 111 | out[i] = reduction; 112 | } 113 | 114 | int main() 115 | { 116 | datatype* in_d; 117 | datatype* out_d; 118 | datatype* out2_d; 119 | 120 | datatype* out_red_d; 121 | datatype* out_red2_d; 122 | constexpr int reduction_factor = 1024; 123 | 124 | long N = std::pow(2, 28); 125 | 126 | //one warmup run 127 | cudaEvent_t start, stop; 128 | gpuErrchk(cudaEventCreate(&start)); 129 | gpuErrchk(cudaEventCreate(&stop)); 130 | 131 | dim3 dimGrid(ceil(N/(float)BLOCK_SIZE), 1, 1); 132 | dim3 dimBlock(BLOCK_SIZE, 1, 1); 133 | 134 | cudaMalloc((void**) &in_d, N*sizeof(datatype)); 135 | datatype* cp = new datatype[N]; 136 | for (int i = 0; i < N; i++) 137 | { 138 | cp[i] = (float)N; 139 | } 140 | cudaMemcpy(in_d, cp, N*sizeof(datatype), cudaMemcpyHostToDevice); 141 | 142 | cudaMalloc((void**) &out_d, N*sizeof(datatype)); 143 | cudaMemset(out_d, 0, N*sizeof(datatype)); 144 | 145 | cudaMalloc((void**) &out2_d, N*sizeof(datatype)); 146 | cudaMemset(out2_d, 0, N*sizeof(datatype)); 147 | 148 | cudaMalloc((void**) &out_red_d, N*sizeof(datatype)); 149 | cudaMemset(out_red_d, 0, N/reduction_factor*sizeof(datatype)); 150 | 151 | cudaMalloc((void**) &out_red2_d, N*sizeof(datatype)); 152 | cudaMemset(out_red2_d, 0, N/reduction_factor*sizeof(datatype)); 153 | float time = 0.f; 154 | double run_time = 0.0; 155 | for (int i = -WARMUP_STEPS; i>>(N, in_d, out_d); 161 | gpuErrchk(cudaEventRecord(stop)); 162 | gpuErrchk(cudaEventSynchronize(stop)); 163 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 164 | gpuErrchk(cudaPeekAtLastError()); 165 | gpuErrchk(cudaDeviceSynchronize()); 166 | if (i >= 0) // warmup 167 | { 168 | run_time += time / BENCH_STEPS; 169 | } 170 | } 171 | 172 | std::cout<<"regular time "<>>(N/VEC_RATIO, reinterpret_cast(in_d), reinterpret_cast(out_d)); 184 | gpuErrchk(cudaEventRecord(stop)); 185 | gpuErrchk(cudaEventSynchronize(stop)); 186 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 187 | gpuErrchk(cudaPeekAtLastError()); 188 | gpuErrchk(cudaDeviceSynchronize()); 189 | if (i >= 0) // warmup 190 | { 191 | run_time += time / BENCH_STEPS; 192 | } 193 | } 194 | 195 | std::cout<<"vectorized time "<>>(N, in_d, out2_d); 208 | gpuErrchk(cudaEventRecord(stop)); 209 | gpuErrchk(cudaEventSynchronize(stop)); 210 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 211 | gpuErrchk(cudaPeekAtLastError()); 212 | gpuErrchk(cudaDeviceSynchronize()); 213 | if (i >= 0) // warmup 214 | { 215 | run_time += time / BENCH_STEPS; 216 | } 217 | } 218 | 219 | datatype* out_h = new datatype[N]; 220 | datatype* out2_h = new datatype[N]; 221 | dimGrid.x/=4; 222 | cudaMemcpy(out_h, out_d, N*sizeof(datatype), cudaMemcpyDeviceToHost); 223 | cudaMemcpy(out2_h, out2_d, N*sizeof(datatype), cudaMemcpyDeviceToHost); 224 | for (int i = 0; i < N; i++) 225 | { 226 | ASSERT(out_h[i] == out2_h[i], "failed at copy loop %d, %f, %f\n", i, (float)out_h[i], (float)out2_h[i]); 227 | } 228 | std::cout<<"loop time "<>>(N, in_d, out2_d); 238 | gpuErrchk(cudaEventRecord(stop)); 239 | gpuErrchk(cudaEventSynchronize(stop)); 240 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 241 | gpuErrchk(cudaPeekAtLastError()); 242 | gpuErrchk(cudaDeviceSynchronize()); 243 | if (i >= 0) // warmup 244 | { 245 | run_time += time / BENCH_STEPS; 246 | } 247 | } 248 | 249 | cudaMemcpy(out_h, out_d, N*sizeof(datatype), cudaMemcpyDeviceToHost); 250 | cudaMemcpy(out2_h, out2_d, N*sizeof(datatype), cudaMemcpyDeviceToHost); 251 | for (int i = 0; i < N; i++) 252 | { 253 | ASSERT(out_h[i] == out2_h[i], "failed at copy loop unrolled %d, %f, %f\n", i, (float)out_h[i], (float)out2_h[i]); 254 | } 255 | 256 | std::cout<<"loop time unrolled "<>>(N/VEC_RATIO, reinterpret_cast(in_d), reinterpret_cast(out2_d)); 266 | gpuErrchk(cudaEventRecord(stop)); 267 | gpuErrchk(cudaEventSynchronize(stop)); 268 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 269 | gpuErrchk(cudaPeekAtLastError()); 270 | gpuErrchk(cudaDeviceSynchronize()); 271 | if (i >= 0) // warmup 272 | { 273 | run_time += time / BENCH_STEPS; 274 | } 275 | } 276 | std::cout<<"loop time vectorized "<>>(N, in_d, out_red_d); 295 | gpuErrchk(cudaEventRecord(stop)); 296 | gpuErrchk(cudaEventSynchronize(stop)); 297 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 298 | gpuErrchk(cudaPeekAtLastError()); 299 | gpuErrchk(cudaDeviceSynchronize()); 300 | if (i >= 0) // warmup 301 | { 302 | run_time += time / BENCH_STEPS; 303 | } 304 | } 305 | 306 | std::cout<<"reduce time "<>>(N/VEC_RATIO, reinterpret_cast(in_d), reinterpret_cast(out_red2_d)); 317 | gpuErrchk(cudaEventRecord(stop)); 318 | gpuErrchk(cudaEventSynchronize(stop)); 319 | gpuErrchk(cudaEventElapsedTime(&time, start, stop)); 320 | gpuErrchk(cudaPeekAtLastError()); 321 | gpuErrchk(cudaDeviceSynchronize()); 322 | if (i >= 0) // warmup 323 | { 324 | run_time += time / BENCH_STEPS; 325 | } 326 | } 327 | 328 | datatype* out_red_h = new datatype[N]; 329 | datatype* out_red2_h = new datatype[N]; 330 | cudaMemcpy(out_red_h, out_d, N*sizeof(datatype), cudaMemcpyDeviceToHost); 331 | cudaMemcpy(out_red2_h, out2_d, N*sizeof(datatype), cudaMemcpyDeviceToHost); 332 | for (int i = 0; i < N; i++) 333 | { 334 | ASSERT(out_red_h[i] == out_red2_h[i], "failed at reduce float4 %d, %f, %f\n", i, (float)out_red_h[i], (float)out_red2_h[i]); 335 | } 336 | 337 | std::cout<<"reduce time float4 "<