├── .gitignore
├── FastAtomicAdd
    ├── Makefile
    ├── README.md
    ├── atomic_add_half.cu
    ├── atomic_add_half_pack2.cu
    └── fast_atomic_add_half.cu
├── FasterTransformer
    ├── README_BERT.md
    └── README_GPT.md
├── README.md
├── UpsampleNearest2D
    ├── Makefile
    ├── README.md
    └── upsample_nearest_2d.cu
├── apex
    ├── README.md
    ├── layer_norm_cuda.cpp
    └── layer_norm_cuda_kernel.cu
├── cuda-mode
    ├── .DS_Store
    ├── CUDA-MODE 第一课课后实战（上）.md
    ├── CUDA-MODE 第一课课后实战（下）.md
    ├── GPU 计算与编程模型演进：异步计算编程中的吞吐与延迟平衡.md
    ├── GPU内存系统演进：最大化带宽利用与延迟隐藏技术路径.md
    ├── Lecture 1 How to profile CUDA kernels in PyTorch.md
    ├── Lecture 11 GPU Sparsity.md
    ├── Lecture 12 Flash Attention.md
    ├── Lecture 13 Ring Attention.md
    ├── Lecture 14 Triton 入门教程.md
    ├── Lecture 15 CUTLASS.md
    ├── Lecture 16 通过CUDA C++核心库把llm.c移植为llm.cpp.md
    ├── Lecture 17 GPU集合通信(NCCL).md
    ├── Lecture 2 Ch1-3 PMPP book.md
    ├── Lecture 20 Scan Algorithm.md
    ├── Lecture 28 LinkedIn Liger kernel.md
    ├── Lecture 29 Triton Internals.md
    ├── Lecture 4 Ch4-5 PMPP book.md
    ├── Lecture 40: CUDA Docs for Humans.md
    ├── Lecture 53 torch.compile Q&A.md
    ├── Lecture 6 Optimizing Optimizer.md
    ├── Lecture 7 Quantization Cuda vs Triton.md
    ├── Lecture 76: BackendBench 修复大模型kernel正确性问题.md
    ├── Lecture 77: Domain Specific Languages for GPU Kernels.md
    ├── Lecture 8 CUDA Performance Checklist.md
    ├── Lecture 9 Reductions.md
    ├── Lei Mao的blog
    │   ├── 【博客转载】Build and Develop CUTLASS CUDA Kernels.md
    │   ├── 【博客转载】C++ Data Alignment.md
    │   ├── 【博客转载】CPU Cache False Sharing.md
    │   ├── 【博客转载】CUDA Coalesced Memory Access.md
    │   ├── 【博客转载】CUDA Compatibility.md
    │   ├── 【博客转载】CUDA Constant Memory.md
    │   ├── 【博客转载】CUDA Cooperative Groups.md
    │   ├── 【博客转载】CUDA Data Alignment.md
    │   ├── 【博客转载】CUDA Default Stream.md
    │   ├── 【博客转载】CUDA Kernel Execution Overlap.md
    │   ├── 【博客转载】CUDA L2 Persistent Cache.md
    │   ├── 【博客转载】CUDA Local Memory.md
    │   ├── 【博客转载】CUDA Matrix Multiplication Optimization.md
    │   ├── 【博客转载】CUDA Occupancy Calculation.md
    │   ├── 【博客转载】CUDA Performance Hot VS Cold Measurement.md
    │   ├── 【博客转载】CUDA Reduction.md
    │   ├── 【博客转载】CUDA Shared Memory Bank.md
    │   ├── 【博客转载】CUDA Shared Memory Capacity.md
    │   ├── 【博客转载】CUDA Shared Memory Swizzling.md
    │   ├── 【博客转载】CUDA Vectorized Memory Access.md
    │   ├── 【博客转载】CUDA Zero Copy Mapped Memory.md
    │   ├── 【博客转载】CuTe Layout Algebra.md
    │   ├── 【博客转载】Load CUDA Kernel at Runtime Using CUDA Driver APIs.md
    │   ├── 【博客转载】NVIDIA Docker CUDA Compatibility && Nsight Compute In Docker.md
    │   ├── 【博客转载】NVIDIA GPU Compute Capability.md
    │   ├── 【博客转载】NVIDIA Tensor Core Programming.md
    │   ├── 【博客转载】Row-Major VS Column-Major.md
    │   └── 【博客转载】cuBLAS GEMM API Usages for Column-Major and Row-Major Matrices.md
    ├── README.md
    ├── cudabmk
    │   ├── Makefile
    │   ├── Makefile-cmem
    │   ├── Makefile-diverge
    │   ├── Makefile-global
    │   ├── Makefile-icache1
    │   ├── Makefile-icache2
    │   ├── Makefile-icache3
    │   ├── Makefile-icache4
    │   ├── Makefile-shared
    │   ├── Makefile-sync
    │   ├── Makefile-texture2
    │   ├── Makefile-texture4
    │   ├── build_cubin
    │   ├── clock.cu
    │   ├── cmem.cu
    │   ├── common.mk
    │   ├── defines.mk
    │   ├── diverge.cu
    │   ├── diverge2.cu
    │   ├── diverge3.cu
    │   ├── empty.cu
    │   ├── global.cu
    │   ├── icache.cu
    │   ├── icache2.cu
    │   ├── icache2_1.cu
    │   ├── icache2_2.cu
    │   ├── icache2_ibuffer.cu
    │   ├── icache3.cu
    │   ├── icache3_1.cu
    │   ├── icache3_2.cu
    │   ├── icache3_kernel.h
    │   ├── icache4.cu
    │   ├── icache4_L1.cu
    │   ├── icache_kernels.h
    │   ├── icache_kernels1.cu
    │   ├── icache_kernels2.cu
    │   ├── icache_kernels3.cu
    │   ├── icache_kernels4.cu
    │   ├── instructions.h
    │   ├── ksync_uint_dep128.cu
    │   ├── ksync_uint_dep128.real_cubin
    │   ├── ksync_uint_dep128.real_ptx
    │   ├── main.cpp
    │   ├── morerules.mk
    │   ├── path.cu
    │   ├── pipeline.cu
    │   ├── regfile.cu
    │   ├── regfile.real_cubin
    │   ├── repeat.h
    │   ├── shared.cu
    │   ├── sync.cu
    │   ├── sync2.cu
    │   ├── texture2.cu
    │   └── texture4.cu
    ├── ppt
    │   ├── .DS_Store
    │   ├── CUDA Docs for Humans.pdf
    │   ├── CUDA MODE_ Liger Kernel.pdf
    │   ├── CUDAMODE_2024_Optimizing_optimizers.pptx
    │   ├── Chapter_11_-_Prefix_Sum_Scan_-_Part_1 .pptx.pdf
    │   ├── Lecture 29 presentation.pdf
    │   ├── Lecture 8_ CUDA Performance.pptx
    │   ├── Lecture10.pdf
    │   ├── Lecture10.pptx
    │   ├── Quantization Cuda vs Triton.pdf
    │   ├── Speaking_Tensor_Cores_CUTLASS_2024.pdf
    │   ├── cuda_mode_lecture2.pptx
    │   ├── lecture_017
    │   │   ├── README.md
    │   │   ├── ddp_example.py
    │   │   ├── ddp_simple.py
    │   │   └── slides.pdf
    │   ├── llmcccl.pdf
    │   ├── llmcccl.pptx
    │   ├── ring_attention.pptx
    │   └── sparsity.pptx
    ├── 【CUDA博客】通过查看GPU Assembly分析CUDA程序.md
    ├── 【博客翻译】CUDA中的Indexing.md
    ├── 【博客翻译】Cutlass中的预测.md
    ├── 【博客翻译】TMA简介 & 让矩阵转置在Hopper GPUs上变得更快.md
    ├── 【博客翻译】使用PTX指令更高效的读写矩阵.md
    ├── 【博客翻译】关于TensorCore和Inline PTX Assembly的简短笔记.md
    ├── 【博客翻译】在SGLang中使用reasoning模型.md
    ├── 【博客翻译】让Prefix Sum变得更快.md
    ├── 【博客翻译】让RMSNorm变得更快.md
    ├── 【博客翻译】连接数学和代码：CuTeDSL中的 CuTe Layout代数.md
    ├── 如何正确理解NVIDIA GPU利用率的概念.md
    └── 简单了解下CUDA Green Context.md
├── cuda-paper
    ├── README.md
    └── 通过微基准测试(Microbenchmarking)和指令级分析(Instruction-level Analysis)揭秘英伟达Ampere架构.md
├── cutlass
    ├── CUTLASS 2.x & CUTLASS 3.x Intro 学习笔记.md
    ├── CUTLASS Tutorial: Mastering the NVIDIA® Tensor Memory Accelerator (TMA).md
    ├── README.md
    ├── TensorRT-LLM 中的 Hopper Mixed GEMM 的 CUTLASS 3.x 实现讲解.md
    ├── TensorRT-LLM中的 Quantization GEMM（Ampere Mixed GEMM）的 CUTLASS 2.x 实现讲解.md
    ├── Tutorial: Matrix Transpose in CUTLASS.md
    ├── Tutorial: Python bindings for CUDA libraries in PyTorch.md
    ├── Tutorial: 在Hopper GPU上使用WGMMA的快速矩阵乘.md
    ├── cfx-article-src
    │   ├── .gitmodules
    │   ├── README.md
    │   ├── cutlass_gemm
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── cutlass_gemm
    │   │   │   ├── cutlass_gemm.cu
    │   │   │   ├── cutlass_gemm.hpp
    │   │   │   └── setup.py
    │   │   └── gemm.py
    │   ├── evt
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── evt_gemm_cute.cu
    │   │   ├── node_types.md
    │   │   └── reference.h
    │   ├── include
    │   │   └── utils
    │   │   │   └── cuda_launch.hpp
    │   ├── pipeline-gemm
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── hopper-gemm-ws
    │   │   │   ├── convert_util.h
    │   │   │   ├── epilogue_sm90_tma_ws.hpp
    │   │   │   ├── hopper_gemm_kernel.h
    │   │   │   ├── hopper_gemm_kernel_launch.h
    │   │   │   ├── kernel_traits.h
    │   │   │   ├── mainloop_sm90_tma_gmma_ws.hpp
    │   │   │   └── tile_scheduler.hpp
    │   │   ├── sm90_gemm_multistage.cu
    │   │   └── sm90_gemm_ws.cu
    │   ├── streamk
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── cli_options.h
    │   │   ├── convert_util.h
    │   │   ├── cutlass_streamk
    │   │   │   ├── Makefile
    │   │   │   ├── benchmark.cu
    │   │   │   └── profile.sh
    │   │   ├── epilogue_sm90_tma_ws.hpp
    │   │   ├── hopper_gemm_kernel.h
    │   │   ├── hopper_gemm_kernel_launch.h
    │   │   ├── kernel_traits.h
    │   │   ├── mainloop_sm90_tma_gmma_ws.hpp
    │   │   ├── streamk.cu
    │   │   └── tile_scheduler.hpp
    │   ├── tma
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── main.cu
    │   │   ├── scale_tma_kernel.h
    │   │   ├── shared_storage.h
    │   │   ├── smem_helper.hpp
    │   │   ├── tma_copy.h
    │   │   └── tma_copy_multicast.h
    │   └── transpose-cute
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── include
    │   │       ├── copy.h
    │   │       ├── shared_storage.h
    │   │       ├── smem_helper.hpp
    │   │       ├── transpose_naive.h
    │   │       ├── transpose_smem.h
    │   │       ├── transpose_tmastore_vectorized.h
    │   │       └── util.h
    │   │   ├── main.cu
    │   │   ├── python
    │   │       ├── copy_cute.cu
    │   │       ├── setup.py
    │   │       └── transpose_cute.cu
    │   │   └── torch_benchmark.py
    ├── cuda的ldmatrix指令的详细解释.md
    ├── cute
    │   ├── README.md
    │   ├── gemm-simple.cu
    │   ├── s2r_copy.cu
    │   └── vector_add.cu
    ├── swizzle
    │   ├── .clang-format
    │   ├── .clangd
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README.md
    │   ├── assets
    │   │   └── 16x16res.png
    │   ├── doc
    │   │   ├── assets
    │   │   │   ├── addr_permute.png
    │   │   │   ├── addr_permute_16x64.png
    │   │   │   ├── blog2
    │   │   │   │   ├── eg2.png
    │   │   │   │   ├── sol1.1.png
    │   │   │   │   ├── sol1.2.png
    │   │   │   │   ├── sol2.1.1.png
    │   │   │   │   ├── sol2.1.png
    │   │   │   │   ├── swizzle_code.png
    │   │   │   │   ├── swizzle_xor.png
    │   │   │   │   └── xor_addr_trans.png
    │   │   │   ├── frag_a_cf.png
    │   │   │   ├── frag_a_layout.png
    │   │   │   ├── load_frag_a.png
    │   │   │   ├── ncu_global.png
    │   │   │   ├── ncu_shared.png
    │   │   │   ├── swizzle_16x32.png
    │   │   │   ├── swizzle_addr.png
    │   │   │   ├── swizzle_res.png
    │   │   │   ├── swizzle_shift.png
    │   │   │   └── swizzle_xor.png
    │   │   ├── blog1.md
    │   │   ├── blog2.md
    │   │   └── draw
    │   │   │   ├── example.excalidraw
    │   │   │   └── xor_swizzle.excalidraw
    │   ├── metrics.txt
    │   └── src
    │   │   ├── main.cu
    │   │   ├── mma.cuh
    │   │   ├── ptx.cuh
    │   │   ├── swizzle.cuh
    │   │   ├── test.cuh
    │   │   └── utils.cuh
    ├── 实用Swizzle教程.md
    ├── 详解awq cuda kernel.md
    └── 详解vLLM和SGLang awq dequantize kernel的魔法.md
├── elementwise
    ├── Makefile
    ├── README.md
    └── elementwise.cu
├── gemv
    ├── ComplexHalfGemv.cu
    ├── README.md
    ├── cuHalfComplex.cuh
    ├── sgemv_v0.cu
    ├── sgemv_v1.cu
    └── sgemv_v2.cu
├── how-to-complie-pytorch-from-source
    └── compile_pytorch_from_source.md
├── indexing
    ├── README.md
    ├── index_add_cuda_oneflow_impl.cu
    ├── index_add_cuda_pytorch_impl.cu
    └── indexing_pytorch_explain.cu
├── large-language-model-note
    ├── Cache-Dit 学习笔记.md
    ├── DeepEP：介绍与最佳实践.md
    ├── FlashInfer集成TensorRT-LLM cubin kernel技术分析.md
    ├── Fused AllGather_MatMul Triton工程实现.md
    ├── GTX 4090 的 Attention 诡异.md
    ├── GTX 4090 的 cuda graph 诡异.md
    ├── MoE之年的总结和MoE 推理优化的一些认识.md
    ├── NVIDIA技术沙龙 《大规模EP优化：PD分离MoE并行方式》课程笔记.md
    ├── README.md
    ├── RMSNorm的精度陷阱：记一次LLM推理精度调查.md
    ├── SGLang DP MLA 特性解读.md
    ├── SGLang MLA 实现解析.md
    ├── Zero.md
    ├── f0f25684d13445198120eba971d3f105.png
    ├── inference
    │   ├── hf_inference_cuda_graph.py
    │   ├── hf_inference_cuda_graph_reuse_kv_cache.py
    │   └── hf_prefill_decode_split_inference.py
    ├── kimi k2 thinking SGLang优化.md
    ├── moe align block size kernel的优化源码.md
    ├── moe_align_block_size_kernel优化详解.md
    ├── sglang&lightllm
    │   ├── DeepSeek-V3 + SGLang: 推理优化 (v0.4.3.post2+sgl-kernel:0.0.3.post6).md
    │   ├── Implement Flash Attention Backend in SGLang - Basics and KV Cache.md
    │   ├── LMCache + SGLang Feature 解读.md
    │   ├── SGLang 2025-8月性能优化技巧记录.md
    │   ├── SGLang 2025-9月性能优化技巧记录.md
    │   ├── SGLang v4.8.0 PD分离性能研究.md
    │   ├── SGLang 优化Triton FusedMoE 的一个新技巧​.md
    │   ├── SGLang 支持Flash Attention V3 Backend.md
    │   ├── SGLang 确定性推理实现技术细节笔记.md
    │   ├── SGLang支持Gpt-Oss性能优化博客.md
    │   ├── SGLang的Expert Parallel特性解读.md
    │   ├── SM100 NVFP4 Quant实现笔记.md
    │   ├── bench_one_batch.py
    │   ├── 【DistServe 博客翻译】通过PD分离实现LLM服务中的最大有效吞吐量.md
    │   ├── 一个Dispatch Dtype引起的fp8 quant kernel性能问题.md
    │   ├── 分享一个DeepSeek V3和R1中 Shared Experts和普通Experts融合的小技巧.md
    │   ├── 单机H200最快DeepSeek V3和R1推理系统优化秘籍.md
    │   ├── 图解DeepSeek V3 biased_grouped_topk cuda融合算子fused_moe_gate kernel.md
    │   └── 记录下SGLang 开发，编译和Profile的几个小技巧.md
    ├── vLLM PIECEWISE CUDA Graph 技术学习笔记.md
    ├── vllm-p2p_nccl_connector_zh.md
    ├── 从DeepSeek V3.2 DSA算子看TileLang编译器的细节.md
    ├── 使用NCU和Cursor Claude-sonnet-3.5写出高效cuda算子的正确姿势.md
    ├── 在 Cache-DiT 框架下实现原生 Serving功能.md
    ├── 强化学习
    │   ├── 20250519.pdf
    │   ├── NVIDIA技术沙龙《强化学习流水线优化：性能分析与 Rollout加速》演讲笔记.md
    │   ├── verl 源码解读 与 HybridFlow 编程范式讲解.md
    │   └── verl: Flexible and Efficient RL for LLMs.md
    ├── 梳理下Flash Attention的dispatch逻辑.md
    └── 面试.md
├── linear-attention
    ├── README.md
    └── causal_product_cuda.cu
├── meagtron-lm
    ├── descriptions.py
    ├── distributed_init.py
    ├── distributed_init_sw.py
    └── playground.py
├── ml-engineering
    ├── 【ml-engineering 翻译系列】AI系统中的网络 benchmark.md
    ├── 【ml-engineering 翻译系列】AI系统中的网络 debug.md
    ├── 【ml-engineering 翻译系列】AI系统中的网络概述.md
    ├── 【ml-engineering 翻译系列】gpu debug.md
    ├── 【ml-engineering 翻译系列】加速器 benchmarks.md
    ├── 【ml-engineering 翻译系列】大模型推理.md
    ├── 【ml-engineering 翻译系列】如何写测试.md
    ├── 【ml-engineering 翻译系列】计算加速器之cpu.md
    ├── 【ml-engineering 翻译系列】计算加速器之gpu.md
    ├── 【ml-engineering 翻译系列】训练之 performance.md
    └── 【ml-engineering 翻译系列】训练之模型并行综述.md
├── mlsys-paper
    └── README.md
├── nvidia-channel
    ├── NVIDIA AI 加速精讲堂-TensorRT-LLM 应用与部署.md
    └── NVIDIA AI 加速精讲堂-TensorRT-LLM量化原理、实现与优化.md
├── oneflow-cuda-optimize-skills
    ├── README.md
    ├── fused_attention
    │   └── FusedMultiHeadAttentionInferenceV2.cpp
    ├── fused_softmax
    │   ├── Makefile
    │   ├── README.md
    │   └── fused_scale_mask_softmax.cu
    └── symbol_explain
    │   ├── README.md
    │   └── symbol.h
├── ptx-isa
    ├── ptx_isa_8.5.pdf
    └── ptx_isa_8.5_notes.md
├── pytorch-blogs-codes
    ├──     _symmetric_memory.py
    ├── Accelerating Generative AI Part III: Diffusion, Fast.md
    ├── Deploying LLMs with TorchServe + vLLM.md
    ├── FP8 GEMM 的 Hopper TMA 单元深入探讨.md
    ├── FlashAttention-3：异步低精度快速准确注意力.md
    ├── FlashInfer API速览.md
    ├── Flex Attention API 应用 Notebook 代码速览.md
    ├── INT4 解码 GQA CUDA 优化以用于 LLM 推理.md
    ├── Optimized GPTQ INT4 Dequantization Triton Kernels o1-preview 解读.md
    ├── PyTorch 原生结构优化：torchao.md
    ├── README.md
    ├── Triton Kernel编译阶段.md
    ├── assets
    │   ├── flash2_a100_fwd_bwd_benchmark.png
    │   ├── flash2_h100_fwd_bwd_benchmark.png
    │   ├── flash3_fp16_fwd.png
    │   ├── flashattention_logo.png
    │   ├── flashattn_banner.jpg
    │   ├── flashattn_banner.pdf
    │   ├── flashattn_memory.jpg
    │   ├── flashattn_speedup.jpg
    │   ├── flashattn_speedup_3090.jpg
    │   ├── flashattn_speedup_a100_d128.jpg
    │   ├── flashattn_speedup_t4.jpg
    │   ├── flashattn_speedup_t4_fwd.jpg
    │   ├── gpt2_training_curve.jpg
    │   ├── gpt2_training_efficiency.jpg
    │   ├── gpt3_training_curve.jpg
    │   └── gpt3_training_efficiency.jpg
    ├── float8
    │   ├── README.md
    │   ├── __init__.py
    │   ├── config.py
    │   ├── distributed_utils.py
    │   ├── float8_linear.py
    │   ├── float8_linear_utils.py
    │   ├── float8_ops.py
    │   ├── float8_python_api.py
    │   ├── float8_scaling_utils.py
    │   ├── float8_tensor.py
    │   ├── float8_tensor_parallel.py
    │   ├── float8_utils.py
    │   ├── fsdp_utils.py
    │   ├── inference.py
    │   └── roofline_utils.py
    ├── triton_all_gather_matmul.py
    ├── 【PyTorch 奇技淫巧】介绍 depyf：轻松掌握 torch.compile.md
    ├── 【PyTorch 奇淫技巧】Async Checkpoint Save.md
    ├── 【PyTorch 奇淫技巧】Python Custom Operators翻译.md
    ├── 【博客翻译】Presenting Flux Fast: 让 Flux 在 H100 上疾速飞驰.md
    ├── 【翻译】CUDA-Free Inference for LLMs.md
    ├── 【翻译】Flash Attention使用指南.md
    ├── 【翻译】MetaShuffling：加速LLama4 MoE推理.md
    ├── 【翻译】depyf 相关的开发文档和教程.md
    ├── 【翻译】torch.compile 的详细示例解析教程.md
    ├── 【翻译】【PyTorch 奇技淫巧】FlexAttetion 基于Triton打造灵活度拉满的Attention.md
    ├── 【翻译】一文了解PyTorch中的Async Tensor Parallelism.md
    ├── 【翻译】不使用checkpoint再加载来做容错训练.md
    ├── 【翻译】使用PyTorch DCP优化模型Checkpointing效率.md
    ├── 【翻译】使用PyTorch FSDP和Torch.compile最大化训练吞吐量.md
    ├── 【翻译】使用PyTorch FSDP最大化训练吞吐量.md
    ├── 【翻译】使用PyTorch加速生成式AI-4-Seamless M4T fast.md
    ├── 【翻译】在 GPU 上如何加速 GPTQ Triton 反量化kernel.md
    ├── 【翻译】在FSDP2中开启Float8 All-Gather.md
    ├── 使用 PyTorch 加速生成式 AI 之 GPT Fast.md
    ├── 使用 PyTorch 加速生成式 AI 之 Segment Anything Fast.md
    ├── 使用 Triton kernel 加速 Llama3 FP8 推理.md
    ├── 使用 Triton 加速 2D 动态块量化 Float8 GEMM.md
    ├── 使用float8和FSDP2加速训练.md
    ├── 使用torchtune把LLaMa-3.1 8B蒸馏为1B.md
    ├── 深入 CUTLASS Ping-Pong GEMM Kernel.md
    └── 深入探讨 Hopper TMA 单元在 FP8 GEMM 运算中的应用.md
├── reduce
    ├── Makefile
    ├── README.md
    ├── pytorch_block_reduce.cu
    ├── reduce_v0_baseline.cu
    ├── reduce_v1_interleaved_addressing.cu
    ├── reduce_v2_bank_conflict_free.cu
    ├── reduce_v3_idle_threads_free.cu
    ├── reduce_v4_unroll_last_warp.cu
    ├── reduce_v5_completely_unroll.cu
    ├── reduce_v6_multi_add.cu
    ├── reduce_v7_shfl_down_sync.cu
    ├── reduce_v8_shfl_down_sync_pack.cu
    └── 【CUDA博客】Making vector sum really fast.md
├── softmax
    ├── Makefile
    ├── README.md
    ├── faster_transformer_softmax.cu
    └── oneflow_softmax.cu
├── tests
    └── test_flash_attenton.py
├── tools
    ├── Cursor 平替 Windsurf 使用体验.md
    ├── README.md
    └── hfd.sh
├── triton-meetup
    ├── 3_天数GPU上Triton适配及共享内存管理机制.pptx
    ├── FlagGems研发进展概览-0905.pptx
    ├── README.md
    ├── Triton中国生态Meetup(第一期).pdf
    ├── 优化 softmax-陈飞宇终稿.pptx
    └── 硅基流动-朱平-MoE-v2.pptx
└── triton
    ├── README.md
    ├── attention_in_pytorch.py
    ├── benchmark_layernorm.py
    ├── flash_attention_v1_in_pytorch.py
    ├── flash_attention_v2_in_pytorch.py
    ├── flash_attention_v2_in_triton.py
    └── layernorm.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | bin/
3 | *.ncu-rep
4 | flagged/
5 | __pycache__/
6 | .DS_Store
7 | 


--------------------------------------------------------------------------------
/FastAtomicAdd/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/FastAtomicAdd/Makefile


--------------------------------------------------------------------------------
/FastAtomicAdd/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/FastAtomicAdd/README.md


--------------------------------------------------------------------------------
/FastAtomicAdd/atomic_add_half.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/FastAtomicAdd/atomic_add_half.cu


--------------------------------------------------------------------------------
/FastAtomicAdd/atomic_add_half_pack2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/FastAtomicAdd/atomic_add_half_pack2.cu


--------------------------------------------------------------------------------
/FastAtomicAdd/fast_atomic_add_half.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/FastAtomicAdd/fast_atomic_add_half.cu


--------------------------------------------------------------------------------
/FasterTransformer/README_BERT.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/FasterTransformer/README_BERT.md


--------------------------------------------------------------------------------
/FasterTransformer/README_GPT.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/FasterTransformer/README_GPT.md


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/README.md


--------------------------------------------------------------------------------
/UpsampleNearest2D/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/UpsampleNearest2D/Makefile


--------------------------------------------------------------------------------
/UpsampleNearest2D/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/UpsampleNearest2D/README.md


--------------------------------------------------------------------------------
/UpsampleNearest2D/upsample_nearest_2d.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/UpsampleNearest2D/upsample_nearest_2d.cu


--------------------------------------------------------------------------------
/apex/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/apex/README.md


--------------------------------------------------------------------------------
/apex/layer_norm_cuda.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/apex/layer_norm_cuda.cpp


--------------------------------------------------------------------------------
/apex/layer_norm_cuda_kernel.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/apex/layer_norm_cuda_kernel.cu


--------------------------------------------------------------------------------
/cuda-mode/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/.DS_Store


--------------------------------------------------------------------------------
/cuda-mode/CUDA-MODE 第一课课后实战（上）.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/CUDA-MODE 第一课课后实战（上）.md


--------------------------------------------------------------------------------
/cuda-mode/CUDA-MODE 第一课课后实战（下）.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/CUDA-MODE 第一课课后实战（下）.md


--------------------------------------------------------------------------------
/cuda-mode/GPU 计算与编程模型演进：异步计算编程中的吞吐与延迟平衡.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/GPU 计算与编程模型演进：异步计算编程中的吞吐与延迟平衡.md


--------------------------------------------------------------------------------
/cuda-mode/GPU内存系统演进：最大化带宽利用与延迟隐藏技术路径.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/GPU内存系统演进：最大化带宽利用与延迟隐藏技术路径.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 1 How to profile CUDA kernels in PyTorch.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 1 How to profile CUDA kernels in PyTorch.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 11 GPU Sparsity.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 11 GPU Sparsity.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 12 Flash Attention.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 12 Flash Attention.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 13 Ring Attention.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 13 Ring Attention.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 14 Triton 入门教程.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 14 Triton 入门教程.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 15 CUTLASS.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 15 CUTLASS.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 16 通过CUDA C++核心库把llm.c移植为llm.cpp.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 16 通过CUDA C++核心库把llm.c移植为llm.cpp.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 17 GPU集合通信(NCCL).md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 17 GPU集合通信(NCCL).md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 2 Ch1-3 PMPP book.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 2 Ch1-3 PMPP book.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 20 Scan Algorithm.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 20 Scan Algorithm.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 28 LinkedIn Liger kernel.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 28 LinkedIn Liger kernel.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 29 Triton Internals.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 29 Triton Internals.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 4 Ch4-5 PMPP book.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 4 Ch4-5 PMPP book.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 40: CUDA Docs for Humans.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 40: CUDA Docs for Humans.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 53 torch.compile Q&A.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 53 torch.compile Q&A.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 6 Optimizing Optimizer.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 6 Optimizing Optimizer.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 7 Quantization Cuda vs Triton.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 7 Quantization Cuda vs Triton.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 76: BackendBench 修复大模型kernel正确性问题.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 76: BackendBench 修复大模型kernel正确性问题.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 77: Domain Specific Languages for GPU Kernels.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 77: Domain Specific Languages for GPU Kernels.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 8 CUDA Performance Checklist.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 8 CUDA Performance Checklist.md


--------------------------------------------------------------------------------
/cuda-mode/Lecture 9 Reductions.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lecture 9 Reductions.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】Build and Develop CUTLASS CUDA Kernels.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】Build and Develop CUTLASS CUDA Kernels.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】C++ Data Alignment.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】C++ Data Alignment.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CPU Cache False Sharing.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CPU Cache False Sharing.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Coalesced Memory Access.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Coalesced Memory Access.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Compatibility.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Compatibility.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Constant Memory.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Constant Memory.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Cooperative Groups.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Cooperative Groups.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Data Alignment.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Data Alignment.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Default Stream.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Default Stream.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Kernel Execution Overlap.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Kernel Execution Overlap.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA L2 Persistent Cache.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA L2 Persistent Cache.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Local Memory.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Local Memory.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Matrix Multiplication Optimization.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Matrix Multiplication Optimization.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Occupancy Calculation.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Occupancy Calculation.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Performance Hot VS Cold Measurement.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Performance Hot VS Cold Measurement.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Reduction.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Reduction.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Shared Memory Bank.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Shared Memory Bank.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Shared Memory Capacity.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Shared Memory Capacity.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Shared Memory Swizzling.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Shared Memory Swizzling.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Vectorized Memory Access.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Vectorized Memory Access.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CUDA Zero Copy Mapped Memory.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CUDA Zero Copy Mapped Memory.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】CuTe Layout Algebra.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】CuTe Layout Algebra.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】Load CUDA Kernel at Runtime Using CUDA Driver APIs.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】Load CUDA Kernel at Runtime Using CUDA Driver APIs.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】NVIDIA Docker CUDA Compatibility && Nsight Compute In Docker.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】NVIDIA Docker CUDA Compatibility && Nsight Compute In Docker.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】NVIDIA GPU Compute Capability.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】NVIDIA GPU Compute Capability.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】NVIDIA Tensor Core Programming.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】NVIDIA Tensor Core Programming.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】Row-Major VS Column-Major.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】Row-Major VS Column-Major.md


--------------------------------------------------------------------------------
/cuda-mode/Lei Mao的blog/【博客转载】cuBLAS GEMM API Usages for Column-Major and Row-Major Matrices.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/Lei Mao的blog/【博客转载】cuBLAS GEMM API Usages for Column-Major and Row-Major Matrices.md


--------------------------------------------------------------------------------
/cuda-mode/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/README.md


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-cmem:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-cmem


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-diverge:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-diverge


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-global:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-global


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-icache1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-icache1


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-icache2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-icache2


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-icache3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-icache3


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-icache4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-icache4


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-shared:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-shared


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-sync:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-sync


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-texture2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-texture2


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/Makefile-texture4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/Makefile-texture4


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/build_cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/build_cubin


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/clock.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/clock.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/cmem.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/cmem.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/common.mk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/common.mk


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/defines.mk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/defines.mk


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/diverge.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/diverge.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/diverge2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/diverge2.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/diverge3.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/diverge3.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/empty.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/empty.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/global.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/global.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache2.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache2_1.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache2_1.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache2_2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache2_2.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache2_ibuffer.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache2_ibuffer.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache3.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache3.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache3_1.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache3_1.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache3_2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache3_2.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache3_kernel.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache3_kernel.h


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache4.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache4.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache4_L1.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache4_L1.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache_kernels.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache_kernels.h


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache_kernels1.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache_kernels1.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache_kernels2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache_kernels2.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache_kernels3.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache_kernels3.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/icache_kernels4.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/icache_kernels4.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/instructions.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/instructions.h


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/ksync_uint_dep128.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/ksync_uint_dep128.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/ksync_uint_dep128.real_cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/ksync_uint_dep128.real_cubin


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/ksync_uint_dep128.real_ptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/ksync_uint_dep128.real_ptx


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/main.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/main.cpp


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/morerules.mk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/morerules.mk


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/path.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/path.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/pipeline.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/pipeline.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/regfile.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/regfile.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/regfile.real_cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/regfile.real_cubin


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/repeat.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/repeat.h


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/shared.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/shared.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/sync.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/sync.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/sync2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/sync2.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/texture2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/texture2.cu


--------------------------------------------------------------------------------
/cuda-mode/cudabmk/texture4.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/cudabmk/texture4.cu


--------------------------------------------------------------------------------
/cuda-mode/ppt/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/.DS_Store


--------------------------------------------------------------------------------
/cuda-mode/ppt/CUDA Docs for Humans.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/CUDA Docs for Humans.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/CUDA MODE_ Liger Kernel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/CUDA MODE_ Liger Kernel.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/CUDAMODE_2024_Optimizing_optimizers.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/CUDAMODE_2024_Optimizing_optimizers.pptx


--------------------------------------------------------------------------------
/cuda-mode/ppt/Chapter_11_-_Prefix_Sum_Scan_-_Part_1 .pptx.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/Chapter_11_-_Prefix_Sum_Scan_-_Part_1 .pptx.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/Lecture 29 presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/Lecture 29 presentation.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/Lecture 8_ CUDA Performance.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/Lecture 8_ CUDA Performance.pptx


--------------------------------------------------------------------------------
/cuda-mode/ppt/Lecture10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/Lecture10.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/Lecture10.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/Lecture10.pptx


--------------------------------------------------------------------------------
/cuda-mode/ppt/Quantization Cuda vs Triton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/Quantization Cuda vs Triton.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/Speaking_Tensor_Cores_CUTLASS_2024.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/Speaking_Tensor_Cores_CUTLASS_2024.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/cuda_mode_lecture2.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/cuda_mode_lecture2.pptx


--------------------------------------------------------------------------------
/cuda-mode/ppt/lecture_017/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/lecture_017/README.md


--------------------------------------------------------------------------------
/cuda-mode/ppt/lecture_017/ddp_example.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/lecture_017/ddp_example.py


--------------------------------------------------------------------------------
/cuda-mode/ppt/lecture_017/ddp_simple.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/lecture_017/ddp_simple.py


--------------------------------------------------------------------------------
/cuda-mode/ppt/lecture_017/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/lecture_017/slides.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/llmcccl.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/llmcccl.pdf


--------------------------------------------------------------------------------
/cuda-mode/ppt/llmcccl.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/llmcccl.pptx


--------------------------------------------------------------------------------
/cuda-mode/ppt/ring_attention.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/ring_attention.pptx


--------------------------------------------------------------------------------
/cuda-mode/ppt/sparsity.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/ppt/sparsity.pptx


--------------------------------------------------------------------------------
/cuda-mode/【CUDA博客】通过查看GPU Assembly分析CUDA程序.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【CUDA博客】通过查看GPU Assembly分析CUDA程序.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】CUDA中的Indexing.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】CUDA中的Indexing.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】Cutlass中的预测.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】Cutlass中的预测.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】TMA简介 & 让矩阵转置在Hopper GPUs上变得更快.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】TMA简介 & 让矩阵转置在Hopper GPUs上变得更快.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】使用PTX指令更高效的读写矩阵.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】使用PTX指令更高效的读写矩阵.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】关于TensorCore和Inline PTX Assembly的简短笔记.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】关于TensorCore和Inline PTX Assembly的简短笔记.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】在SGLang中使用reasoning模型.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】在SGLang中使用reasoning模型.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】让Prefix Sum变得更快.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】让Prefix Sum变得更快.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】让RMSNorm变得更快.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】让RMSNorm变得更快.md


--------------------------------------------------------------------------------
/cuda-mode/【博客翻译】连接数学和代码：CuTeDSL中的 CuTe Layout代数.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/【博客翻译】连接数学和代码：CuTeDSL中的 CuTe Layout代数.md


--------------------------------------------------------------------------------
/cuda-mode/如何正确理解NVIDIA GPU利用率的概念.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/如何正确理解NVIDIA GPU利用率的概念.md


--------------------------------------------------------------------------------
/cuda-mode/简单了解下CUDA Green Context.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-mode/简单了解下CUDA Green Context.md


--------------------------------------------------------------------------------
/cuda-paper/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-paper/README.md


--------------------------------------------------------------------------------
/cuda-paper/通过微基准测试(Microbenchmarking)和指令级分析(Instruction-level Analysis)揭秘英伟达Ampere架构.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cuda-paper/通过微基准测试(Microbenchmarking)和指令级分析(Instruction-level Analysis)揭秘英伟达Ampere架构.md


--------------------------------------------------------------------------------
/cutlass/CUTLASS 2.x & CUTLASS 3.x Intro 学习笔记.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/CUTLASS 2.x & CUTLASS 3.x Intro 学习笔记.md


--------------------------------------------------------------------------------
/cutlass/CUTLASS Tutorial: Mastering the NVIDIA® Tensor Memory Accelerator (TMA).md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/CUTLASS Tutorial: Mastering the NVIDIA® Tensor Memory Accelerator (TMA).md


--------------------------------------------------------------------------------
/cutlass/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/README.md


--------------------------------------------------------------------------------
/cutlass/TensorRT-LLM 中的 Hopper Mixed GEMM 的 CUTLASS 3.x 实现讲解.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/TensorRT-LLM 中的 Hopper Mixed GEMM 的 CUTLASS 3.x 实现讲解.md


--------------------------------------------------------------------------------
/cutlass/TensorRT-LLM中的 Quantization GEMM（Ampere Mixed GEMM）的 CUTLASS 2.x 实现讲解.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/TensorRT-LLM中的 Quantization GEMM（Ampere Mixed GEMM）的 CUTLASS 2.x 实现讲解.md


--------------------------------------------------------------------------------
/cutlass/Tutorial: Matrix Transpose in CUTLASS.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/Tutorial: Matrix Transpose in CUTLASS.md


--------------------------------------------------------------------------------
/cutlass/Tutorial: Python bindings for CUDA libraries in PyTorch.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/Tutorial: Python bindings for CUDA libraries in PyTorch.md


--------------------------------------------------------------------------------
/cutlass/Tutorial: 在Hopper GPU上使用WGMMA的快速矩阵乘.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/Tutorial: 在Hopper GPU上使用WGMMA的快速矩阵乘.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/.gitmodules


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/README.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/cutlass_gemm/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/cutlass_gemm/Makefile


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/cutlass_gemm/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/cutlass_gemm/README.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/cutlass_gemm/cutlass_gemm/cutlass_gemm.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/cutlass_gemm/cutlass_gemm/cutlass_gemm.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/cutlass_gemm/cutlass_gemm/cutlass_gemm.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/cutlass_gemm/cutlass_gemm/cutlass_gemm.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/cutlass_gemm/cutlass_gemm/setup.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/cutlass_gemm/cutlass_gemm/setup.py


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/cutlass_gemm/gemm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/cutlass_gemm/gemm.py


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/evt/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/evt/Makefile


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/evt/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/evt/README.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/evt/evt_gemm_cute.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/evt/evt_gemm_cute.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/evt/node_types.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/evt/node_types.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/evt/reference.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/evt/reference.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/include/utils/cuda_launch.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/include/utils/cuda_launch.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/Makefile


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/README.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/convert_util.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/convert_util.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/epilogue_sm90_tma_ws.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/epilogue_sm90_tma_ws.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/hopper_gemm_kernel.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/hopper_gemm_kernel.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/hopper_gemm_kernel_launch.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/hopper_gemm_kernel_launch.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/kernel_traits.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/kernel_traits.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/mainloop_sm90_tma_gmma_ws.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/mainloop_sm90_tma_gmma_ws.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/tile_scheduler.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/hopper-gemm-ws/tile_scheduler.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/sm90_gemm_multistage.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/sm90_gemm_multistage.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/pipeline-gemm/sm90_gemm_ws.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/pipeline-gemm/sm90_gemm_ws.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/Makefile


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/README.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/cli_options.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/cli_options.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/convert_util.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/convert_util.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/cutlass_streamk/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/cutlass_streamk/Makefile


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/cutlass_streamk/benchmark.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/cutlass_streamk/benchmark.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/cutlass_streamk/profile.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/cutlass_streamk/profile.sh


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/epilogue_sm90_tma_ws.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/epilogue_sm90_tma_ws.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/hopper_gemm_kernel.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/hopper_gemm_kernel.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/hopper_gemm_kernel_launch.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/hopper_gemm_kernel_launch.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/kernel_traits.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/kernel_traits.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/mainloop_sm90_tma_gmma_ws.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/mainloop_sm90_tma_gmma_ws.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/streamk.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/streamk.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/streamk/tile_scheduler.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/streamk/tile_scheduler.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/tma/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/tma/Makefile


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/tma/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/tma/README.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/tma/main.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/tma/main.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/tma/scale_tma_kernel.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/tma/scale_tma_kernel.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/tma/shared_storage.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/tma/shared_storage.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/tma/smem_helper.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/tma/smem_helper.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/tma/tma_copy.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/tma/tma_copy.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/tma/tma_copy_multicast.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/tma/tma_copy_multicast.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/Makefile


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/README.md


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/include/copy.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/include/copy.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/include/shared_storage.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/include/shared_storage.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/include/smem_helper.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/include/smem_helper.hpp


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/include/transpose_naive.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/include/transpose_naive.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/include/transpose_smem.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/include/transpose_smem.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/include/transpose_tmastore_vectorized.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/include/transpose_tmastore_vectorized.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/include/util.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/include/util.h


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/main.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/main.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/python/copy_cute.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/python/copy_cute.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/python/setup.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/python/setup.py


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/python/transpose_cute.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/python/transpose_cute.cu


--------------------------------------------------------------------------------
/cutlass/cfx-article-src/transpose-cute/torch_benchmark.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cfx-article-src/transpose-cute/torch_benchmark.py


--------------------------------------------------------------------------------
/cutlass/cuda的ldmatrix指令的详细解释.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cuda的ldmatrix指令的详细解释.md


--------------------------------------------------------------------------------
/cutlass/cute/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cute/README.md


--------------------------------------------------------------------------------
/cutlass/cute/gemm-simple.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cute/gemm-simple.cu


--------------------------------------------------------------------------------
/cutlass/cute/s2r_copy.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cute/s2r_copy.cu


--------------------------------------------------------------------------------
/cutlass/cute/vector_add.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/cute/vector_add.cu


--------------------------------------------------------------------------------
/cutlass/swizzle/.clang-format:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/.clang-format


--------------------------------------------------------------------------------
/cutlass/swizzle/.clangd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/.clangd


--------------------------------------------------------------------------------
/cutlass/swizzle/.gitignore:
--------------------------------------------------------------------------------
1 | *cache*
2 | build/


--------------------------------------------------------------------------------
/cutlass/swizzle/CMakeLists.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/CMakeLists.txt


--------------------------------------------------------------------------------
/cutlass/swizzle/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/LICENSE


--------------------------------------------------------------------------------
/cutlass/swizzle/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/Makefile


--------------------------------------------------------------------------------
/cutlass/swizzle/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/README.md


--------------------------------------------------------------------------------
/cutlass/swizzle/assets/16x16res.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/assets/16x16res.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/addr_permute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/addr_permute.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/addr_permute_16x64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/addr_permute_16x64.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/blog2/eg2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/blog2/eg2.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/blog2/sol1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/blog2/sol1.1.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/blog2/sol1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/blog2/sol1.2.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/blog2/sol2.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/blog2/sol2.1.1.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/blog2/sol2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/blog2/sol2.1.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/blog2/swizzle_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/blog2/swizzle_code.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/blog2/swizzle_xor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/blog2/swizzle_xor.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/blog2/xor_addr_trans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/blog2/xor_addr_trans.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/frag_a_cf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/frag_a_cf.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/frag_a_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/frag_a_layout.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/load_frag_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/load_frag_a.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/ncu_global.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/ncu_global.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/ncu_shared.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/ncu_shared.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/swizzle_16x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/swizzle_16x32.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/swizzle_addr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/swizzle_addr.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/swizzle_res.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/swizzle_res.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/swizzle_shift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/swizzle_shift.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/assets/swizzle_xor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/assets/swizzle_xor.png


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/blog1.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/blog1.md


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/blog2.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/blog2.md


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/draw/example.excalidraw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/draw/example.excalidraw


--------------------------------------------------------------------------------
/cutlass/swizzle/doc/draw/xor_swizzle.excalidraw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/doc/draw/xor_swizzle.excalidraw


--------------------------------------------------------------------------------
/cutlass/swizzle/metrics.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/metrics.txt


--------------------------------------------------------------------------------
/cutlass/swizzle/src/main.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/src/main.cu


--------------------------------------------------------------------------------
/cutlass/swizzle/src/mma.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/src/mma.cuh


--------------------------------------------------------------------------------
/cutlass/swizzle/src/ptx.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/src/ptx.cuh


--------------------------------------------------------------------------------
/cutlass/swizzle/src/swizzle.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/src/swizzle.cuh


--------------------------------------------------------------------------------
/cutlass/swizzle/src/test.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/src/test.cuh


--------------------------------------------------------------------------------
/cutlass/swizzle/src/utils.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/swizzle/src/utils.cuh


--------------------------------------------------------------------------------
/cutlass/实用Swizzle教程.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/实用Swizzle教程.md


--------------------------------------------------------------------------------
/cutlass/详解awq cuda kernel.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/详解awq cuda kernel.md


--------------------------------------------------------------------------------
/cutlass/详解vLLM和SGLang awq dequantize kernel的魔法.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/cutlass/详解vLLM和SGLang awq dequantize kernel的魔法.md


--------------------------------------------------------------------------------
/elementwise/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/elementwise/Makefile


--------------------------------------------------------------------------------
/elementwise/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/elementwise/README.md


--------------------------------------------------------------------------------
/elementwise/elementwise.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/elementwise/elementwise.cu


--------------------------------------------------------------------------------
/gemv/ComplexHalfGemv.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/gemv/ComplexHalfGemv.cu


--------------------------------------------------------------------------------
/gemv/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/gemv/README.md


--------------------------------------------------------------------------------
/gemv/cuHalfComplex.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/gemv/cuHalfComplex.cuh


--------------------------------------------------------------------------------
/gemv/sgemv_v0.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/gemv/sgemv_v0.cu


--------------------------------------------------------------------------------
/gemv/sgemv_v1.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/gemv/sgemv_v1.cu


--------------------------------------------------------------------------------
/gemv/sgemv_v2.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/gemv/sgemv_v2.cu


--------------------------------------------------------------------------------
/how-to-complie-pytorch-from-source/compile_pytorch_from_source.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/how-to-complie-pytorch-from-source/compile_pytorch_from_source.md


--------------------------------------------------------------------------------
/indexing/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/indexing/README.md


--------------------------------------------------------------------------------
/indexing/index_add_cuda_oneflow_impl.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/indexing/index_add_cuda_oneflow_impl.cu


--------------------------------------------------------------------------------
/indexing/index_add_cuda_pytorch_impl.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/indexing/index_add_cuda_pytorch_impl.cu


--------------------------------------------------------------------------------
/indexing/indexing_pytorch_explain.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/indexing/indexing_pytorch_explain.cu


--------------------------------------------------------------------------------
/large-language-model-note/Cache-Dit 学习笔记.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/Cache-Dit 学习笔记.md


--------------------------------------------------------------------------------
/large-language-model-note/DeepEP：介绍与最佳实践.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/DeepEP：介绍与最佳实践.md


--------------------------------------------------------------------------------
/large-language-model-note/FlashInfer集成TensorRT-LLM cubin kernel技术分析.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/FlashInfer集成TensorRT-LLM cubin kernel技术分析.md


--------------------------------------------------------------------------------
/large-language-model-note/Fused AllGather_MatMul Triton工程实现.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/Fused AllGather_MatMul Triton工程实现.md


--------------------------------------------------------------------------------
/large-language-model-note/GTX 4090 的 Attention 诡异.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/GTX 4090 的 Attention 诡异.md


--------------------------------------------------------------------------------
/large-language-model-note/GTX 4090 的 cuda graph 诡异.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/GTX 4090 的 cuda graph 诡异.md


--------------------------------------------------------------------------------
/large-language-model-note/MoE之年的总结和MoE 推理优化的一些认识.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/MoE之年的总结和MoE 推理优化的一些认识.md


--------------------------------------------------------------------------------
/large-language-model-note/NVIDIA技术沙龙 《大规模EP优化：PD分离MoE并行方式》课程笔记.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/NVIDIA技术沙龙 《大规模EP优化：PD分离MoE并行方式》课程笔记.md


--------------------------------------------------------------------------------
/large-language-model-note/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/README.md


--------------------------------------------------------------------------------
/large-language-model-note/RMSNorm的精度陷阱：记一次LLM推理精度调查.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/RMSNorm的精度陷阱：记一次LLM推理精度调查.md


--------------------------------------------------------------------------------
/large-language-model-note/SGLang DP MLA 特性解读.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/SGLang DP MLA 特性解读.md


--------------------------------------------------------------------------------
/large-language-model-note/SGLang MLA 实现解析.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/SGLang MLA 实现解析.md


--------------------------------------------------------------------------------
/large-language-model-note/Zero.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/Zero.md


--------------------------------------------------------------------------------
/large-language-model-note/f0f25684d13445198120eba971d3f105.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/f0f25684d13445198120eba971d3f105.png


--------------------------------------------------------------------------------
/large-language-model-note/inference/hf_inference_cuda_graph.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/inference/hf_inference_cuda_graph.py


--------------------------------------------------------------------------------
/large-language-model-note/inference/hf_inference_cuda_graph_reuse_kv_cache.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/inference/hf_inference_cuda_graph_reuse_kv_cache.py


--------------------------------------------------------------------------------
/large-language-model-note/inference/hf_prefill_decode_split_inference.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/inference/hf_prefill_decode_split_inference.py


--------------------------------------------------------------------------------
/large-language-model-note/kimi k2 thinking SGLang优化.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/large-language-model-note/moe align block size kernel的优化源码.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/moe align block size kernel的优化源码.md


--------------------------------------------------------------------------------
/large-language-model-note/moe_align_block_size_kernel优化详解.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/moe_align_block_size_kernel优化详解.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/DeepSeek-V3 + SGLang: 推理优化 (v0.4.3.post2+sgl-kernel:0.0.3.post6).md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/DeepSeek-V3 + SGLang: 推理优化 (v0.4.3.post2+sgl-kernel:0.0.3.post6).md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/Implement Flash Attention Backend in SGLang - Basics and KV Cache.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/Implement Flash Attention Backend in SGLang - Basics and KV Cache.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/LMCache + SGLang Feature 解读.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/LMCache + SGLang Feature 解读.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SGLang 2025-8月性能优化技巧记录.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SGLang 2025-8月性能优化技巧记录.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SGLang 2025-9月性能优化技巧记录.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SGLang 2025-9月性能优化技巧记录.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SGLang v4.8.0 PD分离性能研究.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SGLang v4.8.0 PD分离性能研究.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SGLang 优化Triton FusedMoE 的一个新技巧​.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SGLang 优化Triton FusedMoE 的一个新技巧​.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SGLang 支持Flash Attention V3 Backend.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SGLang 支持Flash Attention V3 Backend.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SGLang 确定性推理实现技术细节笔记.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SGLang 确定性推理实现技术细节笔记.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SGLang支持Gpt-Oss性能优化博客.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SGLang支持Gpt-Oss性能优化博客.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SGLang的Expert Parallel特性解读.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SGLang的Expert Parallel特性解读.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/SM100 NVFP4 Quant实现笔记.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/SM100 NVFP4 Quant实现笔记.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/bench_one_batch.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/bench_one_batch.py


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/【DistServe 博客翻译】通过PD分离实现LLM服务中的最大有效吞吐量.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/【DistServe 博客翻译】通过PD分离实现LLM服务中的最大有效吞吐量.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/一个Dispatch Dtype引起的fp8 quant kernel性能问题.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/一个Dispatch Dtype引起的fp8 quant kernel性能问题.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/分享一个DeepSeek V3和R1中 Shared Experts和普通Experts融合的小技巧.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/分享一个DeepSeek V3和R1中 Shared Experts和普通Experts融合的小技巧.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/单机H200最快DeepSeek V3和R1推理系统优化秘籍.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/单机H200最快DeepSeek V3和R1推理系统优化秘籍.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/图解DeepSeek V3 biased_grouped_topk cuda融合算子fused_moe_gate kernel.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/图解DeepSeek V3 biased_grouped_topk cuda融合算子fused_moe_gate kernel.md


--------------------------------------------------------------------------------
/large-language-model-note/sglang&lightllm/记录下SGLang 开发，编译和Profile的几个小技巧.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/sglang&lightllm/记录下SGLang 开发，编译和Profile的几个小技巧.md


--------------------------------------------------------------------------------
/large-language-model-note/vLLM PIECEWISE CUDA Graph 技术学习笔记.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/vLLM PIECEWISE CUDA Graph 技术学习笔记.md


--------------------------------------------------------------------------------
/large-language-model-note/vllm-p2p_nccl_connector_zh.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/vllm-p2p_nccl_connector_zh.md


--------------------------------------------------------------------------------
/large-language-model-note/从DeepSeek V3.2 DSA算子看TileLang编译器的细节.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/从DeepSeek V3.2 DSA算子看TileLang编译器的细节.md


--------------------------------------------------------------------------------
/large-language-model-note/使用NCU和Cursor Claude-sonnet-3.5写出高效cuda算子的正确姿势.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/使用NCU和Cursor Claude-sonnet-3.5写出高效cuda算子的正确姿势.md


--------------------------------------------------------------------------------
/large-language-model-note/在 Cache-DiT 框架下实现原生 Serving功能.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/在 Cache-DiT 框架下实现原生 Serving功能.md


--------------------------------------------------------------------------------
/large-language-model-note/强化学习/20250519.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/强化学习/20250519.pdf


--------------------------------------------------------------------------------
/large-language-model-note/强化学习/NVIDIA技术沙龙《强化学习流水线优化：性能分析与 Rollout加速》演讲笔记.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/强化学习/NVIDIA技术沙龙《强化学习流水线优化：性能分析与 Rollout加速》演讲笔记.md


--------------------------------------------------------------------------------
/large-language-model-note/强化学习/verl 源码解读 与 HybridFlow 编程范式讲解.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/强化学习/verl 源码解读 与 HybridFlow 编程范式讲解.md


--------------------------------------------------------------------------------
/large-language-model-note/强化学习/verl: Flexible and Efficient RL for LLMs.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/强化学习/verl: Flexible and Efficient RL for LLMs.md


--------------------------------------------------------------------------------
/large-language-model-note/梳理下Flash Attention的dispatch逻辑.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/梳理下Flash Attention的dispatch逻辑.md


--------------------------------------------------------------------------------
/large-language-model-note/面试.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/large-language-model-note/面试.md


--------------------------------------------------------------------------------
/linear-attention/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/linear-attention/README.md


--------------------------------------------------------------------------------
/linear-attention/causal_product_cuda.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/linear-attention/causal_product_cuda.cu


--------------------------------------------------------------------------------
/meagtron-lm/descriptions.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/meagtron-lm/descriptions.py


--------------------------------------------------------------------------------
/meagtron-lm/distributed_init.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/meagtron-lm/distributed_init.py


--------------------------------------------------------------------------------
/meagtron-lm/distributed_init_sw.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/meagtron-lm/distributed_init_sw.py


--------------------------------------------------------------------------------
/meagtron-lm/playground.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/meagtron-lm/playground.py


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】AI系统中的网络 benchmark.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】AI系统中的网络 benchmark.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】AI系统中的网络 debug.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】AI系统中的网络 debug.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】AI系统中的网络概述.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】AI系统中的网络概述.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】gpu debug.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】gpu debug.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】加速器 benchmarks.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】加速器 benchmarks.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】大模型推理.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】大模型推理.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】如何写测试.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】如何写测试.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】计算加速器之cpu.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】计算加速器之cpu.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】计算加速器之gpu.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】计算加速器之gpu.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】训练之 performance.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】训练之 performance.md


--------------------------------------------------------------------------------
/ml-engineering/【ml-engineering 翻译系列】训练之模型并行综述.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ml-engineering/【ml-engineering 翻译系列】训练之模型并行综述.md


--------------------------------------------------------------------------------
/mlsys-paper/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/mlsys-paper/README.md


--------------------------------------------------------------------------------
/nvidia-channel/NVIDIA AI 加速精讲堂-TensorRT-LLM 应用与部署.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/nvidia-channel/NVIDIA AI 加速精讲堂-TensorRT-LLM 应用与部署.md


--------------------------------------------------------------------------------
/nvidia-channel/NVIDIA AI 加速精讲堂-TensorRT-LLM量化原理、实现与优化.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/nvidia-channel/NVIDIA AI 加速精讲堂-TensorRT-LLM量化原理、实现与优化.md


--------------------------------------------------------------------------------
/oneflow-cuda-optimize-skills/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/oneflow-cuda-optimize-skills/README.md


--------------------------------------------------------------------------------
/oneflow-cuda-optimize-skills/fused_attention/FusedMultiHeadAttentionInferenceV2.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/oneflow-cuda-optimize-skills/fused_attention/FusedMultiHeadAttentionInferenceV2.cpp


--------------------------------------------------------------------------------
/oneflow-cuda-optimize-skills/fused_softmax/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/oneflow-cuda-optimize-skills/fused_softmax/Makefile


--------------------------------------------------------------------------------
/oneflow-cuda-optimize-skills/fused_softmax/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/oneflow-cuda-optimize-skills/fused_softmax/README.md


--------------------------------------------------------------------------------
/oneflow-cuda-optimize-skills/fused_softmax/fused_scale_mask_softmax.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/oneflow-cuda-optimize-skills/fused_softmax/fused_scale_mask_softmax.cu


--------------------------------------------------------------------------------
/oneflow-cuda-optimize-skills/symbol_explain/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/oneflow-cuda-optimize-skills/symbol_explain/README.md


--------------------------------------------------------------------------------
/oneflow-cuda-optimize-skills/symbol_explain/symbol.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/oneflow-cuda-optimize-skills/symbol_explain/symbol.h


--------------------------------------------------------------------------------
/ptx-isa/ptx_isa_8.5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ptx-isa/ptx_isa_8.5.pdf


--------------------------------------------------------------------------------
/ptx-isa/ptx_isa_8.5_notes.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/ptx-isa/ptx_isa_8.5_notes.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/    _symmetric_memory.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/    _symmetric_memory.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/Accelerating Generative AI Part III: Diffusion, Fast.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/Accelerating Generative AI Part III: Diffusion, Fast.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/Deploying LLMs with TorchServe + vLLM.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/Deploying LLMs with TorchServe + vLLM.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/FP8 GEMM 的 Hopper TMA 单元深入探讨.md:
--------------------------------------------------------------------------------
1 | https://pytorch.ac.cn/blog/hopper-tma-unit/
2 | 
3 | 


--------------------------------------------------------------------------------
/pytorch-blogs-codes/FlashAttention-3：异步低精度快速准确注意力.md:
--------------------------------------------------------------------------------
1 | https://pytorch.ac.cn/blog/flashattention-3/
2 | 
3 | 


--------------------------------------------------------------------------------
/pytorch-blogs-codes/FlashInfer API速览.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/FlashInfer API速览.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/Flex Attention API 应用 Notebook 代码速览.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/Flex Attention API 应用 Notebook 代码速览.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/INT4 解码 GQA CUDA 优化以用于 LLM 推理.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/INT4 解码 GQA CUDA 优化以用于 LLM 推理.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/Optimized GPTQ INT4 Dequantization Triton Kernels o1-preview 解读.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/Optimized GPTQ INT4 Dequantization Triton Kernels o1-preview 解读.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/PyTorch 原生结构优化：torchao.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/PyTorch 原生结构优化：torchao.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/README.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/Triton Kernel编译阶段.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/Triton Kernel编译阶段.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flash2_a100_fwd_bwd_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flash2_a100_fwd_bwd_benchmark.png


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flash2_h100_fwd_bwd_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flash2_h100_fwd_bwd_benchmark.png


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flash3_fp16_fwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flash3_fp16_fwd.png


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattention_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattention_logo.png


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattn_banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattn_banner.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattn_banner.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattn_banner.pdf


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattn_memory.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattn_memory.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattn_speedup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattn_speedup.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattn_speedup_3090.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattn_speedup_3090.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattn_speedup_a100_d128.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattn_speedup_a100_d128.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattn_speedup_t4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattn_speedup_t4.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/flashattn_speedup_t4_fwd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/flashattn_speedup_t4_fwd.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/gpt2_training_curve.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/gpt2_training_curve.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/gpt2_training_efficiency.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/gpt2_training_efficiency.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/gpt3_training_curve.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/gpt3_training_curve.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/assets/gpt3_training_efficiency.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/assets/gpt3_training_efficiency.jpg


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/README.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/__init__.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/config.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/config.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/distributed_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/distributed_utils.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/float8_linear.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/float8_linear.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/float8_linear_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/float8_linear_utils.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/float8_ops.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/float8_ops.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/float8_python_api.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/float8_python_api.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/float8_scaling_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/float8_scaling_utils.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/float8_tensor.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/float8_tensor.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/float8_tensor_parallel.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/float8_tensor_parallel.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/float8_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/float8_utils.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/fsdp_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/fsdp_utils.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/inference.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/inference.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/float8/roofline_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/float8/roofline_utils.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/triton_all_gather_matmul.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/triton_all_gather_matmul.py


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【PyTorch 奇技淫巧】介绍 depyf：轻松掌握 torch.compile.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【PyTorch 奇技淫巧】介绍 depyf：轻松掌握 torch.compile.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【PyTorch 奇淫技巧】Async Checkpoint Save.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【PyTorch 奇淫技巧】Async Checkpoint Save.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【PyTorch 奇淫技巧】Python Custom Operators翻译.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【PyTorch 奇淫技巧】Python Custom Operators翻译.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【博客翻译】Presenting Flux Fast: 让 Flux 在 H100 上疾速飞驰.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【博客翻译】Presenting Flux Fast: 让 Flux 在 H100 上疾速飞驰.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】CUDA-Free Inference for LLMs.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】CUDA-Free Inference for LLMs.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】Flash Attention使用指南.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】Flash Attention使用指南.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】MetaShuffling：加速LLama4 MoE推理.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】MetaShuffling：加速LLama4 MoE推理.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】depyf 相关的开发文档和教程.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】depyf 相关的开发文档和教程.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】torch.compile 的详细示例解析教程.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】torch.compile 的详细示例解析教程.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】【PyTorch 奇技淫巧】FlexAttetion 基于Triton打造灵活度拉满的Attention.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】【PyTorch 奇技淫巧】FlexAttetion 基于Triton打造灵活度拉满的Attention.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】一文了解PyTorch中的Async Tensor Parallelism.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】一文了解PyTorch中的Async Tensor Parallelism.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】不使用checkpoint再加载来做容错训练.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】不使用checkpoint再加载来做容错训练.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】使用PyTorch DCP优化模型Checkpointing效率.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】使用PyTorch DCP优化模型Checkpointing效率.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】使用PyTorch FSDP和Torch.compile最大化训练吞吐量.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】使用PyTorch FSDP和Torch.compile最大化训练吞吐量.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】使用PyTorch FSDP最大化训练吞吐量.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】使用PyTorch FSDP最大化训练吞吐量.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】使用PyTorch加速生成式AI-4-Seamless M4T fast.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】使用PyTorch加速生成式AI-4-Seamless M4T fast.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】在 GPU 上如何加速 GPTQ Triton 反量化kernel.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】在 GPU 上如何加速 GPTQ Triton 反量化kernel.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/【翻译】在FSDP2中开启Float8 All-Gather.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/【翻译】在FSDP2中开启Float8 All-Gather.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/使用 PyTorch 加速生成式 AI 之 GPT Fast.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/使用 PyTorch 加速生成式 AI 之 GPT Fast.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/使用 PyTorch 加速生成式 AI 之 Segment Anything Fast.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/使用 PyTorch 加速生成式 AI 之 Segment Anything Fast.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/使用 Triton kernel 加速 Llama3 FP8 推理.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/使用 Triton kernel 加速 Llama3 FP8 推理.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/使用 Triton 加速 2D 动态块量化 Float8 GEMM.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/使用 Triton 加速 2D 动态块量化 Float8 GEMM.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/使用float8和FSDP2加速训练.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/使用float8和FSDP2加速训练.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/使用torchtune把LLaMa-3.1 8B蒸馏为1B.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/使用torchtune把LLaMa-3.1 8B蒸馏为1B.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/深入 CUTLASS Ping-Pong GEMM Kernel.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/深入 CUTLASS Ping-Pong GEMM Kernel.md


--------------------------------------------------------------------------------
/pytorch-blogs-codes/深入探讨 Hopper TMA 单元在 FP8 GEMM 运算中的应用.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/pytorch-blogs-codes/深入探讨 Hopper TMA 单元在 FP8 GEMM 运算中的应用.md


--------------------------------------------------------------------------------
/reduce/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/Makefile


--------------------------------------------------------------------------------
/reduce/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/README.md


--------------------------------------------------------------------------------
/reduce/pytorch_block_reduce.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/pytorch_block_reduce.cu


--------------------------------------------------------------------------------
/reduce/reduce_v0_baseline.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v0_baseline.cu


--------------------------------------------------------------------------------
/reduce/reduce_v1_interleaved_addressing.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v1_interleaved_addressing.cu


--------------------------------------------------------------------------------
/reduce/reduce_v2_bank_conflict_free.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v2_bank_conflict_free.cu


--------------------------------------------------------------------------------
/reduce/reduce_v3_idle_threads_free.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v3_idle_threads_free.cu


--------------------------------------------------------------------------------
/reduce/reduce_v4_unroll_last_warp.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v4_unroll_last_warp.cu


--------------------------------------------------------------------------------
/reduce/reduce_v5_completely_unroll.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v5_completely_unroll.cu


--------------------------------------------------------------------------------
/reduce/reduce_v6_multi_add.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v6_multi_add.cu


--------------------------------------------------------------------------------
/reduce/reduce_v7_shfl_down_sync.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v7_shfl_down_sync.cu


--------------------------------------------------------------------------------
/reduce/reduce_v8_shfl_down_sync_pack.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/reduce_v8_shfl_down_sync_pack.cu


--------------------------------------------------------------------------------
/reduce/【CUDA博客】Making vector sum really fast.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/reduce/【CUDA博客】Making vector sum really fast.md


--------------------------------------------------------------------------------
/softmax/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/softmax/Makefile


--------------------------------------------------------------------------------
/softmax/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/softmax/README.md


--------------------------------------------------------------------------------
/softmax/faster_transformer_softmax.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/softmax/faster_transformer_softmax.cu


--------------------------------------------------------------------------------
/softmax/oneflow_softmax.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/softmax/oneflow_softmax.cu


--------------------------------------------------------------------------------
/tests/test_flash_attenton.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/tests/test_flash_attenton.py


--------------------------------------------------------------------------------
/tools/Cursor 平替 Windsurf 使用体验.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/tools/Cursor 平替 Windsurf 使用体验.md


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/tools/README.md


--------------------------------------------------------------------------------
/tools/hfd.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/tools/hfd.sh


--------------------------------------------------------------------------------
/triton-meetup/3_天数GPU上Triton适配及共享内存管理机制.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton-meetup/3_天数GPU上Triton适配及共享内存管理机制.pptx


--------------------------------------------------------------------------------
/triton-meetup/FlagGems研发进展概览-0905.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton-meetup/FlagGems研发进展概览-0905.pptx


--------------------------------------------------------------------------------
/triton-meetup/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton-meetup/README.md


--------------------------------------------------------------------------------
/triton-meetup/Triton中国生态Meetup(第一期).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton-meetup/Triton中国生态Meetup(第一期).pdf


--------------------------------------------------------------------------------
/triton-meetup/优化 softmax-陈飞宇终稿.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton-meetup/优化 softmax-陈飞宇终稿.pptx


--------------------------------------------------------------------------------
/triton-meetup/硅基流动-朱平-MoE-v2.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton-meetup/硅基流动-朱平-MoE-v2.pptx


--------------------------------------------------------------------------------
/triton/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton/README.md


--------------------------------------------------------------------------------
/triton/attention_in_pytorch.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton/attention_in_pytorch.py


--------------------------------------------------------------------------------
/triton/benchmark_layernorm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton/benchmark_layernorm.py


--------------------------------------------------------------------------------
/triton/flash_attention_v1_in_pytorch.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton/flash_attention_v1_in_pytorch.py


--------------------------------------------------------------------------------
/triton/flash_attention_v2_in_pytorch.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton/flash_attention_v2_in_pytorch.py


--------------------------------------------------------------------------------
/triton/flash_attention_v2_in_triton.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton/flash_attention_v2_in_triton.py


--------------------------------------------------------------------------------
/triton/layernorm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/how-to-optim-algorithm-in-cuda/HEAD/triton/layernorm.py


--------------------------------------------------------------------------------