├── .clang-format
├── .editorconfig
├── .flake8
├── .git-blame-ignore-revs
├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── documentation.yml
    │   ├── integration-tests.yml
    │   ├── llvm-build.yml
    │   ├── llvm-build
    │       └── Dockerfile
    │   ├── test-backends.yml
    │   ├── torch-inductor-tests.yml
    │   ├── torch-inductor
    │       └── scripts
    │       │   ├── check_acc.py
    │       │   ├── check_perf.py
    │       │   ├── common.sh
    │       │   ├── install_torchinductor.sh
    │       │   ├── install_triton.sh
    │       │   ├── run_torchinductor_acc.sh
    │       │   └── run_torchinductor_perf.sh
    │   └── wheels.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bin
    ├── CMakeLists.txt
    ├── RegisterTritonDialects.h
    ├── triton-llvm-opt.cpp
    ├── triton-lsp.cpp
    ├── triton-opt.cpp
    └── triton-reduce.cpp
├── cmake
    ├── FindLLVM.cmake
    ├── llvm-hash.txt
    ├── nvidia-toolchain-version.txt
    └── pybind11-version.txt
├── docs
    ├── Makefile
    ├── _templates
    │   └── versions.html
    ├── backend
    │   ├── ldmatrixOperand0.svg
    │   └── ldmatrixOperand1.svg
    ├── conf.py
    ├── getting-started
    │   ├── installation.rst
    │   └── tutorials
    │   │   ├── grouped_vs_row_major_ordering.png
    │   │   ├── parallel_reduction.png
    │   │   └── random_bits.png
    ├── index.rst
    ├── meetups
    │   ├── 01-24-2024
    │   │   └── notes.md
    │   ├── 02-20-2024
    │   │   ├── Proton.pdf
    │   │   └── notes.md
    │   ├── 04-02-2024
    │   │   └── notes.md
    │   ├── 07-18-2023
    │   │   └── notes.md
    │   ├── 08-22-2023
    │   │   ├── amd-update.pdf
    │   │   ├── intel-xpu-update.pptx
    │   │   └── notes.md
    │   ├── 10-25-2023
    │   │   ├── intel-xpu-update.pdf
    │   │   ├── notes.md
    │   │   └── triton-shared.pptx
    │   ├── 12-13-2023
    │   │   └── notes.md
    │   └── dev-meetup-2023.md
    ├── programming-guide
    │   ├── chapter-1
    │   │   ├── cuda-parallel-matmul.png
    │   │   ├── introduction.rst
    │   │   └── triton-parallel-matmul.png
    │   └── chapter-2
    │   │   ├── halide-iteration.png
    │   │   ├── polyhedral-iteration.png
    │   │   └── related-work.rst
    └── python-api
    │   ├── triton.language.rst
    │   ├── triton.rst
    │   └── triton.testing.rst
├── include
    ├── CMakeLists.txt
    └── triton
    │   ├── Analysis
    │       ├── Alias.h
    │       ├── Allocation.h
    │       ├── AxisInfo.h
    │       ├── Membar.h
    │       └── Utility.h
    │   ├── CMakeLists.txt
    │   ├── Conversion
    │       ├── CMakeLists.txt
    │       ├── MLIRTypes.h
    │       ├── TritonCPUToLLVM
    │       │   ├── CMakeLists.txt
    │       │   ├── Passes.h
    │       │   ├── Passes.td
    │       │   ├── PatternTritonCPUOpToLLVM.h
    │       │   ├── TypeConverter.h
    │       │   └── Utility.h
    │       ├── TritonGPUToLLVM
    │       │   ├── AsmFormat.h
    │       │   ├── CMakeLists.txt
    │       │   ├── ElementwiseOpToLLVMBase.h
    │       │   ├── Passes.h
    │       │   ├── Passes.td
    │       │   ├── PatternTritonGPUOpToLLVM.h
    │       │   ├── Patterns.h
    │       │   ├── TargetInfoBase.h
    │       │   ├── TypeConverter.h
    │       │   └── Utility.h
    │       ├── TritonToTritonCPU
    │       │   ├── CMakeLists.txt
    │       │   ├── Passes.h
    │       │   ├── Passes.td
    │       │   └── TritonToTritonCPUPass.h
    │       └── TritonToTritonGPU
    │       │   ├── CMakeLists.txt
    │       │   ├── Passes.h
    │       │   ├── Passes.td
    │       │   └── TritonToTritonGPUPass.h
    │   ├── Dialect
    │       ├── CMakeLists.txt
    │       ├── NVGPU
    │       │   ├── CMakeLists.txt
    │       │   └── IR
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Dialect.h
    │       │   │   ├── NVGPUAttrDefs.td
    │       │   │   ├── NVGPUDialect.td
    │       │   │   └── NVGPUOps.td
    │       ├── Triton
    │       │   ├── CMakeLists.txt
    │       │   ├── IR
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Dialect.h
    │       │   │   ├── Interfaces.h
    │       │   │   ├── Traits.h
    │       │   │   ├── TritonAttrDefs.td
    │       │   │   ├── TritonDialect.td
    │       │   │   ├── TritonInterfaces.td
    │       │   │   ├── TritonOps.td
    │       │   │   ├── TritonTypeInterfaces.td
    │       │   │   ├── TritonTypes.td
    │       │   │   ├── Types.h
    │       │   │   └── Utility.h
    │       │   └── Transforms
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Passes.h
    │       │   │   └── Passes.td
    │       ├── TritonCPU
    │       │   ├── CMakeLists.txt
    │       │   ├── IR
    │       │   │   ├── Attributes.h
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Dialect.h
    │       │   │   ├── TritonCPUAttrDefs.td
    │       │   │   ├── TritonCPUDialect.td
    │       │   │   ├── TritonCPUInterfaces.h
    │       │   │   ├── TritonCPUOps.td
    │       │   │   ├── TritonCPUTypes.td
    │       │   │   └── Types.h
    │       │   └── Transforms
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Passes.h
    │       │   │   ├── Passes.td
    │       │   │   └── TritonCPUConversion.h
    │       ├── TritonGPU
    │       │   ├── CMakeLists.txt
    │       │   ├── IR
    │       │   │   ├── Attributes.h
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Dialect.h
    │       │   │   ├── TritonGPUAttrDefs.td
    │       │   │   ├── TritonGPUDialect.td
    │       │   │   ├── TritonGPUInterfaces.h
    │       │   │   ├── TritonGPUOps.td
    │       │   │   ├── TritonGPUTypes.td
    │       │   │   └── Types.h
    │       │   └── Transforms
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Passes.h
    │       │   │   ├── Passes.td
    │       │   │   ├── TritonGPUConversion.h
    │       │   │   └── Utility.h
    │       └── TritonNvidiaGPU
    │       │   ├── CMakeLists.txt
    │       │   ├── IR
    │       │       ├── CMakeLists.txt
    │       │       ├── Dialect.h
    │       │       ├── TritonNvidiaGPUAttrDefs.td
    │       │       ├── TritonNvidiaGPUDialect.td
    │       │       ├── TritonNvidiaGPUOps.td
    │       │       ├── TritonNvidiaGPUTypes.td
    │       │       └── Types.h
    │       │   └── Transforms
    │       │       ├── CMakeLists.txt
    │       │       ├── Passes.h
    │       │       └── Passes.td
    │   ├── Target
    │       ├── CMakeLists.txt
    │       └── LLVMIR
    │       │   ├── CMakeLists.txt
    │       │   ├── Passes.h
    │       │   └── Passes.td
    │   └── Tools
    │       └── Sys
    │           ├── GetEnv.hpp
    │           └── GetPlatform.hpp
├── lib
    ├── Analysis
    │   ├── Alias.cpp
    │   ├── Allocation.cpp
    │   ├── AxisInfo.cpp
    │   ├── CMakeLists.txt
    │   ├── Membar.cpp
    │   └── Utility.cpp
    ├── CMakeLists.txt
    ├── Conversion
    │   ├── CMakeLists.txt
    │   ├── TritonCPUToLLVM
    │   │   ├── CMakeLists.txt
    │   │   ├── ControlFlowOpToLLVM.cpp
    │   │   ├── FuncOpToLLVM.cpp
    │   │   ├── TritonCPUToLLVM.cpp
    │   │   └── TypeConverter.cpp
    │   ├── TritonGPUToLLVM
    │   │   ├── AllocateSharedMemory.cpp
    │   │   ├── AssertOpToLLVM.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── ControlFlowOpToLLVM.cpp
    │   │   ├── ConvertLayoutOpToLLVM.cpp
    │   │   ├── ConvertLayoutOpToLLVM
    │   │   │   └── SharedToDotOperandFMA.cpp
    │   │   ├── DecomposeUnsupportedConversions.cpp
    │   │   ├── DotOpToLLVM
    │   │   │   └── FMA.cpp
    │   │   ├── ElementwiseOpToLLVM.cpp
    │   │   ├── FuncOpToLLVM.cpp
    │   │   ├── HistogramOpToLLVM.cpp
    │   │   ├── MakeRangeOpToLLVM.cpp
    │   │   ├── MemoryOpToLLVM.cpp
    │   │   ├── PrintOpToLLVM.cpp
    │   │   ├── ReduceOpToLLVM.cpp
    │   │   ├── ReduceScanCommon.h
    │   │   ├── SPMDOpToLLVM.cpp
    │   │   ├── ScanOpToLLVM.cpp
    │   │   ├── TypeConverter.cpp
    │   │   ├── Utility.cpp
    │   │   └── ViewOpToLLVM.cpp
    │   ├── TritonToTritonCPU
    │   │   ├── CMakeLists.txt
    │   │   ├── TritonCPUConversion.cpp
    │   │   └── TritonToTritonCPUPass.cpp
    │   └── TritonToTritonGPU
    │   │   ├── CMakeLists.txt
    │   │   ├── TritonGPUConversion.cpp
    │   │   └── TritonToTritonGPUPass.cpp
    ├── Dialect
    │   ├── CMakeLists.txt
    │   ├── NVGPU
    │   │   ├── CMakeLists.txt
    │   │   └── IR
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Dialect.cpp
    │   ├── Triton
    │   │   ├── CMakeLists.txt
    │   │   ├── IR
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Dialect.cpp
    │   │   │   ├── Ops.cpp
    │   │   │   ├── Traits.cpp
    │   │   │   └── Types.cpp
    │   │   └── Transforms
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Combine.cpp
    │   │   │   ├── Combine.td
    │   │   │   ├── ReorderBroadcast.cpp
    │   │   │   └── RewriteTensorPointer.cpp
    │   ├── TritonCPU
    │   │   ├── CMakeLists.txt
    │   │   ├── IR
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Dialect.cpp
    │   │   │   └── Types.cpp
    │   │   └── Transforms
    │   │   │   └── CMakeLists.txt
    │   ├── TritonGPU
    │   │   ├── CMakeLists.txt
    │   │   ├── IR
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Dialect.cpp
    │   │   │   └── Types.cpp
    │   │   └── Transforms
    │   │   │   ├── AccelerateMatmul.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Coalesce.cpp
    │   │   │   ├── F32DotTC.cpp
    │   │   │   ├── OptimizeDotOperands.cpp
    │   │   │   ├── OptimizeThreadLocality.cpp
    │   │   │   ├── Pipeliner
    │   │   │       ├── MatmulLoopPipeline.cpp
    │   │   │       ├── OuterLoopPipeline.cpp
    │   │   │       ├── PipelineExpander.cpp
    │   │   │       ├── PipelineExpander.h
    │   │   │       ├── PipeliningUtility.cpp
    │   │   │       ├── PipeliningUtility.h
    │   │   │       ├── Schedule.h
    │   │   │       └── SoftwarePipeliner.cpp
    │   │   │   ├── Prefetch.cpp
    │   │   │   ├── ReduceDataDuplication.cpp
    │   │   │   ├── RemoveLayoutConversions.cpp
    │   │   │   ├── ReorderInstructions.cpp
    │   │   │   └── Utility.cpp
    │   └── TritonNvidiaGPU
    │   │   ├── CMakeLists.txt
    │   │   ├── IR
    │   │       ├── CMakeLists.txt
    │   │       ├── Dialect.cpp
    │   │       ├── Ops.cpp
    │   │       └── Types.cpp
    │   │   └── Transforms
    │   │       ├── CMakeLists.txt
    │   │       ├── FenceInsertion.cpp
    │   │       └── PlanCTA.cpp
    └── Target
    │   ├── CMakeLists.txt
    │   └── LLVMIR
    │       ├── CMakeLists.txt
    │       ├── LLVMDIScope.cpp
    │       ├── LLVMIRBreakPhiStruct.cpp
    │       └── LLVMPasses.h
├── pyproject.toml
├── python
    ├── MANIFEST.in
    ├── examples
    │   ├── copy_strided.py
    │   └── empty.py
    ├── pyproject.toml
    ├── setup.py
    ├── src
    │   ├── interpreter.cc
    │   ├── ir.cc
    │   ├── llvm.cc
    │   ├── main.cc
    │   ├── passes.cc
    │   └── passes.h
    ├── test
    │   ├── backend
    │   │   ├── extension_backend.c
    │   │   ├── test_device_backend.py
    │   │   └── third_party_backends
    │   │   │   ├── conftest.py
    │   │   │   └── test_xpu_backend.py
    │   ├── kernel_comparison
    │   │   └── kernels.yml
    │   ├── regression
    │   │   ├── test_cast_matmul.py
    │   │   ├── test_functional_regressions.py
    │   │   └── test_performance.py
    │   └── unit
    │   │   ├── conftest.py
    │   │   ├── hopper
    │   │       ├── __init__.py
    │   │       ├── test_flashattention.py
    │   │       ├── test_gemm.py
    │   │       ├── test_gemm_fusion.py
    │   │       ├── test_mixed_io.py
    │   │       ├── test_persistent_warp_specialized_fused-attention.py
    │   │       ├── test_persistent_warp_specialized_gemm.py
    │   │       └── test_tma_store_gemm.py
    │   │   ├── language
    │   │       ├── assert_helper.py
    │   │       ├── conftest.py
    │   │       ├── print_helper.py
    │   │       ├── test_annotations.py
    │   │       ├── test_block_pointer.py
    │   │       ├── test_compile_errors.py
    │   │       ├── test_conversions.py
    │   │       ├── test_core.py
    │   │       ├── test_decorator.py
    │   │       ├── test_line_info.py
    │   │       ├── test_random.py
    │   │       ├── test_reproducer.py
    │   │       ├── test_standard.py
    │   │       └── test_subprocess.py
    │   │   ├── operators
    │   │       ├── conftest.py
    │   │       ├── test_blocksparse.py
    │   │       ├── test_cross_entropy.py
    │   │       ├── test_flash_attention.py
    │   │       ├── test_inductor.py
    │   │       └── test_matmul.py
    │   │   ├── runtime
    │   │       ├── test_autotuner.py
    │   │       ├── test_bindings.py
    │   │       ├── test_cache.py
    │   │       ├── test_driver.py
    │   │       ├── test_jit.py
    │   │       ├── test_launch.py
    │   │       └── test_subproc.py
    │   │   └── tools
    │   │       └── test_aot.py
    ├── triton
    │   ├── _C
    │   │   └── include
    │   ├── __init__.py
    │   ├── backends
    │   │   ├── __init__.py
    │   │   ├── compiler.py
    │   │   └── driver.py
    │   ├── compiler
    │   │   ├── __init__.py
    │   │   ├── code_generator.py
    │   │   ├── compiler.py
    │   │   ├── errors.py
    │   │   └── make_launcher.py
    │   ├── errors.py
    │   ├── language
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── extra
    │   │   │   ├── __init__.py
    │   │   │   └── cuda
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── libdevice.py
    │   │   │   │   └── utils.py
    │   │   ├── math.py
    │   │   ├── random.py
    │   │   ├── semantic.py
    │   │   └── standard.py
    │   ├── ops
    │   │   ├── __init__.py
    │   │   ├── blocksparse
    │   │   │   ├── __init__.py
    │   │   │   ├── matmul.py
    │   │   │   └── softmax.py
    │   │   ├── cross_entropy.py
    │   │   ├── flash_attention.py
    │   │   ├── matmul.py
    │   │   └── matmul_perf_model.py
    │   ├── runtime
    │   │   ├── __init__.py
    │   │   ├── autotuner.py
    │   │   ├── build.py
    │   │   ├── cache.py
    │   │   ├── driver.py
    │   │   ├── errors.py
    │   │   ├── interpreter.py
    │   │   └── jit.py
    │   ├── testing.py
    │   └── tools
    │   │   ├── __init__.py
    │   │   ├── build_extern.py
    │   │   ├── compile.c
    │   │   ├── compile.h
    │   │   ├── compile.py
    │   │   ├── disasm.py
    │   │   └── link.py
    └── tutorials
    │   ├── 01-vector-add.py
    │   ├── 02-fused-softmax.py
    │   ├── 03-matrix-multiplication.py
    │   ├── 04-low-memory-dropout.py
    │   ├── 05-layer-norm.py
    │   ├── 06-fused-attention.py
    │   ├── 07-extern-functions.py
    │   ├── 08-grouped-gemm.py
    │   └── README.rst
├── test
    ├── Analysis
    │   ├── test-alias.mlir
    │   ├── test-alignment.mlir
    │   ├── test-allocation.mlir
    │   └── test-membar.mlir
    ├── CMakeLists.txt
    ├── Conversion
    │   ├── amd
    │   │   ├── decompose-unsupported-conversions.mlir
    │   │   ├── fp_to_fp.mlir
    │   │   ├── load_store.mlir
    │   │   └── tritongpu_wmma_dot_to_llvm.mlir
    │   ├── dedup-by-constancy.mlir
    │   ├── divide-by-0.mlir
    │   ├── triton_to_tritongpu.mlir
    │   ├── tritongpu_to_llvm.mlir
    │   ├── tritongpu_to_llvm_hopper.mlir
    │   └── tritongpu_to_llvm_volta.mlir
    ├── LLVMIR
    │   └── break-phi-struct.ll
    ├── NVGPU
    │   ├── test_cga.mlir
    │   └── test_wgmma.mlir
    ├── Triton
    │   ├── canonicalize.mlir
    │   ├── combine.mlir
    │   ├── invalid.mlir
    │   ├── ops.mlir
    │   ├── reorder-broadcast.mlir
    │   ├── reproducer.mlir
    │   ├── rewrite-tensor-pointer.mlir
    │   ├── vecadd.mlir
    │   └── verify-make-range.mlir
    ├── TritonGPU
    │   ├── accelerate-matmul.mlir
    │   ├── amd
    │   │   ├── accelerate-amd-matmul-wmma.mlir
    │   │   └── amd-reorder-instructions.mlir
    │   ├── atomic-cas.mlir
    │   ├── canonicalize.mlir
    │   ├── coalesce.mlir
    │   ├── combine.mlir
    │   ├── dot-operands.mlir
    │   ├── fence-inserstion.mlir
    │   ├── invalid.mlir
    │   ├── loop-pipeline-hopper.mlir
    │   ├── loop-pipeline.mlir
    │   ├── matmul.mlir
    │   ├── ops.mlir
    │   ├── optimize-locality.mlir
    │   ├── optimize_epilogue.mlir
    │   ├── pipeline-hopper-remove-wait.mlir
    │   ├── prefetch.mlir
    │   ├── reduce-data-duplication.mlir
    │   ├── reorder-instructions.mlir
    │   ├── tritongpu_ops.mlir
    │   └── verify-blocked-layout.mlir
    ├── lib
    │   ├── Analysis
    │   │   ├── CMakeLists.txt
    │   │   ├── TestAlias.cpp
    │   │   ├── TestAllocation.cpp
    │   │   ├── TestAxisInfo.cpp
    │   │   └── TestMembar.cpp
    │   └── CMakeLists.txt
    ├── lit.cfg.py
    └── lit.site.cfg.py.in
├── third_party
    ├── amd
    │   ├── CMakeLists.txt
    │   ├── backend
    │   │   ├── compiler.py
    │   │   ├── driver.c
    │   │   ├── driver.py
    │   │   ├── include
    │   │   │   └── hip
    │   │   │   │   ├── amd_detail
    │   │   │   │       ├── amd_channel_descriptor.h
    │   │   │   │       ├── amd_device_functions.h
    │   │   │   │       ├── amd_hip_atomic.h
    │   │   │   │       ├── amd_hip_bf16.h
    │   │   │   │       ├── amd_hip_bfloat16.h
    │   │   │   │       ├── amd_hip_common.h
    │   │   │   │       ├── amd_hip_complex.h
    │   │   │   │       ├── amd_hip_cooperative_groups.h
    │   │   │   │       ├── amd_hip_fp16.h
    │   │   │   │       ├── amd_hip_gl_interop.h
    │   │   │   │       ├── amd_hip_math_constants.h
    │   │   │   │       ├── amd_hip_runtime.h
    │   │   │   │       ├── amd_hip_runtime_pt_api.h
    │   │   │   │       ├── amd_hip_unsafe_atomics.h
    │   │   │   │       ├── amd_hip_vector_types.h
    │   │   │   │       ├── amd_math_functions.h
    │   │   │   │       ├── amd_surface_functions.h
    │   │   │   │       ├── amd_warp_functions.h
    │   │   │   │       ├── concepts.hpp
    │   │   │   │       ├── device_library_decls.h
    │   │   │   │       ├── functional_grid_launch.hpp
    │   │   │   │       ├── grid_launch.h
    │   │   │   │       ├── grid_launch.hpp
    │   │   │   │       ├── grid_launch_GGL.hpp
    │   │   │   │       ├── helpers.hpp
    │   │   │   │       ├── hip_cooperative_groups_helper.h
    │   │   │   │       ├── hip_fp16_gcc.h
    │   │   │   │       ├── hip_fp16_math_fwd.h
    │   │   │   │       ├── hip_ldg.h
    │   │   │   │       ├── hip_prof_str.h
    │   │   │   │       ├── hip_runtime_prof.h
    │   │   │   │       ├── host_defines.h
    │   │   │   │       ├── hsa_helpers.hpp
    │   │   │   │       ├── macro_based_grid_launch.hpp
    │   │   │   │       ├── math_fwd.h
    │   │   │   │       ├── ockl_image.h
    │   │   │   │       ├── program_state.hpp
    │   │   │   │       ├── texture_fetch_functions.h
    │   │   │   │       └── texture_indirect_functions.h
    │   │   │   │   ├── channel_descriptor.h
    │   │   │   │   ├── device_functions.h
    │   │   │   │   ├── driver_types.h
    │   │   │   │   ├── hip_bf16.h
    │   │   │   │   ├── hip_bfloat16.h
    │   │   │   │   ├── hip_common.h
    │   │   │   │   ├── hip_complex.h
    │   │   │   │   ├── hip_cooperative_groups.h
    │   │   │   │   ├── hip_deprecated.h
    │   │   │   │   ├── hip_ext.h
    │   │   │   │   ├── hip_fp16.h
    │   │   │   │   ├── hip_gl_interop.h
    │   │   │   │   ├── hip_hcc.h
    │   │   │   │   ├── hip_math_constants.h
    │   │   │   │   ├── hip_profile.h
    │   │   │   │   ├── hip_runtime.h
    │   │   │   │   ├── hip_runtime_api.h
    │   │   │   │   ├── hip_texture_types.h
    │   │   │   │   ├── hip_vector_types.h
    │   │   │   │   ├── hip_version.h
    │   │   │   │   ├── hiprtc.h
    │   │   │   │   ├── library_types.h
    │   │   │   │   ├── math_functions.h
    │   │   │   │   ├── surface_types.h
    │   │   │   │   └── texture_types.h
    │   │   └── lib
    │   │   │   ├── cuda2gcn.bc
    │   │   │   ├── ockl.bc
    │   │   │   ├── ocml.bc
    │   │   │   └── opencl.bc
    │   ├── include
    │   │   ├── CMakeLists.txt
    │   │   ├── TritonAMDGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── GCNAsmFormat.h
    │   │   │   ├── Passes.h
    │   │   │   └── Passes.td
    │   │   └── TritonAMDGPUTransforms
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── MfmaGroup.h
    │   │   │   ├── Passes.h
    │   │   │   ├── Passes.td
    │   │   │   └── TritonGPUConversion.h
    │   ├── lib
    │   │   ├── CMakeLists.txt
    │   │   ├── TritonAMDGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── ConvertLayoutOpToLLVM.cpp
    │   │   │   ├── ConvertLayoutOpToLLVM
    │   │   │   │   ├── SharedToDotOperandHelper.cpp
    │   │   │   │   ├── SharedToDotOperandHelper.h
    │   │   │   │   ├── SharedToDotOperandMFMA.cpp
    │   │   │   │   └── SharedToDotOperandWMMA.cpp
    │   │   │   ├── DecomposeUnsupportedConversions.cpp
    │   │   │   ├── DotOpToLLVM.cpp
    │   │   │   ├── DotOpToLLVM
    │   │   │   │   ├── MFMA.cpp
    │   │   │   │   └── WMMA.cpp
    │   │   │   ├── ElementwiseOpToLLVM.cpp
    │   │   │   ├── GCNAsmFormat.cpp
    │   │   │   ├── LoadStoreOpToLLVM.cpp
    │   │   │   ├── PatternTritonGPUOpToLLVM.h
    │   │   │   ├── SPMDOpToLLVM.cpp
    │   │   │   ├── TargetInfo.cpp
    │   │   │   ├── TargetInfo.h
    │   │   │   ├── TritonGPUToLLVM.cpp
    │   │   │   ├── Utility.cpp
    │   │   │   └── Utility.h
    │   │   └── TritonAMDGPUTransforms
    │   │   │   ├── AccelerateAMDMatmul.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── MfmaGroup.cpp
    │   │   │   ├── OptimizeEpilogue.cpp
    │   │   │   ├── RemoveLayoutConversions.cpp
    │   │   │   ├── ReorderInstructions.cpp
    │   │   │   └── StreamPipeline.cpp
    │   └── python
    │   │   └── triton_amd.cc
    ├── cpu
    │   ├── CMakeLists.txt
    │   ├── backend
    │   │   ├── compiler.py
    │   │   └── driver.py
    │   └── triton_cpu.cc
    ├── nvidia
    │   ├── CMakeLists.txt
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── compiler.py
    │   │   ├── driver.c
    │   │   ├── driver.py
    │   │   ├── include
    │   │   │   └── cuda.h
    │   │   └── lib
    │   │   │   └── libdevice.10.bc
    │   ├── include
    │   │   ├── CMakeLists.txt
    │   │   ├── NVGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── NVGPUToLLVMPass.h
    │   │   │   ├── Passes.h
    │   │   │   └── Passes.td
    │   │   └── TritonNVIDIAGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── PTXAsmFormat.h
    │   │   │   ├── Passes.h
    │   │   │   └── Passes.td
    │   ├── lib
    │   │   ├── CMakeLists.txt
    │   │   ├── NVGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── NVGPUToLLVMPass.cpp
    │   │   └── TritonNVIDIAGPUToLLVM
    │   │   │   ├── BarrierOpToLLVM.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── ClusterOpsToLLVM.cpp
    │   │   │   ├── ConvertLayoutOpToLLVM.cpp
    │   │   │   ├── ConvertLayoutOpToLLVM
    │   │   │       ├── SharedToDotOperandMMAv1.cpp
    │   │   │       └── SharedToDotOperandMMAv2.cpp
    │   │   │   ├── DecomposeUnsupportedConversions.cpp
    │   │   │   ├── DotOpToLLVM.cpp
    │   │   │   ├── DotOpToLLVM
    │   │   │       ├── MMAv1.cpp
    │   │   │       ├── MMAv2.cpp
    │   │   │       └── WGMMA.cpp
    │   │   │   ├── ElementwiseOpToLLVM.cpp
    │   │   │   ├── LoadStoreOpToLLVM.cpp
    │   │   │   ├── PTXAsmFormat.cpp
    │   │   │   ├── PatternTritonGPUOpToLLVM.h
    │   │   │   ├── SPMDOpToLLVM.cpp
    │   │   │   ├── TargetInfo.cpp
    │   │   │   ├── TargetInfo.h
    │   │   │   ├── TensorPtrOpsToLLVM.cpp
    │   │   │   ├── TritonGPUToLLVM.cpp
    │   │   │   ├── Utility.cpp
    │   │   │   └── Utility.h
    │   └── triton_nvidia.cc
    └── proton
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── csrc
    │       ├── Proton.cpp
    │       ├── include
    │       │   ├── Context
    │       │   │   ├── Context.h
    │       │   │   ├── Python.h
    │       │   │   └── Shadow.h
    │       │   ├── Data
    │       │   │   ├── Data.h
    │       │   │   ├── Metric.h
    │       │   │   ├── TraceData.h
    │       │   │   └── TreeData.h
    │       │   ├── Driver
    │       │   │   ├── Dispatch.h
    │       │   │   └── GPU
    │       │   │   │   ├── Cuda.h
    │       │   │   │   └── Cupti.h
    │       │   ├── Profiler
    │       │   │   ├── CuptiProfiler.h
    │       │   │   └── Profiler.h
    │       │   ├── Proton.h
    │       │   ├── Session
    │       │   │   └── Session.h
    │       │   └── Utility
    │       │   │   ├── Errors.h
    │       │   │   ├── Singleton.h
    │       │   │   ├── String.h
    │       │   │   └── Traits.h
    │       └── lib
    │       │   ├── Context
    │       │       ├── Context.cpp
    │       │       ├── Python.cpp
    │       │       └── Shadow.cpp
    │       │   ├── Data
    │       │       ├── Data.cpp
    │       │       ├── TraceData.cpp
    │       │       └── TreeData.cpp
    │       │   ├── Driver
    │       │       └── GPU
    │       │       │   ├── Cuda.cpp
    │       │       │   └── Cupti.cpp
    │       │   ├── Profiler
    │       │       └── CuptiProfiler.cpp
    │       │   └── Session
    │       │       └── Session.cpp
    │   ├── proton
    │       ├── _C
    │       │   └── include
    │       ├── __init__.py
    │       ├── flags.py
    │       ├── hook.py
    │       ├── profile.py
    │       ├── scope.py
    │       └── viewer.py
    │   ├── test
    │       ├── test_api.py
    │       ├── test_lib.py
    │       ├── test_profile.py
    │       └── test_viewer.py
    │   └── tutorials
    │       ├── dynamic_net.py
    │       └── matmul.py
├── unittest
    ├── Analysis
    │   ├── CMakeLists.txt
    │   └── UtilityTest.cpp
    ├── CMakeLists.txt
    ├── Conversion
    │   ├── CMakeLists.txt
    │   └── TritonGPUToLLVM
    │   │   ├── CMakeLists.txt
    │   │   ├── DumpLayout.cpp
    │   │   ├── DumpLayout.h
    │   │   ├── EmitIndicesTest.cpp
    │   │   └── PTXAsmFormatTest.cpp
    ├── Dialect
    │   ├── CMakeLists.txt
    │   └── TritonGPU
    │   │   ├── CMakeLists.txt
    │   │   ├── DialectTest.cpp
    │   │   └── SwizzleTest.cpp
    └── googletest.cmake
└── utils
    └── nightly.pypirc


/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: LLVM
2 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # https://editorconfig.org/
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | charset = utf-8
 7 | end_of_line = lf
 8 | indent_style = space
 9 | indent_size = 4
10 | trim_trailing_whitespace = true
11 | insert_final_newline = true
12 | 
13 | [*.py]
14 | indent_size = 4
15 | src_paths=python
16 | 
17 | [*.{yaml,yml}]
18 | indent_size = 2
19 | 
20 | [*.md]
21 | indent_size = 2
22 | x-soft-wrap-text = true
23 | 
24 | [*.rst]
25 | indent_size = 4
26 | x-soft-wrap-text = true
27 | 
28 | [CMakeLists.txt,*.cmake]
29 | indent_size = 2
30 | 
31 | [Makefile]
32 | indent_style = tab
33 | 
34 | [*.{c,cc,cpp,h,hpp,cu,cuh}]
35 | indent_size = 2
36 | 
37 | [*.mlir]
38 | indent_size = 2
39 | 
40 | [*.td]
41 | indent_size = 4
42 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # W503 (linebreak occurred before binary operator) seems to be enabled by
3 | # default, even though it goes against pep8 and is incompatible with W504
4 | # (linebreak occurred *after* binary operator).  Disable it.
5 | ignore = E501,E701,E731,W503
6 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
 1 | # Commits listed here are ignored by `git blame`.  Add "big and uninteresting
 2 | # changes" here.  Don't forget that it has to be a separate commit (and, because
 3 | # our automation squashes PRs, a separate PR)!
 4 | #
 5 | # Run the following command to teach your `git blame` to pick up this file.
 6 | #
 7 | #  $ git config blame.ignoreRevsFile .git-blame-ignore-revs`
 8 | 
 9 | 841a77d1b5961b43e1b64e5265bdfe52c133574d
10 | cb68a0d9d501657258ed9f7ad7610d0784c9be9a
11 | 03184de8b535bb24fb1f49cc1f5e008bcbaa73ef
12 | bc4a8e66da036fafc01b87ee9e210df7ee8fb738
13 | 846d6e7e77891706d179b20f27b1278ac3b9a9ac
14 | 0327b9d32db6d1d63d207ccab722bd45e00a6678
15 | df08301e76a56d9ab3f36ff00ab7133672baa8d3
16 | f88b01f558df06f010a869e01473253a5f5cd8db
17 | 312cf97e147e962562877026fd82c928cf6eaa30
18 | 53d868113a706988394134ca1f7f85cb3016cc81
19 | 539fbe5049570f29e73dc6843f984cd4913c5505
20 | 053af4e9f8f005e1bc3f8ac9bf285eaf0ac9bf72
21 | 5b36cb48ad9ce566dd24ff7183f207a1cb9358b5
22 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # These owners will be the default owners for everything in
 2 | # the repo. Unless a later match takes precedence,
 3 | # @global-owner1 and @global-owner2 will be requested for
 4 | # review when someone opens a pull request.
 5 | *       @ptillet
 6 | 
 7 | # --------
 8 | # Analyses
 9 | # --------
10 | # Alias analysis
11 | include/triton/Analysis/Alias.h @Jokeren
12 | lib/Analysis/Alias.cpp @Jokeren
13 | # Allocation analysis
14 | include/triton/Analysis/Allocation.h @Jokeren
15 | lib/Analysis/Allocation.cpp @Jokeren
16 | # Membar analysis
17 | include/triton/Analysis/Membar.h @Jokeren
18 | lib/Analysis/Membar.cpp @Jokeren
19 | # AxisInfo analysis
20 | include/triton/Analysis/AxisInfo.h @ptillet
21 | lib/Analysis/AxisInfo.cpp @ptillet
22 | # Utilities
23 | include/triton/Analysis/Utility.h @Jokeren
24 | lib/Analysis/Utility.cpp @Jokeren
25 | 
26 | # ----------
27 | # Dialects
28 | # ----------
29 | # Pipeline pass
30 | lib/Dialect/TritonGPU/Transforms/Pipeline.cpp @ptillet
31 | # Prefetch pass
32 | lib/Dialect/TritonGPU/Transforms/Prefetch.cpp @ptillet
33 | # Coalesce pass
34 | lib/Dialect/TritonGPU/Transforms/Coalesce.cpp @ptillet
35 | # Layout simplification pass
36 | lib/Dialect/TritonGPU/Transforms/Combine.cpp @ptillet
37 | 
38 | # -----------
39 | # Conversions
40 | # -----------
41 | # TritonToTritonGPU
42 | include/triton/Conversion/TritonToTritonGPU/ @ptillet
43 | lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp @ptillet
44 | 
45 | # -----------
46 | # third_party
47 | # -----------
48 | third_party/amd/ @antiagainst @zhanglx13
49 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-inductor-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Torchinductor
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Wheels"]
 6 |     types: [completed]
 7 | 
 8 | permissions: read-all
 9 | 
10 | jobs:
11 |   Runner-Preparation:
12 |     runs-on: ubuntu-latest
13 |     outputs:
14 |       matrix: ${{ steps.set-matrix.outputs.matrix }}
15 |     steps:
16 |       - name: Prepare runner matrix
17 |         id: set-matrix
18 |         run: |
19 |           echo '::set-output name=matrix::[["self-hosted", "A100"]]'
20 | 
21 |   Integration-Tests:
22 |     needs: Runner-Preparation
23 |     timeout-minutes: 240  # 4 hours
24 |     runs-on: ${{ matrix.runner }}
25 |     strategy:
26 |       matrix:
27 |         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix)}}
28 |     steps:
29 |       - name: Checkout
30 |         uses: actions/checkout@v4
31 |       - name: Packages
32 |         run: |
33 |           ./.github/workflows/torch-inductor/scripts/install_torchinductor.sh torchbench
34 |       - name: Environment
35 |         run: |
36 |           source /opt/torchinductor_venv/bin/activate
37 |           ./.github/workflows/torch-inductor/scripts/install_triton.sh
38 |       - name: Performance
39 |         run: |
40 |           ./.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh torchbench
41 |       # Runs too long time
42 |       #- name: Accuracy
43 |       #  run: |
44 |       #    ./.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh
45 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-inductor/scripts/check_acc.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | 
 4 | file_path = sys.argv[1]
 5 | with open(file_path) as f:
 6 |     reader = csv.reader(f)
 7 |     for i, row in enumerate(reader):
 8 |         if i == 0:
 9 |             continue
10 |         if row[3] != "pass":
11 |             print(f"{row[1]} failed on device {row[0]} with batch size {row[2]}")
12 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-inductor/scripts/common.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TEST_REPORTS_DIR=/opt/torchinductor_reports
 4 | PYTORCH_DIR=/opt/pytorch
 5 | MODELS=(timm_models huggingface torchbench)
 6 | 
 7 | echo "$TEST_REPORTS_DIR"
 8 | echo "$PYTORCH_DIR"
 9 | echo "${MODELS[@]}"
10 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-inductor/scripts/install_triton.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # remember where we started
 4 | ROOT="$(pwd)"
 5 | 
 6 | # shellcheck source=/dev/null
 7 | source /opt/torchinductor_venv/bin/activate
 8 | # shellcheck source=/dev/null
 9 | source ./.github/workflows/torch-inductor/scripts/common.sh
10 | 
11 | # build our own triton
12 | cd python || exit
13 | pip3 install --pre pytorch-triton --extra-index-url https://download.pytorch.org/whl/nightly/cu118
14 | rm -rf build
15 | pip3 install -e .
16 | pip3 uninstall pytorch-triton -y
17 | 
18 | # clean up cache
19 | rm -rf /tmp/torchinductor_root/
20 | rm -rf ~/.triton/cache
21 | rm -rf "$TEST_REPORTS_DIR"
22 | 
23 | # go back to where we started
24 | cd "$ROOT" || exit
25 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # remember where we started
 4 | ROOT="$(pwd)"
 5 | INDUCTOR="$ROOT"/.github/workflows/torch-inductor
 6 | MODEL_SPEC=$1
 7 | 
 8 | # shellcheck source=/dev/null
 9 | source /opt/torchinductor_venv/bin/activate
10 | # shellcheck source=/dev/null
11 | source "$INDUCTOR"/scripts/common.sh
12 | 
13 | cd "$PYTORCH_DIR" || exit
14 | TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc
15 | mkdir -p "$TEST_REPORTS_DIR"
16 | 
17 | for model in "${MODELS[@]}"; do
18 |   if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
19 |     continue
20 |   fi
21 |   echo "Running accuracy test for $model"
22 |   python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --device cuda \
23 |     --output "$TEST_REPORTS_DIR"/inference_"$model".csv
24 |   python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --training --amp --device cuda \
25 |     --output "$TEST_REPORTS_DIR"/training_"$model".csv
26 |   python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --dynamic-shapes --device cuda \
27 |     --output "$TEST_REPORTS_DIR"/dynamic_shapes_"$model".csv
28 | done
29 | 
30 | cd "$ROOT" || exit
31 | for model in "${MODELS[@]}"; do
32 |   if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
33 |     continue
34 |   fi
35 |   echo "Checking accuracy test for $model"
36 |   python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/inference_"$model".csv
37 |   python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/training_"$model".csv
38 |   python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/dynamic_shapes_"$model".csv
39 | done
40 | 
41 | # go back to where we started
42 | cd "$ROOT" || exit
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Triton builds
 2 | build/
 3 | build-*/
 4 | 
 5 | # Triton Python module builds
 6 | python/build/
 7 | python/triton.egg-info/
 8 | python/triton/_C/libtriton.pyd
 9 | python/triton/_C/libtriton.so
10 | 
11 | # Backends copied from submodules
12 | python/triton/backends/
13 | !python/triton/backends/__init__.py
14 | !python/triton/backends/compiler.py
15 | !python/triton/backends/driver.py
16 | 
17 | # Proton
18 | python/triton/profiler
19 | 
20 | # Python caches
21 | __pycache__/
22 | *.py[cod]
23 | .pytest_cache
24 | 
25 | # Environments
26 | .venv
27 | venv/
28 | venv.bak/
29 | 
30 | # VS Code project files
31 | .vscode
32 | .vs
33 | 
34 | # JetBrains project files
35 | .idea
36 | cmake-build-*
37 | 
38 | # Third-party binaries
39 | cuobjdump
40 | nvdisasm
41 | ptxas
42 | 
43 | # Docs
44 | docs/_build/
45 | docs/python-api/generated/
46 | docs/dialects/
47 | docs/getting-started/tutorials
48 | docs/sg_execution_times.rst
49 | !python/tutorials/*.py
50 | !python/tutorials/*.rst
51 | 
52 | # clangd index. (".clangd" is a config file now, thus trailing slash)
53 | .clangd/
54 | .cache
55 | /compile_commands.json
56 | .vscode
57 | .vs
58 | 
59 | # Vim
60 | *.swp
61 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: check-symlinks
 6 |       - id: destroyed-symlinks
 7 |       - id: trailing-whitespace
 8 |       - id: end-of-file-fixer
 9 |       - id: check-yaml
10 |       - id: check-toml
11 |       - id: check-ast
12 |       - id: check-added-large-files
13 |       - id: check-merge-conflict
14 |       - id: check-executables-have-shebangs
15 |       - id: check-shebang-scripts-are-executable
16 |       - id: detect-private-key
17 |       - id: debug-statements
18 | 
19 |   - repo: https://github.com/astral-sh/ruff-pre-commit
20 |     rev: v0.1.3
21 |     hooks:
22 |       - id: ruff
23 |         files: '^python/.*'
24 |         args: ["--fix", "--line-length", "120"]
25 |         stages: [commit, push, manual]
26 |         exclude: |
27 |           (?x)(
28 |             ^python/triton/runtime/.*|
29 |             ^test/|
30 |             ^docs/conf.py$
31 |           )
32 | 
33 |   - repo: https://github.com/google/yapf
34 |     rev: be72557
35 |     hooks:
36 |       - id: yapf
37 |         args: ["-p", "-i"]
38 |         stages: [commit, push, manual]
39 |         exclude: "python/test/unit/language/test_line_info.py"
40 | 
41 |   - repo: https://github.com/pre-commit/mirrors-clang-format
42 |     rev: v16.0.6
43 |     hooks:
44 |       - id: clang-format
45 |         stages: [commit, push, manual]
46 | 
47 | exclude: |
48 |   (?x)(
49 |     ^include/triton/external/|
50 |     ^third_party/amd/backend/include/hip/|
51 |     ^third_party/amd/backend/lib/|
52 |     ^third_party/nvidia/backend/include/cuda.h
53 |   )
54 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2018-2020 Philippe Tillet
 3 | * Copyright 2020-2022 OpenAI
 4 | *
 5 | * Permission is hereby granted, free of charge, to any person obtaining
 6 | * a copy of this software and associated documentation files
 7 | * (the "Software"), to deal in the Software without restriction,
 8 | * including without limitation the rights to use, copy, modify, merge,
 9 | * publish, distribute, sublicense, and/or sell copies of the Software,
10 | * and to permit persons to whom the Software is furnished to do so,
11 | * subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be
14 | * included in all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | */
24 | 


--------------------------------------------------------------------------------
/bin/triton-lsp.cpp:
--------------------------------------------------------------------------------
 1 | #include "./RegisterTritonDialects.h"
 2 | 
 3 | #include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h"
 4 | 
 5 | int main(int argc, char **argv) {
 6 |   mlir::DialectRegistry registry;
 7 |   registerTritonDialects(registry);
 8 | 
 9 |   mlir::MLIRContext context(registry);
10 |   return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry));
11 | }
12 | 


--------------------------------------------------------------------------------
/bin/triton-opt.cpp:
--------------------------------------------------------------------------------
 1 | #include "./RegisterTritonDialects.h"
 2 | 
 3 | #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 4 | 
 5 | int main(int argc, char **argv) {
 6 |   mlir::DialectRegistry registry;
 7 |   registerTritonDialects(registry);
 8 | 
 9 |   return mlir::asMainReturnCode(mlir::MlirOptMain(
10 |       argc, argv, "Triton (GPU) optimizer driver\n", registry));
11 | }
12 | 


--------------------------------------------------------------------------------
/bin/triton-reduce.cpp:
--------------------------------------------------------------------------------
 1 | #include "./RegisterTritonDialects.h"
 2 | 
 3 | #include "mlir/Tools/mlir-reduce/MlirReduceMain.h"
 4 | 
 5 | int main(int argc, char **argv) {
 6 |   mlir::DialectRegistry registry;
 7 |   registerTritonDialects(registry);
 8 | 
 9 |   mlir::MLIRContext context(registry);
10 |   return mlir::failed(mlir::mlirReduceMain(argc, argv, context));
11 | }
12 | 


--------------------------------------------------------------------------------
/cmake/llvm-hash.txt:
--------------------------------------------------------------------------------
1 | ed4e505c219fe6c7464ea5a056e90d8cd94c7332
2 | 


--------------------------------------------------------------------------------
/cmake/nvidia-toolchain-version.txt:
--------------------------------------------------------------------------------
1 | 12.4.99
2 | 


--------------------------------------------------------------------------------
/cmake/pybind11-version.txt:
--------------------------------------------------------------------------------
1 | 2.11.1
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = Triton
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_templates/versions.html:
--------------------------------------------------------------------------------
 1 | {%- if current_version %}
 2 | <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
 3 |     <span class="rst-current-version" data-toggle="rst-current-version">
 4 |         <span class="fa fa-book"> Other Versions</span>
 5 |         v: {{ current_version.name }}
 6 |         <span class="fa fa-caret-down"></span>
 7 |     </span>
 8 |     <div class="rst-other-versions">
 9 |         {%- if versions.tags %}
10 |         <dl>
11 |             <dt>Tags</dt>
12 |             {%- for item in versions.tags %}
13 |             <dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
14 |             {%- endfor %}
15 |         </dl>
16 |         {%- endif %}
17 |         {%- if versions.branches %}
18 |         <dl>
19 |             <dt>Branches</dt>
20 |             {%- for item in versions.branches %}
21 |             <dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
22 |             {%- endfor %}
23 |         </dl>
24 |         {%- endif %}
25 |     </div>
26 | </div>
27 | {%- endif %}
28 | 


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/parallel_reduction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/parallel_reduction.png


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/random_bits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/random_bits.png


--------------------------------------------------------------------------------
/docs/meetups/01-24-2024/notes.md:
--------------------------------------------------------------------------------
 1 | #### Agenda:
 2 | 
 3 | ##### Items:
 4 | 1. 3rd party refactoring backend update.
 5 | 2. AMD update about experience with refactored backend and new process.
 6 | 3. Plan to restore the Intel XPU backend as third-party module.
 7 | 4. Open discussion.
 8 | 
 9 | ##### Minutes:
10 | Recording link [here](https://youtu.be/uRlqolhNbRk)
11 | 
12 | 1. 3rd party refactoring backend update.
13 |    - Backends are passes and IRs are shared by the backends to avoid divergence and duplications so that developers do not have to change the Triton source code
14 |    - To discover backend forks in directories, put environment vars in setup.py.
15 |    - Backends can link whatever library they want, they don’t need to copy paste Nvidia code.
16 |    - Nvidia uses the same API as other backends, (refactoring of the C++ code is still remaining). No special casing for Nvidia code.
17 |    - If Triton dependency is on top of the main branch then it will work for forks/branches.
18 |    - Still remaining: LLVM IR conversion – reusuable pattern rewriters update; Reduce complexity in statefulness in Triton GPU - inherit from base pattern
19 | 2. AMD update about experience with refactored backend and new process.
20 |    - Skipped due to lack of time. Will be covered in February meetup
21 | 3. Plan to restore the Intel XPU backend as third-party module.
22 |    - Prereqs to upstream – Will take into account the system HW and SW, with perf to be ~80% of Nvidia, to allow upstreaming.
23 |    - Consider how useful it is for AI research to allow upstreaming – as it impacts maintenance cost of the backends.
24 |    - Don’t have plans to upstream mobile backends
25 |    - Intel will hold offline discussion with Open AI for being in-tree.
26 | 


--------------------------------------------------------------------------------
/docs/meetups/02-20-2024/Proton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/02-20-2024/Proton.pdf


--------------------------------------------------------------------------------
/docs/meetups/02-20-2024/notes.md:
--------------------------------------------------------------------------------
 1 | #### Agenda:
 2 | 
 3 | ##### Items:
 4 | 1. Intel update
 5 | 2. AMD update
 6 | 3. Profiler update
 7 | 4. We are in the process of transitioning to a pro slack plan, so everybody will be able to see history. Expect this to take a few more weeks.
 8 | 5. We are still working on finalizing a document about our technical governance structure. Expect this to take a few more weeks too.4. Open discussion.
 9 | 
10 | ##### Minutes:
11 | Recording link [here](https://youtu.be/JDQCdj18Snc)
12 | 
13 | 1. Intel GPU integration with Triton and Pytorch:
14 |    - No strong requirement from PyTorch for specific backends to be part of Triton official release.
15 |    - Can use a separate branch/fork for CI/CD and testing.
16 |    - Intel team will work with Pytorch offline to close.
17 | 2. AMD GPU backend update:
18 |    - AMD team shared the refactored design for AMD backend.
19 |    - The new design is modularized and reduces clutter and duplication in upstream Triton.
20 |    - Further work needed for regression testing and secure runners.
21 | 3. Proton profiler update:
22 |    - Keren from the OpenAI team presented a new profiler tool for Triton kernels, which supports multiple vendors, metrics, and formats.
23 |    - Outlined the plan for open-sourcing, integrating, and extending the tool.
24 | 


--------------------------------------------------------------------------------
/docs/meetups/08-22-2023/amd-update.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/08-22-2023/amd-update.pdf


--------------------------------------------------------------------------------
/docs/meetups/08-22-2023/intel-xpu-update.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/08-22-2023/intel-xpu-update.pptx


--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/intel-xpu-update.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/10-25-2023/intel-xpu-update.pdf


--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/notes.md:
--------------------------------------------------------------------------------
 1 | #### Agenda:
 2 | 
 3 | ##### Items:
 4 | 1. H100 updates
 5 | 2. Triton-Shared layer updates
 6 | 3. Intel update
 7 | 4. Open discussion
 8 | 
 9 | ##### Minutes:
10 | Recording link [here](https://youtu.be/KZAzpKx1ebI)
11 | 
12 | 1. H100 updates
13 |    - Enabled WGMMA by default, now any matmul can reuse it.
14 |    - fp8 formats enabled – 1.3 Petaflops on dense matmul on H100 (gemm performance)
15 |    - Enabled Flash Attention using wgmma, resulting in 450 teraflop on fwd pass and 250 on backward pass – still working on perf for flash attention
16 |    - fp8 numbers with flash attention running in fp8 with matmul is tricky, because the fp8 layout is significantly different than what is returned by wgmma, still wip
17 | 
18 | 2. Triton-Shared layer
19 |    - Please refer to slides for more details
20 |    - Created a repo where you can find the middle layer
21 |    - Available as a plugin into triton
22 | 
23 | 3. Intel Update
24 |    - Please refer to slides for more details
25 | 


--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/triton-shared.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/10-25-2023/triton-shared.pptx


--------------------------------------------------------------------------------
/docs/meetups/12-13-2023/notes.md:
--------------------------------------------------------------------------------
 1 | #### Agenda:
 2 | 
 3 | ##### Items:
 4 | 1. Refactoring plan for 3rd party backends
 5 | 2. Front end refactoring (AMD)
 6 | 3. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development.
 7 | 
 8 | ##### Minutes:
 9 | Recording link [here](https://youtu.be/Lo43DQYkOWM)
10 | 
11 | 1. Refactoring plan for 3rd party backends
12 |    - Refactoring to be completed by end of the year so that all GPU backends can be individual passes on Triton GPU IR instead of being completely out of tree. The goal is for users to get other GPUs besides Cuda when they install Triton. Non-GPU Triton IR expected to stay as is.
13 | 3. Front end refactoring (AMD)
14 |    - Will work with Phil for AMD related refactoring. Will share more details in next meetup about where AMD has diverged from Triton GPU IR and in the codeflow.
15 | 4. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development.
16 |    - Can look at it on a case by case basis.
17 | 


--------------------------------------------------------------------------------
/docs/meetups/dev-meetup-2023.md:
--------------------------------------------------------------------------------
 1 | The conference slides are available [here](https://drive.google.com/drive/folders/1yDFc4ElNN_GGhWDdMlM4wcm5uFEFFVQk?usp=sharing)
 2 | 
 3 | The conference videos will be available [here](https://youtube.com/playlist?list=PLc_vA1r0qoiRZfUC3o4_yjj0FtWvodKAz&feature=shared) when ready.
 4 | 
 5 | # Triton Developer Conference
 6 | The Triton Developer Conference was held in a hybrid mode at the Microsoft Silicon Valley Campus in Mountain View, California. The conference was held on September 20th from 10am to 4pm, followed by a reception till 5:30 pm.
 7 | 
 8 | Agenda for the conference:
 9 | 
10 | |Time    |Title  |Speaker
11 | |--------|-------|-------|
12 | |10:00 AM|Welcome|Kevin Scott (Microsoft)|
13 | |10:20 AM|The Triton Compiler: Past, Present and Future|Phil Tillet (OpenAI)|
14 | |11:00 AM|**Break**||
15 | |11:20 AM|Hopper support in Triton|Gustav Zhu (Nvidia)|
16 | |11:40 AM|Bringing Triton to AMD GPUs|Jason Furmanek, Lixun Zhang (AMD)|
17 | |12:00 PM|Intel XPU Backend for Triton|Eikan Wang (Intel)|
18 | |12:20 PM|Vectorization of Triton Kernels for Qualcomm Hexagon Backend|Javed Absar (Qualcomm)|
19 | |12:30 PM|**Lunch**||
20 | |1:40 PM |Triton for MTIA|Roman Levenstein et al, (Meta)|
21 | |2:00 PM |Using Triton IR for high-performance fusions in XLA|George Karpenkov (Google)|
22 | |2:20 PM |Triton for All: Triton as a device-independent language|Ian Bearman (Microsoft)|
23 | |2:40 PM|**Break**||
24 | |3:00 PM|PyTorch 2.0 and TorchInductor|Jason Ansel, Horace He (Meta)|
25 | |3:20 PM|Pallas: A JAX Kernel Language|Sharad Vikram (Google)|
26 | |3:40 PM|Writing Grouped GEMMs in Triton|Vinod Grover (Nvidia)|
27 | |4:00 PM|**Reception**||
28 | 


--------------------------------------------------------------------------------
/docs/programming-guide/chapter-1/cuda-parallel-matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-1/cuda-parallel-matmul.png


--------------------------------------------------------------------------------
/docs/programming-guide/chapter-1/triton-parallel-matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-1/triton-parallel-matmul.png


--------------------------------------------------------------------------------
/docs/programming-guide/chapter-2/halide-iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-2/halide-iteration.png


--------------------------------------------------------------------------------
/docs/programming-guide/chapter-2/polyhedral-iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-2/polyhedral-iteration.png


--------------------------------------------------------------------------------
/docs/python-api/triton.rst:
--------------------------------------------------------------------------------
 1 | triton
 2 | ======
 3 | 
 4 | .. currentmodule:: triton
 5 | 
 6 | .. autosummary::
 7 |     :toctree: generated
 8 |     :nosignatures:
 9 | 
10 |     jit
11 |     autotune
12 |     heuristics
13 |     Config
14 | 


--------------------------------------------------------------------------------
/docs/python-api/triton.testing.rst:
--------------------------------------------------------------------------------
 1 | triton.testing
 2 | ==============
 3 | 
 4 | .. currentmodule:: triton.testing
 5 | 
 6 | .. autosummary::
 7 |     :toctree: generated
 8 |     :nosignatures:
 9 | 
10 |     Benchmark
11 |     do_bench
12 |     do_bench_cudagraph
13 |     perf_report
14 | 


--------------------------------------------------------------------------------
/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(triton)
2 | 


--------------------------------------------------------------------------------
/include/triton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Conversion)
2 | add_subdirectory(Dialect)
3 | add_subdirectory(Target)
4 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonCPUToLLVM)
2 | add_subdirectory(TritonGPUToLLVM)
3 | add_subdirectory(TritonToTritonCPU)
4 | add_subdirectory(TritonToTritonGPU)
5 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/MLIRTypes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_MLIR_TYPES_H
 2 | #define TRITON_CONVERSION_MLIR_TYPES_H
 3 | 
 4 | #include "mlir/Transforms/DialectConversion.h"
 5 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 6 | 
 7 | // This file redefines some common MLIR types for easy usage.
 8 | namespace mlir {
 9 | namespace triton {
10 | namespace type {
11 | 
12 | // Integer types
13 | inline Type i32Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 32); }
14 | inline Type i16Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 16); }
15 | inline Type i8Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 8); }
16 | inline Type u32Ty(MLIRContext *ctx) {
17 |   return IntegerType::get(ctx, 32, IntegerType::Unsigned);
18 | }
19 | inline Type u1Ty(MLIRContext *ctx) {
20 |   return IntegerType::get(ctx, 1, IntegerType::Unsigned);
21 | }
22 | 
23 | // Float types
24 | inline Type f16Ty(MLIRContext *ctx) { return FloatType::getF16(ctx); }
25 | inline Type f32Ty(MLIRContext *ctx) { return FloatType::getF32(ctx); }
26 | inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); }
27 | inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); }
28 | 
29 | inline bool isFloat(Type type) {
30 |   return type.isF32() || type.isF64() || type.isF16() || type.isF128();
31 | }
32 | 
33 | inline bool isInt(Type type) { return type.isIntOrFloat() && !isFloat(type); }
34 | 
35 | } // namespace type
36 | } // namespace triton
37 | } // namespace mlir
38 | 
39 | #endif // TRITON_CONVERSION_MLIR_TYPES_H
40 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonCPUToLLVM)
3 | add_public_tablegen_target(TritonCPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCPU_CONVERSION_TRITONCPUTOLLVM_PASSES_H
 2 | #define TRITONCPU_CONVERSION_TRITONCPUTOLLVM_PASSES_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | #include "mlir/Pass/Pass.h"
 6 | #include "mlir/Transforms/DialectConversion.h"
 7 | 
 8 | #include <memory>
 9 | 
10 | namespace mlir {
11 | 
12 | class ModuleOp;
13 | template <typename T> class OperationPass;
14 | 
15 | namespace triton {
16 | 
17 | #define GEN_PASS_DECL
18 | #include "triton/Conversion/TritonCPUToLLVM/Passes.h.inc"
19 | 
20 | std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonCPUToLLVMPass();
21 | 
22 | #define GEN_PASS_REGISTRATION
23 | #include "triton/Conversion/TritonCPUToLLVM/Passes.h.inc"
24 | 
25 | } // namespace triton
26 | 
27 | } // namespace mlir
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCPU_CONVERSION_PASSES
 2 | #define TRITONCPU_CONVERSION_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def ConvertTritonCPUToLLVM : Pass<"convert-triton-cpu-to-llvm", "mlir::ModuleOp"> {
 7 |     let summary = "Convert TritonCPU to LLVM";
 8 |     let description = [{
 9 | 
10 |     }];
11 |     let constructor = "mlir::triton::createConvertTritonCPUToLLVMPass()";
12 | 
13 |     let dependentDialects = ["mlir::arith::ArithDialect",
14 |                              "mlir::LLVM::LLVMDialect",
15 |                              "mlir::math::MathDialect",
16 |                              "mlir::scf::SCFDialect",
17 |                              "mlir::tensor::TensorDialect",
18 |                              "mlir::triton::cpu::TritonCPUDialect",
19 |                              "mlir::triton::TritonDialect"];
20 | 
21 |     let options = [
22 |     ];
23 | }
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/PatternTritonCPUOpToLLVM.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITONCPU_TO_LLVM_PATTERNS_TRITON_CPU_OP_TO_LLVM_H
 2 | #define TRITON_CONVERSION_TRITONCPU_TO_LLVM_PATTERNS_TRITON_CPU_OP_TO_LLVM_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | #include "triton/Dialect/TritonCPU/IR/Dialect.h"
 6 | 
 7 | using namespace mlir;
 8 | using namespace mlir::triton;
 9 | 
10 | namespace mlir {
11 | namespace triton {
12 | // Some populate* functions have name collisions with the ones for GPUs.
13 | namespace cpu {
14 | 
15 | constexpr int patternBenefitDefault = 1;
16 | constexpr int patternBenefitPrioritizeOverLLVMConversions = 10;
17 | constexpr int patternBenefitClampOptimizedPattern = 20;
18 | constexpr int patternBenefitConvertLayoutOptimizedPattern = 20;
19 | 
20 | void populateControlFlowOpToLLVMPattern(LLVMTypeConverter &typeConverter,
21 |                                         RewritePatternSet &patterns,
22 |                                         PatternBenefit benefit);
23 | 
24 | void populateFuncOpConversionPattern(LLVMTypeConverter &typeConverter,
25 |                                      RewritePatternSet &patterns,
26 |                                      PatternBenefit benefit);
27 | 
28 | void populatePrintOpToLLVMPattern(LLVMTypeConverter &typeConverter,
29 |                                   RewritePatternSet &patterns,
30 |                                   PatternBenefit benefit);
31 | 
32 | } // namespace cpu
33 | } // namespace triton
34 | } // namespace mlir
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/TypeConverter.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCPU_CONVERSION_TRITONCPUTOLLVM_TYPECONVERTER_H
 2 | #define TRITONCPU_CONVERSION_TRITONCPUTOLLVM_TYPECONVERTER_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 6 | #include "triton/Conversion/MLIRTypes.h"
 7 | #include "triton/Dialect/TritonCPU/IR/Types.h"
 8 | 
 9 | using namespace mlir;
10 | using namespace mlir::triton;
11 | 
12 | class TritonCPUToLLVMTypeConverter : public LLVMTypeConverter {
13 | public:
14 |   using TypeConverter::convertType;
15 | 
16 |   TritonCPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option,
17 |                                const DataLayoutAnalysis *analysis = nullptr);
18 | 
19 |   Type convertTritonPointerType(triton::PointerType type);
20 | };
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/Utility.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITONCPU_TO_LLVM_UTILITY_H
 2 | #define TRITON_CONVERSION_TRITONCPU_TO_LLVM_UTILITY_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/Pattern.h"
 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 6 | #include "triton/Analysis/Utility.h"
 7 | #include "triton/Conversion/MLIRTypes.h"
 8 | #include "triton/Dialect/Triton/IR/Utility.h"
 9 | #include "triton/Dialect/TritonCPU/IR/Dialect.h"
10 | #include "llvm/Support/ErrorHandling.h"
11 | 
12 | using namespace mlir;
13 | using namespace mlir::triton;
14 | 
15 | namespace mlir {
16 | namespace LLVM {
17 | 
18 | // TODO: Not sure we need this for CPU backends.
19 | inline bool isKernel(FunctionOpInterface funcOp) {
20 |   return funcOp.getVisibility() == SymbolTable::Visibility::Public;
21 | }
22 | 
23 | } // namespace LLVM
24 | } // namespace mlir
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/AsmFormat.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
 2 | #define TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
 3 | 
 4 | #include "mlir/IR/Value.h"
 5 | #include "triton/Dialect/Triton/IR/Dialect.h"
 6 | #include "llvm/ADT/SmallVector.h"
 7 | #include "llvm/ADT/StringExtras.h"
 8 | #include "llvm/ADT/StringRef.h"
 9 | #include <memory>
10 | #include <string>
11 | 
12 | namespace mlir {
13 | class ConversionPatternRewriter;
14 | class Location;
15 | 
16 | namespace triton {
17 | using llvm::StringRef;
18 | 
19 | inline std::string strJoin(llvm::ArrayRef<std::string> strs,
20 |                            llvm::StringRef delimiter) {
21 |   return llvm::join(strs.begin(), strs.end(), delimiter);
22 | }
23 | 
24 | } // namespace triton
25 | } // namespace mlir
26 | 
27 | #endif // TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
28 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonGPUToLLVM)
3 | add_public_tablegen_target(TritonGPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H
 2 | #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | #include "mlir/Pass/Pass.h"
 6 | #include "mlir/Transforms/DialectConversion.h"
 7 | 
 8 | #include <memory>
 9 | 
10 | namespace mlir {
11 | 
12 | class ModuleOp;
13 | template <typename T> class OperationPass;
14 | 
15 | namespace triton {
16 | 
17 | #define GEN_PASS_DECL
18 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
19 | 
20 | namespace gpu {
21 | std::unique_ptr<OperationPass<ModuleOp>> createAllocateSharedMemoryPass();
22 | 
23 | } // namespace gpu
24 | 
25 | #define GEN_PASS_REGISTRATION
26 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
27 | 
28 | } // namespace triton
29 | 
30 | } // namespace mlir
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCOMMONGPU_CONVERSION_PASSES
 2 | #define TRITONCOMMONGPU_CONVERSION_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def AllocateSharedMemory : Pass<"allocate-shared-memory", "mlir::ModuleOp"> {
 7 |     let summary = "Add metadata for shared memory allocation";
 8 |     let constructor = "mlir::triton::gpu::createAllocateSharedMemoryPass()";
 9 | }
10 | 
11 | #endif
12 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/Patterns.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PATTERNS_H
 2 | #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PATTERNS_H
 3 | 
 4 | #include <functional>
 5 | 
 6 | namespace mlir {
 7 | class ModuleOp;
 8 | class RankedTensorType;
 9 | 
10 | namespace triton::gpu {
11 | 
12 | /// Replaces `blocked -> dot_op` with `blocked -> shared -> dot_op` in the given
13 | /// |module| op because the codegen doesn't handle `blocked -> dot_op` directly.
14 | void decomposeBlockedToDotLayoutConversion(ModuleOp module);
15 | 
16 | /// Replaces `splat -> shared` with `splat -> blocked -> shared` in the given
17 | /// |module| op.
18 | void decomposeSplatOpToSharedLayoutConversion(ModuleOp module);
19 | 
20 | /// Replaces `mma/mfma -> dot_op` with `mma/mfma -> blocked -> dot_op` in the
21 | /// given |module| op, but bypass the decomposition if |shortcutFn| returns
22 | /// true.
23 | using ShortcutFn = std::function<bool(RankedTensorType &, RankedTensorType &)>;
24 | template <typename TensorCoreEncodingAttr>
25 | void decomposeTensorCoreToDotLayoutConversion(ModuleOp module,
26 |                                               ShortcutFn shortcutFn);
27 | 
28 | } // namespace triton::gpu
29 | 
30 | } // namespace mlir
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_TYPECONVERTER_H
 2 | #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_TYPECONVERTER_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 6 | #include "triton/Conversion/MLIRTypes.h"
 7 | #include "triton/Dialect/TritonGPU/IR/Types.h"
 8 | 
 9 | using namespace mlir;
10 | using namespace mlir::triton;
11 | 
12 | class TritonGPUToLLVMTypeConverter : public LLVMTypeConverter {
13 | public:
14 |   using TypeConverter::convertType;
15 | 
16 |   TritonGPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option,
17 |                                const DataLayoutAnalysis *analysis = nullptr);
18 | 
19 |   Type getElementTypeForStruct(TensorOrMemDesc type);
20 |   Type convertTritonPointerType(triton::PointerType type);
21 |   Type convertTritonTensorType(RankedTensorType type);
22 |   Type convertMemDescType(MemDescType type);
23 |   Type convertAsyncToken(triton::gpu::AsyncTokenType type);
24 | };
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonCPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonCPU)
3 | add_public_tablegen_target(TritonConversionToCPUPassIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonCPU/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TO_CPU_PASSES_H
 2 | #define TRITON_CONVERSION_TO_CPU_PASSES_H
 3 | 
 4 | #include "triton/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.h"
 5 | 
 6 | namespace mlir {
 7 | namespace triton {
 8 | 
 9 | #define GEN_PASS_REGISTRATION
10 | #include "triton/Conversion/TritonToTritonCPU/Passes.h.inc"
11 | 
12 | } // namespace triton
13 | } // namespace mlir
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonCPU/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TO_CPU_PASSES
 2 | #define TRITON_CONVERSION_TO_CPU_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def ConvertTritonToTritonCPU: Pass<"convert-triton-to-tritoncpu", "mlir::ModuleOp"> {
 7 |     let summary = "Convert Triton to TritonCPU";
 8 |     let description = [{
 9 | 
10 |     }];
11 |     let constructor = "mlir::triton::createConvertTritonToTritonCPUPass()";
12 | 
13 |     let dependentDialects = ["mlir::arith::ArithDialect",
14 |                              "mlir::math::MathDialect",
15 |                              "mlir::scf::SCFDialect",
16 |                              "mlir::triton::cpu::TritonCPUDialect",
17 |                              "mlir::triton::TritonDialect"];
18 | 
19 |    let options = [
20 |    ];
21 | }
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITONTOTRITONCPU_TRITONTOTRITONCPUPASS_H
 2 | #define TRITON_CONVERSION_TRITONTOTRITONCPU_TRITONTOTRITONCPUPASS_H
 3 | 
 4 | #include <memory>
 5 | 
 6 | namespace mlir {
 7 | 
 8 | class ModuleOp;
 9 | template <typename T> class OperationPass;
10 | 
11 | namespace triton {
12 | 
13 | std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonToTritonCPUPass();
14 | 
15 | } // namespace triton
16 | } // namespace mlir
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonGPU)
3 | add_public_tablegen_target(TritonConversionToGPUPassIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TO_GPU_PASSES_H
 2 | #define TRITON_CONVERSION_TO_GPU_PASSES_H
 3 | 
 4 | #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
 5 | 
 6 | namespace mlir {
 7 | namespace triton {
 8 | 
 9 | #define GEN_PASS_REGISTRATION
10 | #include "triton/Conversion/TritonToTritonGPU/Passes.h.inc"
11 | 
12 | } // namespace triton
13 | } // namespace mlir
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TO_GPU_PASSES
 2 | #define TRITON_CONVERSION_TO_GPU_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def ConvertTritonToTritonGPU: Pass<"convert-triton-to-tritongpu", "mlir::ModuleOp"> {
 7 |     let summary = "Convert Triton to TritonGPU";
 8 |     let description = [{
 9 | 
10 |     }];
11 |     let constructor = "mlir::triton::createConvertTritonToTritonGPUPass()";
12 | 
13 |     let dependentDialects = ["mlir::arith::ArithDialect",
14 |                              "mlir::math::MathDialect",
15 |                              // TODO: Does this pass depend on SCF?
16 |                              "mlir::scf::SCFDialect",
17 |                              "mlir::triton::TritonDialect",
18 |                              "mlir::triton::gpu::TritonGPUDialect"];
19 | 
20 |    let options = [
21 |        Option<"numWarps", "num-warps",
22 |               "int32_t", /*default*/"4",
23 |               "number of warps">,
24 | 
25 |        Option<"threadsPerWarp", "threads-per-warp",
26 |               "int32_t", /*default*/"32",
27 |               "number of threads per warp">,
28 |         Option<"numCTAs", "num-ctas",
29 |               "int32_t", /*default*/"1",
30 |               "number of ctas in a cga">,
31 |         Option<"computeCapability", "compute-capability",
32 |               "int32_t", /*default*/"80",
33 |               "compute capability">
34 |    ];
35 | }
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H
 2 | #define TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H
 3 | 
 4 | #include <memory>
 5 | 
 6 | namespace mlir {
 7 | 
 8 | class ModuleOp;
 9 | template <typename T> class OperationPass;
10 | 
11 | namespace triton {
12 | 
13 | constexpr static char AttrNumWarpsName[] = "triton_gpu.num-warps";
14 | constexpr static char AttrNumCTAsName[] = "triton_gpu.num-ctas";
15 | constexpr static char AttrComputeCapabilityName[] =
16 |     "triton_gpu.compute-capability";
17 | 
18 | constexpr static char AttrNumThreadsPerWarp[] = "triton_gpu.threads-per-warp";
19 | 
20 | // Create the pass with numWarps passed from cl::opt.
21 | std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonToTritonGPUPass();
22 | 
23 | // Create the pass with numWarps set explicitly.
24 | std::unique_ptr<OperationPass<ModuleOp>>
25 | createConvertTritonToTritonGPUPass(int numWarps, int threadsPerWarp = 32,
26 |                                    int numCTAs = 1, int computeCapability = 80);
27 | 
28 | } // namespace triton
29 | } // namespace mlir
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Triton)
2 | add_subdirectory(TritonCPU)
3 | add_subdirectory(TritonGPU)
4 | add_subdirectory(TritonNvidiaGPU)
5 | add_subdirectory(NVGPU)
6 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/NVGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | #add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/NVGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu)
 6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions)
 7 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 9 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
10 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
11 | add_mlir_doc(NVGPUDialect NVGPUDialect dialects/ -gen-dialect-doc)
12 | add_mlir_doc(NVGPUOps NVGPUOps dialects/ -gen-op-doc)
13 | add_public_tablegen_target(NVGPUTableGen)
14 | 
15 | set(LLVM_TARGET_DEFINITIONS NVGPUAttrDefs.td)
16 | mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls)
17 | mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs)
18 | add_public_tablegen_target(NVGPUAttrDefsIncGen)
19 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/NVGPU/IR/NVGPUAttrDefs.td:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved.
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining
 4 | // a copy of this software and associated documentation files
 5 | // (the "Software"), to deal in the Software without restriction,
 6 | // including without limitation the rights to use, copy, modify, merge,
 7 | // publish, distribute, sublicense, and/or sell copies of the Software,
 8 | // and to permit persons to whom the Software is furnished to do so,
 9 | // subject to the following conditions:
10 | //
11 | // The above copyright notice and this permission notice shall be
12 | // included in all copies or substantial portions of the Software.
13 | //
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | #ifndef NVGPU_ATTRDEFS
23 | #define NVGPU_ATTRDEFS
24 | 
25 | include "triton/Dialect/NVGPU/IR/NVGPUDialect.td"
26 | include "mlir/IR/AttrTypeBase.td"
27 | 
28 | class NVGPU_Attr<string name, list<Trait> traits = [],
29 |                      string baseCppClass = "::mlir::Attribute">
30 |   : AttrDef<NVGPU_Dialect, name, traits, baseCppClass> {
31 | }
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/NVGPU/IR/NVGPUDialect.td:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved.
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining
 4 | // a copy of this software and associated documentation files
 5 | // (the "Software"), to deal in the Software without restriction,
 6 | // including without limitation the rights to use, copy, modify, merge,
 7 | // publish, distribute, sublicense, and/or sell copies of the Software,
 8 | // and to permit persons to whom the Software is furnished to do so,
 9 | // subject to the following conditions:
10 | //
11 | // The above copyright notice and this permission notice shall be
12 | // included in all copies or substantial portions of the Software.
13 | //
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | #ifndef NVGPU_DIALECT
23 | #define NVGPU_DIALECT
24 | 
25 | include "mlir/IR/OpBase.td"
26 | 
27 | def NVGPU_Dialect : Dialect {
28 |   let name = "nvgpu";
29 |   let cppNamespace = "::mlir::triton::nvgpu";
30 | 
31 |   let description = [{
32 |     NVGPU Dialect.
33 |   }];
34 | 
35 |   let dependentDialects = [
36 |     "mlir::LLVM::LLVMDialect"
37 |   ];
38 | }
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonOps.td)
 4 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 5 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 6 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
 7 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
 8 | add_mlir_doc(TritonOps TritonOps dialects/ -gen-op-doc)
 9 | 
10 | set(LLVM_TARGET_DEFINITIONS TritonDialect.td)
11 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls)
12 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs)
13 | add_mlir_doc(TritonDialect TritonDialect dialects/ -gen-dialect-doc)
14 | 
15 | set(LLVM_TARGET_DEFINITIONS TritonTypes.td)
16 | mlir_tablegen(Types.h.inc -gen-typedef-decls)
17 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs)
18 | 
19 | set(LLVM_TARGET_DEFINITIONS TritonInterfaces.td)
20 | mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls)
21 | mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs)
22 | 
23 | set(LLVM_TARGET_DEFINITIONS TritonTypeInterfaces.td)
24 | mlir_tablegen(TritonTypeInterfaces.h.inc -gen-type-interface-decls)
25 | mlir_tablegen(TritonTypeInterfaces.cpp.inc -gen-type-interface-defs)
26 | 
27 | add_public_tablegen_target(TritonTableGen)
28 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/Interfaces.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_IR_INTERFACES_H_
 2 | #define TRITON_IR_INTERFACES_H_
 3 | 
 4 | #include "mlir/IR/OpDefinition.h"
 5 | 
 6 | #define GET_TYPEDEF_CLASSES
 7 | #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc"
 8 | 
 9 | #endif // TRITON_IR_TYPES_H_
10 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/TritonDialect.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT
 2 | #define TRITON_DIALECT
 3 | 
 4 | include "mlir/IR/OpBase.td"
 5 | 
 6 | def Triton_Dialect : Dialect {
 7 |   let name = "tt";
 8 | 
 9 |   let cppNamespace = "::mlir::triton";
10 | 
11 |   let summary = "The Triton IR in MLIR";
12 | 
13 |   let description = [{
14 |     Triton Dialect.
15 | 
16 |     Dependent Dialects:
17 |       * Arith:
18 |         * addf, addi, andi, cmpf, cmpi, divf, fptosi, ...
19 |       * Math:
20 |         * exp, sin, cos, log, ...
21 |       * StructuredControlFlow:
22 |         * for, if, while, yield, condition
23 |       * ControlFlow:
24 |         * br, cond_br
25 |   }];
26 | 
27 |   let dependentDialects = [
28 |     "arith::ArithDialect",
29 |     "math::MathDialect",
30 |     "scf::SCFDialect",
31 |     "cf::ControlFlowDialect"
32 |   ];
33 | 
34 |   let extraClassDeclaration = [{
35 |     void registerTypes();
36 |   }];
37 | 
38 |   let hasConstantMaterializer = 1;
39 |   let useDefaultTypePrinterParser = 1;
40 |   let usePropertiesForAttributes = 1;
41 | }
42 | 
43 | include "triton/Dialect/Triton/IR/TritonTypes.td"
44 | 
45 | 
46 | #endif // TRITON_DIALECT
47 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/TritonInterfaces.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_INTERFACES
 2 | #define TRITON_INTERFACES
 3 | 
 4 | include "mlir/IR/OpBase.td"
 5 | 
 6 | def TensorSizeTrait : NativeOpTrait<"TensorSizeTrait">;
 7 | def VerifyTensorLayoutsTrait : NativeOpTrait<"VerifyTensorLayoutsTrait">;
 8 | def SameOperandsEncoding : NativeOpTrait<"SameOperandsEncoding">;
 9 | def SameOperandsAndResultEncoding : NativeOpTrait<"SameOperandsAndResultEncoding">;
10 | def SameLoadStoreOperandsShape : NativeOpTrait<"SameLoadStoreOperandsShape">;
11 | def SameLoadStoreOperandsAndResultShape : NativeOpTrait<"SameLoadStoreOperandsAndResultShape">;
12 | def SameLoadStoreOperandsEncoding : NativeOpTrait<"SameLoadStoreOperandsEncoding">;
13 | def SameLoadStoreOperandsAndResultEncoding : NativeOpTrait<"SameLoadStoreOperandsAndResultEncoding">;
14 | 
15 | #endif // TRITON_INTERFACES
16 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_TYPE_INTERFACES
 2 | #define TRITON_TYPE_INTERFACES
 3 | 
 4 | include "mlir/IR/OpBase.td"
 5 | 
 6 | // Interface dynamically attached to RankedTensorType and MemDescType.
 7 | def TT_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> {
 8 |   let cppNamespace = "::mlir";
 9 |   let methods = [
10 |     InterfaceMethod<"Returns the encoding of the tensor or memory descriptor",
11 |       "mlir::Attribute", "getEncoding", (ins)>,
12 |     InterfaceMethod<"Returns element type",
13 |       "mlir::Type", "getElementType", (ins)>,
14 |     InterfaceMethod<"Returns the type shape",
15 |       "llvm::ArrayRef<int64_t>", "getShape", (ins)>,
16 |     InterfaceMethod<"Returns the tensor or buffer rank",
17 |       "int64_t", "getRank", (ins)>,
18 |     InterfaceMethod<"Returns the element type bit width",
19 |       "int64_t", "getElementTypeBitWidth", (ins)>,
20 | 
21 |   ];
22 | }
23 | 
24 | #endif // TRITON_TYPE_INTERFACES
25 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/Types.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_IR_TYPES_H_
 2 | #define TRITON_IR_TYPES_H_
 3 | 
 4 | #include "mlir/IR/BuiltinTypes.h"
 5 | #include "mlir/IR/TypeSupport.h"
 6 | #include "mlir/IR/Types.h"
 7 | 
 8 | #define GET_TYPEDEF_CLASSES
 9 | #include "triton/Dialect/Triton/IR/Types.h.inc"
10 | 
11 | #include "triton/Dialect/Triton/IR/TritonTypeInterfaces.h.inc"
12 | 
13 | namespace mlir {
14 | 
15 | namespace triton {
16 | 
17 | bool isTensorPointerType(Type type);
18 | 
19 | bool isTensorOrTensorPointerType(Type type);
20 | 
21 | unsigned getPointeeBitWidth(Type type);
22 | 
23 | Type getPointeeType(Type type);
24 | 
25 | Type getPointerType(Type type);
26 | 
27 | Type getElementTypeOfTensorPointerType(Type type);
28 | 
29 | Type getI1SameShape(Type type);
30 | 
31 | Type getI32SameShape(Type type);
32 | 
33 | Type getPointerTypeSameShape(Type type);
34 | 
35 | } // namespace triton
36 | 
37 | } // namespace mlir
38 | 
39 | #endif // TRITON_IR_TYPES_H_
40 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name Triton)
3 | add_public_tablegen_target(TritonTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_
 2 | #define TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | 
 6 | namespace mlir {
 7 | namespace triton {
 8 | 
 9 | std::unique_ptr<Pass> createCombineOpsPass();
10 | 
11 | std::unique_ptr<Pass> createReorderBroadcastPass();
12 | std::unique_ptr<Pass> createRewriteTensorPointerPass();
13 | 
14 | } // namespace triton
15 | 
16 | #define GEN_PASS_REGISTRATION
17 | #include "triton/Dialect/Triton/Transforms/Passes.h.inc"
18 | 
19 | } // namespace mlir
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/Attributes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_
 2 | #define TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_
 3 | 
 4 | #include "triton/Dialect/TritonCPU/IR/TritonCPUInterfaces.h"
 5 | 
 6 | #define GET_ATTRDEF_CLASSES
 7 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.h.inc"
 8 | 
 9 | #endif // TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_
10 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonCPUOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_cpu)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_cpu)
 6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_cpu)
 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_cpu)
10 | add_mlir_doc(TritonCPUDialect TritonCPUDialect dialects/ -gen-dialect-doc)
11 | add_mlir_doc(TritonCPUOps TritonCPUOps dialects/ -gen-op-doc)
12 | add_public_tablegen_target(TritonCPUTableGen)
13 | 
14 | set(LLVM_TARGET_DEFINITIONS TritonCPUAttrDefs.td)
15 | mlir_tablegen(TritonCPUAttrInterfaces.h.inc -gen-attr-interface-decls)
16 | mlir_tablegen(TritonCPUAttrInterfaces.cpp.inc -gen-attr-interface-defs)
17 | mlir_tablegen(TritonCPUAttrDefs.h.inc -gen-attrdef-decls)
18 | mlir_tablegen(TritonCPUAttrDefs.cpp.inc -gen-attrdef-defs)
19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
21 | add_public_tablegen_target(TritonCPUAttrDefsIncGen)
22 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/Dialect.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_
 2 | #define TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_
 3 | 
 4 | #include "mlir/Dialect/Tensor/IR/Tensor.h"
 5 | #include "mlir/IR/BuiltinOps.h"
 6 | #include "mlir/IR/Dialect.h"
 7 | 
 8 | // TritonCPU depends on Triton
 9 | #include "triton/Dialect/Triton/IR/Dialect.h"
10 | #include "triton/Dialect/TritonCPU/IR/Attributes.h"
11 | #include "triton/Dialect/TritonCPU/IR/Dialect.h.inc"
12 | #include "triton/Dialect/TritonCPU/IR/Types.h"
13 | 
14 | #define GET_OP_CLASSES
15 | #include "triton/Dialect/TritonCPU/IR/Ops.h.inc"
16 | 
17 | #endif // TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_
18 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCPU_ATTRDEFS
 2 | #define TRITONCPU_ATTRDEFS
 3 | 
 4 | include "mlir/IR/AttrTypeBase.td"
 5 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td"
 6 | include "triton/Dialect/Triton/IR/TritonInterfaces.td"
 7 | 
 8 | //===----------------------------------------------------------------------===//
 9 | // TritonCPU Attribute Definitions
10 | //===----------------------------------------------------------------------===//
11 | def TritonCPU_AttrTrait : AttrInterface<"TritonCPU_AttrTrait"> {
12 |   let cppNamespace = "::mlir::triton::cpu";
13 | }
14 | 
15 | class TritonCPU_Attr<string name, string attrMnemonic, list<Trait> traits = [],
16 |                      Dialect dialect = TritonCPU_Dialect,
17 |                      string baseCppClass = "::mlir::Attribute">
18 |   : AttrDef<dialect, name, !listconcat([TritonCPU_AttrTrait], traits), baseCppClass> {
19 | 
20 |   let description = [{
21 |     WIP...
22 |   }];
23 | }
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUDialect.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCPU_DIALECT
 2 | #define TRITONCPU_DIALECT
 3 | 
 4 | include "mlir/IR/OpBase.td"
 5 | 
 6 | def TritonCPU_Dialect : Dialect {
 7 |   let name = "triton_cpu";
 8 | 
 9 |   let cppNamespace = "::mlir::triton::cpu";
10 | 
11 |   let hasOperationAttrVerify = 1;
12 | 
13 |   let description = [{
14 |     Triton CPU Dialect.
15 |   }];
16 | 
17 |   let dependentDialects = [
18 |     "triton::TritonDialect",
19 |     "tensor::TensorDialect",
20 |   ];
21 | 
22 |   let extraClassDeclaration = [{
23 |     void registerTypes();
24 |   }];
25 | 
26 |   let useDefaultTypePrinterParser = 1;
27 | }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUInterfaces.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CPU_DIALECT_INTERFACES_H
2 | #define TRITON_CPU_DIALECT_INTERFACES_H
3 | 
4 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrInterfaces.h.inc"
5 | 
6 | #endif // TRITON_CPU_DIALECT_INTERFACES_H
7 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUOps.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCPU_OPS
 2 | #define TRITONCPU_OPS
 3 | 
 4 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td"
 5 | include "triton/Dialect/TritonCPU/IR/TritonCPUTypes.td"
 6 | include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.td"
 7 | include "mlir/Dialect/Arith/IR/ArithBase.td"
 8 | include "triton/Dialect/Triton/IR/TritonTypes.td"
 9 | include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
10 | include "mlir/IR/OpBase.td"
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUTypes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCPU_TYPES
 2 | #define TRITONCPU_TYPES
 3 | 
 4 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td"
 5 | include "mlir/IR/AttrTypeBase.td"
 6 | 
 7 | class TTC_TypeDef<string name, string _mnemonic, list<Trait> traits = []>
 8 |     : TypeDef<TritonCPU_Dialect, name, traits> {
 9 |     let mnemonic = _mnemonic;
10 | }
11 | 
12 | def TTC_TokenType : TTC_TypeDef<"Token", "token"> {
13 |   let parameters = (ins "int32_t":$type);
14 | 
15 |   let builders = [
16 |     TypeBuilder<(ins "unsigned":$type), [{
17 |       return $_get($_ctxt, type);
18 |     }]>
19 |   ];
20 | 
21 |   let hasCustomAssemblyFormat = 1;
22 | 
23 |   let skipDefaultBuilders = 1;
24 | }
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/Types.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONCPU_IR_TYPES_H_
 2 | #define TRITONCPU_IR_TYPES_H_
 3 | 
 4 | #include "mlir/IR/TypeSupport.h"
 5 | #include "mlir/IR/Types.h"
 6 | 
 7 | #define GET_TYPEDEF_CLASSES
 8 | #include "triton/Dialect/TritonCPU/IR/Types.h.inc"
 9 | 
10 | #endif // TRITON_IR_TYPES_H_
11 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonCPU)
3 | add_public_tablegen_target(TritonCPUTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/Transforms/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONCPU_TRANSFORMS_PASSES_H_
 2 | #define TRITON_DIALECT_TRITONCPU_TRANSFORMS_PASSES_H_
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | 
 6 | namespace mlir {
 7 | namespace triton {
 8 | namespace cpu {} // namespace cpu
 9 | } // namespace triton
10 | 
11 | /// Generate the code for registering passes.
12 | #define GEN_PASS_REGISTRATION
13 | #include "triton/Dialect/TritonCPU/Transforms/Passes.h.inc"
14 | 
15 | } // namespace mlir
16 | #endif
17 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/Transforms/Passes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_PASSES
2 | #define TRITONCPU_PASSES
3 | 
4 | include "mlir/Pass/PassBase.td"
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/Transforms/TritonCPUConversion.h:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //
 3 | // Defines utilities to use while converting to the TritonCPU dialect.
 4 | //
 5 | //===----------------------------------------------------------------------===//
 6 | 
 7 | #ifndef TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_
 8 | #define TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_
 9 | 
10 | #include "mlir/Transforms/DialectConversion.h"
11 | 
12 | namespace mlir {
13 | 
14 | class TritonCPUTypeConverter : public TypeConverter {
15 | public:
16 |   TritonCPUTypeConverter(MLIRContext *context);
17 | 
18 | private:
19 |   MLIRContext *context;
20 | };
21 | 
22 | class TritonCPUConversionTarget : public ConversionTarget {
23 | 
24 | public:
25 |   explicit TritonCPUConversionTarget(MLIRContext &ctx,
26 |                                      TritonCPUTypeConverter &typeConverter);
27 | };
28 | 
29 | } // namespace mlir
30 | 
31 | #endif // TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_
32 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/Attributes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
 2 | #define TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
 3 | 
 4 | #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 5 | 
 6 | #define GET_ATTRDEF_CLASSES
 7 | #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc"
 8 | 
 9 | #endif // TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
10 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_gpu)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_gpu)
 6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_gpu)
 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_gpu)
10 | add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc)
11 | add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc)
12 | add_public_tablegen_target(TritonGPUTableGen)
13 | 
14 | set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td)
15 | mlir_tablegen(TritonGPUAttrInterfaces.h.inc -gen-attr-interface-decls)
16 | mlir_tablegen(TritonGPUAttrInterfaces.cpp.inc -gen-attr-interface-defs)
17 | mlir_tablegen(TritonGPUAttrDefs.h.inc -gen-attrdef-decls)
18 | mlir_tablegen(TritonGPUAttrDefs.cpp.inc -gen-attrdef-defs)
19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
21 | add_public_tablegen_target(TritonGPUAttrDefsIncGen)
22 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_GPU_DIALECT_INTERFACES_H
2 | #define TRITON_GPU_DIALECT_INTERFACES_H
3 | 
4 | #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrInterfaces.h.inc"
5 | 
6 | #endif // TRITON_GPU_DIALECT_INTERFACES_H
7 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_TYPES
 2 | #define TRITONGPU_TYPES
 3 | 
 4 | include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 5 | include "mlir/IR/AttrTypeBase.td"
 6 | 
 7 | class TTG_TypeDef<string name, string _mnemonic, list<Trait> traits = []>
 8 |     : TypeDef<TritonGPU_Dialect, name, traits> {
 9 |     let mnemonic = _mnemonic;
10 | }
11 | 
12 | def TTG_TokenType : TTG_TypeDef<"Token", "token"> {
13 |   let parameters = (ins "int32_t":$type);
14 | 
15 |   let builders = [
16 |     TypeBuilder<(ins "unsigned":$type), [{
17 |       return $_get($_ctxt, type);
18 |     }]>
19 |   ];
20 | 
21 |   let hasCustomAssemblyFormat = 1;
22 | 
23 |   let skipDefaultBuilders = 1;
24 | }
25 | 
26 | def TTG_AsyncToken : TTG_TypeDef<"AsyncToken",
27 |                                     "async.token", []> {
28 |   let summary = "async token type";
29 |   let description = [{
30 |     `ttg.async.token` is a type returned by an asynchronous operation.
31 |     It is used to establish an SSA-based link between async operations
32 |     and operations that group or synchronize the async operations.
33 |   }];
34 | }
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/Types.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_IR_TYPES_H_
 2 | #define TRITONGPU_IR_TYPES_H_
 3 | 
 4 | #include "mlir/IR/TypeSupport.h"
 5 | #include "mlir/IR/Types.h"
 6 | 
 7 | #define GET_TYPEDEF_CLASSES
 8 | #include "triton/Dialect/TritonGPU/IR/Types.h.inc"
 9 | 
10 | #endif // TRITON_IR_TYPES_H_
11 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonGPU)
3 | add_public_tablegen_target(TritonGPUTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_
 2 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 6 | 
 7 | namespace mlir {
 8 | namespace triton {
 9 | namespace gpu {
10 | 
11 | std::unique_ptr<Pass> createPipelinePass(int numStages = 3, int numWarps = 4,
12 |                                          int numCTAs = 1,
13 |                                          int computeCapability = 80);
14 | 
15 | std::unique_ptr<Pass> createAccelerateMatmulPass(int computeCapability = 80);
16 | 
17 | std::unique_ptr<Pass> createF32DotTCPass();
18 | 
19 | std::unique_ptr<Pass> createPrefetchPass();
20 | 
21 | std::unique_ptr<Pass> createCoalescePass();
22 | 
23 | std::unique_ptr<Pass> createReorderInstructionsPass();
24 | 
25 | std::unique_ptr<Pass> createReduceDataDuplicationPass();
26 | 
27 | std::unique_ptr<Pass> createRemoveLayoutConversionsPass();
28 | 
29 | std::unique_ptr<Pass> createVerifier();
30 | 
31 | std::unique_ptr<Pass> createOptimizeDotOperandsPass();
32 | 
33 | std::unique_ptr<Pass> createOptimizeThreadLocalityPass();
34 | 
35 | } // namespace gpu
36 | } // namespace triton
37 | 
38 | /// Generate the code for registering passes.
39 | #define GEN_PASS_REGISTRATION
40 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
41 | 
42 | } // namespace mlir
43 | #endif
44 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //
 3 | // Defines utilities to use while converting to the TritonGPU dialect.
 4 | //
 5 | //===----------------------------------------------------------------------===//
 6 | 
 7 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
 8 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
 9 | 
10 | #include "mlir/Transforms/DialectConversion.h"
11 | 
12 | namespace mlir {
13 | 
14 | class TritonGPUTypeConverter : public TypeConverter {
15 | public:
16 |   TritonGPUTypeConverter(MLIRContext *context, int numWarps, int threadsPerWarp,
17 |                          int numCTAs);
18 |   int getNumWarps() const { return numWarps; }
19 |   int getThreadsPerWarp() const { return threadsPerWarp; }
20 |   int getNumCTAs() const { return numCTAs; }
21 | 
22 | private:
23 |   MLIRContext *context;
24 |   int numWarps;
25 |   int threadsPerWarp;
26 |   int numCTAs;
27 | };
28 | 
29 | class TritonGPUConversionTarget : public ConversionTarget {
30 | 
31 | public:
32 |   explicit TritonGPUConversionTarget(MLIRContext &ctx,
33 |                                      TritonGPUTypeConverter &typeConverter);
34 | };
35 | 
36 | } // namespace mlir
37 | 
38 | #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
39 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_nvidia_gpu)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_nvidia_gpu)
 6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_nvidia_gpu)
 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_nvidia_gpu)
10 | add_mlir_doc(TritonNvidiaGPUDialect TritonNvidiaGPUDialect dialects/ -gen-dialect-doc)
11 | add_mlir_doc(TritonNvidiaGPUOps TritonNvidiaGPUOps dialects/ -gen-op-doc)
12 | add_public_tablegen_target(TritonNvidiaGPUTableGen)
13 | 
14 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUAttrDefs.td)
15 | mlir_tablegen(TritonNvidiaGPUAttrDefs.h.inc -gen-attrdef-decls)
16 | mlir_tablegen(TritonNvidiaGPUAttrDefs.cpp.inc -gen-attrdef-defs)
17 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
18 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
19 | add_public_tablegen_target(TritonNvidiaGPUAttrDefsIncGen)
20 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved.
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining
 4 | // a copy of this software and associated documentation files
 5 | // (the "Software"), to deal in the Software without restriction,
 6 | // including without limitation the rights to use, copy, modify, merge,
 7 | // publish, distribute, sublicense, and/or sell copies of the Software,
 8 | // and to permit persons to whom the Software is furnished to do so,
 9 | // subject to the following conditions:
10 | //
11 | // The above copyright notice and this permission notice shall be
12 | // included in all copies or substantial portions of the Software.
13 | //
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | #ifndef TRITONNVIDIAGPU_ATTRDEFS
23 | #define TRITONNVIDIAGPU_ATTRDEFS
24 | 
25 | include "mlir/IR/AttrTypeBase.td"
26 | include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td"
27 | include "triton/Dialect/Triton/IR/TritonInterfaces.td"
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/IR/Types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining
 5 |  * a copy of this software and associated documentation files
 6 |  * (the "Software"), to deal in the Software without restriction,
 7 |  * including without limitation the rights to use, copy, modify, merge,
 8 |  * publish, distribute, sublicense, and/or sell copies of the Software,
 9 |  * and to permit persons to whom the Software is furnished to do so,
10 |  * subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be
13 |  * included in all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 |  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #ifndef TRITONNVIDIAGPU_IR_TYPES_H_
25 | #define TRITONNVIDIAGPU_IR_TYPES_H_
26 | 
27 | #include "mlir/IR/TypeSupport.h"
28 | #include "mlir/IR/Types.h"
29 | 
30 | #define GET_TYPEDEF_CLASSES
31 | #include "triton/Dialect/TritonNvidiaGPU/IR/Types.h.inc"
32 | 
33 | #endif // TRITON_IR_TYPES_H_
34 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonNvidiaGPU)
3 | add_public_tablegen_target(TritonNvidiaGPUTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Target/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(LLVMIR)
2 | 


--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name LLVMIR)
3 | add_public_tablegen_target(LLVMIRIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_TARGET_LLVM_IR_PASSES_H
 2 | #define TRITON_TARGET_LLVM_IR_PASSES_H
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | 
 6 | namespace mlir {
 7 | 
 8 | /// Create a pass to add DIScope
 9 | std::unique_ptr<Pass> createLLVMDIScopePass();
10 | 
11 | /// Generate the code for registering conversion passes.
12 | #define GEN_PASS_REGISTRATION
13 | #include "triton/Target/LLVMIR/Passes.h.inc"
14 | 
15 | } // namespace mlir
16 | 
17 | #endif // TRITON_TARGET_LLVM_IR_PASSES_H
18 | 


--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_TARGET_LLVMIR_PASSES
 2 | #define TRITON_TARGET_LLVMIR_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def LLVMDIScope: Pass<"enable-line-info", "mlir::ModuleOp"> {
 7 |   let summary = "Materialize LLVM line info";
 8 |   let description = [{
 9 |     This pass materializes line mapping information for LLVM IR dialect operations.
10 |   }];
11 | 
12 |   let constructor = "mlir::createLLVMDIScopePass()";
13 | }
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/include/triton/Tools/Sys/GetPlatform.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
 3 |  *
 4 |  * This file is part of ISAAC.
 5 |  *
 6 |  * ISAAC is free software; you can redistribute it and/or
 7 |  * modify it under the terms of the GNU Lesser General Public
 8 |  * License as published by the Free Software Foundation; either
 9 |  * version 2.1 of the License, or (at your option) any later version.
10 |  *
11 |  * This library is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 |  * Lesser General Public License for more details.
15 |  *
16 |  * You should have received a copy of the GNU Lesser General Public
17 |  * License along with this library; if not, write to the Free Software
18 |  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 |  * MA 02110-1301  USA
20 |  */
21 | 
22 | #ifndef TDL_TOOLS_SYS_GETPLATFORM_HPP
23 | #define TDL_TOOLS_SYS_GETPLATFORM_HPP
24 | 
25 | #include <algorithm>
26 | #include <cstdlib>
27 | #include <iostream>
28 | #include <map>
29 | #include <memory>
30 | #include <string>
31 | 
32 | // inline bool _isROCM = false;
33 | // inline void setROCM() { _isROCM = true; }
34 | // inline bool isROCM() { return _isROCM; }
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/lib/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAnalysis
 2 |   AxisInfo.cpp
 3 |   Allocation.cpp
 4 |   Membar.cpp
 5 |   Alias.cpp
 6 |   Utility.cpp
 7 | 
 8 |   DEPENDS
 9 |   TritonTableGen
10 |   TritonGPUAttrDefsIncGen
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRAnalysis
14 |   MLIRLLVMDialect
15 |   TritonIR
16 |   TritonGPUIR
17 |   TritonNvidiaGPUIR
18 | )
19 | 


--------------------------------------------------------------------------------
/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # add_subdirectory(codegen)
2 | add_subdirectory(Analysis)
3 | add_subdirectory(Conversion)
4 | add_subdirectory(Dialect)
5 | add_subdirectory(Target)
6 | 


--------------------------------------------------------------------------------
/lib/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonToTritonCPU)
2 | add_subdirectory(TritonToTritonGPU)
3 | add_subdirectory(TritonCPUToLLVM)
4 | add_subdirectory(TritonGPUToLLVM)
5 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonCPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonCPUToLLVM
 2 |     ControlFlowOpToLLVM.cpp
 3 |     FuncOpToLLVM.cpp
 4 |     TypeConverter.cpp
 5 |     TritonCPUToLLVM.cpp
 6 | 
 7 |     DEPENDS
 8 |     TritonCPUConversionPassIncGen
 9 | 
10 |     LINK_LIBS PUBLIC
11 |     MLIRIR
12 |     MLIRPass
13 |     TritonAnalysis
14 |     TritonIR
15 |     TritonCPUIR
16 |     TritonCPUTransforms
17 | )
18 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonCPUToLLVM/ControlFlowOpToLLVM.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Conversion/TritonCPUToLLVM/PatternTritonCPUOpToLLVM.h"
 2 | #include "triton/Conversion/TritonCPUToLLVM/Utility.h"
 3 | #include "llvm/Support/ErrorHandling.h"
 4 | 
 5 | namespace {
 6 | 
 7 | using namespace mlir;
 8 | using namespace mlir::triton;
 9 | 
10 | struct ReturnOpConversion : public ConvertOpToLLVMPattern<triton::ReturnOp> {
11 |   using ConvertOpToLLVMPattern<triton::ReturnOp>::ConvertOpToLLVMPattern;
12 | 
13 |   LogicalResult
14 |   matchAndRewrite(triton::ReturnOp op, OpAdaptor adaptor,
15 |                   ConversionPatternRewriter &rewriter) const override {
16 |     auto funcOp = op->getParentOfType<LLVM::LLVMFuncOp>();
17 |     if (funcOp->hasAttr("cpu.kernel")) {
18 |       if (op.getNumOperands() > 0) {
19 |         return rewriter.notifyMatchFailure(
20 |             op, "Kernel functions do not support return with operands");
21 |       }
22 |       rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, TypeRange(), ValueRange(),
23 |                                                   op->getAttrs());
24 |     } else {
25 |       llvm_unreachable("Not implemented");
26 |     }
27 |     return success();
28 |   }
29 | };
30 | 
31 | } // namespace
32 | 
33 | void mlir::triton::cpu::populateControlFlowOpToLLVMPattern(
34 |     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
35 |     PatternBenefit benefit) {
36 |   patterns.add<ReturnOpConversion>(typeConverter, benefit);
37 | }
38 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonCPUToLLVM/TypeConverter.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Conversion/TritonCPUToLLVM/TypeConverter.h"
 2 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 3 | #include "triton/Conversion/MLIRTypes.h"
 4 | #include "llvm/Support/ErrorHandling.h"
 5 | 
 6 | using namespace mlir;
 7 | using namespace mlir::triton;
 8 | 
 9 | TritonCPUToLLVMTypeConverter::TritonCPUToLLVMTypeConverter(
10 |     MLIRContext *ctx, LowerToLLVMOptions &option,
11 |     const DataLayoutAnalysis *analysis)
12 |     : LLVMTypeConverter(ctx, option, analysis) {
13 |   addConversion([&](triton::PointerType type) -> std::optional<Type> {
14 |     return convertTritonPointerType(type);
15 |   });
16 | 
17 |   // Internally store bfloat16 as int16
18 |   addConversion([&](BFloat16Type type) -> std::optional<Type> {
19 |     return IntegerType::get(type.getContext(), 16);
20 |   });
21 | }
22 | 
23 | Type TritonCPUToLLVMTypeConverter::convertTritonPointerType(
24 |     triton::PointerType type) {
25 |   auto ctx = type.getContext();
26 |   auto pointeeType = type.getPointeeType();
27 |   if (pointeeType.isa<RankedTensorType>()) {
28 |     llvm_unreachable("Not implemented");
29 |   }
30 |   return LLVM::LLVMPointerType::get(ctx, type.getAddressSpace());
31 | }
32 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonGPUToLLVM
 2 |     ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp
 3 |     DotOpToLLVM/FMA.cpp
 4 |     TypeConverter.cpp
 5 |     Utility.cpp
 6 |     ElementwiseOpToLLVM.cpp
 7 |     MemoryOpToLLVM.cpp
 8 |     AssertOpToLLVM.cpp
 9 |     ViewOpToLLVM.cpp
10 |     MakeRangeOpToLLVM.cpp
11 |     HistogramOpToLLVM.cpp
12 |     AllocateSharedMemory.cpp
13 |     ReduceOpToLLVM.cpp
14 |     ScanOpToLLVM.cpp
15 |     ConvertLayoutOpToLLVM.cpp
16 |     ControlFlowOpToLLVM.cpp
17 |     FuncOpToLLVM.cpp
18 |     SPMDOpToLLVM.cpp
19 |     DecomposeUnsupportedConversions.cpp
20 |     PrintOpToLLVM.cpp
21 | 
22 |     DEPENDS
23 |     TritonGPUConversionPassIncGen
24 | 
25 |     LINK_LIBS PUBLIC
26 |     MLIRIR
27 |     MLIRPass
28 |     MLIRGPUDialect
29 |     MLIRGPUToNVVMTransforms
30 |     MLIRGPUToROCDLTransforms
31 |     MLIRGPUTransforms
32 |     TritonAnalysis
33 |     TritonIR
34 |     TritonGPUIR
35 |     TritonGPUTransforms
36 |     TritonNvidiaGPUTransforms
37 |     NVGPUIR
38 | )
39 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonGPUToLLVM/SPMDOpToLLVM.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 2 | #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 3 | 
 4 | namespace {
 5 | 
 6 | using namespace mlir;
 7 | using namespace mlir::triton;
 8 | 
 9 | struct GetProgramIdOpConversion
10 |     : public ConvertOpToLLVMPattern<triton::GetProgramIdOp> {
11 |   explicit GetProgramIdOpConversion(LLVMTypeConverter &typeConverter,
12 |                                     const TargetInfoBase &targetInfo,
13 |                                     PatternBenefit benefit = 1)
14 |       : ConvertOpToLLVMPattern<triton::GetProgramIdOp>(typeConverter, benefit),
15 |         targetInfo(targetInfo) {}
16 | 
17 |   LogicalResult
18 |   matchAndRewrite(triton::GetProgramIdOp op, OpAdaptor adaptor,
19 |                   ConversionPatternRewriter &rewriter) const override {
20 |     Value programId = targetInfo.programId(rewriter, op->getLoc(),
21 |                                            op->getParentOfType<ModuleOp>(),
22 |                                            op.getAxisAsInt());
23 |     rewriter.replaceOp(op, programId);
24 |     return success();
25 |   }
26 | 
27 | private:
28 |   const TargetInfoBase &targetInfo;
29 | };
30 | 
31 | } // namespace
32 | 
33 | void mlir::triton::populateSPMDOpToLLVMPattern(LLVMTypeConverter &typeConverter,
34 |                                                RewritePatternSet &patterns,
35 |                                                const TargetInfoBase &targetInfo,
36 |                                                PatternBenefit benefit) {
37 |   patterns.add<GetProgramIdOpConversion>(typeConverter, targetInfo, benefit);
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonToTritonCPU/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonToTritonCPU
 2 |     TritonCPUConversion.cpp
 3 |     TritonToTritonCPUPass.cpp
 4 | 
 5 |     DEPENDS
 6 |     TritonConversionToCPUPassIncGen
 7 | 
 8 |     LINK_LIBS PUBLIC
 9 |     MLIRIR
10 |     MLIRPass
11 |     MLIRTransforms
12 |     TritonIR
13 |     TritonCPUIR
14 |     TritonCPUTransforms
15 | )
16 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.h"
 2 | 
 3 | #include "mlir/Dialect/Arith/IR/Arith.h"
 4 | #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 5 | #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 6 | #include "mlir/Dialect/Index/IR/IndexDialect.h"
 7 | #include "mlir/Pass/Pass.h"
 8 | #include "mlir/Transforms/DialectConversion.h"
 9 | #include "triton/Analysis/Utility.h"
10 | #include "triton/Dialect/Triton/IR/Dialect.h"
11 | #include "triton/Dialect/Triton/IR/Utility.h"
12 | #include "triton/Dialect/TritonCPU/IR/Dialect.h"
13 | #include "triton/Dialect/TritonCPU/Transforms/TritonCPUConversion.h"
14 | #include "llvm/ADT/APSInt.h"
15 | #include <numeric>
16 | 
17 | #define GEN_PASS_CLASSES
18 | #include "triton/Conversion/TritonToTritonCPU/Passes.h.inc"
19 | 
20 | namespace {
21 | 
22 | using namespace mlir;
23 | using namespace mlir::triton;
24 | using namespace mlir::triton::cpu;
25 | 
26 | class ConvertTritonToTritonCPU
27 |     : public ConvertTritonToTritonCPUBase<ConvertTritonToTritonCPU> {
28 | public:
29 |   ConvertTritonToTritonCPU() = default;
30 | 
31 |   void runOnOperation() override {
32 |     // TODO:
33 |   }
34 | };
35 | 
36 | } // namespace
37 | 
38 | std::unique_ptr<OperationPass<ModuleOp>>
39 | mlir::triton::createConvertTritonToTritonCPUPass() {
40 |   return std::make_unique<::ConvertTritonToTritonCPU>();
41 | }
42 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonToTritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonToTritonGPU
 2 |     TritonGPUConversion.cpp
 3 |     TritonToTritonGPUPass.cpp
 4 | 
 5 |     DEPENDS
 6 |     TritonConversionToGPUPassIncGen
 7 | 
 8 |     LINK_LIBS PUBLIC
 9 |     MLIRIR
10 |     MLIRPass
11 |     MLIRTransforms
12 |     TritonIR
13 |     TritonGPUIR
14 |     TritonGPUTransforms
15 | )
16 | 


--------------------------------------------------------------------------------
/lib/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Triton)
2 | add_subdirectory(TritonCPU)
3 | add_subdirectory(TritonGPU)
4 | add_subdirectory(TritonNvidiaGPU)
5 | add_subdirectory(NVGPU)
6 | 


--------------------------------------------------------------------------------
/lib/Dialect/NVGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 


--------------------------------------------------------------------------------
/lib/Dialect/NVGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(NVGPUIR
 2 |   Dialect.cpp
 3 | 
 4 |   DEPENDS
 5 |   NVGPUTableGen
 6 |   NVGPUAttrDefsIncGen
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   MLIRLLVMDialect
10 | )
11 | 


--------------------------------------------------------------------------------
/lib/Dialect/Triton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/lib/Dialect/Triton/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonIR
 2 |   Dialect.cpp
 3 |   Ops.cpp
 4 |   Types.cpp
 5 |   Traits.cpp
 6 | 
 7 |   DEPENDS
 8 |   TritonTableGen
 9 | 
10 |   LINK_LIBS PUBLIC
11 |   MLIRIR
12 |   MLIRArithDialect
13 |   MLIRMathDialect
14 |   MLIRSCFDialect
15 | )
16 | 


--------------------------------------------------------------------------------
/lib/Dialect/Triton/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LLVM_TARGET_DEFINITIONS Combine.td)
 2 | mlir_tablegen(TritonCombine.inc -gen-rewriters)
 3 | add_public_tablegen_target(TritonCombineIncGen)
 4 | 
 5 | add_triton_library(TritonTransforms
 6 |   Combine.cpp
 7 |   ReorderBroadcast.cpp
 8 |   RewriteTensorPointer.cpp
 9 | 
10 |   DEPENDS
11 |   TritonTransformsIncGen
12 |   TritonCombineIncGen
13 | 
14 |   LINK_LIBS PUBLIC
15 |   MLIRPass
16 |   MLIRTransformUtils
17 |   TritonIR
18 | )
19 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonCPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonCPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonCPUIR
 2 |   Dialect.cpp
 3 |   Types.cpp
 4 | 
 5 |   DEPENDS
 6 |   TritonCPUTableGen
 7 |   TritonCPUAttrDefsIncGen
 8 | 
 9 |   LINK_LIBS PUBLIC
10 |   TritonIR
11 | )
12 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonCPU/IR/Dialect.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Dialect/Triton/IR/Dialect.h"
 2 | 
 3 | #include <numeric>
 4 | 
 5 | #include "mlir/IR/DialectImplementation.h"
 6 | #include "mlir/IR/OpImplementation.h"
 7 | #include "triton/Analysis/Utility.h"
 8 | #include "triton/Dialect/Triton/IR/Utility.h"
 9 | #include "triton/Dialect/TritonCPU/IR/Dialect.cpp.inc"
10 | #include "triton/Dialect/TritonCPU/IR/Dialect.h"
11 | #include "triton/Tools/Sys/GetEnv.hpp"
12 | #include "llvm/ADT/TypeSwitch.h"
13 | 
14 | using namespace mlir;
15 | using namespace mlir::triton::cpu;
16 | 
17 | //===----------------------------------------------------------------------===//
18 | // Attribute methods
19 | //===----------------------------------------------------------------------===//
20 | #define GET_ATTRDEF_CLASSES
21 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.cpp.inc"
22 | 
23 | void TritonCPUDialect::initialize() {
24 |   registerTypes();
25 | 
26 |   addAttributes<
27 | #define GET_ATTRDEF_LIST
28 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.cpp.inc"
29 |       >();
30 |   addOperations<
31 | #define GET_OP_LIST
32 | #include "triton/Dialect/TritonCPU/IR/Ops.cpp.inc"
33 | #include "triton/Dialect/TritonCPU/IR/OpsEnums.cpp.inc"
34 |       >();
35 | }
36 | 
37 | // verify TritonCPU ops
38 | LogicalResult TritonCPUDialect::verifyOperationAttribute(Operation *op,
39 |                                                          NamedAttribute attr) {
40 |   // TODO: fill this.
41 |   return success();
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonCPU/IR/Types.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Dialect/TritonCPU/IR/Types.h"
 2 | #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc`
 3 | #include "triton/Dialect/TritonCPU/IR/Dialect.h"
 4 | #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc`
 5 | 
 6 | using namespace mlir;
 7 | using namespace mlir::triton::cpu;
 8 | 
 9 | #define GET_TYPEDEF_CLASSES
10 | #include "triton/Dialect/TritonCPU/IR/Types.cpp.inc"
11 | 
12 | Type TokenType::parse(AsmParser &parser) {
13 |   if (parser.parseLess())
14 |     return Type();
15 | 
16 |   int type = 1;
17 |   if (parser.parseInteger(type))
18 |     return Type();
19 | 
20 |   if (parser.parseGreater())
21 |     return Type();
22 | 
23 |   return TokenType::get(parser.getContext(), type);
24 | }
25 | 
26 | void TokenType::print(AsmPrinter &printer) const {
27 |   printer << "<" << getType() << ">";
28 | }
29 | 
30 | //===----------------------------------------------------------------------===//
31 | // Triton Dialect
32 | //===----------------------------------------------------------------------===//
33 | void ::mlir::triton::cpu::TritonCPUDialect::registerTypes() {
34 |   addTypes<
35 | #define GET_TYPEDEF_LIST
36 | #include "triton/Dialect/TritonCPU/IR/Types.cpp.inc"
37 |       >();
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonCPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonCPUTransforms
 2 | 
 3 |   DEPENDS
 4 |   TritonCPUTransformsIncGen
 5 | 
 6 |   LINK_LIBS PUBLIC
 7 |   MLIRTransforms
 8 |   MLIRTransformUtils
 9 |   TritonAnalysis
10 |   TritonIR
11 |   TritonCPUIR
12 |   MLIRTransformUtils
13 | )
14 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonGPUIR
 2 |   Dialect.cpp
 3 |   Types.cpp
 4 | 
 5 |   DEPENDS
 6 |   TritonGPUTableGen
 7 |   TritonGPUAttrDefsIncGen
 8 | 
 9 |   LINK_LIBS PUBLIC
10 |   MLIRGPUDialect
11 |   TritonIR
12 | )
13 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/IR/Types.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Dialect/TritonGPU/IR/Types.h"
 2 | #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc`
 3 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 4 | #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc`
 5 | 
 6 | using namespace mlir;
 7 | using namespace mlir::triton::gpu;
 8 | 
 9 | #define GET_TYPEDEF_CLASSES
10 | #include "triton/Dialect/TritonGPU/IR/Types.cpp.inc"
11 | 
12 | Type TokenType::parse(AsmParser &parser) {
13 |   if (parser.parseLess())
14 |     return Type();
15 | 
16 |   int type = 1;
17 |   if (parser.parseInteger(type))
18 |     return Type();
19 | 
20 |   if (parser.parseGreater())
21 |     return Type();
22 | 
23 |   return TokenType::get(parser.getContext(), type);
24 | }
25 | 
26 | void TokenType::print(AsmPrinter &printer) const {
27 |   printer << "<" << getType() << ">";
28 | }
29 | 
30 | //===----------------------------------------------------------------------===//
31 | // Triton Dialect
32 | //===----------------------------------------------------------------------===//
33 | void ::mlir::triton::gpu::TritonGPUDialect::registerTypes() {
34 |   addTypes<
35 | #define GET_TYPEDEF_LIST
36 | #include "triton/Dialect/TritonGPU/IR/Types.cpp.inc"
37 |       >();
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonGPUTransforms
 2 |   AccelerateMatmul.cpp
 3 |   Coalesce.cpp
 4 |   F32DotTC.cpp
 5 |   ReduceDataDuplication.cpp
 6 |   OptimizeDotOperands.cpp
 7 |   OptimizeThreadLocality.cpp
 8 |   Pipeliner/MatmulLoopPipeline.cpp
 9 |   Pipeliner/OuterLoopPipeline.cpp
10 |   Pipeliner/PipelineExpander.cpp
11 |   Pipeliner/SoftwarePipeliner.cpp
12 |   Pipeliner/PipeliningUtility.cpp
13 |   Prefetch.cpp
14 |   RemoveLayoutConversions.cpp
15 |   ReorderInstructions.cpp
16 |   Utility.cpp
17 | 
18 |   DEPENDS
19 |   TritonGPUTransformsIncGen
20 | 
21 |   LINK_LIBS PUBLIC
22 |   MLIRTransforms
23 |   MLIRTransformUtils
24 |   TritonAnalysis
25 |   TritonIR
26 |   TritonGPUIR
27 |   TritonNvidiaGPUIR
28 |   MLIRTransformUtils
29 | )
30 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_TRITONGPU_TRANSFORMS_PIPELINER_PIPELINING_UTILITY_H_
 2 | #define TRITON_TRITONGPU_TRANSFORMS_PIPELINER_PIPELINING_UTILITY_H_
 3 | 
 4 | #include "mlir/Dialect/SCF/IR/SCF.h"
 5 | #include <vector>
 6 | 
 7 | namespace mlir {
 8 | namespace triton {
 9 | 
10 | static const char *kNumStagesAttrName = "tt.num_stages";
11 | 
12 | /// Function to mask operations during scheduling.
13 | Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
14 | 
15 | /// Collect ssa dependencies of `op` in `deps`. if `includeArg` is true,
16 | /// continue looking through loop block arguments.
17 | void addDep(Operation *op, DenseSet<Operation *> &deps, bool includeArg = true,
18 |             DenseSet<Operation *> *filter = nullptr);
19 | 
20 | /// Add operations from `forOp` into a pipeline schedule with the the given
21 | /// `stage` when filter is true. This will add operation in the original loop
22 | /// order.
23 | void addOps(scf::ForOp forOp, int stage,
24 |             std::vector<std::pair<Operation *, unsigned>> &schedule,
25 |             std::function<bool(Operation *)> filter);
26 | } // namespace triton
27 | } // namespace mlir
28 | 
29 | #endif // TRITON_TRITONGPU_TRANSFORMS_PIPELINER_PIPELINING_UTILITY_H_
30 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_
 2 | #define TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_
 3 | 
 4 | #include "PipelineExpander.h"
 5 | #include "mlir/Dialect/SCF/IR/SCF.h"
 6 | #include "mlir/Support/LLVM.h"
 7 | #include "llvm/ADT/ArrayRef.h"
 8 | #include <vector>
 9 | 
10 | namespace mlir {
11 | namespace triton {
12 | 
13 | /// This fill out the pipelining options including schedule and annotations
14 | /// for wait ops. This also does pre-processing by converting some of the
15 | /// loads into async loads so that the IR is ready to be pipelined.
16 | bool preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages,
17 |                                   mlir::triton::PipeliningOption &options);
18 | 
19 | /// Fills out pipelining options for an outer loop pipelining case. This
20 | /// schedules async copies to overlap with the epilogue of a loop.
21 | bool getOuterLoopSchedule(scf::ForOp &forOp, int numStages,
22 |                           mlir::triton::PipeliningOption &options);
23 | 
24 | /// This does post-processing on the pipelined loop to try to pipeline wgmma
25 | /// ops.
26 | // TODO: this should be included as part of the pipeline but currently the wgmma
27 | // wait modeling is problematic.
28 | void asyncLaunchDots(scf::ForOp forOp);
29 | 
30 | /// Post process the pipelined loop by updating the wait ops with the right
31 | /// number of groups in flight.
32 | void updateWaits(ModuleOp module);
33 | 
34 | } // namespace triton
35 | } // namespace mlir
36 | #endif // TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_
37 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonNvidiaGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonNvidiaGPUIR
 2 |   Dialect.cpp
 3 |   Ops.cpp
 4 |   Types.cpp
 5 | 
 6 |   DEPENDS
 7 |   TritonNvidiaGPUTableGen
 8 |   TritonNvidiaGPUAttrDefsIncGen
 9 | 
10 |   LINK_LIBS PUBLIC
11 |   TritonIR
12 |   TritonGPUIR
13 | )
14 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonNvidiaGPUTransforms
 2 |   FenceInsertion.cpp
 3 |   PlanCTA.cpp
 4 | 
 5 |   DEPENDS
 6 |   TritonNvidiaGPUTransformsIncGen
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   TritonIR
10 |   TritonGPUIR
11 |   TritonGPUTransforms
12 |   TritonNvidiaGPUIR
13 |   MLIRTransformUtils
14 | )
15 | 


--------------------------------------------------------------------------------
/lib/Target/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(LLVMIR)
2 | 


--------------------------------------------------------------------------------
/lib/Target/LLVMIR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonLLVMIR
 2 |         LLVMDIScope.cpp
 3 |         LLVMIRBreakPhiStruct.cpp
 4 | 
 5 |         DEPENDS
 6 |         LLVMIRIncGen
 7 | 
 8 |         LINK_LIBS
 9 |         ${CMAKE_DL_LIBS}
10 |         PUBLIC
11 |         MLIRArithToLLVM
12 |         MLIRBuiltinToLLVMIRTranslation
13 |         MLIRIndexToLLVM
14 |         MLIRIR
15 |         MLIRLLVMDialect
16 |         MLIRLLVMToLLVMIRTranslation
17 |         MLIRNVVMToLLVMIRTranslation
18 |         MLIRROCDLToLLVMIRTranslation
19 |         MLIRSCFToControlFlow
20 |         MLIRSupport
21 |         MLIRTargetLLVMIRExport
22 |         TritonGPUToLLVM
23 |         )
24 | 
25 | set_source_files_properties(
26 |         LLVMIRTranslation.cpp
27 |         PROPERTIES
28 |         COMPILE_FLAGS "-D__BUILD_DIR__=\\\"${CMAKE_BINARY_DIR}\\\"")
29 | 


--------------------------------------------------------------------------------
/lib/Target/LLVMIR/LLVMPasses.h:
--------------------------------------------------------------------------------
 1 | #include "llvm/IR/PassManager.h"
 2 | #include "llvm/Pass.h"
 3 | #include "llvm/Support/CodeGen.h"
 4 | 
 5 | namespace llvm {
 6 | 
 7 | // Pass to pre-process LLVM IR before optimization and break up phi of struct.
 8 | // Breaking up those phis into elementary types allows better optimizations
 9 | // downstream.
10 | struct BreakStructPhiNodesPass : PassInfoMixin<BreakStructPhiNodesPass> {
11 |   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
12 | 
13 |   static StringRef name() { return "BreakStructPhiNodesPass"; }
14 | };
15 | 
16 | } // namespace llvm
17 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18", "ninja>=1.11.1"]
 3 | 
 4 | [tool.yapf]
 5 | based_on_style = "pep8"
 6 | column_limit = 120
 7 | disable_split_list_with_comment = true
 8 | each_dict_entry_on_separate_line=false
 9 | split_before_named_assigns = false
10 | split_complex_comprehension = true
11 | 
12 | [tool.yapfignore]
13 | ignore_patterns = [
14 |     # This exclusion is also specified in .pre-commit-config.yaml.
15 |     # - We put it here because if you run yapf directly, we want it to skip the
16 |     #   file.
17 |     # - We also put it in .pre-commit-config because yapf raises an error if
18 |     #   pre-commit runs it but all of the files it might touch are ignored!
19 |     "python/test/unit/language/test_line_info.py"
20 | ]
21 | 
22 | [tool.ruff]
23 | line-length = 120
24 | 
25 | [tool.ruff.lint]
26 | ignore = ["E501", "E701", "E731", "E741"]
27 | 


--------------------------------------------------------------------------------
/python/MANIFEST.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/python/MANIFEST.in


--------------------------------------------------------------------------------
/python/examples/copy_strided.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import triton.language as tl
 3 | import triton.compiler as tc
 4 | 
 5 | 
 6 | # triton kernel
 7 | @triton.jit
 8 | def kernel(X, stride_xm,  #
 9 |            Z, stride_zn,  #
10 |            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
11 |     off_m = tl.arange(0, BLOCK_M)
12 |     off_n = tl.arange(0, BLOCK_N)
13 |     Xs = X + off_m[:, None] * stride_xm + off_n[None, :] * 1
14 |     Zs = Z + off_m[:, None] * 1 + off_n[None, :] * stride_zn
15 |     tl.store(Zs, tl.load(Xs))
16 | 
17 | 
18 | src = tc.ASTSource(
19 |     fn=kernel,
20 |     constants={"BLOCK_M": 64, "BLOCK_N": 64},
21 |     signature="*fp32,i32,*fp32,i32",
22 | )
23 | 
24 | ret = triton.compile(src)
25 | print(ret.asm["ttgir"])
26 | 


--------------------------------------------------------------------------------
/python/examples/empty.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | 
 7 | @triton.jit
 8 | def kernel(X, stride_xm, stride_xn, BLOCK: tl.constexpr):
 9 |     pass
10 | 
11 | 
12 | X = torch.randn(1, device="cuda")
13 | pgm = kernel[(1, )](X, 1, 1, BLOCK=1024)
14 | 


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | [build-system]
 3 | requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18", "ninja>=1.11.1"]
 4 | 
 5 | # We're incrementally switching from autopep8 to ruff.
 6 | [tool.autopep8]
 7 | aggressive = 1
 8 | ignore = "E501,E701,E731,W690,W503"
 9 | max_line_length = 88
10 | 
11 | [tool.ruff]
12 | line-length = 120
13 | 
14 | [tool.ruff.lint]
15 | ignore = ["E501", "E701", "E731", "E741"]
16 | 


--------------------------------------------------------------------------------
/python/src/passes.h:
--------------------------------------------------------------------------------
 1 | #define ADD_PASS_WRAPPER_0(name, builder)                                      \
 2 |   m.def(name, [](mlir::PassManager &pm) { pm.addPass(builder()); })
 3 | 
 4 | #define ADD_PASS_WRAPPER_1(name, builder, ty0)                                 \
 5 |   m.def(name,                                                                  \
 6 |         [](mlir::PassManager &pm, ty0 val0) { pm.addPass(builder(val0)); })
 7 | 
 8 | #define ADD_PASS_WRAPPER_2(name, builder, ty0, ty1)                            \
 9 |   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1) {                  \
10 |     pm.addPass(builder(val0, val1));                                           \
11 |   })
12 | 
13 | #define ADD_PASS_WRAPPER_3(name, builder, ty0, ty1, ty2)                       \
14 |   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2) {        \
15 |     pm.addPass(builder(val0, val1, val2));                                     \
16 |   })
17 | 
18 | #define ADD_PASS_WRAPPER_4(name, builder, ty0, ty1, ty2, ty3)                  \
19 |   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2,          \
20 |                  ty3 val3) { pm.addPass(builder(val0, val1, val2, val3)); })
21 | 


--------------------------------------------------------------------------------
/python/test/backend/extension_backend.c:
--------------------------------------------------------------------------------
 1 | #include <Python.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
 6 |   // create a struct to hold device properties
 7 |   return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", 1024,
 8 |                        "multiprocessor_count", 16, "sm_clock_rate", 2100,
 9 |                        "mem_clock_rate", 2300, "mem_bus_width", 2400);
10 | }
11 | 
12 | static PyObject *loadBinary(PyObject *self, PyObject *args) {
13 |   // get allocated registers and spilled registers from the function
14 |   int n_regs = 0;
15 |   int n_spills = 0;
16 |   int mod = 0;
17 |   int fun = 0;
18 |   return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs,
19 |                        n_spills);
20 | }
21 | 
22 | static PyMethodDef ModuleMethods[] = {
23 |     {"load_binary", loadBinary, METH_VARARGS,
24 |      "Load dummy binary for the extension device"},
25 |     {"get_device_properties", getDeviceProperties, METH_VARARGS,
26 |      "Get the properties for the extension device"},
27 |     {NULL, NULL, 0, NULL} // sentinel
28 | };
29 | 
30 | static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "ext_utils",
31 |                                        NULL, // documentation
32 |                                        -1,   // size
33 |                                        ModuleMethods};
34 | 
35 | PyMODINIT_FUNC PyInit_ext_utils(void) {
36 |   PyObject *m = PyModule_Create(&ModuleDef);
37 |   if (m == NULL) {
38 |     return NULL;
39 |   }
40 |   PyModule_AddFunctions(m, ModuleMethods);
41 |   return m;
42 | }
43 | 


--------------------------------------------------------------------------------
/python/test/backend/third_party_backends/conftest.py:
--------------------------------------------------------------------------------
 1 | # content of conftest.py
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | def pytest_addoption(parser):
 7 |     parser.addoption("--backend", action="store", default="", help="Codegen backend")
 8 | 
 9 | 
10 | @pytest.fixture
11 | def cmdopt(request):
12 |     return request.config.getoption("--backend")
13 | 


--------------------------------------------------------------------------------
/python/test/backend/third_party_backends/test_xpu_backend.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | 
 7 | def test_xpu_backend(cmdopt):
 8 |     if cmdopt == "xpu":
 9 |         has_ipex = False
10 |         try:
11 |             # Import IPEX to provide Intel GPU runtime
12 |             import intel_extension_for_pytorch  # type: ignore # noqa: F401
13 |             has_ipex = True if hasattr(torch, "xpu") else False
14 |         except Exception:
15 |             has_ipex = False
16 | 
17 |         @triton.jit()
18 |         def kernel(x_ptr, y_ptr, out_ptr):
19 |             pid = tl.program_id(axis=0)
20 |             x = tl.load(x_ptr + pid)
21 |             y = tl.load(y_ptr + pid)
22 |             out = x + y
23 |             tl.store(out_ptr + pid, out)
24 | 
25 |         if has_ipex:
26 |             for _ in range(1000):
27 |                 x = torch.randn((65536, ), device="xpu", dtype=torch.float32)
28 |                 y = torch.randn((65536, ), device="xpu", dtype=torch.float32)
29 |                 z = torch.zeros((65536, ), device="xpu", dtype=torch.float32)
30 |                 kernel[(65536, )](x, y, z, num_warps=32)
31 |                 assert torch.all(x + y == z)
32 |     else:
33 |         return
34 | 


--------------------------------------------------------------------------------
/python/test/kernel_comparison/kernels.yml:
--------------------------------------------------------------------------------
 1 | name_and_extension:
 2 |   - name: _kernel_0d1d2d3de4de5de6c7de8de9c10de11c
 3 |     extension: ptx
 4 |   - name: _kernel_0d1d2d3de4de5de6de7c8de9c10de11c
 5 |     extension: ptx
 6 |   - name: _kernel_0d1d2d345de6c789c1011c
 7 |     extension: ptx
 8 |   - name: _kernel_0d1d2d3456c789c1011c
 9 |     extension: ptx
10 |   - name: _kernel_0d1d2d3de4de5de6c7de8c9de10de11c
11 |     extension: ptx
12 |   - name: _kernel_0d1d2d34567c8c91011c
13 |     extension: ptx
14 |   - name: _kernel_0d1d2d3456c78c91011c
15 |     extension: ptx
16 |   - name: _kernel_0d1d2d3de4de5de6de7c8c9de10de11c
17 |     extension: ptx
18 |   - name: _kernel_0d1d2d34567c89c1011c
19 |     extension: ptx
20 |   - name: _kernel_0d1d2d345de6de7c89c1011c
21 |     extension: ptx
22 |   - name: _kernel_0d1d2d345de6de7c8c9de1011c
23 |     extension: ptx
24 |   - name: kernel_0d1d2de
25 |     extension: ptx
26 |   - name: _kernel_0d1d2d345de6c78c9de1011c
27 |     extension: ptx
28 |   - name: _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11de12de13de14de15c16de17de18de19c20de21de22de23c2425de26de
29 |     extension: ptx
30 |   - name: _fwd_kernel_0d1d2d34d5d6de7de8de9c10de11de12de13c14de15de16de17c18de19de20de21c2223de24de
31 |     extension: ptx
32 |   - name: _bwd_preprocess_0d1d2d
33 |     extension: ptx
34 | 


--------------------------------------------------------------------------------
/python/test/unit/conftest.py:
--------------------------------------------------------------------------------
 1 | # content of conftest.py
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | def pytest_addoption(parser):
 7 |     parser.addoption("--device", action="store", default='cuda')
 8 | 
 9 | 
10 | @pytest.fixture
11 | def device(request):
12 |     return request.config.getoption("--device")
13 | 


--------------------------------------------------------------------------------
/python/test/unit/hopper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/python/test/unit/hopper/__init__.py


--------------------------------------------------------------------------------
/python/test/unit/language/conftest.py:
--------------------------------------------------------------------------------
1 | # content of conftest.py
2 | 
3 | 
4 | def pytest_configure(config):
5 |     config.addinivalue_line("markers", "interpreter: indicate whether interpreter supports the test")
6 | 


--------------------------------------------------------------------------------
/python/test/unit/language/test_annotations.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import torch
 3 | import triton
 4 | import triton.language as tl
 5 | import pytest
 6 | 
 7 | 
 8 | def annotated_function(return_type=None, **arg_types):
 9 |     """A decorator to add annotations to a function."""
10 | 
11 |     def decorator(func):
12 |         func.__annotations__ = {**arg_types, 'return': return_type}
13 |         return func
14 | 
15 |     return decorator
16 | 
17 | 
18 | # Test integer annotations
19 | @pytest.mark.parametrize(("signed", "width"), [
20 |     (signed, width) for signed in [False, True]\
21 |                     for width in [8, 16, 32, 64]
22 | ] + [(False, 1)]
23 |                          )
24 | def test_int_annotation(signed, width, device):
25 | 
26 |     @triton.jit
27 |     @annotated_function(X=torch.tensor, v=f"tl.{'' if signed else 'u'}int{width}")
28 |     def _kernel(X, v):
29 |         tl.store(X, v)
30 | 
31 |     h = _kernel[(1, )](torch.empty(1, device=device), 3)
32 |     pfx = 'si' if signed else 'ui'
33 |     assert f'%arg1: i{width}' in h.asm["ttir"]
34 |     assert f'arith.{pfx}tofp' in h.asm["ttir"]
35 | 
36 | 
37 | # Test that unknown annotations do not emit an error
38 | def test_unknown_annotation(device):
39 | 
40 |     @triton.jit
41 |     def _kernel(X: torch.Tensor, N: int, BLOCK_SIZE: tl.constexpr):
42 |         pass
43 | 
44 |     x = torch.empty(1, device=device)
45 |     _kernel[(1, )](x, x.shape[0], 32)
46 |     try:
47 |         _kernel[(1, )](x.shape[0], x.shape[0], 32)
48 |     except AttributeError:
49 |         pass
50 | 


--------------------------------------------------------------------------------
/python/test/unit/language/test_decorator.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import pytest
 3 | 
 4 | 
 5 | def test_decorator_with_def(device):
 6 | 
 7 |     def triton_heuristics_pointwise(**kwargs):
 8 | 
 9 |         def decorator(func):
10 |             return func
11 | 
12 |         return decorator
13 | 
14 |     # "def" might appear in a decorator call, e.g. a hash string argument.
15 |     # This test makes sure the compiler can find the right position of function
16 |     # definition.
17 |     @triton_heuristics_pointwise(inductor_meta={'backend_hash': 'def0aeffabe53b3f8'}, )
18 |     @triton.jit
19 |     def kernel():
20 |         pass
21 | 
22 |     try:
23 |         triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
24 |     except Exception as e:
25 |         pytest.fail(f"triton compile failed with error: {e}")
26 | 


--------------------------------------------------------------------------------
/python/test/unit/language/test_reproducer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | import pytest
 5 | 
 6 | import torch
 7 | import triton
 8 | import re
 9 | 
10 | 
11 | @triton.jit
12 | def triton_():
13 |     return
14 | 
15 | 
16 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
17 | def test_reproducer():
18 |     tmpdir = ".tmp"
19 |     reproducer = 'triton-reproducer.mlir'
20 |     if os.path.exists(tmpdir):
21 |         shutil.rmtree(tmpdir, ignore_errors=True)
22 |     if os.path.exists(reproducer):
23 |         os.remove(reproducer)
24 |     os.environ["TRITON_CACHE_DIR"] = tmpdir
25 |     os.environ["TRITON_REPRODUCER_PATH"] = reproducer
26 |     triton_[(1, )]()
27 |     foundPipeline = ""
28 |     with open(reproducer, 'r') as f:
29 |         line = f.read()
30 |         if 'pipeline:' in line:
31 |             foundPipeline = line
32 |     if 0 == len(foundPipeline):
33 |         raise Exception("Failed to find pipeline info in reproducer file.")
34 | 
35 |     ttgir_to_llvm_pass = re.compile("convert-triton-{{.*}}gpu-to-llvm")
36 |     if ttgir_to_llvm_pass.search(foundPipeline):
37 |         raise Exception("Failed to find triton passes in pipeline")
38 |     # cleanup
39 |     if os.path.exists(tmpdir):
40 |         shutil.rmtree(tmpdir, ignore_errors=True)
41 |     if os.path.exists(reproducer):
42 |         os.remove(reproducer)
43 | 


--------------------------------------------------------------------------------
/python/test/unit/operators/conftest.py:
--------------------------------------------------------------------------------
1 | # content of conftest.py
2 | 
3 | 
4 | def pytest_configure(config):
5 |     config.addinivalue_line("markers", "interpreter: indicate whether interpreter supports the test")
6 | 


--------------------------------------------------------------------------------
/python/test/unit/operators/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | import triton
 5 | import triton.ops
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("M, N, dtype, mode", [  #
 9 |     (M, N, dtype, mode)
10 |     for M in [1024, 821]
11 |     for N in [512, 857, 1871, 2089, 8573, 31000]
12 |     for dtype in ['float16', 'float32']
13 |     for mode in ['forward', 'backward']
14 | ])
15 | def test_op(M, N, dtype, mode):
16 |     capability = torch.cuda.get_device_capability()
17 |     if capability[0] < 8 and dtype == "bfloat16":
18 |         pytest.skip("Only test bfloat16 on devices with sm >= 80")
19 |     dtype = {'bfloat16': torch.bfloat16, 'float16': torch.float16, 'float32': torch.float32}[dtype]
20 |     # create inputs
21 |     x = torch.randn(M, N, dtype=dtype, device='cuda', requires_grad=True)
22 |     idx = 4 + torch.ones(M, dtype=torch.int64, device='cuda')
23 |     # forward pass
24 |     tt_y = triton.ops.cross_entropy(x, idx)
25 |     th_y = torch.nn.CrossEntropyLoss(reduction="none")(x, idx)
26 |     if mode == 'forward':
27 |         torch.testing.assert_close(th_y, tt_y)
28 |     # backward pass
29 |     elif mode == 'backward':
30 |         dy = torch.randn_like(tt_y)
31 |         # triton backward
32 |         tt_y.backward(dy)
33 |         tt_dx = x.grad.clone()
34 |         # torch backward
35 |         x.grad = None
36 |         th_y.backward(dy)
37 |         th_dx = x.grad.clone()
38 |         if dtype == torch.float16:
39 |             torch.testing.assert_close(th_dx, tt_dx, rtol=0.001, atol=0.001)
40 |         else:
41 |             torch.testing.assert_close(th_dx, tt_dx)
42 | 


--------------------------------------------------------------------------------
/python/test/unit/runtime/test_driver.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import triton
 4 | 
 5 | 
 6 | def test_is_lazy():
 7 |     from importlib import reload
 8 |     reload(sys.modules["triton.runtime.driver"])
 9 |     reload(sys.modules["triton.runtime"])
10 |     mod = sys.modules[triton.runtime.driver.__module__]
11 |     assert isinstance(triton.runtime.driver.active, getattr(mod, "LazyProxy"))
12 |     assert triton.runtime.driver.active._obj is None
13 |     utils = triton.runtime.driver.active.utils  # noqa: F841
14 |     assert issubclass(triton.runtime.driver.active._obj.__class__, getattr(triton.backends.driver, "DriverBase"))
15 | 


--------------------------------------------------------------------------------
/python/test/unit/runtime/test_jit.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import pytest
 3 | import torch
 4 | 
 5 | import triton
 6 | import triton.language as tl
 7 | 
 8 | 
 9 | def test_pre_call_hooks():
10 | 
11 |     @triton.jit
12 |     def add_kernel(
13 |         in_ptr0,
14 |         in_ptr1,
15 |         out_ptr,
16 |         n_elements,
17 |         BLOCK_SIZE: "tl.constexpr",
18 |     ):
19 |         pid = tl.program_id(axis=0)
20 |         block_start = pid * BLOCK_SIZE
21 |         offsets = block_start + tl.arange(0, BLOCK_SIZE)
22 |         mask = offsets < n_elements
23 |         x = tl.load(in_ptr0 + offsets, mask=mask)
24 |         y = tl.load(in_ptr1 + offsets, mask=mask)
25 |         output = x + y
26 |         tl.store(out_ptr + offsets, output, mask=mask)
27 | 
28 |     class MyTensor(torch.Tensor):
29 |         pass
30 | 
31 |     def my_hook(*args, **kwargs):
32 |         for arg in itertools.chain(args, kwargs.values()):
33 |             if isinstance(arg, MyTensor):
34 |                 raise Exception("MyTensor is not allowed")
35 | 
36 |     add_kernel.add_pre_run_hook(my_hook)
37 | 
38 |     x = torch.randn(4).cuda()
39 |     y = MyTensor(x)
40 |     out = torch.zeros_like(x)
41 |     with pytest.raises(Exception):
42 |         add_kernel[(4, )](x, y, out, 4, 4)
43 | 


--------------------------------------------------------------------------------
/python/triton/_C/include:
--------------------------------------------------------------------------------
1 | ../../../include/


--------------------------------------------------------------------------------
/python/triton/__init__.py:
--------------------------------------------------------------------------------
 1 | """isort:skip_file"""
 2 | __version__ = '3.0.0'
 3 | 
 4 | # ---------------------------------------
 5 | # Note: import order is significant here.
 6 | 
 7 | # submodules
 8 | from .runtime import (
 9 |     autotune,
10 |     Config,
11 |     heuristics,
12 |     JITFunction,
13 |     KernelInterface,
14 |     reinterpret,
15 |     TensorWrapper,
16 |     OutOfResources,
17 |     InterpreterError,
18 |     MockTensor,
19 | )
20 | from .runtime.jit import jit
21 | from .compiler import compile, CompilationError
22 | from .errors import TritonError
23 | 
24 | from . import language
25 | from . import testing
26 | from . import tools
27 | 
28 | __all__ = [
29 |     "autotune",
30 |     "cdiv",
31 |     "CompilationError",
32 |     "compile",
33 |     "Config",
34 |     "heuristics",
35 |     "impl",
36 |     "InterpreterError",
37 |     "jit",
38 |     "JITFunction",
39 |     "KernelInterface",
40 |     "language",
41 |     "MockTensor",
42 |     "next_power_of_2",
43 |     "ops",
44 |     "OutOfResources",
45 |     "reinterpret",
46 |     "runtime",
47 |     "TensorWrapper",
48 |     "TritonError",
49 |     "testing",
50 |     "tools",
51 | ]
52 | 
53 | # -------------------------------------
54 | # misc. utilities that  don't fit well
55 | # into any specific module
56 | # -------------------------------------
57 | 
58 | 
59 | def cdiv(x: int, y: int):
60 |     return (x + y - 1) // y
61 | 
62 | 
63 | def next_power_of_2(n: int):
64 |     """Return the smallest power of 2 greater than or equal to n"""
65 |     n -= 1
66 |     n |= n >> 1
67 |     n |= n >> 2
68 |     n |= n >> 4
69 |     n |= n >> 8
70 |     n |= n >> 16
71 |     n |= n >> 32
72 |     n += 1
73 |     return n
74 | 


--------------------------------------------------------------------------------
/python/triton/backends/driver.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod, abstractclassmethod
 2 | 
 3 | 
 4 | class DriverBase(metaclass=ABCMeta):
 5 | 
 6 |     @abstractclassmethod
 7 |     def is_active(self):
 8 |         pass
 9 | 
10 |     @abstractmethod
11 |     def get_current_target(self):
12 |         pass
13 | 
14 |     def __init__(self) -> None:
15 |         pass
16 | 
17 | 
18 | class GPUDriver(DriverBase):
19 | 
20 |     def __init__(self):
21 |         # TODO: support other frameworks than torch
22 |         import torch
23 |         self.get_device_capability = torch.cuda.get_device_capability
24 |         try:
25 |             from torch._C import _cuda_getCurrentRawStream
26 |             self.get_current_stream = _cuda_getCurrentRawStream
27 |         except ImportError:
28 |             self.get_current_stream = lambda idx: torch.cuda.current_stream(idx).cuda_stream
29 |         self.get_current_device = torch.cuda.current_device
30 |         self.set_current_device = torch.cuda.set_device
31 | 
32 |     # TODO: remove once TMA is cleaned up
33 |     def assemble_tensormap_to_arg(self, tensormaps_info, args):
34 |         return args
35 | 
36 | 
37 | class CPUDriverBase(DriverBase):
38 | 
39 |     def __init__(self):
40 |         # Right now, we just provide dummy functions.
41 |         # TODO: Consider better engineering the code only intended for GPU in jit.py.
42 |         self.get_device_capability = lambda idx: (0, 0)
43 |         self.get_current_stream = lambda idx: 0
44 |         self.get_current_device = lambda: 0
45 |         self.set_current_device = lambda idx: None
46 | 


--------------------------------------------------------------------------------
/python/triton/compiler/__init__.py:
--------------------------------------------------------------------------------
1 | from .compiler import CompiledKernel, ASTSource, compile, AttrsDescriptor, make_backend, LazyDict
2 | from .errors import CompilationError
3 | 
4 | __all__ = ["compile", "make_backend", "ASTSource", "AttrsDescriptor", "CompiledKernel", "CompilationError", "LazyDict"]
5 | 


--------------------------------------------------------------------------------
/python/triton/compiler/make_launcher.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/python/triton/compiler/make_launcher.py


--------------------------------------------------------------------------------
/python/triton/errors.py:
--------------------------------------------------------------------------------
1 | """Base class for all errors raised by Triton"""
2 | 
3 | 
4 | class TritonError(Exception):
5 |     ...
6 | 


--------------------------------------------------------------------------------
/python/triton/language/extra/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cuda
2 | 
3 | __all__ = ['cuda']
4 | 


--------------------------------------------------------------------------------
/python/triton/language/extra/cuda/__init__.py:
--------------------------------------------------------------------------------
1 | from . import libdevice
2 | 
3 | from .utils import (globaltimer, num_threads, num_warps, smid, convert_custom_float8_sm70, convert_custom_float8_sm80)
4 | 
5 | __all__ = [
6 |     "libdevice", "globaltimer", "num_threads", "num_warps", "smid", "convert_custom_float8_sm70",
7 |     "convert_custom_float8_sm80"
8 | ]
9 | 


--------------------------------------------------------------------------------
/python/triton/ops/__init__.py:
--------------------------------------------------------------------------------
1 | # from .conv import _conv, conv
2 | from . import blocksparse
3 | from .cross_entropy import _cross_entropy, cross_entropy
4 | from .flash_attention import attention
5 | from .matmul import _matmul, get_higher_dtype, matmul
6 | 
7 | __all__ = ["blocksparse", "_cross_entropy", "cross_entropy", "_matmul", "matmul", "attention", "get_higher_dtype"]
8 | 


--------------------------------------------------------------------------------
/python/triton/ops/blocksparse/__init__.py:
--------------------------------------------------------------------------------
1 | from .matmul import matmul
2 | from .softmax import softmax
3 | 
4 | __all__ = [
5 |     "matmul",
6 |     "softmax",
7 | ]
8 | 


--------------------------------------------------------------------------------
/python/triton/runtime/__init__.py:
--------------------------------------------------------------------------------
 1 | from .autotuner import (Autotuner, Config, Heuristics, autotune, heuristics)
 2 | from .cache import RedisRemoteCacheBackend, RemoteCacheBackend
 3 | from .driver import driver
 4 | from .jit import JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret
 5 | from .errors import OutOfResources, InterpreterError
 6 | 
 7 | __all__ = [
 8 |     "autotune",
 9 |     "Autotuner",
10 |     "Config",
11 |     "driver",
12 |     "Heuristics",
13 |     "heuristics",
14 |     "InterpreterError",
15 |     "JITFunction",
16 |     "KernelInterface",
17 |     "MockTensor",
18 |     "OutOfResources",
19 |     "RedisRemoteCacheBackend",
20 |     "reinterpret",
21 |     "RemoteCacheBackend",
22 |     "TensorWrapper",
23 | ]
24 | 


--------------------------------------------------------------------------------
/python/triton/runtime/errors.py:
--------------------------------------------------------------------------------
 1 | from ..errors import TritonError
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class InterpreterError(TritonError):
 6 | 
 7 |     def __init__(self, error_message: Optional[str] = None):
 8 |         self.error_message = error_message
 9 | 
10 |     def __str__(self) -> str:
11 |         return self.error_message or ""
12 | 
13 | 
14 | class OutOfResources(TritonError):
15 | 
16 |     def __init__(self, required, limit, name):
17 |         self.required = required
18 |         self.limit = limit
19 |         self.name = name
20 | 
21 |     def __str__(self) -> str:
22 |         return f"out of resource: {self.name}, Required: {self.required}, Hardware limit: {self.limit}. Reducing block sizes or `num_stages` may help."
23 | 
24 |     def __reduce__(self):
25 |         # this is necessary to make CompilationError picklable
26 |         return (type(self), (self.required, self.limit, self.name))
27 | 


--------------------------------------------------------------------------------
/python/triton/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/python/triton/tools/__init__.py


--------------------------------------------------------------------------------
/python/triton/tools/compile.h:
--------------------------------------------------------------------------------
 1 | #ifndef TT_KERNEL_INCLUDES
 2 | #define TT_KERNEL_INCLUDES
 3 | 
 4 | #include <cuda.h>
 5 | #include <inttypes.h>
 6 | #include <stdint.h>
 7 | #include <stdio.h>
 8 | 
 9 | #endif
10 | 
11 | void unload_{kernel_name}(void);
12 | void load_{kernel_name}(void);
13 | // tt-linker: {kernel_name}:{full_signature}:{algo_info}
14 | CUresult{_placeholder} {kernel_name}(CUstream stream, {signature});
15 | 


--------------------------------------------------------------------------------
/python/tutorials/README.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | =========
 3 | 
 4 | Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one.
 5 | 
 6 | To install the dependencies for the tutorials:
 7 | 
 8 | .. code-block:: bash
 9 | 
10 |     cd triton
11 |     pip install -e './python[tutorials]'
12 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(lib)
 2 | 
 3 | llvm_canonicalize_cmake_booleans(
 4 |   MLIR_ENABLE_BINDINGS_PYTHON
 5 | )
 6 | 
 7 | configure_lit_site_cfg(
 8 |   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
 9 |   ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
10 |   MAIN_CONFIG
11 |   ${CMAKE_CURRENT_SOURCe_DIR}/lit.cfg.py
12 | )
13 | 
14 | set(TRITON_TEST_DEPENDS
15 |   triton-opt
16 | )
17 | 
18 | set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck")
19 | set(LIT_ARGS "-Dfilecheck=${FILECHECK_PATH}")
20 | add_lit_testsuite(check-triton-lit-tests "Running the triton regression tests"
21 |   ${CMAKE_CURRENT_BINARY_DIR}
22 |   ARGS ${LIT_ARGS}
23 |   DEPENDS ${TRITON_TEST_DEPENDS}
24 |   )
25 | 
26 | set_target_properties(check-triton-lit-tests PROPERTIES FOLDER "Tests")
27 | 
28 | add_lit_testsuites(TRITON-LIT-TESTS ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TRITON_TEST_DEPENDS})
29 | 


--------------------------------------------------------------------------------
/test/Conversion/amd/decompose-unsupported-conversions.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s --split-input-file --decompose-unsupported-amd-conversions | FileCheck %s
 2 | 
 3 | // CHECK: #[[BLOCKED:.+]] = #triton_gpu.blocked<{{.*}}>
 4 | // CHECK: #[[WMMA:.+]] = #triton_gpu.amd_wmma<{{.*}}>
 5 | // CHECK: #[[SHARED:.+]] = #triton_gpu.shared<{{.*}}>
 6 | #mma = #triton_gpu.amd_wmma<{warpsPerCTA = [2, 2]}>
 7 | module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 8 |   tt.func @wmma_to_wmma_dot_op(%arg0: tensor<16x16xf16, #mma>) {
 9 |     // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<16x16xf16, #[[WMMA]]> -> tensor<16x16xf16, #[[BLOCKED]]>
10 |     // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<16x16xf16, #[[SHARED]]>
11 |     // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[WMMA]]}>>
12 |     %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>>
13 |     tt.return
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/test/Conversion/amd/fp_to_fp.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm | FileCheck %s
 2 | 
 3 | //  CHECK-LABEL: f16_to_f32
 4 | #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 5 | module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 6 |   tt.func @f16_to_f32(%arg0: tensor<8x8xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>) {
 7 |     // CHECK-COUNT-8: llvm.inline_asm asm_dialect {{.*}}v_cvt_f32_f16 {{.*}}: (f16) -> f32
 8 |     %0 = tt.fp_to_fp %arg0 : tensor<8x8xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> -> tensor<8x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>
 9 |     tt.return
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/test/Conversion/divide-by-0.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm --cse | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: dont_divide_0
 4 | // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
 5 | // CHECK-NOT: llvm.urem %{{.*}}, %[[C0]]
 6 | #blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 7 | #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 8]}>
 8 | module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 9 |   tt.func public @dont_divide_0() attributes {noinline = false} {
10 |     %zero = arith.constant dense<0.000000e+00> : tensor<16x1xf32, #mma>
11 |     %cvt = triton_gpu.convert_layout %zero : tensor<16x1xf32, #mma> -> tensor<16x1xf32, #blocked>
12 |     tt.return
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/test/Conversion/tritongpu_to_llvm_volta.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm=compute-capability=70 2>&1 | FileCheck %s
 2 | 
 3 | #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 4 | // CHECK-LABEL: clamp
 5 | module attributes {"triton_gpu.compute-capability" = 70 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 6 |   tt.func public @clamp(%x : tensor<1024xf32, #blocked>, %limit : tensor<1024xf32, #blocked>) attributes {noinline = false} {
 7 |     %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
 8 |     %neg_limit = arith.subf %cst, %limit : tensor<1024xf32, #blocked>
 9 | 
10 |     // CHECK:      llvm.fcmp "une" %[[REG:[a-zA-Z0-9]+]], %[[REG]]
11 |     // CHECK-NEXT: llvm.intr.maxnum
12 |     // CHECK-NEXT: llvm.intr.minnum
13 |     // CHECK-NEXT: llvm.mlir.constant
14 |     // CHECK-NEXT: llvm.select
15 |     %12 = tt.clampf %x, %neg_limit, %limit, propagateNan = all : tensor<1024xf32, #blocked>
16 |     tt.return
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/test/LLVMIR/break-phi-struct.ll:
--------------------------------------------------------------------------------
 1 | ; RUN: triton-llvm-opt -break-struct-phi-nodes %s | FileCheck %s
 2 | 
 3 | ; CHECK-LABEL: struct
 4 | define {i32, i32} @struct(i1 %c) {
 5 | ; CHECK: br i1 %{{.*}}, label [[TRUE:%.*]], label [[FALSE:%.*]]
 6 |   br i1 %c, label %true, label %false
 7 | 
 8 | true:
 9 |   %s.1 = insertvalue {i32, i32} undef, i32 20, 0
10 |   %s.2 = insertvalue {i32, i32} %s.1, i32 200, 1
11 | 
12 | ; CHECK-DAG: [[E0:%.*]] = extractvalue { i32, i32 } %{{.*}}, 0
13 | ; CHECK-DAG: [[E1:%.*]] = extractvalue { i32, i32 } %{{.*}}, 1
14 | ; CHECK: br
15 |   br label %exit
16 | 
17 | false:
18 |   %s.3 = insertvalue {i32, i32} undef, i32 30, 0
19 |   %s.4 = insertvalue {i32, i32} %s.3, i32 300, 1
20 | ; CHECK-DAG: [[E2:%.*]] = extractvalue { i32, i32 } %{{.*}}, 0
21 | ; CHECK-DAG: [[E3:%.*]] = extractvalue { i32, i32 } %{{.*}}, 1
22 | ; CHECK: br
23 |   br label %exit
24 | 
25 | exit:
26 | ; CHECK-DAG: [[PHI0:%.*]] = phi i32 [ [[E0]], [[TRUE]] ], [ [[E2]], [[FALSE]] ]
27 | ; CHECK-DAG: [[PHI1:%.*]] = phi i32 [ [[E1]], [[TRUE]] ], [ [[E3]], [[FALSE]] ]
28 | ; CHECK: [[S0:%.*]] = insertvalue { i32, i32 } undef, i32 [[PHI0]], 0
29 | ; CHECK: [[S1:%.*]] = insertvalue { i32, i32 } [[S0]], i32 [[PHI1]], 1
30 | ; CHECK: ret { i32, i32 } [[S1]]
31 |   %r = phi {i32, i32} [ %s.2, %true], [ %s.4, %false ]
32 |   ret {i32, i32} %r
33 | }
34 | 


--------------------------------------------------------------------------------
/test/NVGPU/test_cga.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -split-input-file --convert-nv-gpu-to-llvm | FileCheck %s
 2 | #SHARED = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
 3 | module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 2 : i32} {
 4 |   tt.func @test_mbarrier() {
 5 |     %ptr = llvm.mlir.zero : !llvm.ptr<3>
 6 | 
 7 |     // CHECK: llvm.inline_asm
 8 |     %v = nvgpu.cluster_id
 9 |     llvm.store %v, %ptr : i32, !llvm.ptr<3>
10 | 
11 |     tt.return
12 |   }
13 | } // end module
14 | 


--------------------------------------------------------------------------------
/test/Triton/reproducer.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt --verify-diagnostics --dump-pass-pipeline --run-reproducer %s 2>&1 | FileCheck %s
 2 | 
 3 | module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 4 |   tt.func public @triton__() attributes {noinline = false} {
 5 |     tt.return
 6 |   }
 7 | }
 8 | 
 9 | {-#
10 |   external_resources: {
11 |     mlir_reproducer: {
12 |       pipeline: "builtin.module(any(convert-scf-to-cf,convert-index-to-llvm{index-bitwidth=0},convert-triton-gpu-to-llvm{compute-capability=90},convert-nv-gpu-to-llvm,convert-arith-to-llvm{index-bitwidth=0},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=true test-convergence=false top-down=true},cse,symbol-dce,enable-line-info))",
13 |       disable_threading: false,
14 |       verify_each: false
15 |     }
16 |   }
17 | #-}
18 | 
19 | // CHECK: Pass Manager with
20 | // CHECK-NEXT: convert-triton-gpu-to-llvm
21 | 


--------------------------------------------------------------------------------
/test/Triton/verify-make-range.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt --split-input-file %s --verify-diagnostics
 2 | 
 3 | tt.func public @i64_tensor() {
 4 |     // expected-error @+1 {{i32 elements}}
 5 |     %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : tensor<16xi64>
 6 |     tt.return
 7 | }
 8 | 
 9 | // -----
10 | tt.func public @i32_scalar() {
11 |     // expected-error @+1 {{invalid kind of type}}
12 |     %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : i32
13 |     tt.return
14 | }
15 | 
16 | // -----
17 | tt.func public @_2d_tensor() {
18 |     // expected-error @+1 {{must be a 1D tensor}}
19 |     %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : tensor<16x1xi32>
20 |     tt.return
21 | }
22 | 
23 | // -----
24 | tt.func public @bad_start_end() {
25 |     // expected-error @+1 {{start must be less than or equal to end}}
26 |     %a = tt.make_range { start = 0 : i32, end = -16 : i32 } : tensor<16xi32>
27 |     tt.return
28 | }
29 | 
30 | // -----
31 | tt.func public @bad_num_elems() {
32 |     // expected-error @+1 {{number of elements}}
33 |     %a = tt.make_range { start = 0 : i32, end = 32 : i32 } : tensor<16xi32>
34 |     tt.return
35 | }
36 | 


--------------------------------------------------------------------------------
/test/TritonGPU/ops.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt --split-input-file %s | FileCheck %s
 2 | 
 3 | // CHECK: #[[$WMMA:.*]] = #triton_gpu.amd_wmma
 4 | #blocked = #triton_gpu.blocked<{sizePerThread = [2, 2], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 5 | 
 6 | module attributes {"triton_gpu.compute-capability" = 0 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 7 |   // CHECK-LABEL: wmma_layout
 8 |   tt.func @wmma_layout(%0: tensor<16x16xf16, #blocked>) {
 9 |     %1 = triton_gpu.convert_layout %0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #triton_gpu.amd_wmma<{warpsPerCTA = [1, 1]}>>
10 |     // CHECK:  %{{.+}} = triton_gpu.convert_layout %{{.+}} : tensor<16x16xf16, #{{.+}}> -> tensor<16x16xf16, #[[$WMMA]]>
11 |     tt.return
12 |   }
13 | 
14 |   // CHECK-LABEL: wmma_dot_op_layout
15 |   tt.func @wmma_dot_op_layout(%0: tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) {
16 |     %1 = triton_gpu.convert_layout %0 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #triton_gpu.amd_wmma<{warpsPerCTA = [1, 1]}>}>>
17 |     // CHECK:  %{{.+}} = triton_gpu.convert_layout %{{.+}} : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #{{.+}}}>> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[$WMMA]]}>>
18 |     tt.return
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/test/TritonGPU/reduce-data-duplication.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
 2 | 
 3 | //       CHECK:   #[[SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}
 4 | //       CHECK:   apply_swizzle
 5 | //       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[SHARED]]>
 6 | 
 7 | #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
 8 | #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
 9 | module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
10 |   tt.func @apply_swizzle(%arg0: tensor<16x256xf16, #blocked>) {
11 |     %0 = triton_gpu.convert_layout %arg0 : tensor<16x256xf16, #blocked> -> tensor<16x256xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
12 |     tt.return
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/test/TritonGPU/tritongpu_ops.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s | triton-opt | FileCheck %s
 2 | 
 3 | #shared0 = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], hasLeadingOffset=true}>
 4 | 
 5 | module attributes {"triton_gpu.compute-capability" = 0 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
 6 |   // CHECK-LABEL: memdesc
 7 |   // CHECK-SAME: !tt.memdesc<1x64x16xf16, #{{.+}}>
 8 |   tt.func @memdesc(%d : !tt.memdesc<1x64x16xf16, #shared0>) {
 9 |     tt.return
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/test/lib/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(TritonTestAnalysis
 2 |   TestAlias.cpp
 3 |   TestAxisInfo.cpp
 4 |   TestAllocation.cpp
 5 |   TestMembar.cpp
 6 | 
 7 |   LINK_LIBS PUBLIC
 8 |   MLIRPass
 9 |   ${triton_libs}
10 | )
11 | 


--------------------------------------------------------------------------------
/test/lib/Analysis/TestAxisInfo.cpp:
--------------------------------------------------------------------------------
 1 | #include "mlir/Pass/Pass.h"
 2 | #include "triton/Analysis/AxisInfo.h"
 3 | #include "triton/Analysis/Utility.h"
 4 | 
 5 | using namespace mlir;
 6 | using namespace mlir::triton;
 7 | 
 8 | namespace {
 9 | 
10 | struct TestAxisInfoPass
11 |     : public PassWrapper<TestAxisInfoPass, OperationPass<ModuleOp>> {
12 | 
13 |   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAxisInfoPass);
14 | 
15 |   StringRef getArgument() const final { return "test-print-alignment"; }
16 |   StringRef getDescription() const final {
17 |     return "print the result of the alignment analysis pass";
18 |   }
19 | 
20 |   void runOnOperation() override {
21 |     Operation *operation = getOperation();
22 |     ModuleOp moduleOp = cast<ModuleOp>(operation);
23 |     ModuleAxisInfoAnalysis moduleAxisInfoAnalysis(moduleOp);
24 |     moduleOp.walk([&](FuncOp funcOp) {
25 |       auto &os = llvm::errs();
26 |       auto opName = SymbolTable::getSymbolName(funcOp).getValue().str();
27 |       os << "@" << opName << "\n";
28 |       funcOp.walk([&](Operation *op) {
29 |         if (op->getNumResults() < 1)
30 |           return;
31 |         for (Value result : op->getResults()) {
32 |           result.print(os);
33 |           os << " => ";
34 |           auto *axisInfo = moduleAxisInfoAnalysis.getAxisInfo(result);
35 |           if (axisInfo)
36 |             axisInfo->print(os);
37 |           os << "\n";
38 |         }
39 |       });
40 |     });
41 |   }
42 | };
43 | 
44 | } // namespace
45 | 
46 | namespace mlir {
47 | namespace test {
48 | void registerTestAlignmentPass() { PassRegistration<TestAxisInfoPass>(); }
49 | } // namespace test
50 | } // namespace mlir
51 | 


--------------------------------------------------------------------------------
/test/lib/Analysis/TestMembar.cpp:
--------------------------------------------------------------------------------
 1 | #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 2 | #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 3 | #include "mlir/IR/Dialect.h"
 4 | #include "mlir/Pass/Pass.h"
 5 | #include "mlir/Transforms/DialectConversion.h"
 6 | #include "triton/Analysis/Allocation.h"
 7 | #include "triton/Analysis/Membar.h"
 8 | 
 9 | using namespace mlir;
10 | 
11 | namespace {
12 | 
13 | struct TestMembarPass
14 |     : public PassWrapper<TestMembarPass, OperationPass<ModuleOp>> {
15 | 
16 |   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMembarPass);
17 | 
18 |   StringRef getArgument() const final { return "test-print-membar"; }
19 |   StringRef getDescription() const final {
20 |     return "print the result of the allocation pass";
21 |   }
22 | 
23 |   void runOnOperation() override {
24 |     Operation *operation = getOperation();
25 |     ModuleOp moduleOp = cast<ModuleOp>(operation);
26 |     // Print all ops after membar pass
27 |     ModuleAllocation allocation(moduleOp);
28 |     ModuleMembarAnalysis membarPass(&allocation);
29 |     membarPass.run();
30 |   }
31 | };
32 | 
33 | } // namespace
34 | 
35 | namespace mlir {
36 | namespace test {
37 | void registerTestMembarPass() { PassRegistration<TestMembarPass>(); }
38 | } // namespace test
39 | } // namespace mlir
40 | 


--------------------------------------------------------------------------------
/test/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Analysis)
2 | 


--------------------------------------------------------------------------------
/test/lit.site.cfg.py.in:
--------------------------------------------------------------------------------
 1 | @LIT_SITE_CFG_IN_HEADER@
 2 | 
 3 | import sys
 4 | 
 5 | config.triton_obj_root = "@TRITON_BINARY_DIR@"
 6 | config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 7 | config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 8 | config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
 9 | config.llvm_lib_dir = "@LLVM_LIBS_DIR@"
10 | config.llvm_shlib_dir = "@SHLIBDIR@"
11 | config.llvm_shlib_ext = "@SHLIBEXT@"
12 | config.llvm_exe_ext = "@EXEEXT@"
13 | config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
14 | config.mlir_binary_dir = "@MLIR_BINARY_DIR@"
15 | config.python_executable = "@Python3_EXECUTABLE@"
16 | config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@
17 | 
18 | 
19 | import lit.llvm
20 | lit.llvm.initialize(lit_config, config)
21 | 
22 | # Let the main config do the real work
23 | lit_config.load_config(config, "@TRITON_SOURCE_DIR@/test/lit.cfg.py")
24 | 


--------------------------------------------------------------------------------
/third_party/amd/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
3 | add_subdirectory(include)
4 | add_subdirectory(lib)
5 | if(TRITON_BUILD_PYTHON_MODULE)
6 |   add_triton_plugin(TritonAMD ${CMAKE_CURRENT_SOURCE_DIR}/python/triton_amd.cc LINK_LIBS TritonAMDGPUToLLVM TritonAMDGPUTransforms)
7 | endif()
8 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/amd_detail/amd_hip_common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 | this software and associated documentation files (the "Software"), to deal in
 6 | the Software without restriction, including without limitation the rights to
 7 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 8 | of the Software, and to permit persons to whom the Software is furnished to do
 9 | so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.
21 | */
22 | 
23 | #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
24 | #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
25 | 
26 | #if defined(__clang__) && defined(__HIP__)
27 | #define __HIP_CLANG_ONLY__ 1
28 | #else
29 | #define __HIP_CLANG_ONLY__ 0
30 | #endif
31 | 
32 | #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
33 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/amd_detail/concepts.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #pragma once
24 | 
25 | namespace hip_impl  // Documentation only.
26 | {
27 | #define requires(...)
28 | 
29 | #define FunctionalProcedure typename
30 | }  // namespace hip_impl
31 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/amd_detail/grid_launch.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "grid_launch.h"
 4 | #include "hc.hpp"
 5 | 
 6 | class grid_launch_parm_cxx : public grid_launch_parm
 7 | {
 8 | public:
 9 |   grid_launch_parm_cxx() = default;
10 | 
11 |   // customized serialization: don't need av and cf in kernel
12 |   __attribute__((annotate("serialize")))
13 |   void __cxxamp_serialize(Kalmar::Serialize& s) const {
14 |     s.Append(sizeof(int), &grid_dim.x);
15 |     s.Append(sizeof(int), &grid_dim.y);
16 |     s.Append(sizeof(int), &grid_dim.z);
17 |     s.Append(sizeof(int), &group_dim.x);
18 |     s.Append(sizeof(int), &group_dim.y);
19 |     s.Append(sizeof(int), &group_dim.z);
20 |   }
21 | 
22 |   __attribute__((annotate("user_deserialize")))
23 |   grid_launch_parm_cxx(int grid_dim_x,  int grid_dim_y,  int grid_dim_z,
24 |                    int group_dim_x, int group_dim_y, int group_dim_z) {
25 |     grid_dim.x  = grid_dim_x;
26 |     grid_dim.y  = grid_dim_y;
27 |     grid_dim.z  = grid_dim_z;
28 |     group_dim.x = group_dim_x;
29 |     group_dim.y = group_dim_y;
30 |     group_dim.z = group_dim_z;
31 |   }
32 | };
33 | 
34 | 
35 | extern inline void grid_launch_init(grid_launch_parm *lp) {
36 |   lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1;
37 | 
38 |   lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1;
39 | 
40 |   lp->dynamic_group_mem_bytes = 0;
41 | 
42 |   lp->barrier_bit = barrier_bit_queue_default;
43 |   lp->launch_fence = -1;
44 | 
45 |   // TODO - set to NULL?
46 |   static hc::accelerator_view av = hc::accelerator().get_default_view();
47 |   lp->av = &av;
48 |   lp->cf = NULL;
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/amd_detail/grid_launch_GGL.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | #pragma once
23 | 
24 | #if GENERIC_GRID_LAUNCH == 1
25 | #include "macro_based_grid_launch.hpp"
26 | #endif  // GENERIC_GRID_LAUNCH


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_bf16.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #ifndef HIP_INCLUDE_HIP_HIP_BF16_H
24 | #define HIP_INCLUDE_HIP_HIP_BF16_H
25 | 
26 | #include <hip/hip_common.h>
27 | 
28 | #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
29 | #include <hip/amd_detail/amd_hip_bf16.h>
30 | #elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
31 | #include <hip/nvidia_detail/nvidia_hip_bf16.h>
32 | #else
33 | #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
34 | #endif
35 | 
36 | #endif  // HIP_INCLUDE_HIP_HIP_BF16_H
37 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_fp16.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | #ifndef HIP_INCLUDE_HIP_HIP_FP16_H
24 | #define HIP_INCLUDE_HIP_HIP_FP16_H
25 | 
26 | #include <hip/hip_common.h>
27 | 
28 | #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
29 | #include <hip/amd_detail/amd_hip_fp16.h>
30 | #elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
31 | #include "cuda_fp16.h"
32 | #else
33 | #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
34 | #endif
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_gl_interop.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | #ifndef HIP_GL_INTEROP_H
23 | #define HIP_GL_INTEROP_H
24 | 
25 | #include <hip/hip_common.h>
26 | 
27 | #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
28 | #include "hip/amd_detail/amd_hip_gl_interop.h"
29 | #elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
30 | #include "hip/nvidia_detail/nvidia_hip_gl_interop.h"
31 | #endif
32 | #endif
33 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_hcc.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | The above copyright notice and this permission notice shall be included in
10 | all copies or substantial portions of the Software.
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 | THE SOFTWARE.
18 | */
19 | 
20 | #ifndef HIP_INCLUDE_HIP_HIP_HCC_H
21 | #define HIP_INCLUDE_HIP_HIP_HCC_H
22 | #warning "hip/hip_hcc.h is deprecated, please use hip/hip_ext.h"
23 | #include "hip/hip_ext.h"
24 | #endif  // #ifdef HIP_INCLUDE_HIP_HIP_HCC_H
25 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_profile.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | The above copyright notice and this permission notice shall be included in
10 | all copies or substantial portions of the Software.
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 | THE SOFTWARE.
18 | */
19 | 
20 | #ifndef HIP_INCLUDE_HIP_HIP_PROFILE_H
21 | #define HIP_INCLUDE_HIP_HIP_PROFILE_H
22 | 
23 | #define HIP_SCOPED_MARKER(markerName, group)
24 | #define HIP_BEGIN_MARKER(markerName, group)
25 | #define HIP_END_MARKER()
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_texture_types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | 
24 | #ifndef HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
25 | #define HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
26 | 
27 | #include <hip/texture_types.h>
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_version.h:
--------------------------------------------------------------------------------
 1 | // Auto-generated by cmake
 2 | 
 3 | #ifndef HIP_VERSION_H
 4 | #define HIP_VERSION_H
 5 | 
 6 | #define HIP_VERSION_MAJOR 6
 7 | #define HIP_VERSION_MINOR 0
 8 | #define HIP_VERSION_PATCH 32830
 9 | #define HIP_VERSION_GITHASH "d62f6a171"
10 | #define HIP_VERSION_BUILD_ID 0
11 | #define HIP_VERSION_BUILD_NAME ""
12 | #define HIP_VERSION    (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
13 | 
14 | #define __HIP_HAS_GET_PCH 1
15 | 
16 | #endif
17 | 
18 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/lib/cuda2gcn.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/amd/backend/lib/cuda2gcn.bc


--------------------------------------------------------------------------------
/third_party/amd/backend/lib/ockl.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/amd/backend/lib/ockl.bc


--------------------------------------------------------------------------------
/third_party/amd/backend/lib/ocml.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/amd/backend/lib/ocml.bc


--------------------------------------------------------------------------------
/third_party/amd/backend/lib/opencl.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/amd/backend/lib/opencl.bc


--------------------------------------------------------------------------------
/third_party/amd/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonAMDGPUToLLVM)
2 | add_subdirectory(TritonAMDGPUTransforms)
3 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonAMDGPUToLLVM)
3 | add_public_tablegen_target(TritonAMDGPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONAMDGPU_CONVERSION_PASSES_H
 2 | #define TRITONAMDGPU_CONVERSION_PASSES_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 6 | #include "mlir/Pass/Pass.h"
 7 | #include "mlir/Transforms/DialectConversion.h"
 8 | 
 9 | #include <memory>
10 | 
11 | namespace mlir {
12 | 
13 | class ModuleOp;
14 | template <typename T> class OperationPass;
15 | 
16 | namespace triton {
17 | 
18 | #define GEN_PASS_DECL
19 | #include "TritonAMDGPUToLLVM/Passes.h.inc"
20 | 
21 | namespace AMD {
22 | std::unique_ptr<OperationPass<ModuleOp>>
23 | createDecomposeUnsupportedConversionsPass();
24 | 
25 | } // namespace AMD
26 | 
27 | std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonAMDGPUToLLVMPass();
28 | std::unique_ptr<OperationPass<ModuleOp>>
29 | createConvertTritonAMDGPUToLLVMPass(int32_t computeCapability);
30 | #define GEN_PASS_REGISTRATION
31 | #include "TritonAMDGPUToLLVM/Passes.h.inc"
32 | 
33 | } // namespace triton
34 | 
35 | } // namespace mlir
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONAMDGPU_CONVERSION_PASSES
 2 | #define TRITONAMDGPU_CONVERSION_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def DecomposeUnsupportedAMDConversions : Pass<"decompose-unsupported-amd-conversions", "mlir::ModuleOp"> {
 7 |     let summary = "Decompose conversions that are not supported by TritonGPU -> LLVM";
 8 |     let constructor = "mlir::triton::AMD::createDecomposeUnsupportedConversionsPass()";
 9 | }
10 | 
11 | def ConvertTritonAMDGPUToLLVM : Pass<"convert-triton-amdgpu-to-llvm", "mlir::ModuleOp"> {
12 |     let summary = "Convert TritonGPU to LLVM";
13 |     let description = [{
14 | 
15 |     }];
16 |     let constructor = "mlir::triton::createConvertTritonAMDGPUToLLVMPass()";
17 | 
18 |     let dependentDialects = ["mlir::arith::ArithDialect",
19 |                              "mlir::math::MathDialect",
20 |                              "mlir::gpu::GPUDialect",
21 |                              "mlir::scf::SCFDialect",
22 |                              "mlir::LLVM::LLVMDialect",
23 |                              "mlir::tensor::TensorDialect",
24 |                              "mlir::triton::TritonDialect",
25 |                              "mlir::triton::gpu::TritonGPUDialect",
26 |                              "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
27 |                              "mlir::ROCDL::ROCDLDialect",
28 |                              "mlir::NVVM::NVVMDialect"];
29 | 
30 |     let options = [
31 |         Option<"computeCapability", "compute-capability",
32 |                "int32_t", /*default*/"80",
33 |                "device compute capability">,
34 |     ];
35 | }
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUTransforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonAMDGPU)
3 | add_public_tablegen_target(TritonAMDGPUTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUTransforms/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONAMDGPU_TRANSFORMS_PASSES_H_
 2 | #define TRITON_DIALECT_TRITONAMDGPU_TRANSFORMS_PASSES_H_
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 6 | 
 7 | namespace mlir {
 8 | 
 9 | std::unique_ptr<Pass>
10 | createTritonAMDGPUPipelinePass(int numStages = 3, int numWarps = 4,
11 |                                int numCTAs = 1, int computeCapability = 80);
12 | 
13 | std::unique_ptr<Pass> createTritonAMDGPUStreamPipelinePass();
14 | 
15 | std::unique_ptr<Pass>
16 | createTritonAMDGPUAccelerateMatmulPass(std::string archGenName = std::string(),
17 |                                        int matrixInstructionSize = 0,
18 |                                        int kpack = 1);
19 | 
20 | std::unique_ptr<Pass> createTritonAMDGPUPrefetchPass();
21 | 
22 | std::unique_ptr<Pass> createTritonAMDGPUCanonicalizeLoopsPass();
23 | 
24 | std::unique_ptr<Pass> createTritonAMDGPUCoalescePass();
25 | 
26 | std::unique_ptr<Pass> createTritonAMDGPUReorderInstructionsPass();
27 | 
28 | std::unique_ptr<Pass> createTritonAMDGPURemoveLayoutConversionsPass();
29 | 
30 | std::unique_ptr<Pass> createTritonAMDGPUVerifier();
31 | 
32 | std::unique_ptr<Pass> createTritonAMDGPUOptimizeDotOperandsPass();
33 | 
34 | std::unique_ptr<Pass> createTritonAMDGPUOptimizeEpiloguePass();
35 | 
36 | /// Generate the code for registering passes.
37 | #define GEN_PASS_REGISTRATION
38 | #include "TritonAMDGPUTransforms/Passes.h.inc"
39 | 
40 | } // namespace mlir
41 | #endif
42 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUTransforms/TritonGPUConversion.h:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //
 3 | // Defines utilities to use while converting to the TritonGPU dialect.
 4 | //
 5 | //===----------------------------------------------------------------------===//
 6 | 
 7 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
 8 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
 9 | 
10 | #include "mlir/Transforms/DialectConversion.h"
11 | 
12 | namespace mlir {
13 | 
14 | class TritonGPUTypeConverter : public TypeConverter {
15 | public:
16 |   TritonGPUTypeConverter(MLIRContext *context, int numWarps, int threadsPerWarp,
17 |                          int numCTAs);
18 |   int getNumWarps() const { return numWarps; }
19 |   int getThreadsPerWarp() const { return threadsPerWarp; }
20 |   int getNumCTAs() const { return numCTAs; }
21 | 
22 | private:
23 |   MLIRContext *context;
24 |   int numWarps;
25 |   int threadsPerWarp;
26 |   int numCTAs;
27 | };
28 | 
29 | class TritonGPUConversionTarget : public ConversionTarget {
30 | 
31 | public:
32 |   explicit TritonGPUConversionTarget(MLIRContext &ctx,
33 |                                      TritonGPUTypeConverter &typeConverter);
34 | };
35 | 
36 | } // namespace mlir
37 | 
38 | #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
39 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonAMDGPUToLLVM)
2 | add_subdirectory(TritonAMDGPUTransforms)
3 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAMDGPUToLLVM
 2 |     ConvertLayoutOpToLLVM/SharedToDotOperandHelper.cpp
 3 |     ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
 4 |     ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
 5 |     ConvertLayoutOpToLLVM.cpp
 6 |     DotOpToLLVM/MFMA.cpp
 7 |     DotOpToLLVM/WMMA.cpp
 8 |     DotOpToLLVM.cpp
 9 |     ElementwiseOpToLLVM.cpp
10 |     LoadStoreOpToLLVM.cpp
11 |     GCNAsmFormat.cpp
12 |     TritonGPUToLLVM.cpp
13 |     Utility.cpp
14 |     TargetInfo.cpp
15 |     DecomposeUnsupportedConversions.cpp
16 |     SPMDOpToLLVM.cpp
17 | 
18 |     DEPENDS
19 |     TritonAMDGPUConversionPassIncGen
20 | 
21 |     LINK_LIBS PUBLIC
22 |     TritonGPUToLLVM
23 | )
24 | 
25 | target_compile_definitions(TritonAMDGPUToLLVM PUBLIC USE_ROCM)
26 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUToLLVM/SPMDOpToLLVM.cpp:
--------------------------------------------------------------------------------
 1 | #include "PatternTritonGPUOpToLLVM.h"
 2 | #include "Utility.h"
 3 | 
 4 | using namespace mlir;
 5 | 
 6 | namespace {
 7 | 
 8 | struct GetNumProgramsOpConversion
 9 |     : public ConvertOpToLLVMPattern<triton::GetNumProgramsOp> {
10 |   using ConvertOpToLLVMPattern<
11 |       triton::GetNumProgramsOp>::ConvertOpToLLVMPattern;
12 | 
13 |   LogicalResult
14 |   matchAndRewrite(triton::GetNumProgramsOp op, OpAdaptor adaptor,
15 |                   ConversionPatternRewriter &rewriter) const override {
16 |     static constexpr mlir::gpu::Dimension dims[] = {mlir::gpu::Dimension::x,
17 |                                                     mlir::gpu::Dimension::y,
18 |                                                     mlir::gpu::Dimension::z};
19 |     Location loc = op->getLoc();
20 |     assert(op.getAxisAsInt() < 3);
21 |     Value blockId =
22 |         rewriter.create<::mlir::gpu::GridDimOp>(loc, dims[op.getAxisAsInt()]);
23 |     rewriter.replaceOpWithNewOp<arith::TruncIOp>(op, i32_ty, blockId);
24 |     return success();
25 |   }
26 | };
27 | 
28 | } // namespace
29 | 
30 | void mlir::triton::AMD::populateSPMDOpToLLVMPattern(
31 |     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
32 |     PatternBenefit benefit) {
33 |   patterns.add<GetNumProgramsOpConversion>(typeConverter, benefit);
34 | }
35 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITONAMDGPU_TO_LLVM_UTILITY_H
 2 | #define TRITON_CONVERSION_TRITONAMDGPU_TO_LLVM_UTILITY_H
 3 | 
 4 | #include "TritonAMDGPUToLLVM/GCNAsmFormat.h"
 5 | 
 6 | #include "mlir/Conversion/LLVMCommon/Pattern.h"
 7 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 8 | #include "triton/Analysis/Utility.h"
 9 | #include "triton/Conversion/MLIRTypes.h"
10 | #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
11 | namespace mlir::LLVM::AMD {
12 | 
13 | Value shuffleXor(Location loc, ConversionPatternRewriter &rewriter, Value val,
14 |                  int i);
15 | Value shuffleUp(Location loc, ConversionPatternRewriter &rewriter, Value val,
16 |                 int i);
17 | Value shuffleIdx(Location loc, ConversionPatternRewriter &rewriter, Value val,
18 |                  int i);
19 | Value shuffleIdx(Location loc, ConversionPatternRewriter &rewriter, Value val,
20 |                  Value i);
21 | 
22 | Value llGetPid(Location loc, ConversionPatternRewriter &rewriter,
23 |                ModuleOp moduleOp, int axis);
24 | } // namespace mlir::LLVM::AMD
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAMDGPUTransforms
 2 |   AccelerateAMDMatmul.cpp
 3 |   OptimizeEpilogue.cpp
 4 |   RemoveLayoutConversions.cpp
 5 |   ReorderInstructions.cpp
 6 |   StreamPipeline.cpp
 7 |   MfmaGroup.cpp
 8 | 
 9 |   DEPENDS
10 |   TritonAMDGPUTransformsIncGen
11 | )
12 | 
13 | target_include_directories(TritonAMDGPUTransforms PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
14 | target_include_directories(TritonAMDGPUTransforms PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../include)
15 | target_compile_definitions(TritonAMDGPUTransforms PUBLIC USE_ROCM)
16 | 


--------------------------------------------------------------------------------
/third_party/cpu/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if(TRITON_BUILD_PYTHON_MODULE)
2 |   add_triton_plugin(TritonCPU ${CMAKE_CURRENT_SOURCE_DIR}/triton_cpu.cc LINK_LIBS TritonCPUToLLVM)
3 | endif()
4 | 


--------------------------------------------------------------------------------
/third_party/cpu/triton_cpu.cc:
--------------------------------------------------------------------------------
 1 | #include "mlir/Pass/Pass.h"
 2 | #include "mlir/Pass/PassManager.h"
 3 | #include "triton/Conversion/TritonCPUToLLVM/Passes.h"
 4 | #include "triton/Dialect/TritonCPU/IR/Dialect.h"
 5 | #include "llvm/IR/Constants.h"
 6 | #include "llvm/Support/TargetSelect.h"
 7 | #include <pybind11/pybind11.h>
 8 | #include <pybind11/stl.h>
 9 | #include <pybind11/stl_bind.h>
10 | 
11 | #include <iostream>
12 | 
13 | namespace py = pybind11;
14 | 
15 | void init_triton_cpu_passes_ttcpuir(py::module &&m) {
16 |   using namespace mlir::triton;
17 |   m.def("add_to_llvmir", [](mlir::PassManager &pm) {
18 |     pm.addPass(mlir::triton::createConvertTritonCPUToLLVMPass());
19 |   });
20 | }
21 | 
22 | void init_triton_cpu(py::module &&m) {
23 |   auto passes = m.def_submodule("passes");
24 |   init_triton_cpu_passes_ttcpuir(passes.def_submodule("ttcpuir"));
25 | 
26 |   m.def("load_dialects", [](mlir::MLIRContext &context) {
27 |     mlir::DialectRegistry registry;
28 |     registry.insert<mlir::triton::cpu::TritonCPUDialect>();
29 |     context.appendDialectRegistry(registry);
30 |     context.loadAllAvailableDialects();
31 |   });
32 | }
33 | 


--------------------------------------------------------------------------------
/third_party/nvidia/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
3 | add_subdirectory(include)
4 | add_subdirectory(lib)
5 | if(TRITON_BUILD_PYTHON_MODULE)
6 |   add_triton_plugin(TritonNVIDIA ${CMAKE_CURRENT_SOURCE_DIR}/triton_nvidia.cc LINK_LIBS TritonNVIDIAGPUToLLVM NVGPUToLLVM)
7 | endif()
8 | 


--------------------------------------------------------------------------------
/third_party/nvidia/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/nvidia/backend/__init__.py


--------------------------------------------------------------------------------
/third_party/nvidia/backend/lib/libdevice.10.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/nvidia/backend/lib/libdevice.10.bc


--------------------------------------------------------------------------------
/third_party/nvidia/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonNVIDIAGPUToLLVM)
2 | add_subdirectory(NVGPUToLLVM)
3 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/NVGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name NVGPUToLLVM)
3 | add_public_tablegen_target(NVGPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_NVGPU_TO_LLVM_PASS_H
 2 | #define TRITON_CONVERSION_NVGPU_TO_LLVM_PASS_H
 3 | 
 4 | #include <memory>
 5 | 
 6 | namespace mlir {
 7 | 
 8 | class ModuleOp;
 9 | template <typename T> class OperationPass;
10 | 
11 | namespace triton {
12 | 
13 | std::unique_ptr<OperationPass<ModuleOp>> createConvertNVGPUToLLVMPass();
14 | 
15 | } // namespace triton
16 | 
17 | } // namespace mlir
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/NVGPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef NVGPU_CONVERSION_PASSES_H
 2 | #define NVGPU_CONVERSION_PASSES_H
 3 | 
 4 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 5 | #include "mlir/Pass/Pass.h"
 6 | #include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h"
 7 | 
 8 | namespace mlir {
 9 | namespace triton {
10 | 
11 | #define GEN_PASS_REGISTRATION
12 | #include "nvidia/include/NVGPUToLLVM/Passes.h.inc"
13 | 
14 | } // namespace triton
15 | } // namespace mlir
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/NVGPUToLLVM/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef NVGPU_CONVERSION_PASSES
 2 | #define NVGPU_CONVERSION_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | 
 7 | def ConvertNVGPUToLLVM : Pass<"convert-nv-gpu-to-llvm", "mlir::ModuleOp"> {
 8 |     let summary = "Convert NVGPU to LLVM";
 9 |     let description = [{
10 | 
11 |     }];
12 |     let constructor = "mlir::triton::createConvertNVGPUToLLVMPass()";
13 | 
14 |     let dependentDialects = ["mlir::arith::ArithDialect",
15 |                              "mlir::LLVM::LLVMDialect",
16 |                              "mlir::NVVM::NVVMDialect",
17 |                              "mlir::triton::nvgpu::NVGPUDialect"];
18 | }
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonNVIDIAGPUToLLVM)
3 | add_public_tablegen_target(TritonNVIDIAGPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_PASSES_H
 2 | #define TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_PASSES_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | #include "mlir/Pass/Pass.h"
 6 | #include "mlir/Transforms/DialectConversion.h"
 7 | 
 8 | #include <memory>
 9 | 
10 | namespace mlir {
11 | 
12 | class ModuleOp;
13 | template <typename T> class OperationPass;
14 | 
15 | namespace triton {
16 | 
17 | #define GEN_PASS_DECL
18 | #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc"
19 | 
20 | namespace NVIDIA {
21 | std::unique_ptr<OperationPass<ModuleOp>>
22 | createDecomposeUnsupportedConversionsPass();
23 | 
24 | } // namespace NVIDIA
25 | 
26 | std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonGPUToLLVMPass();
27 | std::unique_ptr<OperationPass<ModuleOp>>
28 | createConvertTritonGPUToLLVMPass(int32_t computeCapability);
29 | 
30 | #define GEN_PASS_REGISTRATION
31 | #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc"
32 | 
33 | } // namespace triton
34 | 
35 | } // namespace mlir
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_CONVERSION_PASSES
 2 | #define TRITONGPU_CONVERSION_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def DecomposeUnsupportedNVIDIAConversions : Pass<"decompose-unsupported-nvidia-conversions", "mlir::ModuleOp"> {
 7 |     let summary = "Decompose conversions that are not supported by TritonGPU -> LLVM";
 8 |     let constructor = "mlir::triton::NVIDIA::createDecomposeUnsupportedConversionsPass()";
 9 | }
10 | 
11 | def ConvertTritonGPUToLLVM : Pass<"convert-triton-gpu-to-llvm", "mlir::ModuleOp"> {
12 |     let summary = "Convert TritonGPU to LLVM";
13 |     let description = [{
14 | 
15 |     }];
16 |     let constructor = "mlir::triton::createConvertTritonGPUToLLVMPass()";
17 | 
18 |     let dependentDialects = ["mlir::arith::ArithDialect",
19 |                              "mlir::math::MathDialect",
20 |                              "mlir::gpu::GPUDialect",
21 |                              "mlir::scf::SCFDialect",
22 |                              "mlir::LLVM::LLVMDialect",
23 |                              "mlir::tensor::TensorDialect",
24 |                              "mlir::triton::TritonDialect",
25 |                              "mlir::triton::gpu::TritonGPUDialect",
26 |                              "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
27 |                              "mlir::NVVM::NVVMDialect"];
28 | 
29 |     let options = [
30 |         Option<"computeCapability", "compute-capability",
31 |                "int32_t", /*default*/"80",
32 |                "device compute capability">,
33 |     ];
34 | }
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonNVIDIAGPUToLLVM)
2 | add_subdirectory(NVGPUToLLVM)
3 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/NVGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_triton_library(NVGPUToLLVM
2 |     NVGPUToLLVMPass.cpp
3 | 
4 |     DEPENDS
5 |     NVGPUConversionPassIncGen
6 | )
7 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonNVIDIAGPUToLLVM
 2 |     ConvertLayoutOpToLLVM/SharedToDotOperandMMAv1.cpp
 3 |     ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
 4 |     ConvertLayoutOpToLLVM.cpp
 5 |     DotOpToLLVM/MMAv1.cpp
 6 |     DotOpToLLVM/MMAv2.cpp
 7 |     DotOpToLLVM/WGMMA.cpp
 8 |     DotOpToLLVM.cpp
 9 |     ElementwiseOpToLLVM.cpp
10 |     LoadStoreOpToLLVM.cpp
11 |     BarrierOpToLLVM.cpp
12 |     TritonGPUToLLVM.cpp
13 |     DecomposeUnsupportedConversions.cpp
14 |     SPMDOpToLLVM.cpp
15 |     TensorPtrOpsToLLVM.cpp
16 |     ClusterOpsToLLVM.cpp
17 |     PTXAsmFormat.cpp
18 |     Utility.cpp
19 |     TargetInfo.cpp
20 | 
21 |     DEPENDS
22 |     TritonNVIDIAGPUConversionPassIncGen
23 |     NVGPUAttrDefsIncGen
24 | 
25 |     LINK_LIBS PUBLIC
26 |     TritonGPUToLLVM
27 | )
28 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp:
--------------------------------------------------------------------------------
 1 | #include "TritonNVIDIAGPUToLLVM/Passes.h"
 2 | #include "mlir/Pass/Pass.h"
 3 | #include "triton/Analysis/Utility.h"
 4 | #include "triton/Conversion/TritonGPUToLLVM/Patterns.h"
 5 | #include "triton/Dialect/Triton/IR/Dialect.h"
 6 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 7 | 
 8 | using namespace mlir;
 9 | 
10 | namespace mlir {
11 | namespace triton {
12 | #define GEN_PASS_DEF_DECOMPOSEUNSUPPORTEDNVIDIACONVERSIONS
13 | #include "TritonNVIDIAGPUToLLVM/Passes.h.inc"
14 | } // namespace triton
15 | } // namespace mlir
16 | 
17 | namespace {
18 | struct DecomposeUnsupportedConversions
19 |     : public mlir::triton::impl::DecomposeUnsupportedNVIDIAConversionsBase<
20 |           DecomposeUnsupportedConversions> {
21 |   void runOnOperation() override {
22 |     ModuleOp mod = getOperation();
23 |     triton::gpu::decomposeSplatOpToSharedLayoutConversion(mod);
24 |     triton::gpu::decomposeTensorCoreToDotLayoutConversion<
25 |         triton::gpu::NvidiaMmaEncodingAttr>(mod, isMmaToDotShortcut);
26 |     triton::gpu::decomposeBlockedToDotLayoutConversion(mod);
27 |   }
28 | };
29 | } // namespace
30 | 
31 | namespace mlir::triton::NVIDIA {
32 | 
33 | std::unique_ptr<OperationPass<ModuleOp>>
34 | createDecomposeUnsupportedConversionsPass() {
35 |   return std::make_unique<DecomposeUnsupportedConversions>();
36 | }
37 | 
38 | } // namespace mlir::triton::NVIDIA
39 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/SPMDOpToLLVM.cpp:
--------------------------------------------------------------------------------
 1 | #include "PatternTritonGPUOpToLLVM.h"
 2 | #include "Utility.h"
 3 | 
 4 | namespace {
 5 | 
 6 | using namespace mlir;
 7 | using namespace mlir::triton;
 8 | 
 9 | struct GetNumProgramsOpConversion
10 |     : public ConvertOpToLLVMPattern<triton::GetNumProgramsOp> {
11 |   using ConvertOpToLLVMPattern<
12 |       triton::GetNumProgramsOp>::ConvertOpToLLVMPattern;
13 | 
14 |   LogicalResult
15 |   matchAndRewrite(triton::GetNumProgramsOp op, OpAdaptor adaptor,
16 |                   ConversionPatternRewriter &rewriter) const override {
17 |     // It is not easy to get the compute capability here, so we use numCTAs to
18 |     // decide the semantic of GetNumProgramsOp. If numCTAs = 1, then
19 |     // GetNumProgramsOp is converted to "%nctaid", otherwise it is converted to
20 |     // "%nclusterid".
21 |     auto moduleOp = op->getParentOfType<ModuleOp>();
22 |     assert(moduleOp && "Parent ModuleOp not found for GetProgramIdOp");
23 |     int numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(moduleOp);
24 | 
25 |     Location loc = op->getLoc();
26 |     assert(op.getAxisAsInt() < 3);
27 |     std::string sreg = numCTAs == 1 ? "%nctaid." : "%nclusterid.";
28 |     sreg.append(1, 'x' + op.getAxisAsInt()); // 0 -> 'x', 1 -> 'y', 2 -> 'z'
29 | 
30 |     Value numPrograms = LLVM::NVIDIA::getSRegValue(rewriter, loc, sreg);
31 |     rewriter.replaceOp(op, numPrograms);
32 |     return success();
33 |   }
34 | };
35 | 
36 | } // namespace
37 | 
38 | void mlir::triton::NVIDIA::populateSPMDOpToLLVMPattern(
39 |     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
40 |     PatternBenefit benefit) {
41 |   patterns.add<GetNumProgramsOpConversion>(typeConverter, benefit);
42 | }
43 | 


--------------------------------------------------------------------------------
/third_party/proton/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | proton.egg-info
3 | proton/_C/libproton.so
4 | 
5 | *.hatchet
6 | 


--------------------------------------------------------------------------------
/third_party/proton/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(Proton CXX)
 2 | 
 3 | set(PROTON_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/csrc)
 4 | set(PROTON_EXTERN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/extern)
 5 | file(GLOB_RECURSE PROTON_SRC ${PROTON_SRC_DIR}/lib/*.cpp)
 6 | add_library(proton SHARED ${PROTON_SRC} ${PROTON_SRC_DIR}/${PROJECT_NAME}.cpp)
 7 | 
 8 | find_package(CUDAToolkit)
 9 | 
10 | if(${CUDAToolkit_VERSION_MAJOR} VERSION_LESS 11)
11 |   message(FATAL_ERROR "CUDA 11 or higher is required")
12 | endif()
13 | 
14 | # Try to find CUPTI from the default include dir, if not found, search the
15 | # EXTRAS dir
16 | find_path(
17 |   CUPTI_INCLUDE_DIR
18 |   NAMES cupti.h
19 |   HINTS ${CUDAToolkit_INCLUDE_DIRS}
20 |   PATH_SUFFIXES include)
21 | 
22 | if(NOT CUPTI_INCLUDE_DIR)
23 |   find_path(
24 |     CUPTI_INCLUDE_DIR
25 |     NAMES cupti.h
26 |     HINTS ${CUDAToolkit_ROOT}/extras/CUPTI
27 |     PATH_SUFFIXES include)
28 | endif()
29 | 
30 | # Check if CUPTI was found
31 | if(NOT CUPTI_INCLUDE_DIR)
32 |   message(FATAL_ERROR "CUPTI include directory not found: CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
33 | else()
34 |   message(STATUS "Found CUPTI include directory: ${CUPTI_INCLUDE_DIR}")
35 | endif()
36 | 
37 | include_directories(${PYBIND11_INCLUDE_DIR})
38 | include_directories(${JSON_INCLUDE_DIR})
39 | include_directories(${PROTON_SRC_DIR}/include)
40 | include_directories(${PROTON_EXTERN_DIR})
41 | 
42 | if(PYTHON_INCLUDE_DIRS)
43 |   include_directories(${PYTHON_INCLUDE_DIRS})
44 | else()
45 |   find_package(Python3 REQUIRED Interpreter Development)
46 |   include_directories(${Python3_INCLUDE_DIRS})
47 | endif()
48 | 
49 | include_directories(${CUDAToolkit_INCLUDE_DIRS})
50 | include_directories(${CUPTI_INCLUDE_DIR})
51 | target_link_libraries(proton ${Python_LIBRARIES})
52 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Context/Python.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_CONTEXT_PYTHON_H_
 2 | #define PROTON_CONTEXT_PYTHON_H_
 3 | 
 4 | #include "Context.h"
 5 | 
 6 | namespace proton {
 7 | 
 8 | /// Unwind the Python stack and early return a list of contexts.
 9 | class PythonContextSource : public ContextSource {
10 | public:
11 |   std::vector<Context> getContexts() override;
12 | };
13 | 
14 | } // namespace proton
15 | 
16 | #endif // PROTON_CONTEXT_PYTHON_H_
17 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Context/Shadow.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_CONTEXT_SHADOW_H_
 2 | #define PROTON_CONTEXT_SHADOW_H_
 3 | 
 4 | #include "Context.h"
 5 | #include <vector>
 6 | 
 7 | namespace proton {
 8 | 
 9 | /// Incrementally build a list of contexts by shadowing the stack with
10 | /// user-defined scopes.
11 | class ShadowContextSource : public ContextSource, public ScopeInterface {
12 | public:
13 |   ShadowContextSource() = default;
14 | 
15 |   std::vector<Context> getContexts() override { return contextStack; }
16 | 
17 |   void enterScope(const Scope &scope) override;
18 | 
19 |   void exitScope(const Scope &scope) override;
20 | 
21 | private:
22 |   std::vector<Context> contextStack;
23 | };
24 | 
25 | } // namespace proton
26 | 
27 | #endif // PROTON_CONTEXT_CONTEXT_H_
28 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Data/Data.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_DATA_DATA_H_
 2 | #define PROTON_DATA_DATA_H_
 3 | 
 4 | #include "Context/Context.h"
 5 | #include "Metric.h"
 6 | #include <map>
 7 | #include <memory>
 8 | #include <shared_mutex>
 9 | #include <string>
10 | 
11 | namespace proton {
12 | 
13 | enum class OutputFormat { Hatchet, Count };
14 | 
15 | class Data : public InternalOpInterface {
16 | public:
17 |   Data(const std::string &path, ContextSource *contextSource = nullptr)
18 |       : path(path), contextSource(contextSource) {}
19 |   virtual ~Data() = default;
20 | 
21 |   /// Add a single metric to the data.
22 |   /// [MT] The implementation must be thread-safe.
23 |   virtual void addMetric(size_t scopeId, std::shared_ptr<Metric> metric) = 0;
24 | 
25 |   /// Add multiple metrics to the data.
26 |   /// [MT] The implementation must be thread-safe.
27 |   virtual void
28 |   addMetrics(size_t scopeId,
29 |              const std::map<std::string, MetricValueType> &metrics) = 0;
30 | 
31 |   /// Dump the data to the given output format.
32 |   /// [MT] Thread-safe.
33 |   void dump(OutputFormat outputFormat);
34 | 
35 | protected:
36 |   /// The actual implementation of the dump operation.
37 |   /// [MT] Thread-safe.
38 |   virtual void doDump(std::ostream &os, OutputFormat outputFormat) const = 0;
39 | 
40 | protected:
41 |   mutable std::shared_mutex mutex;
42 | 
43 |   const std::string path{};
44 |   ContextSource *contextSource{};
45 | };
46 | 
47 | OutputFormat parseOutputFormat(const std::string &outputFormat);
48 | 
49 | const std::string outputFormatToString(OutputFormat outputFormat);
50 | 
51 | } // namespace proton
52 | 
53 | #endif // PROTON_DATA_DATA_H_
54 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Data/TraceData.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_DATA_TRACE_DATA_H_
 2 | #define PROTON_DATA_TRACE_DATA_H_
 3 | 
 4 | #include "Data.h"
 5 | 
 6 | namespace proton {
 7 | 
 8 | class TraceData : public Data {
 9 | public:
10 |   using Data::Data;
11 | 
12 |   void addMetric(size_t scopeId, std::shared_ptr<Metric> metric) override;
13 | 
14 |   void
15 |   addMetrics(size_t scopeId,
16 |              const std::map<std::string, MetricValueType> &metrics) override;
17 | 
18 | protected:
19 |   void startOp(const Scope &scope) override final;
20 | 
21 |   void stopOp(const Scope &scope) override final;
22 | 
23 | private:
24 |   void doDump(std::ostream &os, OutputFormat outputFormat) const override;
25 | };
26 | 
27 | } // namespace proton
28 | 
29 | #endif // PROTON_DATA_TRACE_DATA_H_
30 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Driver/GPU/Cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_DRIVER_GPU_CUDA_H_
 2 | #define PROTON_DRIVER_GPU_CUDA_H_
 3 | 
 4 | #include <cuda.h>
 5 | 
 6 | namespace proton {
 7 | 
 8 | namespace cuda {
 9 | 
10 | template <bool CheckSuccess> CUresult init(int flags);
11 | 
12 | template <bool CheckSuccess> CUresult ctxSynchronize();
13 | 
14 | template <bool CheckSuccess> CUresult ctxGetCurrent(CUcontext *pctx);
15 | 
16 | template <bool CheckSuccess>
17 | CUresult deviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
18 | 
19 | template <bool CheckSuccess> CUresult deviceGet(CUdevice *device, int ordinal);
20 | 
21 | } // namespace cuda
22 | 
23 | } // namespace proton
24 | 
25 | #endif // PROTON_DRIVER_GPU_CUDA_H_
26 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Proton.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_H_
 2 | #define PROTON_H_
 3 | 
 4 | #include "Context/Context.h"
 5 | #include "Data/Data.h"
 6 | #include "Data/Metric.h"
 7 | #include "Session/Session.h"
 8 | 
 9 | #endif // PROTON_H_
10 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/Errors.h:
--------------------------------------------------------------------------------
 1 | #include <stdexcept>
 2 | 
 3 | namespace proton {
 4 | 
 5 | class NotImplemented : public std::logic_error {
 6 | public:
 7 |   NotImplemented() : std::logic_error("Not yet implemented"){};
 8 | };
 9 | 
10 | } // namespace proton
11 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/Singleton.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_SINGLETON_H_
 2 | #define PROTON_UTILITY_SINGLETON_H_
 3 | 
 4 | namespace proton {
 5 | 
 6 | template <typename T> class Singleton {
 7 | public:
 8 |   Singleton(const Singleton &) = delete;
 9 |   Singleton &operator=(const Singleton &) = delete;
10 | 
11 |   static T &instance() {
12 |     static T _;
13 |     return _;
14 |   }
15 | 
16 | protected:
17 |   Singleton() = default;
18 | };
19 | 
20 | } // namespace proton
21 | 
22 | #endif // PROTON_UTILITY_SINGLETON_H_
23 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/String.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_STRING_H_
 2 | #define PROTON_UTILITY_STRING_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | namespace proton {
 7 | 
 8 | inline std::string toLower(const std::string &str) {
 9 |   std::string lower;
10 |   for (auto c : str) {
11 |     lower += tolower(c);
12 |   }
13 |   return lower;
14 | }
15 | 
16 | } // namespace proton
17 | 
18 | #endif // PROTON_UTILITY_STRING_H_
19 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/Traits.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_TRAITS_H_
 2 | #define PROTON_UTILITY_TRAITS_H_
 3 | 
 4 | #include <type_traits>
 5 | #include <variant>
 6 | 
 7 | namespace proton {
 8 | template <class T, class... Ts>
 9 | struct is_one_of : std::disjunction<std::is_same<T, Ts>...> {};
10 | } // namespace proton
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Context/Context.cpp:
--------------------------------------------------------------------------------
1 | #include "Context/Context.h"
2 | 
3 | namespace proton {
4 | 
5 | std::atomic<size_t> Scope::scopeIdCounter{1};
6 | 
7 | } // namespace proton
8 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Context/Python.cpp:
--------------------------------------------------------------------------------
 1 | #include "Context/Python.h"
 2 | #include "pybind11/pybind11.h"
 3 | #include <string>
 4 | 
 5 | namespace proton {
 6 | 
 7 | namespace {
 8 | 
 9 | std::string UnpackPyobject(PyObject *pyObject) {
10 |   if (PyBytes_Check(pyObject)) {
11 |     size_t size = PyBytes_GET_SIZE(pyObject);
12 |     return std::string(PyBytes_AS_STRING(pyObject), size);
13 |   }
14 |   if (PyUnicode_Check(pyObject)) {
15 |     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
16 |     Py_ssize_t size;
17 |     const char *data = PyUnicode_AsUTF8AndSize(pyObject, &size);
18 |     if (!data) {
19 |       return "";
20 |     }
21 |     return std::string(data, (size_t)size);
22 |   }
23 |   return "";
24 | }
25 | 
26 | } // namespace
27 | 
28 | std::vector<Context> PythonContextSource::getContexts() {
29 |   pybind11::gil_scoped_acquire gil;
30 | 
31 |   PyFrameObject *frame = PyEval_GetFrame();
32 |   Py_XINCREF(frame);
33 | 
34 |   std::vector<Context> contexts;
35 |   while (frame != nullptr) {
36 |     PyCodeObject *f_code = PyFrame_GetCode(frame);
37 |     size_t lineno = PyFrame_GetLineNumber(frame);
38 |     size_t firstLineNo = f_code->co_firstlineno;
39 |     std::string file = UnpackPyobject(f_code->co_filename);
40 |     std::string function = UnpackPyobject(f_code->co_name);
41 |     auto pythonFrame = file + ":" + function + "@" + std::to_string(lineno);
42 |     contexts.push_back(Context(pythonFrame));
43 |     auto newFrame = PyFrame_GetBack(frame);
44 |     Py_DECREF(frame);
45 |     frame = newFrame;
46 |   }
47 |   return contexts;
48 | }
49 | 
50 | } // namespace proton
51 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Context/Shadow.cpp:
--------------------------------------------------------------------------------
 1 | #include "Context/Shadow.h"
 2 | 
 3 | #include <stdexcept>
 4 | 
 5 | namespace proton {
 6 | 
 7 | void ShadowContextSource::enterScope(const Scope &scope) {
 8 |   contextStack.push_back(scope);
 9 | }
10 | 
11 | void ShadowContextSource::exitScope(const Scope &scope) {
12 |   if (contextStack.empty()) {
13 |     throw std::runtime_error("Context stack is empty");
14 |   }
15 |   if (contextStack.back() != scope) {
16 |     throw std::runtime_error("Context stack is not balanced");
17 |   }
18 |   contextStack.pop_back();
19 | }
20 | 
21 | } // namespace proton
22 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Data/Data.cpp:
--------------------------------------------------------------------------------
 1 | #include "Data/Data.h"
 2 | #include "Utility/String.h"
 3 | 
 4 | #include <fstream>
 5 | #include <iostream>
 6 | #include <stdexcept>
 7 | 
 8 | #include <shared_mutex>
 9 | 
10 | namespace proton {
11 | 
12 | void Data::dump(OutputFormat outputFormat) {
13 |   std::shared_lock<std::shared_mutex> lock(mutex);
14 | 
15 |   std::unique_ptr<std::ostream> out;
16 |   if (path.empty() || path == "-") {
17 |     out.reset(new std::ostream(std::cout.rdbuf())); // Redirecting to cout
18 |   } else {
19 |     out.reset(new std::ofstream(
20 |         path + "." +
21 |         outputFormatToString(outputFormat))); // Opening a file for output
22 |   }
23 |   doDump(*out, outputFormat);
24 | }
25 | 
26 | OutputFormat parseOutputFormat(const std::string &outputFormat) {
27 |   if (toLower(outputFormat) == "hatchet") {
28 |     return OutputFormat::Hatchet;
29 |   }
30 |   throw std::runtime_error("Unknown output format: " + outputFormat);
31 | }
32 | 
33 | const std::string outputFormatToString(OutputFormat outputFormat) {
34 |   if (outputFormat == OutputFormat::Hatchet) {
35 |     return "hatchet";
36 |   }
37 |   throw std::runtime_error("Unknown output format: " +
38 |                            std::to_string(static_cast<int>(outputFormat)));
39 | }
40 | 
41 | } // namespace proton
42 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Data/TraceData.cpp:
--------------------------------------------------------------------------------
 1 | #include "Data/TraceData.h"
 2 | #include "Utility/Errors.h"
 3 | 
 4 | #include <stdexcept>
 5 | 
 6 | namespace proton {
 7 | 
 8 | void TraceData::startOp(const Scope &scope) { throw NotImplemented(); }
 9 | 
10 | void TraceData::stopOp(const Scope &scope) { throw NotImplemented(); }
11 | 
12 | void TraceData::addMetric(size_t scopeId, std::shared_ptr<Metric> metric) {
13 |   throw NotImplemented();
14 | }
15 | 
16 | void TraceData::addMetrics(
17 |     size_t scopeId, const std::map<std::string, MetricValueType> &metrics) {
18 |   throw NotImplemented();
19 | }
20 | 
21 | void TraceData::doDump(std::ostream &os, OutputFormat outputFormat) const {
22 |   throw NotImplemented();
23 | }
24 | 
25 | } // namespace proton
26 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Driver/GPU/Cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include "Driver/GPU/Cuda.h"
 2 | #include "Driver/Dispatch.h"
 3 | 
 4 | namespace proton {
 5 | 
 6 | namespace cuda {
 7 | 
 8 | struct ExternLibCuda : public ExternLibBase {
 9 |   using RetType = CUresult;
10 |   static constexpr const char *name = "libcuda.so";
11 |   static constexpr RetType success = CUDA_SUCCESS;
12 |   static void *lib;
13 | };
14 | 
15 | void *ExternLibCuda::lib = nullptr;
16 | 
17 | DEFINE_DISPATCH(ExternLibCuda, init, cuInit, int)
18 | 
19 | DEFINE_DISPATCH(ExternLibCuda, ctxSynchronize, cuCtxSynchronize)
20 | 
21 | DEFINE_DISPATCH(ExternLibCuda, ctxGetCurrent, cuCtxGetCurrent, CUcontext *)
22 | 
23 | DEFINE_DISPATCH(ExternLibCuda, deviceGet, cuDeviceGet, CUdevice *, int)
24 | 
25 | DEFINE_DISPATCH(ExternLibCuda, deviceGetAttribute, cuDeviceGetAttribute, int *,
26 |                 CUdevice_attribute, CUdevice)
27 | 
28 | } // namespace cuda
29 | 
30 | } // namespace proton
31 | 


--------------------------------------------------------------------------------
/third_party/proton/proton/_C/include:
--------------------------------------------------------------------------------
1 | ../../csrc/include/


--------------------------------------------------------------------------------
/third_party/proton/proton/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from .scope import scope, enter_scope, exit_scope
 3 | from .profile import (
 4 |     start,
 5 |     activate,
 6 |     deactivate,
 7 |     finalize,
 8 |     profile,
 9 |     DEFAULT_PROFILE_NAME,
10 | )
11 | 


--------------------------------------------------------------------------------
/third_party/proton/proton/flags.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains the global flags used in the proton package.
 3 | """
 4 | 
 5 | # Whether to enable profiling. Default is False.
 6 | profiling_on = False
 7 | 
 8 | 
 9 | def set_profiling_on():
10 |     global profiling_on
11 |     profiling_on = True
12 | 
13 | 
14 | def set_profiling_off():
15 |     global profiling_on
16 |     profiling_on = False
17 | 
18 | 
19 | def get_profiling_on():
20 |     global profiling_on
21 |     return profiling_on
22 | 


--------------------------------------------------------------------------------
/third_party/proton/proton/hook.py:
--------------------------------------------------------------------------------
 1 | from .scope import enter_scope, exit_scope
 2 | from triton.compiler import CompiledKernel, LazyDict
 3 | 
 4 | COMPUTE_METADATA_SCOPE_NAME = "__proton_launch_metadata"
 5 | 
 6 | 
 7 | class TritonHook:
 8 |     metrics = ["flops8", "flops16", "flops32", "flops64", "bytes"]
 9 | 
10 |     @staticmethod
11 |     def enter(metadata: LazyDict) -> None:
12 |         enter_scope(COMPUTE_METADATA_SCOPE_NAME)
13 |         metadata = metadata.get()
14 |         exit_scope()
15 |         fn_metrics = {k: metadata[k] for k in TritonHook.metrics if k in metadata}
16 |         enter_scope(metadata["name"], triton_op=True, metrics=fn_metrics)
17 | 
18 |     @staticmethod
19 |     def exit(metadata: LazyDict) -> None:
20 |         exit_scope(triton_op=True)
21 | 
22 | 
23 | def register_triton_hook() -> None:
24 |     if CompiledKernel.launch_enter_hook is None:
25 |         CompiledKernel.launch_enter_hook = TritonHook.enter
26 |         CompiledKernel.launch_exit_hook = TritonHook.exit
27 | 
28 | 
29 | def unregister_triton_hook() -> None:
30 |     if CompiledKernel.launch_enter_hook == TritonHook.enter:
31 |         CompiledKernel.launch_enter_hook = None
32 |         CompiledKernel.launch_exit_hook = None
33 | 


--------------------------------------------------------------------------------
/third_party/proton/test/test_lib.py:
--------------------------------------------------------------------------------
 1 | import triton._C.libproton.proton as libproton
 2 | import tempfile
 3 | import pathlib
 4 | 
 5 | 
 6 | def test_record():
 7 |     id0 = libproton.record_scope()
 8 |     id1 = libproton.record_scope()
 9 |     assert id1 == id0 + 1
10 | 
11 | 
12 | def test_scope():
13 |     id0 = libproton.record_scope()
14 |     libproton.enter_scope(id0, "zero")
15 |     id1 = libproton.record_scope()
16 |     libproton.enter_scope(id1, "one")
17 |     libproton.exit_scope(id1, "one")
18 |     libproton.exit_scope(id0, "zero")
19 | 
20 | 
21 | def test_op():
22 |     id0 = libproton.record_scope()
23 |     libproton.enter_op(id0, "zero")
24 |     libproton.exit_op(id0, "zero")
25 | 
26 | 
27 | def test_session():
28 |     with tempfile.NamedTemporaryFile(delete=True, suffix=".hatchet") as f:
29 |         session_id = libproton.start(f.name.split(".")[0], "cupti", "shadow", "tree")
30 |         libproton.deactivate(session_id)
31 |         libproton.activate(session_id)
32 |         libproton.finalize(session_id, "hatchet")
33 |         libproton.finalize_all("hatchet")
34 |         assert pathlib.Path(f.name).exists()
35 | 
36 | 
37 | def test_add_metrics():
38 |     with tempfile.NamedTemporaryFile(delete=True, suffix=".hatchet") as f:
39 |         libproton.start(f.name.split(".")[0], "cupti", "shadow", "tree")
40 |         id1 = libproton.record_scope()
41 |         libproton.enter_scope(id1, "one")
42 |         libproton.add_metrics(id1, {"a": 1.0, "b": 2.0})
43 |         libproton.exit_scope(id1, "one")
44 |         libproton.finalize_all("hatchet")
45 |         assert pathlib.Path(f.name).exists()
46 | 


--------------------------------------------------------------------------------
/third_party/proton/test/test_viewer.py:
--------------------------------------------------------------------------------
1 | import triton.profiler as proton
2 | import subprocess
3 | 
4 | 
5 | def test_help():
6 |     # Only check if the viewer can be invoked
7 |     ret = subprocess.check_call(["proton-viewer", "-h"])
8 |     assert ret == 0
9 | 


--------------------------------------------------------------------------------
/unittest/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_triton_ut(
2 |   NAME TestTritonAnalysis
3 |   SRCS UtilityTest.cpp
4 |   LIBS
5 |     TritonAnalysis
6 |     TritonIR
7 |     TritonGPUIR
8 | )
9 | 


--------------------------------------------------------------------------------
/unittest/Analysis/UtilityTest.cpp:
--------------------------------------------------------------------------------
 1 | //===- UtilityTest.cpp - Tests for
 2 | // Utility----------------------------------===//
 3 | //
 4 | //===----------------------------------------------------------------------===//
 5 | 
 6 | #include "triton/Dialect/Triton/IR/Utility.h"
 7 | #include <gtest/gtest.h>
 8 | 
 9 | namespace mlir {
10 | 
11 | TEST(Analysis, reorder) {
12 |   SmallVector<int> shape({10, 20, 30});
13 |   {
14 |     SmallVector<unsigned> order({2, 1, 0});
15 |     auto reordered = triton::applyPermutation(shape, order);
16 |     EXPECT_EQ(reordered[0], 30);
17 |     EXPECT_EQ(reordered[1], 20);
18 |     EXPECT_EQ(reordered[2], 10);
19 |   }
20 |   {
21 |     SmallVector<unsigned> order({1, 0, 2});
22 |     auto reordered = triton::applyPermutation(shape, order);
23 |     EXPECT_EQ(reordered[0], 20);
24 |     EXPECT_EQ(reordered[1], 10);
25 |     EXPECT_EQ(reordered[2], 30);
26 |   }
27 | }
28 | 
29 | } // namespace mlir
30 | 


--------------------------------------------------------------------------------
/unittest/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include (${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake)
 2 | 
 3 | include(GoogleTest)
 4 | enable_testing()
 5 | 
 6 | get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 7 | get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
 8 | get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
 9 | 
10 | function(add_triton_ut)
11 |   set(options)
12 |   set(oneValueArgs NAME)
13 |   set(multiValueArgs SRCS LIBS DEFS)
14 |   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
15 |   add_test(NAME ${__NAME}
16 |           COMMAND ${__NAME})
17 |   add_executable(
18 |           ${__NAME}
19 |           ${__SRCS})
20 |   target_link_libraries(
21 |           ${__NAME}
22 |           PRIVATE
23 |           GTest::gtest_main
24 |           ${triton_libs}
25 |           ${dialect_libs}
26 |           ${conversion_libs}
27 |           gmock
28 |           ${__LIBS})
29 | 
30 |   target_compile_options(${__NAME} PRIVATE -fno-rtti)
31 | 
32 |   target_compile_definitions(${__NAME} PRIVATE ${__DEFS})
33 | 
34 |   # Without the TEST_DISCOVERY_TIMEOUT, the tests randomly time out on my mac
35 |   # laptop.  I think the issue may be that the very first time you run a program
36 |   # it's a bit slow.
37 |   gtest_discover_tests(${__NAME} PROPERTIES TEST_DISCOVERY_TIMEOUT 60)
38 | endfunction()
39 | 
40 | add_subdirectory(Analysis)
41 | add_subdirectory(Conversion)
42 | add_subdirectory(Dialect)
43 | 


--------------------------------------------------------------------------------
/unittest/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonGPUToLLVM)
2 | 


--------------------------------------------------------------------------------
/unittest/Conversion/TritonGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_ut(
 2 | 	NAME TestPtxAsmFormat
 3 | 	SRCS PTXAsmFormatTest.cpp
 4 | 	LIBS TritonGPUToLLVM TritonNVIDIAGPUToLLVM
 5 | )
 6 | 
 7 | add_triton_ut(
 8 | 	NAME TestEmitIndicesNvidia
 9 | 	SRCS EmitIndicesTest.cpp DumpLayout.cpp
10 | 	LIBS TritonGPUIR TritonNvidiaGPUIR TritonNVIDIAGPUToLLVM
11 | 	DEFS NVIDIA_TARGET=1
12 | )
13 | 
14 | add_triton_ut(
15 | 	NAME TestEmitIndicesAMD
16 | 	SRCS EmitIndicesTest.cpp DumpLayout.cpp
17 | 	LIBS TritonGPUIR TritonAMDGPUToLLVM
18 | 	DEFS AMD_TARGET=1
19 | )
20 | 


--------------------------------------------------------------------------------
/unittest/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonGPU)
2 | 


--------------------------------------------------------------------------------
/unittest/Dialect/TritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_ut(
 2 | 	NAME TestSwizzling
 3 | 	SRCS SwizzleTest.cpp
 4 | 	LIBS TritonGPUIR TritonNvidiaGPUIR
 5 | )
 6 | add_triton_ut(
 7 | 	NAME Dialect
 8 | 	SRCS DialectTest.cpp
 9 | 	LIBS TritonGPUIR
10 | )
11 | 


--------------------------------------------------------------------------------
/unittest/googletest.cmake:
--------------------------------------------------------------------------------
 1 | include(FetchContent)
 2 | 
 3 | set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against")
 4 | 
 5 | if(GOOGLETEST_DIR)
 6 |   set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override")
 7 | endif()
 8 | 
 9 | FetchContent_Declare(
10 |   googletest
11 |   GIT_REPOSITORY https://github.com/google/googletest.git
12 |   GIT_TAG release-1.12.1
13 |   )
14 | 
15 | FetchContent_GetProperties(googletest)
16 | 
17 | if(NOT googletest_POPULATED)
18 |   FetchContent_Populate(googletest)
19 |   if (MSVC)
20 |     set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
21 |   endif()
22 |   add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
23 | endif()
24 | 


--------------------------------------------------------------------------------
/utils/nightly.pypirc:
--------------------------------------------------------------------------------
1 | [distutils]
2 | Index-servers =
3 |   Triton-Nightly
4 | 
5 | [Triton-Nightly]
6 | Repository = https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/upload/
7 | 


--------------------------------------------------------------------------------