├── .clang-format
├── .editorconfig
├── .flake8
├── .git-blame-ignore-revs
├── .github
├── CODEOWNERS
└── workflows
│ ├── documentation.yml
│ ├── integration-tests.yml
│ ├── llvm-build.yml
│ ├── llvm-build
│ └── Dockerfile
│ ├── test-backends.yml
│ ├── torch-inductor-tests.yml
│ ├── torch-inductor
│ └── scripts
│ │ ├── check_acc.py
│ │ ├── check_perf.py
│ │ ├── common.sh
│ │ ├── install_torchinductor.sh
│ │ ├── install_triton.sh
│ │ ├── run_torchinductor_acc.sh
│ │ └── run_torchinductor_perf.sh
│ └── wheels.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bin
├── CMakeLists.txt
├── RegisterTritonDialects.h
├── triton-llvm-opt.cpp
├── triton-lsp.cpp
├── triton-opt.cpp
└── triton-reduce.cpp
├── cmake
├── FindLLVM.cmake
├── llvm-hash.txt
├── nvidia-toolchain-version.txt
└── pybind11-version.txt
├── docs
├── Makefile
├── _templates
│ └── versions.html
├── backend
│ ├── ldmatrixOperand0.svg
│ └── ldmatrixOperand1.svg
├── conf.py
├── getting-started
│ ├── installation.rst
│ └── tutorials
│ │ ├── grouped_vs_row_major_ordering.png
│ │ ├── parallel_reduction.png
│ │ └── random_bits.png
├── index.rst
├── meetups
│ ├── 01-24-2024
│ │ └── notes.md
│ ├── 02-20-2024
│ │ ├── Proton.pdf
│ │ └── notes.md
│ ├── 04-02-2024
│ │ └── notes.md
│ ├── 07-18-2023
│ │ └── notes.md
│ ├── 08-22-2023
│ │ ├── amd-update.pdf
│ │ ├── intel-xpu-update.pptx
│ │ └── notes.md
│ ├── 10-25-2023
│ │ ├── intel-xpu-update.pdf
│ │ ├── notes.md
│ │ └── triton-shared.pptx
│ ├── 12-13-2023
│ │ └── notes.md
│ └── dev-meetup-2023.md
├── programming-guide
│ ├── chapter-1
│ │ ├── cuda-parallel-matmul.png
│ │ ├── introduction.rst
│ │ └── triton-parallel-matmul.png
│ └── chapter-2
│ │ ├── halide-iteration.png
│ │ ├── polyhedral-iteration.png
│ │ └── related-work.rst
└── python-api
│ ├── triton.language.rst
│ ├── triton.rst
│ └── triton.testing.rst
├── include
├── CMakeLists.txt
└── triton
│ ├── Analysis
│ ├── Alias.h
│ ├── Allocation.h
│ ├── AxisInfo.h
│ ├── Membar.h
│ └── Utility.h
│ ├── CMakeLists.txt
│ ├── Conversion
│ ├── CMakeLists.txt
│ ├── MLIRTypes.h
│ ├── TritonCPUToLLVM
│ │ ├── CMakeLists.txt
│ │ ├── Passes.h
│ │ ├── Passes.td
│ │ ├── PatternTritonCPUOpToLLVM.h
│ │ ├── TypeConverter.h
│ │ └── Utility.h
│ ├── TritonGPUToLLVM
│ │ ├── AsmFormat.h
│ │ ├── CMakeLists.txt
│ │ ├── ElementwiseOpToLLVMBase.h
│ │ ├── Passes.h
│ │ ├── Passes.td
│ │ ├── PatternTritonGPUOpToLLVM.h
│ │ ├── Patterns.h
│ │ ├── TargetInfoBase.h
│ │ ├── TypeConverter.h
│ │ └── Utility.h
│ ├── TritonToTritonCPU
│ │ ├── CMakeLists.txt
│ │ ├── Passes.h
│ │ ├── Passes.td
│ │ └── TritonToTritonCPUPass.h
│ └── TritonToTritonGPU
│ │ ├── CMakeLists.txt
│ │ ├── Passes.h
│ │ ├── Passes.td
│ │ └── TritonToTritonGPUPass.h
│ ├── Dialect
│ ├── CMakeLists.txt
│ ├── NVGPU
│ │ ├── CMakeLists.txt
│ │ └── IR
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Dialect.h
│ │ │ ├── NVGPUAttrDefs.td
│ │ │ ├── NVGPUDialect.td
│ │ │ └── NVGPUOps.td
│ ├── Triton
│ │ ├── CMakeLists.txt
│ │ ├── IR
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Dialect.h
│ │ │ ├── Interfaces.h
│ │ │ ├── Traits.h
│ │ │ ├── TritonAttrDefs.td
│ │ │ ├── TritonDialect.td
│ │ │ ├── TritonInterfaces.td
│ │ │ ├── TritonOps.td
│ │ │ ├── TritonTypeInterfaces.td
│ │ │ ├── TritonTypes.td
│ │ │ ├── Types.h
│ │ │ └── Utility.h
│ │ └── Transforms
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Passes.h
│ │ │ └── Passes.td
│ ├── TritonCPU
│ │ ├── CMakeLists.txt
│ │ ├── IR
│ │ │ ├── Attributes.h
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Dialect.h
│ │ │ ├── TritonCPUAttrDefs.td
│ │ │ ├── TritonCPUDialect.td
│ │ │ ├── TritonCPUInterfaces.h
│ │ │ ├── TritonCPUOps.td
│ │ │ ├── TritonCPUTypes.td
│ │ │ └── Types.h
│ │ └── Transforms
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Passes.h
│ │ │ ├── Passes.td
│ │ │ └── TritonCPUConversion.h
│ ├── TritonGPU
│ │ ├── CMakeLists.txt
│ │ ├── IR
│ │ │ ├── Attributes.h
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Dialect.h
│ │ │ ├── TritonGPUAttrDefs.td
│ │ │ ├── TritonGPUDialect.td
│ │ │ ├── TritonGPUInterfaces.h
│ │ │ ├── TritonGPUOps.td
│ │ │ ├── TritonGPUTypes.td
│ │ │ └── Types.h
│ │ └── Transforms
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Passes.h
│ │ │ ├── Passes.td
│ │ │ ├── TritonGPUConversion.h
│ │ │ └── Utility.h
│ └── TritonNvidiaGPU
│ │ ├── CMakeLists.txt
│ │ ├── IR
│ │ ├── CMakeLists.txt
│ │ ├── Dialect.h
│ │ ├── TritonNvidiaGPUAttrDefs.td
│ │ ├── TritonNvidiaGPUDialect.td
│ │ ├── TritonNvidiaGPUOps.td
│ │ ├── TritonNvidiaGPUTypes.td
│ │ └── Types.h
│ │ └── Transforms
│ │ ├── CMakeLists.txt
│ │ ├── Passes.h
│ │ └── Passes.td
│ ├── Target
│ ├── CMakeLists.txt
│ └── LLVMIR
│ │ ├── CMakeLists.txt
│ │ ├── Passes.h
│ │ └── Passes.td
│ └── Tools
│ └── Sys
│ ├── GetEnv.hpp
│ └── GetPlatform.hpp
├── lib
├── Analysis
│ ├── Alias.cpp
│ ├── Allocation.cpp
│ ├── AxisInfo.cpp
│ ├── CMakeLists.txt
│ ├── Membar.cpp
│ └── Utility.cpp
├── CMakeLists.txt
├── Conversion
│ ├── CMakeLists.txt
│ ├── TritonCPUToLLVM
│ │ ├── CMakeLists.txt
│ │ ├── ControlFlowOpToLLVM.cpp
│ │ ├── FuncOpToLLVM.cpp
│ │ ├── TritonCPUToLLVM.cpp
│ │ └── TypeConverter.cpp
│ ├── TritonGPUToLLVM
│ │ ├── AllocateSharedMemory.cpp
│ │ ├── AssertOpToLLVM.cpp
│ │ ├── CMakeLists.txt
│ │ ├── ControlFlowOpToLLVM.cpp
│ │ ├── ConvertLayoutOpToLLVM.cpp
│ │ ├── ConvertLayoutOpToLLVM
│ │ │ └── SharedToDotOperandFMA.cpp
│ │ ├── DecomposeUnsupportedConversions.cpp
│ │ ├── DotOpToLLVM
│ │ │ └── FMA.cpp
│ │ ├── ElementwiseOpToLLVM.cpp
│ │ ├── FuncOpToLLVM.cpp
│ │ ├── HistogramOpToLLVM.cpp
│ │ ├── MakeRangeOpToLLVM.cpp
│ │ ├── MemoryOpToLLVM.cpp
│ │ ├── PrintOpToLLVM.cpp
│ │ ├── ReduceOpToLLVM.cpp
│ │ ├── ReduceScanCommon.h
│ │ ├── SPMDOpToLLVM.cpp
│ │ ├── ScanOpToLLVM.cpp
│ │ ├── TypeConverter.cpp
│ │ ├── Utility.cpp
│ │ └── ViewOpToLLVM.cpp
│ ├── TritonToTritonCPU
│ │ ├── CMakeLists.txt
│ │ ├── TritonCPUConversion.cpp
│ │ └── TritonToTritonCPUPass.cpp
│ └── TritonToTritonGPU
│ │ ├── CMakeLists.txt
│ │ ├── TritonGPUConversion.cpp
│ │ └── TritonToTritonGPUPass.cpp
├── Dialect
│ ├── CMakeLists.txt
│ ├── NVGPU
│ │ ├── CMakeLists.txt
│ │ └── IR
│ │ │ ├── CMakeLists.txt
│ │ │ └── Dialect.cpp
│ ├── Triton
│ │ ├── CMakeLists.txt
│ │ ├── IR
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Dialect.cpp
│ │ │ ├── Ops.cpp
│ │ │ ├── Traits.cpp
│ │ │ └── Types.cpp
│ │ └── Transforms
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Combine.cpp
│ │ │ ├── Combine.td
│ │ │ ├── ReorderBroadcast.cpp
│ │ │ └── RewriteTensorPointer.cpp
│ ├── TritonCPU
│ │ ├── CMakeLists.txt
│ │ ├── IR
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Dialect.cpp
│ │ │ └── Types.cpp
│ │ └── Transforms
│ │ │ └── CMakeLists.txt
│ ├── TritonGPU
│ │ ├── CMakeLists.txt
│ │ ├── IR
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Dialect.cpp
│ │ │ └── Types.cpp
│ │ └── Transforms
│ │ │ ├── AccelerateMatmul.cpp
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Coalesce.cpp
│ │ │ ├── F32DotTC.cpp
│ │ │ ├── OptimizeDotOperands.cpp
│ │ │ ├── OptimizeThreadLocality.cpp
│ │ │ ├── Pipeliner
│ │ │ ├── MatmulLoopPipeline.cpp
│ │ │ ├── OuterLoopPipeline.cpp
│ │ │ ├── PipelineExpander.cpp
│ │ │ ├── PipelineExpander.h
│ │ │ ├── PipeliningUtility.cpp
│ │ │ ├── PipeliningUtility.h
│ │ │ ├── Schedule.h
│ │ │ └── SoftwarePipeliner.cpp
│ │ │ ├── Prefetch.cpp
│ │ │ ├── ReduceDataDuplication.cpp
│ │ │ ├── RemoveLayoutConversions.cpp
│ │ │ ├── ReorderInstructions.cpp
│ │ │ └── Utility.cpp
│ └── TritonNvidiaGPU
│ │ ├── CMakeLists.txt
│ │ ├── IR
│ │ ├── CMakeLists.txt
│ │ ├── Dialect.cpp
│ │ ├── Ops.cpp
│ │ └── Types.cpp
│ │ └── Transforms
│ │ ├── CMakeLists.txt
│ │ ├── FenceInsertion.cpp
│ │ └── PlanCTA.cpp
└── Target
│ ├── CMakeLists.txt
│ └── LLVMIR
│ ├── CMakeLists.txt
│ ├── LLVMDIScope.cpp
│ ├── LLVMIRBreakPhiStruct.cpp
│ └── LLVMPasses.h
├── pyproject.toml
├── python
├── MANIFEST.in
├── examples
│ ├── copy_strided.py
│ └── empty.py
├── pyproject.toml
├── setup.py
├── src
│ ├── interpreter.cc
│ ├── ir.cc
│ ├── llvm.cc
│ ├── main.cc
│ ├── passes.cc
│ └── passes.h
├── test
│ ├── backend
│ │ ├── extension_backend.c
│ │ ├── test_device_backend.py
│ │ └── third_party_backends
│ │ │ ├── conftest.py
│ │ │ └── test_xpu_backend.py
│ ├── kernel_comparison
│ │ └── kernels.yml
│ ├── regression
│ │ ├── test_cast_matmul.py
│ │ ├── test_functional_regressions.py
│ │ └── test_performance.py
│ └── unit
│ │ ├── conftest.py
│ │ ├── hopper
│ │ ├── __init__.py
│ │ ├── test_flashattention.py
│ │ ├── test_gemm.py
│ │ ├── test_gemm_fusion.py
│ │ ├── test_mixed_io.py
│ │ ├── test_persistent_warp_specialized_fused-attention.py
│ │ ├── test_persistent_warp_specialized_gemm.py
│ │ └── test_tma_store_gemm.py
│ │ ├── language
│ │ ├── assert_helper.py
│ │ ├── conftest.py
│ │ ├── print_helper.py
│ │ ├── test_annotations.py
│ │ ├── test_block_pointer.py
│ │ ├── test_compile_errors.py
│ │ ├── test_conversions.py
│ │ ├── test_core.py
│ │ ├── test_decorator.py
│ │ ├── test_line_info.py
│ │ ├── test_random.py
│ │ ├── test_reproducer.py
│ │ ├── test_standard.py
│ │ └── test_subprocess.py
│ │ ├── operators
│ │ ├── conftest.py
│ │ ├── test_blocksparse.py
│ │ ├── test_cross_entropy.py
│ │ ├── test_flash_attention.py
│ │ ├── test_inductor.py
│ │ └── test_matmul.py
│ │ ├── runtime
│ │ ├── test_autotuner.py
│ │ ├── test_bindings.py
│ │ ├── test_cache.py
│ │ ├── test_driver.py
│ │ ├── test_jit.py
│ │ ├── test_launch.py
│ │ └── test_subproc.py
│ │ └── tools
│ │ └── test_aot.py
├── triton
│ ├── _C
│ │ └── include
│ ├── __init__.py
│ ├── backends
│ │ ├── __init__.py
│ │ ├── compiler.py
│ │ └── driver.py
│ ├── compiler
│ │ ├── __init__.py
│ │ ├── code_generator.py
│ │ ├── compiler.py
│ │ ├── errors.py
│ │ └── make_launcher.py
│ ├── errors.py
│ ├── language
│ │ ├── __init__.py
│ │ ├── core.py
│ │ ├── extra
│ │ │ ├── __init__.py
│ │ │ └── cuda
│ │ │ │ ├── __init__.py
│ │ │ │ ├── libdevice.py
│ │ │ │ └── utils.py
│ │ ├── math.py
│ │ ├── random.py
│ │ ├── semantic.py
│ │ └── standard.py
│ ├── ops
│ │ ├── __init__.py
│ │ ├── blocksparse
│ │ │ ├── __init__.py
│ │ │ ├── matmul.py
│ │ │ └── softmax.py
│ │ ├── cross_entropy.py
│ │ ├── flash_attention.py
│ │ ├── matmul.py
│ │ └── matmul_perf_model.py
│ ├── runtime
│ │ ├── __init__.py
│ │ ├── autotuner.py
│ │ ├── build.py
│ │ ├── cache.py
│ │ ├── driver.py
│ │ ├── errors.py
│ │ ├── interpreter.py
│ │ └── jit.py
│ ├── testing.py
│ └── tools
│ │ ├── __init__.py
│ │ ├── build_extern.py
│ │ ├── compile.c
│ │ ├── compile.h
│ │ ├── compile.py
│ │ ├── disasm.py
│ │ └── link.py
└── tutorials
│ ├── 01-vector-add.py
│ ├── 02-fused-softmax.py
│ ├── 03-matrix-multiplication.py
│ ├── 04-low-memory-dropout.py
│ ├── 05-layer-norm.py
│ ├── 06-fused-attention.py
│ ├── 07-extern-functions.py
│ ├── 08-grouped-gemm.py
│ └── README.rst
├── test
├── Analysis
│ ├── test-alias.mlir
│ ├── test-alignment.mlir
│ ├── test-allocation.mlir
│ └── test-membar.mlir
├── CMakeLists.txt
├── Conversion
│ ├── amd
│ │ ├── decompose-unsupported-conversions.mlir
│ │ ├── fp_to_fp.mlir
│ │ ├── load_store.mlir
│ │ └── tritongpu_wmma_dot_to_llvm.mlir
│ ├── dedup-by-constancy.mlir
│ ├── divide-by-0.mlir
│ ├── triton_to_tritongpu.mlir
│ ├── tritongpu_to_llvm.mlir
│ ├── tritongpu_to_llvm_hopper.mlir
│ └── tritongpu_to_llvm_volta.mlir
├── LLVMIR
│ └── break-phi-struct.ll
├── NVGPU
│ ├── test_cga.mlir
│ └── test_wgmma.mlir
├── Triton
│ ├── canonicalize.mlir
│ ├── combine.mlir
│ ├── invalid.mlir
│ ├── ops.mlir
│ ├── reorder-broadcast.mlir
│ ├── reproducer.mlir
│ ├── rewrite-tensor-pointer.mlir
│ ├── vecadd.mlir
│ └── verify-make-range.mlir
├── TritonGPU
│ ├── accelerate-matmul.mlir
│ ├── amd
│ │ ├── accelerate-amd-matmul-wmma.mlir
│ │ └── amd-reorder-instructions.mlir
│ ├── atomic-cas.mlir
│ ├── canonicalize.mlir
│ ├── coalesce.mlir
│ ├── combine.mlir
│ ├── dot-operands.mlir
│ ├── fence-inserstion.mlir
│ ├── invalid.mlir
│ ├── loop-pipeline-hopper.mlir
│ ├── loop-pipeline.mlir
│ ├── matmul.mlir
│ ├── ops.mlir
│ ├── optimize-locality.mlir
│ ├── optimize_epilogue.mlir
│ ├── pipeline-hopper-remove-wait.mlir
│ ├── prefetch.mlir
│ ├── reduce-data-duplication.mlir
│ ├── reorder-instructions.mlir
│ ├── tritongpu_ops.mlir
│ └── verify-blocked-layout.mlir
├── lib
│ ├── Analysis
│ │ ├── CMakeLists.txt
│ │ ├── TestAlias.cpp
│ │ ├── TestAllocation.cpp
│ │ ├── TestAxisInfo.cpp
│ │ └── TestMembar.cpp
│ └── CMakeLists.txt
├── lit.cfg.py
└── lit.site.cfg.py.in
├── third_party
├── amd
│ ├── CMakeLists.txt
│ ├── backend
│ │ ├── compiler.py
│ │ ├── driver.c
│ │ ├── driver.py
│ │ ├── include
│ │ │ └── hip
│ │ │ │ ├── amd_detail
│ │ │ │ ├── amd_channel_descriptor.h
│ │ │ │ ├── amd_device_functions.h
│ │ │ │ ├── amd_hip_atomic.h
│ │ │ │ ├── amd_hip_bf16.h
│ │ │ │ ├── amd_hip_bfloat16.h
│ │ │ │ ├── amd_hip_common.h
│ │ │ │ ├── amd_hip_complex.h
│ │ │ │ ├── amd_hip_cooperative_groups.h
│ │ │ │ ├── amd_hip_fp16.h
│ │ │ │ ├── amd_hip_gl_interop.h
│ │ │ │ ├── amd_hip_math_constants.h
│ │ │ │ ├── amd_hip_runtime.h
│ │ │ │ ├── amd_hip_runtime_pt_api.h
│ │ │ │ ├── amd_hip_unsafe_atomics.h
│ │ │ │ ├── amd_hip_vector_types.h
│ │ │ │ ├── amd_math_functions.h
│ │ │ │ ├── amd_surface_functions.h
│ │ │ │ ├── amd_warp_functions.h
│ │ │ │ ├── concepts.hpp
│ │ │ │ ├── device_library_decls.h
│ │ │ │ ├── functional_grid_launch.hpp
│ │ │ │ ├── grid_launch.h
│ │ │ │ ├── grid_launch.hpp
│ │ │ │ ├── grid_launch_GGL.hpp
│ │ │ │ ├── helpers.hpp
│ │ │ │ ├── hip_cooperative_groups_helper.h
│ │ │ │ ├── hip_fp16_gcc.h
│ │ │ │ ├── hip_fp16_math_fwd.h
│ │ │ │ ├── hip_ldg.h
│ │ │ │ ├── hip_prof_str.h
│ │ │ │ ├── hip_runtime_prof.h
│ │ │ │ ├── host_defines.h
│ │ │ │ ├── hsa_helpers.hpp
│ │ │ │ ├── macro_based_grid_launch.hpp
│ │ │ │ ├── math_fwd.h
│ │ │ │ ├── ockl_image.h
│ │ │ │ ├── program_state.hpp
│ │ │ │ ├── texture_fetch_functions.h
│ │ │ │ └── texture_indirect_functions.h
│ │ │ │ ├── channel_descriptor.h
│ │ │ │ ├── device_functions.h
│ │ │ │ ├── driver_types.h
│ │ │ │ ├── hip_bf16.h
│ │ │ │ ├── hip_bfloat16.h
│ │ │ │ ├── hip_common.h
│ │ │ │ ├── hip_complex.h
│ │ │ │ ├── hip_cooperative_groups.h
│ │ │ │ ├── hip_deprecated.h
│ │ │ │ ├── hip_ext.h
│ │ │ │ ├── hip_fp16.h
│ │ │ │ ├── hip_gl_interop.h
│ │ │ │ ├── hip_hcc.h
│ │ │ │ ├── hip_math_constants.h
│ │ │ │ ├── hip_profile.h
│ │ │ │ ├── hip_runtime.h
│ │ │ │ ├── hip_runtime_api.h
│ │ │ │ ├── hip_texture_types.h
│ │ │ │ ├── hip_vector_types.h
│ │ │ │ ├── hip_version.h
│ │ │ │ ├── hiprtc.h
│ │ │ │ ├── library_types.h
│ │ │ │ ├── math_functions.h
│ │ │ │ ├── surface_types.h
│ │ │ │ └── texture_types.h
│ │ └── lib
│ │ │ ├── cuda2gcn.bc
│ │ │ ├── ockl.bc
│ │ │ ├── ocml.bc
│ │ │ └── opencl.bc
│ ├── include
│ │ ├── CMakeLists.txt
│ │ ├── TritonAMDGPUToLLVM
│ │ │ ├── CMakeLists.txt
│ │ │ ├── GCNAsmFormat.h
│ │ │ ├── Passes.h
│ │ │ └── Passes.td
│ │ └── TritonAMDGPUTransforms
│ │ │ ├── CMakeLists.txt
│ │ │ ├── MfmaGroup.h
│ │ │ ├── Passes.h
│ │ │ ├── Passes.td
│ │ │ └── TritonGPUConversion.h
│ ├── lib
│ │ ├── CMakeLists.txt
│ │ ├── TritonAMDGPUToLLVM
│ │ │ ├── CMakeLists.txt
│ │ │ ├── ConvertLayoutOpToLLVM.cpp
│ │ │ ├── ConvertLayoutOpToLLVM
│ │ │ │ ├── SharedToDotOperandHelper.cpp
│ │ │ │ ├── SharedToDotOperandHelper.h
│ │ │ │ ├── SharedToDotOperandMFMA.cpp
│ │ │ │ └── SharedToDotOperandWMMA.cpp
│ │ │ ├── DecomposeUnsupportedConversions.cpp
│ │ │ ├── DotOpToLLVM.cpp
│ │ │ ├── DotOpToLLVM
│ │ │ │ ├── MFMA.cpp
│ │ │ │ └── WMMA.cpp
│ │ │ ├── ElementwiseOpToLLVM.cpp
│ │ │ ├── GCNAsmFormat.cpp
│ │ │ ├── LoadStoreOpToLLVM.cpp
│ │ │ ├── PatternTritonGPUOpToLLVM.h
│ │ │ ├── SPMDOpToLLVM.cpp
│ │ │ ├── TargetInfo.cpp
│ │ │ ├── TargetInfo.h
│ │ │ ├── TritonGPUToLLVM.cpp
│ │ │ ├── Utility.cpp
│ │ │ └── Utility.h
│ │ └── TritonAMDGPUTransforms
│ │ │ ├── AccelerateAMDMatmul.cpp
│ │ │ ├── CMakeLists.txt
│ │ │ ├── MfmaGroup.cpp
│ │ │ ├── OptimizeEpilogue.cpp
│ │ │ ├── RemoveLayoutConversions.cpp
│ │ │ ├── ReorderInstructions.cpp
│ │ │ └── StreamPipeline.cpp
│ └── python
│ │ └── triton_amd.cc
├── cpu
│ ├── CMakeLists.txt
│ ├── backend
│ │ ├── compiler.py
│ │ └── driver.py
│ └── triton_cpu.cc
├── nvidia
│ ├── CMakeLists.txt
│ ├── backend
│ │ ├── __init__.py
│ │ ├── compiler.py
│ │ ├── driver.c
│ │ ├── driver.py
│ │ ├── include
│ │ │ └── cuda.h
│ │ └── lib
│ │ │ └── libdevice.10.bc
│ ├── include
│ │ ├── CMakeLists.txt
│ │ ├── NVGPUToLLVM
│ │ │ ├── CMakeLists.txt
│ │ │ ├── NVGPUToLLVMPass.h
│ │ │ ├── Passes.h
│ │ │ └── Passes.td
│ │ └── TritonNVIDIAGPUToLLVM
│ │ │ ├── CMakeLists.txt
│ │ │ ├── PTXAsmFormat.h
│ │ │ ├── Passes.h
│ │ │ └── Passes.td
│ ├── lib
│ │ ├── CMakeLists.txt
│ │ ├── NVGPUToLLVM
│ │ │ ├── CMakeLists.txt
│ │ │ └── NVGPUToLLVMPass.cpp
│ │ └── TritonNVIDIAGPUToLLVM
│ │ │ ├── BarrierOpToLLVM.cpp
│ │ │ ├── CMakeLists.txt
│ │ │ ├── ClusterOpsToLLVM.cpp
│ │ │ ├── ConvertLayoutOpToLLVM.cpp
│ │ │ ├── ConvertLayoutOpToLLVM
│ │ │ ├── SharedToDotOperandMMAv1.cpp
│ │ │ └── SharedToDotOperandMMAv2.cpp
│ │ │ ├── DecomposeUnsupportedConversions.cpp
│ │ │ ├── DotOpToLLVM.cpp
│ │ │ ├── DotOpToLLVM
│ │ │ ├── MMAv1.cpp
│ │ │ ├── MMAv2.cpp
│ │ │ └── WGMMA.cpp
│ │ │ ├── ElementwiseOpToLLVM.cpp
│ │ │ ├── LoadStoreOpToLLVM.cpp
│ │ │ ├── PTXAsmFormat.cpp
│ │ │ ├── PatternTritonGPUOpToLLVM.h
│ │ │ ├── SPMDOpToLLVM.cpp
│ │ │ ├── TargetInfo.cpp
│ │ │ ├── TargetInfo.h
│ │ │ ├── TensorPtrOpsToLLVM.cpp
│ │ │ ├── TritonGPUToLLVM.cpp
│ │ │ ├── Utility.cpp
│ │ │ └── Utility.h
│ └── triton_nvidia.cc
└── proton
│ ├── .gitignore
│ ├── CMakeLists.txt
│ ├── README.md
│ ├── csrc
│ ├── Proton.cpp
│ ├── include
│ │ ├── Context
│ │ │ ├── Context.h
│ │ │ ├── Python.h
│ │ │ └── Shadow.h
│ │ ├── Data
│ │ │ ├── Data.h
│ │ │ ├── Metric.h
│ │ │ ├── TraceData.h
│ │ │ └── TreeData.h
│ │ ├── Driver
│ │ │ ├── Dispatch.h
│ │ │ └── GPU
│ │ │ │ ├── Cuda.h
│ │ │ │ └── Cupti.h
│ │ ├── Profiler
│ │ │ ├── CuptiProfiler.h
│ │ │ └── Profiler.h
│ │ ├── Proton.h
│ │ ├── Session
│ │ │ └── Session.h
│ │ └── Utility
│ │ │ ├── Errors.h
│ │ │ ├── Singleton.h
│ │ │ ├── String.h
│ │ │ └── Traits.h
│ └── lib
│ │ ├── Context
│ │ ├── Context.cpp
│ │ ├── Python.cpp
│ │ └── Shadow.cpp
│ │ ├── Data
│ │ ├── Data.cpp
│ │ ├── TraceData.cpp
│ │ └── TreeData.cpp
│ │ ├── Driver
│ │ └── GPU
│ │ │ ├── Cuda.cpp
│ │ │ └── Cupti.cpp
│ │ ├── Profiler
│ │ └── CuptiProfiler.cpp
│ │ └── Session
│ │ └── Session.cpp
│ ├── proton
│ ├── _C
│ │ └── include
│ ├── __init__.py
│ ├── flags.py
│ ├── hook.py
│ ├── profile.py
│ ├── scope.py
│ └── viewer.py
│ ├── test
│ ├── test_api.py
│ ├── test_lib.py
│ ├── test_profile.py
│ └── test_viewer.py
│ └── tutorials
│ ├── dynamic_net.py
│ └── matmul.py
├── unittest
├── Analysis
│ ├── CMakeLists.txt
│ └── UtilityTest.cpp
├── CMakeLists.txt
├── Conversion
│ ├── CMakeLists.txt
│ └── TritonGPUToLLVM
│ │ ├── CMakeLists.txt
│ │ ├── DumpLayout.cpp
│ │ ├── DumpLayout.h
│ │ ├── EmitIndicesTest.cpp
│ │ └── PTXAsmFormatTest.cpp
├── Dialect
│ ├── CMakeLists.txt
│ └── TritonGPU
│ │ ├── CMakeLists.txt
│ │ ├── DialectTest.cpp
│ │ └── SwizzleTest.cpp
└── googletest.cmake
└── utils
└── nightly.pypirc
/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: LLVM
2 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # https://editorconfig.org/
2 |
3 | root = true
4 |
5 | [*]
6 | charset = utf-8
7 | end_of_line = lf
8 | indent_style = space
9 | indent_size = 4
10 | trim_trailing_whitespace = true
11 | insert_final_newline = true
12 |
13 | [*.py]
14 | indent_size = 4
15 | src_paths=python
16 |
17 | [*.{yaml,yml}]
18 | indent_size = 2
19 |
20 | [*.md]
21 | indent_size = 2
22 | x-soft-wrap-text = true
23 |
24 | [*.rst]
25 | indent_size = 4
26 | x-soft-wrap-text = true
27 |
28 | [CMakeLists.txt,*.cmake]
29 | indent_size = 2
30 |
31 | [Makefile]
32 | indent_style = tab
33 |
34 | [*.{c,cc,cpp,h,hpp,cu,cuh}]
35 | indent_size = 2
36 |
37 | [*.mlir]
38 | indent_size = 2
39 |
40 | [*.td]
41 | indent_size = 4
42 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # W503 (linebreak occurred before binary operator) seems to be enabled by
3 | # default, even though it goes against pep8 and is incompatible with W504
4 | # (linebreak occurred *after* binary operator). Disable it.
5 | ignore = E501,E701,E731,W503
6 |
--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # Commits listed here are ignored by `git blame`. Add "big and uninteresting
2 | # changes" here. Don't forget that it has to be a separate commit (and, because
3 | # our automation squashes PRs, a separate PR)!
4 | #
5 | # Run the following command to teach your `git blame` to pick up this file.
6 | #
7 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs`
8 |
9 | 841a77d1b5961b43e1b64e5265bdfe52c133574d
10 | cb68a0d9d501657258ed9f7ad7610d0784c9be9a
11 | 03184de8b535bb24fb1f49cc1f5e008bcbaa73ef
12 | bc4a8e66da036fafc01b87ee9e210df7ee8fb738
13 | 846d6e7e77891706d179b20f27b1278ac3b9a9ac
14 | 0327b9d32db6d1d63d207ccab722bd45e00a6678
15 | df08301e76a56d9ab3f36ff00ab7133672baa8d3
16 | f88b01f558df06f010a869e01473253a5f5cd8db
17 | 312cf97e147e962562877026fd82c928cf6eaa30
18 | 53d868113a706988394134ca1f7f85cb3016cc81
19 | 539fbe5049570f29e73dc6843f984cd4913c5505
20 | 053af4e9f8f005e1bc3f8ac9bf285eaf0ac9bf72
21 | 5b36cb48ad9ce566dd24ff7183f207a1cb9358b5
22 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in
2 | # the repo. Unless a later match takes precedence,
3 | # @global-owner1 and @global-owner2 will be requested for
4 | # review when someone opens a pull request.
5 | * @ptillet
6 |
7 | # --------
8 | # Analyses
9 | # --------
10 | # Alias analysis
11 | include/triton/Analysis/Alias.h @Jokeren
12 | lib/Analysis/Alias.cpp @Jokeren
13 | # Allocation analysis
14 | include/triton/Analysis/Allocation.h @Jokeren
15 | lib/Analysis/Allocation.cpp @Jokeren
16 | # Membar analysis
17 | include/triton/Analysis/Membar.h @Jokeren
18 | lib/Analysis/Membar.cpp @Jokeren
19 | # AxisInfo analysis
20 | include/triton/Analysis/AxisInfo.h @ptillet
21 | lib/Analysis/AxisInfo.cpp @ptillet
22 | # Utilities
23 | include/triton/Analysis/Utility.h @Jokeren
24 | lib/Analysis/Utility.cpp @Jokeren
25 |
26 | # ----------
27 | # Dialects
28 | # ----------
29 | # Pipeline pass
30 | lib/Dialect/TritonGPU/Transforms/Pipeline.cpp @ptillet
31 | # Prefetch pass
32 | lib/Dialect/TritonGPU/Transforms/Prefetch.cpp @ptillet
33 | # Coalesce pass
34 | lib/Dialect/TritonGPU/Transforms/Coalesce.cpp @ptillet
35 | # Layout simplification pass
36 | lib/Dialect/TritonGPU/Transforms/Combine.cpp @ptillet
37 |
38 | # -----------
39 | # Conversions
40 | # -----------
41 | # TritonToTritonGPU
42 | include/triton/Conversion/TritonToTritonGPU/ @ptillet
43 | lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp @ptillet
44 |
45 | # -----------
46 | # third_party
47 | # -----------
48 | third_party/amd/ @antiagainst @zhanglx13
49 |
--------------------------------------------------------------------------------
/.github/workflows/torch-inductor-tests.yml:
--------------------------------------------------------------------------------
1 | name: Torchinductor
2 |
3 | on:
4 | workflow_run:
5 | workflows: ["Wheels"]
6 | types: [completed]
7 |
8 | permissions: read-all
9 |
10 | jobs:
11 | Runner-Preparation:
12 | runs-on: ubuntu-latest
13 | outputs:
14 | matrix: ${{ steps.set-matrix.outputs.matrix }}
15 | steps:
16 | - name: Prepare runner matrix
17 | id: set-matrix
18 | run: |
19 | echo '::set-output name=matrix::[["self-hosted", "A100"]]'
20 |
21 | Integration-Tests:
22 | needs: Runner-Preparation
23 | timeout-minutes: 240 # 4 hours
24 | runs-on: ${{ matrix.runner }}
25 | strategy:
26 | matrix:
27 | runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix)}}
28 | steps:
29 | - name: Checkout
30 | uses: actions/checkout@v4
31 | - name: Packages
32 | run: |
33 | ./.github/workflows/torch-inductor/scripts/install_torchinductor.sh torchbench
34 | - name: Environment
35 | run: |
36 | source /opt/torchinductor_venv/bin/activate
37 | ./.github/workflows/torch-inductor/scripts/install_triton.sh
38 | - name: Performance
39 | run: |
40 | ./.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh torchbench
41 | # Runs too long time
42 | #- name: Accuracy
43 | # run: |
44 | # ./.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh
45 |
--------------------------------------------------------------------------------
/.github/workflows/torch-inductor/scripts/check_acc.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import sys
3 |
4 | file_path = sys.argv[1]
5 | with open(file_path) as f:
6 | reader = csv.reader(f)
7 | for i, row in enumerate(reader):
8 | if i == 0:
9 | continue
10 | if row[3] != "pass":
11 | print(f"{row[1]} failed on device {row[0]} with batch size {row[2]}")
12 |
--------------------------------------------------------------------------------
/.github/workflows/torch-inductor/scripts/common.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | TEST_REPORTS_DIR=/opt/torchinductor_reports
4 | PYTORCH_DIR=/opt/pytorch
5 | MODELS=(timm_models huggingface torchbench)
6 |
7 | echo "$TEST_REPORTS_DIR"
8 | echo "$PYTORCH_DIR"
9 | echo "${MODELS[@]}"
10 |
--------------------------------------------------------------------------------
/.github/workflows/torch-inductor/scripts/install_triton.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # remember where we started
4 | ROOT="$(pwd)"
5 |
6 | # shellcheck source=/dev/null
7 | source /opt/torchinductor_venv/bin/activate
8 | # shellcheck source=/dev/null
9 | source ./.github/workflows/torch-inductor/scripts/common.sh
10 |
11 | # build our own triton
12 | cd python || exit
13 | pip3 install --pre pytorch-triton --extra-index-url https://download.pytorch.org/whl/nightly/cu118
14 | rm -rf build
15 | pip3 install -e .
16 | pip3 uninstall pytorch-triton -y
17 |
18 | # clean up cache
19 | rm -rf /tmp/torchinductor_root/
20 | rm -rf ~/.triton/cache
21 | rm -rf "$TEST_REPORTS_DIR"
22 |
23 | # go back to where we started
24 | cd "$ROOT" || exit
25 |
--------------------------------------------------------------------------------
/.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # remember where we started
4 | ROOT="$(pwd)"
5 | INDUCTOR="$ROOT"/.github/workflows/torch-inductor
6 | MODEL_SPEC=$1
7 |
8 | # shellcheck source=/dev/null
9 | source /opt/torchinductor_venv/bin/activate
10 | # shellcheck source=/dev/null
11 | source "$INDUCTOR"/scripts/common.sh
12 |
13 | cd "$PYTORCH_DIR" || exit
14 | TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc
15 | mkdir -p "$TEST_REPORTS_DIR"
16 |
17 | for model in "${MODELS[@]}"; do
18 | if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
19 | continue
20 | fi
21 | echo "Running accuracy test for $model"
22 | python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --device cuda \
23 | --output "$TEST_REPORTS_DIR"/inference_"$model".csv
24 | python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --training --amp --device cuda \
25 | --output "$TEST_REPORTS_DIR"/training_"$model".csv
26 | python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --dynamic-shapes --device cuda \
27 | --output "$TEST_REPORTS_DIR"/dynamic_shapes_"$model".csv
28 | done
29 |
30 | cd "$ROOT" || exit
31 | for model in "${MODELS[@]}"; do
32 | if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then
33 | continue
34 | fi
35 | echo "Checking accuracy test for $model"
36 | python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/inference_"$model".csv
37 | python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/training_"$model".csv
38 | python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/dynamic_shapes_"$model".csv
39 | done
40 |
41 | # go back to where we started
42 | cd "$ROOT" || exit
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Triton builds
2 | build/
3 | build-*/
4 |
5 | # Triton Python module builds
6 | python/build/
7 | python/triton.egg-info/
8 | python/triton/_C/libtriton.pyd
9 | python/triton/_C/libtriton.so
10 |
11 | # Backends copied from submodules
12 | python/triton/backends/
13 | !python/triton/backends/__init__.py
14 | !python/triton/backends/compiler.py
15 | !python/triton/backends/driver.py
16 |
17 | # Proton
18 | python/triton/profiler
19 |
20 | # Python caches
21 | __pycache__/
22 | *.py[cod]
23 | .pytest_cache
24 |
25 | # Environments
26 | .venv
27 | venv/
28 | venv.bak/
29 |
30 | # VS Code project files
31 | .vscode
32 | .vs
33 |
34 | # JetBrains project files
35 | .idea
36 | cmake-build-*
37 |
38 | # Third-party binaries
39 | cuobjdump
40 | nvdisasm
41 | ptxas
42 |
43 | # Docs
44 | docs/_build/
45 | docs/python-api/generated/
46 | docs/dialects/
47 | docs/getting-started/tutorials
48 | docs/sg_execution_times.rst
49 | !python/tutorials/*.py
50 | !python/tutorials/*.rst
51 |
52 | # clangd index. (".clangd" is a config file now, thus trailing slash)
53 | .clangd/
54 | .cache
55 | /compile_commands.json
56 | .vscode
57 | .vs
58 |
59 | # Vim
60 | *.swp
61 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.4.0
4 | hooks:
5 | - id: check-symlinks
6 | - id: destroyed-symlinks
7 | - id: trailing-whitespace
8 | - id: end-of-file-fixer
9 | - id: check-yaml
10 | - id: check-toml
11 | - id: check-ast
12 | - id: check-added-large-files
13 | - id: check-merge-conflict
14 | - id: check-executables-have-shebangs
15 | - id: check-shebang-scripts-are-executable
16 | - id: detect-private-key
17 | - id: debug-statements
18 |
19 | - repo: https://github.com/astral-sh/ruff-pre-commit
20 | rev: v0.1.3
21 | hooks:
22 | - id: ruff
23 | files: '^python/.*'
24 | args: ["--fix", "--line-length", "120"]
25 | stages: [commit, push, manual]
26 | exclude: |
27 | (?x)(
28 | ^python/triton/runtime/.*|
29 | ^test/|
30 | ^docs/conf.py$
31 | )
32 |
33 | - repo: https://github.com/google/yapf
34 | rev: be72557
35 | hooks:
36 | - id: yapf
37 | args: ["-p", "-i"]
38 | stages: [commit, push, manual]
39 | exclude: "python/test/unit/language/test_line_info.py"
40 |
41 | - repo: https://github.com/pre-commit/mirrors-clang-format
42 | rev: v16.0.6
43 | hooks:
44 | - id: clang-format
45 | stages: [commit, push, manual]
46 |
47 | exclude: |
48 | (?x)(
49 | ^include/triton/external/|
50 | ^third_party/amd/backend/include/hip/|
51 | ^third_party/amd/backend/lib/|
52 | ^third_party/nvidia/backend/include/cuda.h
53 | )
54 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018-2020 Philippe Tillet
3 | * Copyright 2020-2022 OpenAI
4 | *
5 | * Permission is hereby granted, free of charge, to any person obtaining
6 | * a copy of this software and associated documentation files
7 | * (the "Software"), to deal in the Software without restriction,
8 | * including without limitation the rights to use, copy, modify, merge,
9 | * publish, distribute, sublicense, and/or sell copies of the Software,
10 | * and to permit persons to whom the Software is furnished to do so,
11 | * subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be
14 | * included in all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | */
24 |
--------------------------------------------------------------------------------
/bin/triton-lsp.cpp:
--------------------------------------------------------------------------------
1 | #include "./RegisterTritonDialects.h"
2 |
3 | #include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h"
4 |
5 | int main(int argc, char **argv) {
6 | mlir::DialectRegistry registry;
7 | registerTritonDialects(registry);
8 |
9 | mlir::MLIRContext context(registry);
10 | return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry));
11 | }
12 |
--------------------------------------------------------------------------------
/bin/triton-opt.cpp:
--------------------------------------------------------------------------------
1 | #include "./RegisterTritonDialects.h"
2 |
3 | #include "mlir/Tools/mlir-opt/MlirOptMain.h"
4 |
5 | int main(int argc, char **argv) {
6 | mlir::DialectRegistry registry;
7 | registerTritonDialects(registry);
8 |
9 | return mlir::asMainReturnCode(mlir::MlirOptMain(
10 | argc, argv, "Triton (GPU) optimizer driver\n", registry));
11 | }
12 |
--------------------------------------------------------------------------------
/bin/triton-reduce.cpp:
--------------------------------------------------------------------------------
1 | #include "./RegisterTritonDialects.h"
2 |
3 | #include "mlir/Tools/mlir-reduce/MlirReduceMain.h"
4 |
5 | int main(int argc, char **argv) {
6 | mlir::DialectRegistry registry;
7 | registerTritonDialects(registry);
8 |
9 | mlir::MLIRContext context(registry);
10 | return mlir::failed(mlir::mlirReduceMain(argc, argv, context));
11 | }
12 |
--------------------------------------------------------------------------------
/cmake/llvm-hash.txt:
--------------------------------------------------------------------------------
1 | ed4e505c219fe6c7464ea5a056e90d8cd94c7332
2 |
--------------------------------------------------------------------------------
/cmake/nvidia-toolchain-version.txt:
--------------------------------------------------------------------------------
1 | 12.4.99
2 |
--------------------------------------------------------------------------------
/cmake/pybind11-version.txt:
--------------------------------------------------------------------------------
1 | 2.11.1
2 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = Triton
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/_templates/versions.html:
--------------------------------------------------------------------------------
1 | {%- if current_version %}
2 |
3 |
4 | Other Versions
5 | v: {{ current_version.name }}
6 |
7 |
8 |
9 | {%- if versions.tags %}
10 |
11 | - Tags
12 | {%- for item in versions.tags %}
13 | - {{ item.name }}
14 | {%- endfor %}
15 |
16 | {%- endif %}
17 | {%- if versions.branches %}
18 |
19 | - Branches
20 | {%- for item in versions.branches %}
21 | - {{ item.name }}
22 | {%- endfor %}
23 |
24 | {%- endif %}
25 |
26 |
27 | {%- endif %}
28 |
--------------------------------------------------------------------------------
/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png
--------------------------------------------------------------------------------
/docs/getting-started/tutorials/parallel_reduction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/parallel_reduction.png
--------------------------------------------------------------------------------
/docs/getting-started/tutorials/random_bits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/random_bits.png
--------------------------------------------------------------------------------
/docs/meetups/01-24-2024/notes.md:
--------------------------------------------------------------------------------
1 | #### Agenda:
2 |
3 | ##### Items:
4 | 1. 3rd party refactoring backend update.
5 | 2. AMD update about experience with refactored backend and new process.
6 | 3. Plan to restore the Intel XPU backend as third-party module.
7 | 4. Open discussion.
8 |
9 | ##### Minutes:
10 | Recording link [here](https://youtu.be/uRlqolhNbRk)
11 |
12 | 1. 3rd party refactoring backend update.
13 | - Backends are passes and IRs are shared by the backends to avoid divergence and duplications so that developers do not have to change the Triton source code
14 | - To discover backend forks in directories, put environment vars in setup.py.
15 | - Backends can link whatever library they want, they don’t need to copy paste Nvidia code.
16 | - Nvidia uses the same API as other backends, (refactoring of the C++ code is still remaining). No special casing for Nvidia code.
17 | - If Triton dependency is on top of the main branch then it will work for forks/branches.
18 | - Still remaining: LLVM IR conversion – reusuable pattern rewriters update; Reduce complexity in statefulness in Triton GPU - inherit from base pattern
19 | 2. AMD update about experience with refactored backend and new process.
20 | - Skipped due to lack of time. Will be covered in February meetup
21 | 3. Plan to restore the Intel XPU backend as third-party module.
22 | - Prereqs to upstream – Will take into account the system HW and SW, with perf to be ~80% of Nvidia, to allow upstreaming.
23 | - Consider how useful it is for AI research to allow upstreaming – as it impacts maintenance cost of the backends.
24 | - Don’t have plans to upstream mobile backends
25 | - Intel will hold offline discussion with Open AI for being in-tree.
26 |
--------------------------------------------------------------------------------
/docs/meetups/02-20-2024/Proton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/02-20-2024/Proton.pdf
--------------------------------------------------------------------------------
/docs/meetups/02-20-2024/notes.md:
--------------------------------------------------------------------------------
1 | #### Agenda:
2 |
3 | ##### Items:
4 | 1. Intel update
5 | 2. AMD update
6 | 3. Profiler update
7 | 4. We are in the process of transitioning to a pro slack plan, so everybody will be able to see history. Expect this to take a few more weeks.
8 | 5. We are still working on finalizing a document about our technical governance structure. Expect this to take a few more weeks too.4. Open discussion.
9 |
10 | ##### Minutes:
11 | Recording link [here](https://youtu.be/JDQCdj18Snc)
12 |
13 | 1. Intel GPU integration with Triton and Pytorch:
14 | - No strong requirement from PyTorch for specific backends to be part of Triton official release.
15 | - Can use a separate branch/fork for CI/CD and testing.
16 | - Intel team will work with Pytorch offline to close.
17 | 2. AMD GPU backend update:
18 | - AMD team shared the refactored design for AMD backend.
19 | - The new design is modularized and reduces clutter and duplication in upstream Triton.
20 | - Further work needed for regression testing and secure runners.
21 | 3. Proton profiler update:
22 | - Keren from the OpenAI team presented a new profiler tool for Triton kernels, which supports multiple vendors, metrics, and formats.
23 | - Outlined the plan for open-sourcing, integrating, and extending the tool.
24 |
--------------------------------------------------------------------------------
/docs/meetups/08-22-2023/amd-update.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/08-22-2023/amd-update.pdf
--------------------------------------------------------------------------------
/docs/meetups/08-22-2023/intel-xpu-update.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/08-22-2023/intel-xpu-update.pptx
--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/intel-xpu-update.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/10-25-2023/intel-xpu-update.pdf
--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/notes.md:
--------------------------------------------------------------------------------
1 | #### Agenda:
2 |
3 | ##### Items:
4 | 1. H100 updates
5 | 2. Triton-Shared layer updates
6 | 3. Intel update
7 | 4. Open discussion
8 |
9 | ##### Minutes:
10 | Recording link [here](https://youtu.be/KZAzpKx1ebI)
11 |
12 | 1. H100 updates
13 | - Enabled WGMMA by default, now any matmul can reuse it.
14 | - fp8 formats enabled – 1.3 Petaflops on dense matmul on H100 (gemm performance)
15 | - Enabled Flash Attention using wgmma, resulting in 450 teraflop on fwd pass and 250 on backward pass – still working on perf for flash attention
16 | - fp8 numbers with flash attention running in fp8 with matmul is tricky, because the fp8 layout is significantly different than what is returned by wgmma, still wip
17 |
18 | 2. Triton-Shared layer
19 | - Please refer to slides for more details
20 | - Created a repo where you can find the middle layer
21 | - Available as a plugin into triton
22 |
23 | 3. Intel Update
24 | - Please refer to slides for more details
25 |
--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/triton-shared.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/10-25-2023/triton-shared.pptx
--------------------------------------------------------------------------------
/docs/meetups/12-13-2023/notes.md:
--------------------------------------------------------------------------------
1 | #### Agenda:
2 |
3 | ##### Items:
4 | 1. Refactoring plan for 3rd party backends
5 | 2. Front end refactoring (AMD)
6 | 3. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development.
7 |
8 | ##### Minutes:
9 | Recording link [here](https://youtu.be/Lo43DQYkOWM)
10 |
11 | 1. Refactoring plan for 3rd party backends
12 | - Refactoring to be completed by end of the year so that all GPU backends can be individual passes on Triton GPU IR instead of being completely out of tree. The goal is for users to get other GPUs besides Cuda when they install Triton. Non-GPU Triton IR expected to stay as is.
13 | 3. Front end refactoring (AMD)
14 | - Will work with Phil for AMD related refactoring. Will share more details in next meetup about where AMD has diverged from Triton GPU IR and in the codeflow.
15 | 4. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development.
16 | - Can look at it on a case by case basis.
17 |
--------------------------------------------------------------------------------
/docs/meetups/dev-meetup-2023.md:
--------------------------------------------------------------------------------
1 | The conference slides are available [here](https://drive.google.com/drive/folders/1yDFc4ElNN_GGhWDdMlM4wcm5uFEFFVQk?usp=sharing)
2 |
3 | The conference videos will be available [here](https://youtube.com/playlist?list=PLc_vA1r0qoiRZfUC3o4_yjj0FtWvodKAz&feature=shared) when ready.
4 |
5 | # Triton Developer Conference
6 | The Triton Developer Conference was held in a hybrid mode at the Microsoft Silicon Valley Campus in Mountain View, California. The conference was held on September 20th from 10am to 4pm, followed by a reception till 5:30 pm.
7 |
8 | Agenda for the conference:
9 |
10 | |Time |Title |Speaker
11 | |--------|-------|-------|
12 | |10:00 AM|Welcome|Kevin Scott (Microsoft)|
13 | |10:20 AM|The Triton Compiler: Past, Present and Future|Phil Tillet (OpenAI)|
14 | |11:00 AM|**Break**||
15 | |11:20 AM|Hopper support in Triton|Gustav Zhu (Nvidia)|
16 | |11:40 AM|Bringing Triton to AMD GPUs|Jason Furmanek, Lixun Zhang (AMD)|
17 | |12:00 PM|Intel XPU Backend for Triton|Eikan Wang (Intel)|
18 | |12:20 PM|Vectorization of Triton Kernels for Qualcomm Hexagon Backend|Javed Absar (Qualcomm)|
19 | |12:30 PM|**Lunch**||
20 | |1:40 PM |Triton for MTIA|Roman Levenstein et al, (Meta)|
21 | |2:00 PM |Using Triton IR for high-performance fusions in XLA|George Karpenkov (Google)|
22 | |2:20 PM |Triton for All: Triton as a device-independent language|Ian Bearman (Microsoft)|
23 | |2:40 PM|**Break**||
24 | |3:00 PM|PyTorch 2.0 and TorchInductor|Jason Ansel, Horace He (Meta)|
25 | |3:20 PM|Pallas: A JAX Kernel Language|Sharad Vikram (Google)|
26 | |3:40 PM|Writing Grouped GEMMs in Triton|Vinod Grover (Nvidia)|
27 | |4:00 PM|**Reception**||
28 |
--------------------------------------------------------------------------------
/docs/programming-guide/chapter-1/cuda-parallel-matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-1/cuda-parallel-matmul.png
--------------------------------------------------------------------------------
/docs/programming-guide/chapter-1/triton-parallel-matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-1/triton-parallel-matmul.png
--------------------------------------------------------------------------------
/docs/programming-guide/chapter-2/halide-iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-2/halide-iteration.png
--------------------------------------------------------------------------------
/docs/programming-guide/chapter-2/polyhedral-iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-2/polyhedral-iteration.png
--------------------------------------------------------------------------------
/docs/python-api/triton.rst:
--------------------------------------------------------------------------------
1 | triton
2 | ======
3 |
4 | .. currentmodule:: triton
5 |
6 | .. autosummary::
7 | :toctree: generated
8 | :nosignatures:
9 |
10 | jit
11 | autotune
12 | heuristics
13 | Config
14 |
--------------------------------------------------------------------------------
/docs/python-api/triton.testing.rst:
--------------------------------------------------------------------------------
1 | triton.testing
2 | ==============
3 |
4 | .. currentmodule:: triton.testing
5 |
6 | .. autosummary::
7 | :toctree: generated
8 | :nosignatures:
9 |
10 | Benchmark
11 | do_bench
12 | do_bench_cudagraph
13 | perf_report
14 |
--------------------------------------------------------------------------------
/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(triton)
2 |
--------------------------------------------------------------------------------
/include/triton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Conversion)
2 | add_subdirectory(Dialect)
3 | add_subdirectory(Target)
4 |
--------------------------------------------------------------------------------
/include/triton/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonCPUToLLVM)
2 | add_subdirectory(TritonGPUToLLVM)
3 | add_subdirectory(TritonToTritonCPU)
4 | add_subdirectory(TritonToTritonGPU)
5 |
--------------------------------------------------------------------------------
/include/triton/Conversion/MLIRTypes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_MLIR_TYPES_H
2 | #define TRITON_CONVERSION_MLIR_TYPES_H
3 |
4 | #include "mlir/Transforms/DialectConversion.h"
5 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
6 |
7 | // This file redefines some common MLIR types for easy usage.
8 | namespace mlir {
9 | namespace triton {
10 | namespace type {
11 |
12 | // Integer types
13 | inline Type i32Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 32); }
14 | inline Type i16Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 16); }
15 | inline Type i8Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 8); }
16 | inline Type u32Ty(MLIRContext *ctx) {
17 | return IntegerType::get(ctx, 32, IntegerType::Unsigned);
18 | }
19 | inline Type u1Ty(MLIRContext *ctx) {
20 | return IntegerType::get(ctx, 1, IntegerType::Unsigned);
21 | }
22 |
23 | // Float types
24 | inline Type f16Ty(MLIRContext *ctx) { return FloatType::getF16(ctx); }
25 | inline Type f32Ty(MLIRContext *ctx) { return FloatType::getF32(ctx); }
26 | inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); }
27 | inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); }
28 |
29 | inline bool isFloat(Type type) {
30 | return type.isF32() || type.isF64() || type.isF16() || type.isF128();
31 | }
32 |
33 | inline bool isInt(Type type) { return type.isIntOrFloat() && !isFloat(type); }
34 |
35 | } // namespace type
36 | } // namespace triton
37 | } // namespace mlir
38 |
39 | #endif // TRITON_CONVERSION_MLIR_TYPES_H
40 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonCPUToLLVM)
3 | add_public_tablegen_target(TritonCPUConversionPassIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_CONVERSION_TRITONCPUTOLLVM_PASSES_H
2 | #define TRITONCPU_CONVERSION_TRITONCPUTOLLVM_PASSES_H
3 |
4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
5 | #include "mlir/Pass/Pass.h"
6 | #include "mlir/Transforms/DialectConversion.h"
7 |
8 | #include
9 |
10 | namespace mlir {
11 |
12 | class ModuleOp;
13 | template class OperationPass;
14 |
15 | namespace triton {
16 |
17 | #define GEN_PASS_DECL
18 | #include "triton/Conversion/TritonCPUToLLVM/Passes.h.inc"
19 |
20 | std::unique_ptr> createConvertTritonCPUToLLVMPass();
21 |
22 | #define GEN_PASS_REGISTRATION
23 | #include "triton/Conversion/TritonCPUToLLVM/Passes.h.inc"
24 |
25 | } // namespace triton
26 |
27 | } // namespace mlir
28 |
29 | #endif
30 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/Passes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_CONVERSION_PASSES
2 | #define TRITONCPU_CONVERSION_PASSES
3 |
4 | include "mlir/Pass/PassBase.td"
5 |
6 | def ConvertTritonCPUToLLVM : Pass<"convert-triton-cpu-to-llvm", "mlir::ModuleOp"> {
7 | let summary = "Convert TritonCPU to LLVM";
8 | let description = [{
9 |
10 | }];
11 | let constructor = "mlir::triton::createConvertTritonCPUToLLVMPass()";
12 |
13 | let dependentDialects = ["mlir::arith::ArithDialect",
14 | "mlir::LLVM::LLVMDialect",
15 | "mlir::math::MathDialect",
16 | "mlir::scf::SCFDialect",
17 | "mlir::tensor::TensorDialect",
18 | "mlir::triton::cpu::TritonCPUDialect",
19 | "mlir::triton::TritonDialect"];
20 |
21 | let options = [
22 | ];
23 | }
24 |
25 | #endif
26 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/PatternTritonCPUOpToLLVM.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TRITONCPU_TO_LLVM_PATTERNS_TRITON_CPU_OP_TO_LLVM_H
2 | #define TRITON_CONVERSION_TRITONCPU_TO_LLVM_PATTERNS_TRITON_CPU_OP_TO_LLVM_H
3 |
4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
5 | #include "triton/Dialect/TritonCPU/IR/Dialect.h"
6 |
7 | using namespace mlir;
8 | using namespace mlir::triton;
9 |
10 | namespace mlir {
11 | namespace triton {
12 | // Some populate* functions have name collisions with the ones for GPUs.
13 | namespace cpu {
14 |
15 | constexpr int patternBenefitDefault = 1;
16 | constexpr int patternBenefitPrioritizeOverLLVMConversions = 10;
17 | constexpr int patternBenefitClampOptimizedPattern = 20;
18 | constexpr int patternBenefitConvertLayoutOptimizedPattern = 20;
19 |
20 | void populateControlFlowOpToLLVMPattern(LLVMTypeConverter &typeConverter,
21 | RewritePatternSet &patterns,
22 | PatternBenefit benefit);
23 |
24 | void populateFuncOpConversionPattern(LLVMTypeConverter &typeConverter,
25 | RewritePatternSet &patterns,
26 | PatternBenefit benefit);
27 |
28 | void populatePrintOpToLLVMPattern(LLVMTypeConverter &typeConverter,
29 | RewritePatternSet &patterns,
30 | PatternBenefit benefit);
31 |
32 | } // namespace cpu
33 | } // namespace triton
34 | } // namespace mlir
35 |
36 | #endif
37 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/TypeConverter.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_CONVERSION_TRITONCPUTOLLVM_TYPECONVERTER_H
2 | #define TRITONCPU_CONVERSION_TRITONCPUTOLLVM_TYPECONVERTER_H
3 |
4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
6 | #include "triton/Conversion/MLIRTypes.h"
7 | #include "triton/Dialect/TritonCPU/IR/Types.h"
8 |
9 | using namespace mlir;
10 | using namespace mlir::triton;
11 |
12 | class TritonCPUToLLVMTypeConverter : public LLVMTypeConverter {
13 | public:
14 | using TypeConverter::convertType;
15 |
16 | TritonCPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option,
17 | const DataLayoutAnalysis *analysis = nullptr);
18 |
19 | Type convertTritonPointerType(triton::PointerType type);
20 | };
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonCPUToLLVM/Utility.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TRITONCPU_TO_LLVM_UTILITY_H
2 | #define TRITON_CONVERSION_TRITONCPU_TO_LLVM_UTILITY_H
3 |
4 | #include "mlir/Conversion/LLVMCommon/Pattern.h"
5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
6 | #include "triton/Analysis/Utility.h"
7 | #include "triton/Conversion/MLIRTypes.h"
8 | #include "triton/Dialect/Triton/IR/Utility.h"
9 | #include "triton/Dialect/TritonCPU/IR/Dialect.h"
10 | #include "llvm/Support/ErrorHandling.h"
11 |
12 | using namespace mlir;
13 | using namespace mlir::triton;
14 |
15 | namespace mlir {
16 | namespace LLVM {
17 |
18 | // TODO: Not sure we need this for CPU backends.
19 | inline bool isKernel(FunctionOpInterface funcOp) {
20 | return funcOp.getVisibility() == SymbolTable::Visibility::Public;
21 | }
22 |
23 | } // namespace LLVM
24 | } // namespace mlir
25 |
26 | #endif
27 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/AsmFormat.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
2 | #define TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
3 |
4 | #include "mlir/IR/Value.h"
5 | #include "triton/Dialect/Triton/IR/Dialect.h"
6 | #include "llvm/ADT/SmallVector.h"
7 | #include "llvm/ADT/StringExtras.h"
8 | #include "llvm/ADT/StringRef.h"
9 | #include
10 | #include
11 |
12 | namespace mlir {
13 | class ConversionPatternRewriter;
14 | class Location;
15 |
16 | namespace triton {
17 | using llvm::StringRef;
18 |
19 | inline std::string strJoin(llvm::ArrayRef strs,
20 | llvm::StringRef delimiter) {
21 | return llvm::join(strs.begin(), strs.end(), delimiter);
22 | }
23 |
24 | } // namespace triton
25 | } // namespace mlir
26 |
27 | #endif // TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
28 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonGPUToLLVM)
3 | add_public_tablegen_target(TritonGPUConversionPassIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H
2 | #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H
3 |
4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
5 | #include "mlir/Pass/Pass.h"
6 | #include "mlir/Transforms/DialectConversion.h"
7 |
8 | #include
9 |
10 | namespace mlir {
11 |
12 | class ModuleOp;
13 | template class OperationPass;
14 |
15 | namespace triton {
16 |
17 | #define GEN_PASS_DECL
18 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
19 |
20 | namespace gpu {
21 | std::unique_ptr> createAllocateSharedMemoryPass();
22 |
23 | } // namespace gpu
24 |
25 | #define GEN_PASS_REGISTRATION
26 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
27 |
28 | } // namespace triton
29 |
30 | } // namespace mlir
31 |
32 | #endif
33 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/Passes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCOMMONGPU_CONVERSION_PASSES
2 | #define TRITONCOMMONGPU_CONVERSION_PASSES
3 |
4 | include "mlir/Pass/PassBase.td"
5 |
6 | def AllocateSharedMemory : Pass<"allocate-shared-memory", "mlir::ModuleOp"> {
7 | let summary = "Add metadata for shared memory allocation";
8 | let constructor = "mlir::triton::gpu::createAllocateSharedMemoryPass()";
9 | }
10 |
11 | #endif
12 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/Patterns.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PATTERNS_H
2 | #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PATTERNS_H
3 |
4 | #include
5 |
6 | namespace mlir {
7 | class ModuleOp;
8 | class RankedTensorType;
9 |
10 | namespace triton::gpu {
11 |
12 | /// Replaces `blocked -> dot_op` with `blocked -> shared -> dot_op` in the given
13 | /// |module| op because the codegen doesn't handle `blocked -> dot_op` directly.
14 | void decomposeBlockedToDotLayoutConversion(ModuleOp module);
15 |
16 | /// Replaces `splat -> shared` with `splat -> blocked -> shared` in the given
17 | /// |module| op.
18 | void decomposeSplatOpToSharedLayoutConversion(ModuleOp module);
19 |
20 | /// Replaces `mma/mfma -> dot_op` with `mma/mfma -> blocked -> dot_op` in the
21 | /// given |module| op, but bypass the decomposition if |shortcutFn| returns
22 | /// true.
23 | using ShortcutFn = std::function;
24 | template
25 | void decomposeTensorCoreToDotLayoutConversion(ModuleOp module,
26 | ShortcutFn shortcutFn);
27 |
28 | } // namespace triton::gpu
29 |
30 | } // namespace mlir
31 |
32 | #endif
33 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_TYPECONVERTER_H
2 | #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_TYPECONVERTER_H
3 |
4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
6 | #include "triton/Conversion/MLIRTypes.h"
7 | #include "triton/Dialect/TritonGPU/IR/Types.h"
8 |
9 | using namespace mlir;
10 | using namespace mlir::triton;
11 |
12 | class TritonGPUToLLVMTypeConverter : public LLVMTypeConverter {
13 | public:
14 | using TypeConverter::convertType;
15 |
16 | TritonGPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option,
17 | const DataLayoutAnalysis *analysis = nullptr);
18 |
19 | Type getElementTypeForStruct(TensorOrMemDesc type);
20 | Type convertTritonPointerType(triton::PointerType type);
21 | Type convertTritonTensorType(RankedTensorType type);
22 | Type convertMemDescType(MemDescType type);
23 | Type convertAsyncToken(triton::gpu::AsyncTokenType type);
24 | };
25 |
26 | #endif
27 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonCPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonCPU)
3 | add_public_tablegen_target(TritonConversionToCPUPassIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonCPU/Passes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TO_CPU_PASSES_H
2 | #define TRITON_CONVERSION_TO_CPU_PASSES_H
3 |
4 | #include "triton/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.h"
5 |
6 | namespace mlir {
7 | namespace triton {
8 |
9 | #define GEN_PASS_REGISTRATION
10 | #include "triton/Conversion/TritonToTritonCPU/Passes.h.inc"
11 |
12 | } // namespace triton
13 | } // namespace mlir
14 |
15 | #endif
16 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonCPU/Passes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TO_CPU_PASSES
2 | #define TRITON_CONVERSION_TO_CPU_PASSES
3 |
4 | include "mlir/Pass/PassBase.td"
5 |
6 | def ConvertTritonToTritonCPU: Pass<"convert-triton-to-tritoncpu", "mlir::ModuleOp"> {
7 | let summary = "Convert Triton to TritonCPU";
8 | let description = [{
9 |
10 | }];
11 | let constructor = "mlir::triton::createConvertTritonToTritonCPUPass()";
12 |
13 | let dependentDialects = ["mlir::arith::ArithDialect",
14 | "mlir::math::MathDialect",
15 | "mlir::scf::SCFDialect",
16 | "mlir::triton::cpu::TritonCPUDialect",
17 | "mlir::triton::TritonDialect"];
18 |
19 | let options = [
20 | ];
21 | }
22 |
23 | #endif
24 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TRITONTOTRITONCPU_TRITONTOTRITONCPUPASS_H
2 | #define TRITON_CONVERSION_TRITONTOTRITONCPU_TRITONTOTRITONCPUPASS_H
3 |
4 | #include
5 |
6 | namespace mlir {
7 |
8 | class ModuleOp;
9 | template class OperationPass;
10 |
11 | namespace triton {
12 |
13 | std::unique_ptr> createConvertTritonToTritonCPUPass();
14 |
15 | } // namespace triton
16 | } // namespace mlir
17 |
18 | #endif
19 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonGPU)
3 | add_public_tablegen_target(TritonConversionToGPUPassIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/Passes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TO_GPU_PASSES_H
2 | #define TRITON_CONVERSION_TO_GPU_PASSES_H
3 |
4 | #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
5 |
6 | namespace mlir {
7 | namespace triton {
8 |
9 | #define GEN_PASS_REGISTRATION
10 | #include "triton/Conversion/TritonToTritonGPU/Passes.h.inc"
11 |
12 | } // namespace triton
13 | } // namespace mlir
14 |
15 | #endif
16 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/Passes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TO_GPU_PASSES
2 | #define TRITON_CONVERSION_TO_GPU_PASSES
3 |
4 | include "mlir/Pass/PassBase.td"
5 |
6 | def ConvertTritonToTritonGPU: Pass<"convert-triton-to-tritongpu", "mlir::ModuleOp"> {
7 | let summary = "Convert Triton to TritonGPU";
8 | let description = [{
9 |
10 | }];
11 | let constructor = "mlir::triton::createConvertTritonToTritonGPUPass()";
12 |
13 | let dependentDialects = ["mlir::arith::ArithDialect",
14 | "mlir::math::MathDialect",
15 | // TODO: Does this pass depend on SCF?
16 | "mlir::scf::SCFDialect",
17 | "mlir::triton::TritonDialect",
18 | "mlir::triton::gpu::TritonGPUDialect"];
19 |
20 | let options = [
21 | Option<"numWarps", "num-warps",
22 | "int32_t", /*default*/"4",
23 | "number of warps">,
24 |
25 | Option<"threadsPerWarp", "threads-per-warp",
26 | "int32_t", /*default*/"32",
27 | "number of threads per warp">,
28 | Option<"numCTAs", "num-ctas",
29 | "int32_t", /*default*/"1",
30 | "number of ctas in a cga">,
31 | Option<"computeCapability", "compute-capability",
32 | "int32_t", /*default*/"80",
33 | "compute capability">
34 | ];
35 | }
36 |
37 | #endif
38 |
--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H
2 | #define TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H
3 |
4 | #include
5 |
6 | namespace mlir {
7 |
8 | class ModuleOp;
9 | template class OperationPass;
10 |
11 | namespace triton {
12 |
13 | constexpr static char AttrNumWarpsName[] = "triton_gpu.num-warps";
14 | constexpr static char AttrNumCTAsName[] = "triton_gpu.num-ctas";
15 | constexpr static char AttrComputeCapabilityName[] =
16 | "triton_gpu.compute-capability";
17 |
18 | constexpr static char AttrNumThreadsPerWarp[] = "triton_gpu.threads-per-warp";
19 |
20 | // Create the pass with numWarps passed from cl::opt.
21 | std::unique_ptr> createConvertTritonToTritonGPUPass();
22 |
23 | // Create the pass with numWarps set explicitly.
24 | std::unique_ptr>
25 | createConvertTritonToTritonGPUPass(int numWarps, int threadsPerWarp = 32,
26 | int numCTAs = 1, int computeCapability = 80);
27 |
28 | } // namespace triton
29 | } // namespace mlir
30 |
31 | #endif
32 |
--------------------------------------------------------------------------------
/include/triton/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Triton)
2 | add_subdirectory(TritonCPU)
3 | add_subdirectory(TritonGPU)
4 | add_subdirectory(TritonNvidiaGPU)
5 | add_subdirectory(NVGPU)
6 |
--------------------------------------------------------------------------------
/include/triton/Dialect/NVGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | #add_subdirectory(Transforms)
3 |
--------------------------------------------------------------------------------
/include/triton/Dialect/NVGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
2 |
3 | set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu)
5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu)
6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions)
7 | mlir_tablegen(Ops.h.inc -gen-op-decls)
8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
9 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
10 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
11 | add_mlir_doc(NVGPUDialect NVGPUDialect dialects/ -gen-dialect-doc)
12 | add_mlir_doc(NVGPUOps NVGPUOps dialects/ -gen-op-doc)
13 | add_public_tablegen_target(NVGPUTableGen)
14 |
15 | set(LLVM_TARGET_DEFINITIONS NVGPUAttrDefs.td)
16 | mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls)
17 | mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs)
18 | add_public_tablegen_target(NVGPUAttrDefsIncGen)
19 |
--------------------------------------------------------------------------------
/include/triton/Dialect/NVGPU/IR/NVGPUAttrDefs.td:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved.
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining
4 | // a copy of this software and associated documentation files
5 | // (the "Software"), to deal in the Software without restriction,
6 | // including without limitation the rights to use, copy, modify, merge,
7 | // publish, distribute, sublicense, and/or sell copies of the Software,
8 | // and to permit persons to whom the Software is furnished to do so,
9 | // subject to the following conditions:
10 | //
11 | // The above copyright notice and this permission notice shall be
12 | // included in all copies or substantial portions of the Software.
13 | //
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
22 | #ifndef NVGPU_ATTRDEFS
23 | #define NVGPU_ATTRDEFS
24 |
25 | include "triton/Dialect/NVGPU/IR/NVGPUDialect.td"
26 | include "mlir/IR/AttrTypeBase.td"
27 |
28 | class NVGPU_Attr traits = [],
29 | string baseCppClass = "::mlir::Attribute">
30 | : AttrDef {
31 | }
32 |
33 | #endif
34 |
--------------------------------------------------------------------------------
/include/triton/Dialect/NVGPU/IR/NVGPUDialect.td:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved.
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining
4 | // a copy of this software and associated documentation files
5 | // (the "Software"), to deal in the Software without restriction,
6 | // including without limitation the rights to use, copy, modify, merge,
7 | // publish, distribute, sublicense, and/or sell copies of the Software,
8 | // and to permit persons to whom the Software is furnished to do so,
9 | // subject to the following conditions:
10 | //
11 | // The above copyright notice and this permission notice shall be
12 | // included in all copies or substantial portions of the Software.
13 | //
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
22 | #ifndef NVGPU_DIALECT
23 | #define NVGPU_DIALECT
24 |
25 | include "mlir/IR/OpBase.td"
26 |
27 | def NVGPU_Dialect : Dialect {
28 | let name = "nvgpu";
29 | let cppNamespace = "::mlir::triton::nvgpu";
30 |
31 | let description = [{
32 | NVGPU Dialect.
33 | }];
34 |
35 | let dependentDialects = [
36 | "mlir::LLVM::LLVMDialect"
37 | ];
38 | }
39 |
40 | #endif
41 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
2 |
3 | set(LLVM_TARGET_DEFINITIONS TritonOps.td)
4 | mlir_tablegen(Ops.h.inc -gen-op-decls)
5 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
6 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
7 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
8 | add_mlir_doc(TritonOps TritonOps dialects/ -gen-op-doc)
9 |
10 | set(LLVM_TARGET_DEFINITIONS TritonDialect.td)
11 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls)
12 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs)
13 | add_mlir_doc(TritonDialect TritonDialect dialects/ -gen-dialect-doc)
14 |
15 | set(LLVM_TARGET_DEFINITIONS TritonTypes.td)
16 | mlir_tablegen(Types.h.inc -gen-typedef-decls)
17 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs)
18 |
19 | set(LLVM_TARGET_DEFINITIONS TritonInterfaces.td)
20 | mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls)
21 | mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs)
22 |
23 | set(LLVM_TARGET_DEFINITIONS TritonTypeInterfaces.td)
24 | mlir_tablegen(TritonTypeInterfaces.h.inc -gen-type-interface-decls)
25 | mlir_tablegen(TritonTypeInterfaces.cpp.inc -gen-type-interface-defs)
26 |
27 | add_public_tablegen_target(TritonTableGen)
28 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/Interfaces.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_IR_INTERFACES_H_
2 | #define TRITON_IR_INTERFACES_H_
3 |
4 | #include "mlir/IR/OpDefinition.h"
5 |
6 | #define GET_TYPEDEF_CLASSES
7 | #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc"
8 |
9 | #endif // TRITON_IR_TYPES_H_
10 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/TritonDialect.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_DIALECT
2 | #define TRITON_DIALECT
3 |
4 | include "mlir/IR/OpBase.td"
5 |
6 | def Triton_Dialect : Dialect {
7 | let name = "tt";
8 |
9 | let cppNamespace = "::mlir::triton";
10 |
11 | let summary = "The Triton IR in MLIR";
12 |
13 | let description = [{
14 | Triton Dialect.
15 |
16 | Dependent Dialects:
17 | * Arith:
18 | * addf, addi, andi, cmpf, cmpi, divf, fptosi, ...
19 | * Math:
20 | * exp, sin, cos, log, ...
21 | * StructuredControlFlow:
22 | * for, if, while, yield, condition
23 | * ControlFlow:
24 | * br, cond_br
25 | }];
26 |
27 | let dependentDialects = [
28 | "arith::ArithDialect",
29 | "math::MathDialect",
30 | "scf::SCFDialect",
31 | "cf::ControlFlowDialect"
32 | ];
33 |
34 | let extraClassDeclaration = [{
35 | void registerTypes();
36 | }];
37 |
38 | let hasConstantMaterializer = 1;
39 | let useDefaultTypePrinterParser = 1;
40 | let usePropertiesForAttributes = 1;
41 | }
42 |
43 | include "triton/Dialect/Triton/IR/TritonTypes.td"
44 |
45 |
46 | #endif // TRITON_DIALECT
47 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/TritonInterfaces.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_INTERFACES
2 | #define TRITON_INTERFACES
3 |
4 | include "mlir/IR/OpBase.td"
5 |
6 | def TensorSizeTrait : NativeOpTrait<"TensorSizeTrait">;
7 | def VerifyTensorLayoutsTrait : NativeOpTrait<"VerifyTensorLayoutsTrait">;
8 | def SameOperandsEncoding : NativeOpTrait<"SameOperandsEncoding">;
9 | def SameOperandsAndResultEncoding : NativeOpTrait<"SameOperandsAndResultEncoding">;
10 | def SameLoadStoreOperandsShape : NativeOpTrait<"SameLoadStoreOperandsShape">;
11 | def SameLoadStoreOperandsAndResultShape : NativeOpTrait<"SameLoadStoreOperandsAndResultShape">;
12 | def SameLoadStoreOperandsEncoding : NativeOpTrait<"SameLoadStoreOperandsEncoding">;
13 | def SameLoadStoreOperandsAndResultEncoding : NativeOpTrait<"SameLoadStoreOperandsAndResultEncoding">;
14 |
15 | #endif // TRITON_INTERFACES
16 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_TYPE_INTERFACES
2 | #define TRITON_TYPE_INTERFACES
3 |
4 | include "mlir/IR/OpBase.td"
5 |
6 | // Interface dynamically attached to RankedTensorType and MemDescType.
7 | def TT_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> {
8 | let cppNamespace = "::mlir";
9 | let methods = [
10 | InterfaceMethod<"Returns the encoding of the tensor or memory descriptor",
11 | "mlir::Attribute", "getEncoding", (ins)>,
12 | InterfaceMethod<"Returns element type",
13 | "mlir::Type", "getElementType", (ins)>,
14 | InterfaceMethod<"Returns the type shape",
15 | "llvm::ArrayRef", "getShape", (ins)>,
16 | InterfaceMethod<"Returns the tensor or buffer rank",
17 | "int64_t", "getRank", (ins)>,
18 | InterfaceMethod<"Returns the element type bit width",
19 | "int64_t", "getElementTypeBitWidth", (ins)>,
20 |
21 | ];
22 | }
23 |
24 | #endif // TRITON_TYPE_INTERFACES
25 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/Types.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_IR_TYPES_H_
2 | #define TRITON_IR_TYPES_H_
3 |
4 | #include "mlir/IR/BuiltinTypes.h"
5 | #include "mlir/IR/TypeSupport.h"
6 | #include "mlir/IR/Types.h"
7 |
8 | #define GET_TYPEDEF_CLASSES
9 | #include "triton/Dialect/Triton/IR/Types.h.inc"
10 |
11 | #include "triton/Dialect/Triton/IR/TritonTypeInterfaces.h.inc"
12 |
13 | namespace mlir {
14 |
15 | namespace triton {
16 |
17 | bool isTensorPointerType(Type type);
18 |
19 | bool isTensorOrTensorPointerType(Type type);
20 |
21 | unsigned getPointeeBitWidth(Type type);
22 |
23 | Type getPointeeType(Type type);
24 |
25 | Type getPointerType(Type type);
26 |
27 | Type getElementTypeOfTensorPointerType(Type type);
28 |
29 | Type getI1SameShape(Type type);
30 |
31 | Type getI32SameShape(Type type);
32 |
33 | Type getPointerTypeSameShape(Type type);
34 |
35 | } // namespace triton
36 |
37 | } // namespace mlir
38 |
39 | #endif // TRITON_IR_TYPES_H_
40 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name Triton)
3 | add_public_tablegen_target(TritonTransformsIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/Passes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_
2 | #define TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_
3 |
4 | #include "mlir/Pass/Pass.h"
5 |
6 | namespace mlir {
7 | namespace triton {
8 |
9 | std::unique_ptr createCombineOpsPass();
10 |
11 | std::unique_ptr createReorderBroadcastPass();
12 | std::unique_ptr createRewriteTensorPointerPass();
13 |
14 | } // namespace triton
15 |
16 | #define GEN_PASS_REGISTRATION
17 | #include "triton/Dialect/Triton/Transforms/Passes.h.inc"
18 |
19 | } // namespace mlir
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/Attributes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_
2 | #define TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_
3 |
4 | #include "triton/Dialect/TritonCPU/IR/TritonCPUInterfaces.h"
5 |
6 | #define GET_ATTRDEF_CLASSES
7 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.h.inc"
8 |
9 | #endif // TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_
10 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
2 |
3 | set(LLVM_TARGET_DEFINITIONS TritonCPUOps.td)
4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_cpu)
5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_cpu)
6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_cpu)
9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_cpu)
10 | add_mlir_doc(TritonCPUDialect TritonCPUDialect dialects/ -gen-dialect-doc)
11 | add_mlir_doc(TritonCPUOps TritonCPUOps dialects/ -gen-op-doc)
12 | add_public_tablegen_target(TritonCPUTableGen)
13 |
14 | set(LLVM_TARGET_DEFINITIONS TritonCPUAttrDefs.td)
15 | mlir_tablegen(TritonCPUAttrInterfaces.h.inc -gen-attr-interface-decls)
16 | mlir_tablegen(TritonCPUAttrInterfaces.cpp.inc -gen-attr-interface-defs)
17 | mlir_tablegen(TritonCPUAttrDefs.h.inc -gen-attrdef-decls)
18 | mlir_tablegen(TritonCPUAttrDefs.cpp.inc -gen-attrdef-defs)
19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
21 | add_public_tablegen_target(TritonCPUAttrDefsIncGen)
22 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/Dialect.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_
2 | #define TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_
3 |
4 | #include "mlir/Dialect/Tensor/IR/Tensor.h"
5 | #include "mlir/IR/BuiltinOps.h"
6 | #include "mlir/IR/Dialect.h"
7 |
8 | // TritonCPU depends on Triton
9 | #include "triton/Dialect/Triton/IR/Dialect.h"
10 | #include "triton/Dialect/TritonCPU/IR/Attributes.h"
11 | #include "triton/Dialect/TritonCPU/IR/Dialect.h.inc"
12 | #include "triton/Dialect/TritonCPU/IR/Types.h"
13 |
14 | #define GET_OP_CLASSES
15 | #include "triton/Dialect/TritonCPU/IR/Ops.h.inc"
16 |
17 | #endif // TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_
18 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_ATTRDEFS
2 | #define TRITONCPU_ATTRDEFS
3 |
4 | include "mlir/IR/AttrTypeBase.td"
5 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td"
6 | include "triton/Dialect/Triton/IR/TritonInterfaces.td"
7 |
8 | //===----------------------------------------------------------------------===//
9 | // TritonCPU Attribute Definitions
10 | //===----------------------------------------------------------------------===//
11 | def TritonCPU_AttrTrait : AttrInterface<"TritonCPU_AttrTrait"> {
12 | let cppNamespace = "::mlir::triton::cpu";
13 | }
14 |
15 | class TritonCPU_Attr traits = [],
16 | Dialect dialect = TritonCPU_Dialect,
17 | string baseCppClass = "::mlir::Attribute">
18 | : AttrDef {
19 |
20 | let description = [{
21 | WIP...
22 | }];
23 | }
24 |
25 | #endif
26 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUDialect.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_DIALECT
2 | #define TRITONCPU_DIALECT
3 |
4 | include "mlir/IR/OpBase.td"
5 |
6 | def TritonCPU_Dialect : Dialect {
7 | let name = "triton_cpu";
8 |
9 | let cppNamespace = "::mlir::triton::cpu";
10 |
11 | let hasOperationAttrVerify = 1;
12 |
13 | let description = [{
14 | Triton CPU Dialect.
15 | }];
16 |
17 | let dependentDialects = [
18 | "triton::TritonDialect",
19 | "tensor::TensorDialect",
20 | ];
21 |
22 | let extraClassDeclaration = [{
23 | void registerTypes();
24 | }];
25 |
26 | let useDefaultTypePrinterParser = 1;
27 | }
28 |
29 | #endif
30 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUInterfaces.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_CPU_DIALECT_INTERFACES_H
2 | #define TRITON_CPU_DIALECT_INTERFACES_H
3 |
4 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrInterfaces.h.inc"
5 |
6 | #endif // TRITON_CPU_DIALECT_INTERFACES_H
7 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUOps.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_OPS
2 | #define TRITONCPU_OPS
3 |
4 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td"
5 | include "triton/Dialect/TritonCPU/IR/TritonCPUTypes.td"
6 | include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.td"
7 | include "mlir/Dialect/Arith/IR/ArithBase.td"
8 | include "triton/Dialect/Triton/IR/TritonTypes.td"
9 | include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
10 | include "mlir/IR/OpBase.td"
11 |
12 | #endif
13 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/TritonCPUTypes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_TYPES
2 | #define TRITONCPU_TYPES
3 |
4 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td"
5 | include "mlir/IR/AttrTypeBase.td"
6 |
7 | class TTC_TypeDef traits = []>
8 | : TypeDef {
9 | let mnemonic = _mnemonic;
10 | }
11 |
12 | def TTC_TokenType : TTC_TypeDef<"Token", "token"> {
13 | let parameters = (ins "int32_t":$type);
14 |
15 | let builders = [
16 | TypeBuilder<(ins "unsigned":$type), [{
17 | return $_get($_ctxt, type);
18 | }]>
19 | ];
20 |
21 | let hasCustomAssemblyFormat = 1;
22 |
23 | let skipDefaultBuilders = 1;
24 | }
25 |
26 | #endif
27 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/IR/Types.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_IR_TYPES_H_
2 | #define TRITONCPU_IR_TYPES_H_
3 |
4 | #include "mlir/IR/TypeSupport.h"
5 | #include "mlir/IR/Types.h"
6 |
7 | #define GET_TYPEDEF_CLASSES
8 | #include "triton/Dialect/TritonCPU/IR/Types.h.inc"
9 |
10 | #endif // TRITON_IR_TYPES_H_
11 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonCPU)
3 | add_public_tablegen_target(TritonCPUTransformsIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/Transforms/Passes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_DIALECT_TRITONCPU_TRANSFORMS_PASSES_H_
2 | #define TRITON_DIALECT_TRITONCPU_TRANSFORMS_PASSES_H_
3 |
4 | #include "mlir/Pass/Pass.h"
5 |
6 | namespace mlir {
7 | namespace triton {
8 | namespace cpu {} // namespace cpu
9 | } // namespace triton
10 |
11 | /// Generate the code for registering passes.
12 | #define GEN_PASS_REGISTRATION
13 | #include "triton/Dialect/TritonCPU/Transforms/Passes.h.inc"
14 |
15 | } // namespace mlir
16 | #endif
17 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/Transforms/Passes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONCPU_PASSES
2 | #define TRITONCPU_PASSES
3 |
4 | include "mlir/Pass/PassBase.td"
5 |
6 | #endif
7 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonCPU/Transforms/TritonCPUConversion.h:
--------------------------------------------------------------------------------
1 | //===----------------------------------------------------------------------===//
2 | //
3 | // Defines utilities to use while converting to the TritonCPU dialect.
4 | //
5 | //===----------------------------------------------------------------------===//
6 |
7 | #ifndef TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_
8 | #define TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_
9 |
10 | #include "mlir/Transforms/DialectConversion.h"
11 |
12 | namespace mlir {
13 |
14 | class TritonCPUTypeConverter : public TypeConverter {
15 | public:
16 | TritonCPUTypeConverter(MLIRContext *context);
17 |
18 | private:
19 | MLIRContext *context;
20 | };
21 |
22 | class TritonCPUConversionTarget : public ConversionTarget {
23 |
24 | public:
25 | explicit TritonCPUConversionTarget(MLIRContext &ctx,
26 | TritonCPUTypeConverter &typeConverter);
27 | };
28 |
29 | } // namespace mlir
30 |
31 | #endif // TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_
32 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/Attributes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
2 | #define TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
3 |
4 | #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
5 |
6 | #define GET_ATTRDEF_CLASSES
7 | #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc"
8 |
9 | #endif // TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
10 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
2 |
3 | set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td)
4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_gpu)
5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_gpu)
6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_gpu)
9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_gpu)
10 | add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc)
11 | add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc)
12 | add_public_tablegen_target(TritonGPUTableGen)
13 |
14 | set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td)
15 | mlir_tablegen(TritonGPUAttrInterfaces.h.inc -gen-attr-interface-decls)
16 | mlir_tablegen(TritonGPUAttrInterfaces.cpp.inc -gen-attr-interface-defs)
17 | mlir_tablegen(TritonGPUAttrDefs.h.inc -gen-attrdef-decls)
18 | mlir_tablegen(TritonGPUAttrDefs.cpp.inc -gen-attrdef-defs)
19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
21 | add_public_tablegen_target(TritonGPUAttrDefsIncGen)
22 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_GPU_DIALECT_INTERFACES_H
2 | #define TRITON_GPU_DIALECT_INTERFACES_H
3 |
4 | #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrInterfaces.h.inc"
5 |
6 | #endif // TRITON_GPU_DIALECT_INTERFACES_H
7 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITONGPU_TYPES
2 | #define TRITONGPU_TYPES
3 |
4 | include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
5 | include "mlir/IR/AttrTypeBase.td"
6 |
7 | class TTG_TypeDef traits = []>
8 | : TypeDef {
9 | let mnemonic = _mnemonic;
10 | }
11 |
12 | def TTG_TokenType : TTG_TypeDef<"Token", "token"> {
13 | let parameters = (ins "int32_t":$type);
14 |
15 | let builders = [
16 | TypeBuilder<(ins "unsigned":$type), [{
17 | return $_get($_ctxt, type);
18 | }]>
19 | ];
20 |
21 | let hasCustomAssemblyFormat = 1;
22 |
23 | let skipDefaultBuilders = 1;
24 | }
25 |
26 | def TTG_AsyncToken : TTG_TypeDef<"AsyncToken",
27 | "async.token", []> {
28 | let summary = "async token type";
29 | let description = [{
30 | `ttg.async.token` is a type returned by an asynchronous operation.
31 | It is used to establish an SSA-based link between async operations
32 | and operations that group or synchronize the async operations.
33 | }];
34 | }
35 |
36 | #endif
37 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/Types.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITONGPU_IR_TYPES_H_
2 | #define TRITONGPU_IR_TYPES_H_
3 |
4 | #include "mlir/IR/TypeSupport.h"
5 | #include "mlir/IR/Types.h"
6 |
7 | #define GET_TYPEDEF_CLASSES
8 | #include "triton/Dialect/TritonGPU/IR/Types.h.inc"
9 |
10 | #endif // TRITON_IR_TYPES_H_
11 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonGPU)
3 | add_public_tablegen_target(TritonGPUTransformsIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/Passes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_
2 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_
3 |
4 | #include "mlir/Pass/Pass.h"
5 | #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
6 |
7 | namespace mlir {
8 | namespace triton {
9 | namespace gpu {
10 |
11 | std::unique_ptr createPipelinePass(int numStages = 3, int numWarps = 4,
12 | int numCTAs = 1,
13 | int computeCapability = 80);
14 |
15 | std::unique_ptr createAccelerateMatmulPass(int computeCapability = 80);
16 |
17 | std::unique_ptr createF32DotTCPass();
18 |
19 | std::unique_ptr createPrefetchPass();
20 |
21 | std::unique_ptr createCoalescePass();
22 |
23 | std::unique_ptr createReorderInstructionsPass();
24 |
25 | std::unique_ptr createReduceDataDuplicationPass();
26 |
27 | std::unique_ptr createRemoveLayoutConversionsPass();
28 |
29 | std::unique_ptr createVerifier();
30 |
31 | std::unique_ptr createOptimizeDotOperandsPass();
32 |
33 | std::unique_ptr createOptimizeThreadLocalityPass();
34 |
35 | } // namespace gpu
36 | } // namespace triton
37 |
38 | /// Generate the code for registering passes.
39 | #define GEN_PASS_REGISTRATION
40 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
41 |
42 | } // namespace mlir
43 | #endif
44 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h:
--------------------------------------------------------------------------------
1 | //===----------------------------------------------------------------------===//
2 | //
3 | // Defines utilities to use while converting to the TritonGPU dialect.
4 | //
5 | //===----------------------------------------------------------------------===//
6 |
7 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
8 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
9 |
10 | #include "mlir/Transforms/DialectConversion.h"
11 |
12 | namespace mlir {
13 |
14 | class TritonGPUTypeConverter : public TypeConverter {
15 | public:
16 | TritonGPUTypeConverter(MLIRContext *context, int numWarps, int threadsPerWarp,
17 | int numCTAs);
18 | int getNumWarps() const { return numWarps; }
19 | int getThreadsPerWarp() const { return threadsPerWarp; }
20 | int getNumCTAs() const { return numCTAs; }
21 |
22 | private:
23 | MLIRContext *context;
24 | int numWarps;
25 | int threadsPerWarp;
26 | int numCTAs;
27 | };
28 |
29 | class TritonGPUConversionTarget : public ConversionTarget {
30 |
31 | public:
32 | explicit TritonGPUConversionTarget(MLIRContext &ctx,
33 | TritonGPUTypeConverter &typeConverter);
34 | };
35 |
36 | } // namespace mlir
37 |
38 | #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_
39 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
2 |
3 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOps.td)
4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_nvidia_gpu)
5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_nvidia_gpu)
6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_nvidia_gpu)
9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_nvidia_gpu)
10 | add_mlir_doc(TritonNvidiaGPUDialect TritonNvidiaGPUDialect dialects/ -gen-dialect-doc)
11 | add_mlir_doc(TritonNvidiaGPUOps TritonNvidiaGPUOps dialects/ -gen-op-doc)
12 | add_public_tablegen_target(TritonNvidiaGPUTableGen)
13 |
14 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUAttrDefs.td)
15 | mlir_tablegen(TritonNvidiaGPUAttrDefs.h.inc -gen-attrdef-decls)
16 | mlir_tablegen(TritonNvidiaGPUAttrDefs.cpp.inc -gen-attrdef-defs)
17 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
18 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
19 | add_public_tablegen_target(TritonNvidiaGPUAttrDefsIncGen)
20 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved.
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining
4 | // a copy of this software and associated documentation files
5 | // (the "Software"), to deal in the Software without restriction,
6 | // including without limitation the rights to use, copy, modify, merge,
7 | // publish, distribute, sublicense, and/or sell copies of the Software,
8 | // and to permit persons to whom the Software is furnished to do so,
9 | // subject to the following conditions:
10 | //
11 | // The above copyright notice and this permission notice shall be
12 | // included in all copies or substantial portions of the Software.
13 | //
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
22 | #ifndef TRITONNVIDIAGPU_ATTRDEFS
23 | #define TRITONNVIDIAGPU_ATTRDEFS
24 |
25 | include "mlir/IR/AttrTypeBase.td"
26 | include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td"
27 | include "triton/Dialect/Triton/IR/TritonInterfaces.td"
28 |
29 | #endif
30 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/IR/Types.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved.
3 | *
4 | * Permission is hereby granted, free of charge, to any person obtaining
5 | * a copy of this software and associated documentation files
6 | * (the "Software"), to deal in the Software without restriction,
7 | * including without limitation the rights to use, copy, modify, merge,
8 | * publish, distribute, sublicense, and/or sell copies of the Software,
9 | * and to permit persons to whom the Software is furnished to do so,
10 | * subject to the following conditions:
11 | *
12 | * The above copyright notice and this permission notice shall be
13 | * included in all copies or substantial portions of the Software.
14 | *
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | */
23 |
24 | #ifndef TRITONNVIDIAGPU_IR_TYPES_H_
25 | #define TRITONNVIDIAGPU_IR_TYPES_H_
26 |
27 | #include "mlir/IR/TypeSupport.h"
28 | #include "mlir/IR/Types.h"
29 |
30 | #define GET_TYPEDEF_CLASSES
31 | #include "triton/Dialect/TritonNvidiaGPU/IR/Types.h.inc"
32 |
33 | #endif // TRITON_IR_TYPES_H_
34 |
--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonNvidiaGPU)
3 | add_public_tablegen_target(TritonNvidiaGPUTransformsIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Target/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(LLVMIR)
2 |
--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name LLVMIR)
3 | add_public_tablegen_target(LLVMIRIncGen)
4 |
--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/Passes.h:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_TARGET_LLVM_IR_PASSES_H
2 | #define TRITON_TARGET_LLVM_IR_PASSES_H
3 |
4 | #include "mlir/Pass/Pass.h"
5 |
6 | namespace mlir {
7 |
8 | /// Create a pass to add DIScope
9 | std::unique_ptr createLLVMDIScopePass();
10 |
11 | /// Generate the code for registering conversion passes.
12 | #define GEN_PASS_REGISTRATION
13 | #include "triton/Target/LLVMIR/Passes.h.inc"
14 |
15 | } // namespace mlir
16 |
17 | #endif // TRITON_TARGET_LLVM_IR_PASSES_H
18 |
--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/Passes.td:
--------------------------------------------------------------------------------
1 | #ifndef TRITON_TARGET_LLVMIR_PASSES
2 | #define TRITON_TARGET_LLVMIR_PASSES
3 |
4 | include "mlir/Pass/PassBase.td"
5 |
6 | def LLVMDIScope: Pass<"enable-line-info", "mlir::ModuleOp"> {
7 | let summary = "Materialize LLVM line info";
8 | let description = [{
9 | This pass materializes line mapping information for LLVM IR dialect operations.
10 | }];
11 |
12 | let constructor = "mlir::createLLVMDIScopePass()";
13 | }
14 |
15 | #endif
16 |
--------------------------------------------------------------------------------
/include/triton/Tools/Sys/GetPlatform.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
3 | *
4 | * This file is part of ISAAC.
5 | *
6 | * ISAAC is free software; you can redistribute it and/or
7 | * modify it under the terms of the GNU Lesser General Public
8 | * License as published by the Free Software Foundation; either
9 | * version 2.1 of the License, or (at your option) any later version.
10 | *
11 | * This library is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 | * Lesser General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU Lesser General Public
17 | * License along with this library; if not, write to the Free Software
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 | * MA 02110-1301 USA
20 | */
21 |
22 | #ifndef TDL_TOOLS_SYS_GETPLATFORM_HPP
23 | #define TDL_TOOLS_SYS_GETPLATFORM_HPP
24 |
25 | #include
26 | #include
27 | #include
28 | #include