├── .clang-format ├── .editorconfig ├── .flake8 ├── .git-blame-ignore-revs ├── .github ├── CODEOWNERS └── workflows │ ├── documentation.yml │ ├── integration-tests.yml │ ├── llvm-build.yml │ ├── llvm-build │ └── Dockerfile │ ├── test-backends.yml │ ├── torch-inductor-tests.yml │ ├── torch-inductor │ └── scripts │ │ ├── check_acc.py │ │ ├── check_perf.py │ │ ├── common.sh │ │ ├── install_torchinductor.sh │ │ ├── install_triton.sh │ │ ├── run_torchinductor_acc.sh │ │ └── run_torchinductor_perf.sh │ └── wheels.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── bin ├── CMakeLists.txt ├── RegisterTritonDialects.h ├── triton-llvm-opt.cpp ├── triton-lsp.cpp ├── triton-opt.cpp └── triton-reduce.cpp ├── cmake ├── FindLLVM.cmake ├── llvm-hash.txt ├── nvidia-toolchain-version.txt └── pybind11-version.txt ├── docs ├── Makefile ├── _templates │ └── versions.html ├── backend │ ├── ldmatrixOperand0.svg │ └── ldmatrixOperand1.svg ├── conf.py ├── getting-started │ ├── installation.rst │ └── tutorials │ │ ├── grouped_vs_row_major_ordering.png │ │ ├── parallel_reduction.png │ │ └── random_bits.png ├── index.rst ├── meetups │ ├── 01-24-2024 │ │ └── notes.md │ ├── 02-20-2024 │ │ ├── Proton.pdf │ │ └── notes.md │ ├── 04-02-2024 │ │ └── notes.md │ ├── 07-18-2023 │ │ └── notes.md │ ├── 08-22-2023 │ │ ├── amd-update.pdf │ │ ├── intel-xpu-update.pptx │ │ └── notes.md │ ├── 10-25-2023 │ │ ├── intel-xpu-update.pdf │ │ ├── notes.md │ │ └── triton-shared.pptx │ ├── 12-13-2023 │ │ └── notes.md │ └── dev-meetup-2023.md ├── programming-guide │ ├── chapter-1 │ │ ├── cuda-parallel-matmul.png │ │ ├── introduction.rst │ │ └── triton-parallel-matmul.png │ └── chapter-2 │ │ ├── halide-iteration.png │ │ ├── polyhedral-iteration.png │ │ └── related-work.rst └── python-api │ ├── triton.language.rst │ ├── triton.rst │ └── triton.testing.rst ├── include ├── CMakeLists.txt └── triton │ ├── Analysis │ ├── Alias.h │ ├── Allocation.h │ ├── AxisInfo.h │ ├── Membar.h │ └── Utility.h │ ├── CMakeLists.txt │ ├── Conversion │ ├── CMakeLists.txt │ ├── MLIRTypes.h │ ├── TritonCPUToLLVM │ │ ├── CMakeLists.txt │ │ ├── Passes.h │ │ ├── Passes.td │ │ ├── PatternTritonCPUOpToLLVM.h │ │ ├── TypeConverter.h │ │ └── Utility.h │ ├── TritonGPUToLLVM │ │ ├── AsmFormat.h │ │ ├── CMakeLists.txt │ │ ├── ElementwiseOpToLLVMBase.h │ │ ├── Passes.h │ │ ├── Passes.td │ │ ├── PatternTritonGPUOpToLLVM.h │ │ ├── Patterns.h │ │ ├── TargetInfoBase.h │ │ ├── TypeConverter.h │ │ └── Utility.h │ ├── TritonToTritonCPU │ │ ├── CMakeLists.txt │ │ ├── Passes.h │ │ ├── Passes.td │ │ └── TritonToTritonCPUPass.h │ └── TritonToTritonGPU │ │ ├── CMakeLists.txt │ │ ├── Passes.h │ │ ├── Passes.td │ │ └── TritonToTritonGPUPass.h │ ├── Dialect │ ├── CMakeLists.txt │ ├── NVGPU │ │ ├── CMakeLists.txt │ │ └── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.h │ │ │ ├── NVGPUAttrDefs.td │ │ │ ├── NVGPUDialect.td │ │ │ └── NVGPUOps.td │ ├── Triton │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.h │ │ │ ├── Interfaces.h │ │ │ ├── Traits.h │ │ │ ├── TritonAttrDefs.td │ │ │ ├── TritonDialect.td │ │ │ ├── TritonInterfaces.td │ │ │ ├── TritonOps.td │ │ │ ├── TritonTypeInterfaces.td │ │ │ ├── TritonTypes.td │ │ │ ├── Types.h │ │ │ └── Utility.h │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ └── Passes.td │ ├── TritonCPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── Attributes.h │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.h │ │ │ ├── TritonCPUAttrDefs.td │ │ │ ├── TritonCPUDialect.td │ │ │ ├── TritonCPUInterfaces.h │ │ │ ├── TritonCPUOps.td │ │ │ ├── TritonCPUTypes.td │ │ │ └── Types.h │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── TritonCPUConversion.h │ ├── TritonGPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── Attributes.h │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.h │ │ │ ├── TritonGPUAttrDefs.td │ │ │ ├── TritonGPUDialect.td │ │ │ ├── TritonGPUInterfaces.h │ │ │ ├── TritonGPUOps.td │ │ │ ├── TritonGPUTypes.td │ │ │ └── Types.h │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── TritonGPUConversion.h │ │ │ └── Utility.h │ └── TritonNvidiaGPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ ├── CMakeLists.txt │ │ ├── Dialect.h │ │ ├── TritonNvidiaGPUAttrDefs.td │ │ ├── TritonNvidiaGPUDialect.td │ │ ├── TritonNvidiaGPUOps.td │ │ ├── TritonNvidiaGPUTypes.td │ │ └── Types.h │ │ └── Transforms │ │ ├── CMakeLists.txt │ │ ├── Passes.h │ │ └── Passes.td │ ├── Target │ ├── CMakeLists.txt │ └── LLVMIR │ │ ├── CMakeLists.txt │ │ ├── Passes.h │ │ └── Passes.td │ └── Tools │ └── Sys │ ├── GetEnv.hpp │ └── GetPlatform.hpp ├── lib ├── Analysis │ ├── Alias.cpp │ ├── Allocation.cpp │ ├── AxisInfo.cpp │ ├── CMakeLists.txt │ ├── Membar.cpp │ └── Utility.cpp ├── CMakeLists.txt ├── Conversion │ ├── CMakeLists.txt │ ├── TritonCPUToLLVM │ │ ├── CMakeLists.txt │ │ ├── ControlFlowOpToLLVM.cpp │ │ ├── FuncOpToLLVM.cpp │ │ ├── TritonCPUToLLVM.cpp │ │ └── TypeConverter.cpp │ ├── TritonGPUToLLVM │ │ ├── AllocateSharedMemory.cpp │ │ ├── AssertOpToLLVM.cpp │ │ ├── CMakeLists.txt │ │ ├── ControlFlowOpToLLVM.cpp │ │ ├── ConvertLayoutOpToLLVM.cpp │ │ ├── ConvertLayoutOpToLLVM │ │ │ └── SharedToDotOperandFMA.cpp │ │ ├── DecomposeUnsupportedConversions.cpp │ │ ├── DotOpToLLVM │ │ │ └── FMA.cpp │ │ ├── ElementwiseOpToLLVM.cpp │ │ ├── FuncOpToLLVM.cpp │ │ ├── HistogramOpToLLVM.cpp │ │ ├── MakeRangeOpToLLVM.cpp │ │ ├── MemoryOpToLLVM.cpp │ │ ├── PrintOpToLLVM.cpp │ │ ├── ReduceOpToLLVM.cpp │ │ ├── ReduceScanCommon.h │ │ ├── SPMDOpToLLVM.cpp │ │ ├── ScanOpToLLVM.cpp │ │ ├── TypeConverter.cpp │ │ ├── Utility.cpp │ │ └── ViewOpToLLVM.cpp │ ├── TritonToTritonCPU │ │ ├── CMakeLists.txt │ │ ├── TritonCPUConversion.cpp │ │ └── TritonToTritonCPUPass.cpp │ └── TritonToTritonGPU │ │ ├── CMakeLists.txt │ │ ├── TritonGPUConversion.cpp │ │ └── TritonToTritonGPUPass.cpp ├── Dialect │ ├── CMakeLists.txt │ ├── NVGPU │ │ ├── CMakeLists.txt │ │ └── IR │ │ │ ├── CMakeLists.txt │ │ │ └── Dialect.cpp │ ├── Triton │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.cpp │ │ │ ├── Ops.cpp │ │ │ ├── Traits.cpp │ │ │ └── Types.cpp │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── Combine.cpp │ │ │ ├── Combine.td │ │ │ ├── ReorderBroadcast.cpp │ │ │ └── RewriteTensorPointer.cpp │ ├── TritonCPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.cpp │ │ │ └── Types.cpp │ │ └── Transforms │ │ │ └── CMakeLists.txt │ ├── TritonGPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.cpp │ │ │ └── Types.cpp │ │ └── Transforms │ │ │ ├── AccelerateMatmul.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── Coalesce.cpp │ │ │ ├── F32DotTC.cpp │ │ │ ├── OptimizeDotOperands.cpp │ │ │ ├── OptimizeThreadLocality.cpp │ │ │ ├── Pipeliner │ │ │ ├── MatmulLoopPipeline.cpp │ │ │ ├── OuterLoopPipeline.cpp │ │ │ ├── PipelineExpander.cpp │ │ │ ├── PipelineExpander.h │ │ │ ├── PipeliningUtility.cpp │ │ │ ├── PipeliningUtility.h │ │ │ ├── Schedule.h │ │ │ └── SoftwarePipeliner.cpp │ │ │ ├── Prefetch.cpp │ │ │ ├── ReduceDataDuplication.cpp │ │ │ ├── RemoveLayoutConversions.cpp │ │ │ ├── ReorderInstructions.cpp │ │ │ └── Utility.cpp │ └── TritonNvidiaGPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ ├── CMakeLists.txt │ │ ├── Dialect.cpp │ │ ├── Ops.cpp │ │ └── Types.cpp │ │ └── Transforms │ │ ├── CMakeLists.txt │ │ ├── FenceInsertion.cpp │ │ └── PlanCTA.cpp └── Target │ ├── CMakeLists.txt │ └── LLVMIR │ ├── CMakeLists.txt │ ├── LLVMDIScope.cpp │ ├── LLVMIRBreakPhiStruct.cpp │ └── LLVMPasses.h ├── pyproject.toml ├── python ├── MANIFEST.in ├── examples │ ├── copy_strided.py │ └── empty.py ├── pyproject.toml ├── setup.py ├── src │ ├── interpreter.cc │ ├── ir.cc │ ├── llvm.cc │ ├── main.cc │ ├── passes.cc │ └── passes.h ├── test │ ├── backend │ │ ├── extension_backend.c │ │ ├── test_device_backend.py │ │ └── third_party_backends │ │ │ ├── conftest.py │ │ │ └── test_xpu_backend.py │ ├── kernel_comparison │ │ └── kernels.yml │ ├── regression │ │ ├── test_cast_matmul.py │ │ ├── test_functional_regressions.py │ │ └── test_performance.py │ └── unit │ │ ├── conftest.py │ │ ├── hopper │ │ ├── __init__.py │ │ ├── test_flashattention.py │ │ ├── test_gemm.py │ │ ├── test_gemm_fusion.py │ │ ├── test_mixed_io.py │ │ ├── test_persistent_warp_specialized_fused-attention.py │ │ ├── test_persistent_warp_specialized_gemm.py │ │ └── test_tma_store_gemm.py │ │ ├── language │ │ ├── assert_helper.py │ │ ├── conftest.py │ │ ├── print_helper.py │ │ ├── test_annotations.py │ │ ├── test_block_pointer.py │ │ ├── test_compile_errors.py │ │ ├── test_conversions.py │ │ ├── test_core.py │ │ ├── test_decorator.py │ │ ├── test_line_info.py │ │ ├── test_random.py │ │ ├── test_reproducer.py │ │ ├── test_standard.py │ │ └── test_subprocess.py │ │ ├── operators │ │ ├── conftest.py │ │ ├── test_blocksparse.py │ │ ├── test_cross_entropy.py │ │ ├── test_flash_attention.py │ │ ├── test_inductor.py │ │ └── test_matmul.py │ │ ├── runtime │ │ ├── test_autotuner.py │ │ ├── test_bindings.py │ │ ├── test_cache.py │ │ ├── test_driver.py │ │ ├── test_jit.py │ │ ├── test_launch.py │ │ └── test_subproc.py │ │ └── tools │ │ └── test_aot.py ├── triton │ ├── _C │ │ └── include │ ├── __init__.py │ ├── backends │ │ ├── __init__.py │ │ ├── compiler.py │ │ └── driver.py │ ├── compiler │ │ ├── __init__.py │ │ ├── code_generator.py │ │ ├── compiler.py │ │ ├── errors.py │ │ └── make_launcher.py │ ├── errors.py │ ├── language │ │ ├── __init__.py │ │ ├── core.py │ │ ├── extra │ │ │ ├── __init__.py │ │ │ └── cuda │ │ │ │ ├── __init__.py │ │ │ │ ├── libdevice.py │ │ │ │ └── utils.py │ │ ├── math.py │ │ ├── random.py │ │ ├── semantic.py │ │ └── standard.py │ ├── ops │ │ ├── __init__.py │ │ ├── blocksparse │ │ │ ├── __init__.py │ │ │ ├── matmul.py │ │ │ └── softmax.py │ │ ├── cross_entropy.py │ │ ├── flash_attention.py │ │ ├── matmul.py │ │ └── matmul_perf_model.py │ ├── runtime │ │ ├── __init__.py │ │ ├── autotuner.py │ │ ├── build.py │ │ ├── cache.py │ │ ├── driver.py │ │ ├── errors.py │ │ ├── interpreter.py │ │ └── jit.py │ ├── testing.py │ └── tools │ │ ├── __init__.py │ │ ├── build_extern.py │ │ ├── compile.c │ │ ├── compile.h │ │ ├── compile.py │ │ ├── disasm.py │ │ └── link.py └── tutorials │ ├── 01-vector-add.py │ ├── 02-fused-softmax.py │ ├── 03-matrix-multiplication.py │ ├── 04-low-memory-dropout.py │ ├── 05-layer-norm.py │ ├── 06-fused-attention.py │ ├── 07-extern-functions.py │ ├── 08-grouped-gemm.py │ └── README.rst ├── test ├── Analysis │ ├── test-alias.mlir │ ├── test-alignment.mlir │ ├── test-allocation.mlir │ └── test-membar.mlir ├── CMakeLists.txt ├── Conversion │ ├── amd │ │ ├── decompose-unsupported-conversions.mlir │ │ ├── fp_to_fp.mlir │ │ ├── load_store.mlir │ │ └── tritongpu_wmma_dot_to_llvm.mlir │ ├── dedup-by-constancy.mlir │ ├── divide-by-0.mlir │ ├── triton_to_tritongpu.mlir │ ├── tritongpu_to_llvm.mlir │ ├── tritongpu_to_llvm_hopper.mlir │ └── tritongpu_to_llvm_volta.mlir ├── LLVMIR │ └── break-phi-struct.ll ├── NVGPU │ ├── test_cga.mlir │ └── test_wgmma.mlir ├── Triton │ ├── canonicalize.mlir │ ├── combine.mlir │ ├── invalid.mlir │ ├── ops.mlir │ ├── reorder-broadcast.mlir │ ├── reproducer.mlir │ ├── rewrite-tensor-pointer.mlir │ ├── vecadd.mlir │ └── verify-make-range.mlir ├── TritonGPU │ ├── accelerate-matmul.mlir │ ├── amd │ │ ├── accelerate-amd-matmul-wmma.mlir │ │ └── amd-reorder-instructions.mlir │ ├── atomic-cas.mlir │ ├── canonicalize.mlir │ ├── coalesce.mlir │ ├── combine.mlir │ ├── dot-operands.mlir │ ├── fence-inserstion.mlir │ ├── invalid.mlir │ ├── loop-pipeline-hopper.mlir │ ├── loop-pipeline.mlir │ ├── matmul.mlir │ ├── ops.mlir │ ├── optimize-locality.mlir │ ├── optimize_epilogue.mlir │ ├── pipeline-hopper-remove-wait.mlir │ ├── prefetch.mlir │ ├── reduce-data-duplication.mlir │ ├── reorder-instructions.mlir │ ├── tritongpu_ops.mlir │ └── verify-blocked-layout.mlir ├── lib │ ├── Analysis │ │ ├── CMakeLists.txt │ │ ├── TestAlias.cpp │ │ ├── TestAllocation.cpp │ │ ├── TestAxisInfo.cpp │ │ └── TestMembar.cpp │ └── CMakeLists.txt ├── lit.cfg.py └── lit.site.cfg.py.in ├── third_party ├── amd │ ├── CMakeLists.txt │ ├── backend │ │ ├── compiler.py │ │ ├── driver.c │ │ ├── driver.py │ │ ├── include │ │ │ └── hip │ │ │ │ ├── amd_detail │ │ │ │ ├── amd_channel_descriptor.h │ │ │ │ ├── amd_device_functions.h │ │ │ │ ├── amd_hip_atomic.h │ │ │ │ ├── amd_hip_bf16.h │ │ │ │ ├── amd_hip_bfloat16.h │ │ │ │ ├── amd_hip_common.h │ │ │ │ ├── amd_hip_complex.h │ │ │ │ ├── amd_hip_cooperative_groups.h │ │ │ │ ├── amd_hip_fp16.h │ │ │ │ ├── amd_hip_gl_interop.h │ │ │ │ ├── amd_hip_math_constants.h │ │ │ │ ├── amd_hip_runtime.h │ │ │ │ ├── amd_hip_runtime_pt_api.h │ │ │ │ ├── amd_hip_unsafe_atomics.h │ │ │ │ ├── amd_hip_vector_types.h │ │ │ │ ├── amd_math_functions.h │ │ │ │ ├── amd_surface_functions.h │ │ │ │ ├── amd_warp_functions.h │ │ │ │ ├── concepts.hpp │ │ │ │ ├── device_library_decls.h │ │ │ │ ├── functional_grid_launch.hpp │ │ │ │ ├── grid_launch.h │ │ │ │ ├── grid_launch.hpp │ │ │ │ ├── grid_launch_GGL.hpp │ │ │ │ ├── helpers.hpp │ │ │ │ ├── hip_cooperative_groups_helper.h │ │ │ │ ├── hip_fp16_gcc.h │ │ │ │ ├── hip_fp16_math_fwd.h │ │ │ │ ├── hip_ldg.h │ │ │ │ ├── hip_prof_str.h │ │ │ │ ├── hip_runtime_prof.h │ │ │ │ ├── host_defines.h │ │ │ │ ├── hsa_helpers.hpp │ │ │ │ ├── macro_based_grid_launch.hpp │ │ │ │ ├── math_fwd.h │ │ │ │ ├── ockl_image.h │ │ │ │ ├── program_state.hpp │ │ │ │ ├── texture_fetch_functions.h │ │ │ │ └── texture_indirect_functions.h │ │ │ │ ├── channel_descriptor.h │ │ │ │ ├── device_functions.h │ │ │ │ ├── driver_types.h │ │ │ │ ├── hip_bf16.h │ │ │ │ ├── hip_bfloat16.h │ │ │ │ ├── hip_common.h │ │ │ │ ├── hip_complex.h │ │ │ │ ├── hip_cooperative_groups.h │ │ │ │ ├── hip_deprecated.h │ │ │ │ ├── hip_ext.h │ │ │ │ ├── hip_fp16.h │ │ │ │ ├── hip_gl_interop.h │ │ │ │ ├── hip_hcc.h │ │ │ │ ├── hip_math_constants.h │ │ │ │ ├── hip_profile.h │ │ │ │ ├── hip_runtime.h │ │ │ │ ├── hip_runtime_api.h │ │ │ │ ├── hip_texture_types.h │ │ │ │ ├── hip_vector_types.h │ │ │ │ ├── hip_version.h │ │ │ │ ├── hiprtc.h │ │ │ │ ├── library_types.h │ │ │ │ ├── math_functions.h │ │ │ │ ├── surface_types.h │ │ │ │ └── texture_types.h │ │ └── lib │ │ │ ├── cuda2gcn.bc │ │ │ ├── ockl.bc │ │ │ ├── ocml.bc │ │ │ └── opencl.bc │ ├── include │ │ ├── CMakeLists.txt │ │ ├── TritonAMDGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── GCNAsmFormat.h │ │ │ ├── Passes.h │ │ │ └── Passes.td │ │ └── TritonAMDGPUTransforms │ │ │ ├── CMakeLists.txt │ │ │ ├── MfmaGroup.h │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── TritonGPUConversion.h │ ├── lib │ │ ├── CMakeLists.txt │ │ ├── TritonAMDGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── ConvertLayoutOpToLLVM.cpp │ │ │ ├── ConvertLayoutOpToLLVM │ │ │ │ ├── SharedToDotOperandHelper.cpp │ │ │ │ ├── SharedToDotOperandHelper.h │ │ │ │ ├── SharedToDotOperandMFMA.cpp │ │ │ │ └── SharedToDotOperandWMMA.cpp │ │ │ ├── DecomposeUnsupportedConversions.cpp │ │ │ ├── DotOpToLLVM.cpp │ │ │ ├── DotOpToLLVM │ │ │ │ ├── MFMA.cpp │ │ │ │ └── WMMA.cpp │ │ │ ├── ElementwiseOpToLLVM.cpp │ │ │ ├── GCNAsmFormat.cpp │ │ │ ├── LoadStoreOpToLLVM.cpp │ │ │ ├── PatternTritonGPUOpToLLVM.h │ │ │ ├── SPMDOpToLLVM.cpp │ │ │ ├── TargetInfo.cpp │ │ │ ├── TargetInfo.h │ │ │ ├── TritonGPUToLLVM.cpp │ │ │ ├── Utility.cpp │ │ │ └── Utility.h │ │ └── TritonAMDGPUTransforms │ │ │ ├── AccelerateAMDMatmul.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── MfmaGroup.cpp │ │ │ ├── OptimizeEpilogue.cpp │ │ │ ├── RemoveLayoutConversions.cpp │ │ │ ├── ReorderInstructions.cpp │ │ │ └── StreamPipeline.cpp │ └── python │ │ └── triton_amd.cc ├── cpu │ ├── CMakeLists.txt │ ├── backend │ │ ├── compiler.py │ │ └── driver.py │ └── triton_cpu.cc ├── nvidia │ ├── CMakeLists.txt │ ├── backend │ │ ├── __init__.py │ │ ├── compiler.py │ │ ├── driver.c │ │ ├── driver.py │ │ ├── include │ │ │ └── cuda.h │ │ └── lib │ │ │ └── libdevice.10.bc │ ├── include │ │ ├── CMakeLists.txt │ │ ├── NVGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── NVGPUToLLVMPass.h │ │ │ ├── Passes.h │ │ │ └── Passes.td │ │ └── TritonNVIDIAGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── PTXAsmFormat.h │ │ │ ├── Passes.h │ │ │ └── Passes.td │ ├── lib │ │ ├── CMakeLists.txt │ │ ├── NVGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ └── NVGPUToLLVMPass.cpp │ │ └── TritonNVIDIAGPUToLLVM │ │ │ ├── BarrierOpToLLVM.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── ClusterOpsToLLVM.cpp │ │ │ ├── ConvertLayoutOpToLLVM.cpp │ │ │ ├── ConvertLayoutOpToLLVM │ │ │ ├── SharedToDotOperandMMAv1.cpp │ │ │ └── SharedToDotOperandMMAv2.cpp │ │ │ ├── DecomposeUnsupportedConversions.cpp │ │ │ ├── DotOpToLLVM.cpp │ │ │ ├── DotOpToLLVM │ │ │ ├── MMAv1.cpp │ │ │ ├── MMAv2.cpp │ │ │ └── WGMMA.cpp │ │ │ ├── ElementwiseOpToLLVM.cpp │ │ │ ├── LoadStoreOpToLLVM.cpp │ │ │ ├── PTXAsmFormat.cpp │ │ │ ├── PatternTritonGPUOpToLLVM.h │ │ │ ├── SPMDOpToLLVM.cpp │ │ │ ├── TargetInfo.cpp │ │ │ ├── TargetInfo.h │ │ │ ├── TensorPtrOpsToLLVM.cpp │ │ │ ├── TritonGPUToLLVM.cpp │ │ │ ├── Utility.cpp │ │ │ └── Utility.h │ └── triton_nvidia.cc └── proton │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── csrc │ ├── Proton.cpp │ ├── include │ │ ├── Context │ │ │ ├── Context.h │ │ │ ├── Python.h │ │ │ └── Shadow.h │ │ ├── Data │ │ │ ├── Data.h │ │ │ ├── Metric.h │ │ │ ├── TraceData.h │ │ │ └── TreeData.h │ │ ├── Driver │ │ │ ├── Dispatch.h │ │ │ └── GPU │ │ │ │ ├── Cuda.h │ │ │ │ └── Cupti.h │ │ ├── Profiler │ │ │ ├── CuptiProfiler.h │ │ │ └── Profiler.h │ │ ├── Proton.h │ │ ├── Session │ │ │ └── Session.h │ │ └── Utility │ │ │ ├── Errors.h │ │ │ ├── Singleton.h │ │ │ ├── String.h │ │ │ └── Traits.h │ └── lib │ │ ├── Context │ │ ├── Context.cpp │ │ ├── Python.cpp │ │ └── Shadow.cpp │ │ ├── Data │ │ ├── Data.cpp │ │ ├── TraceData.cpp │ │ └── TreeData.cpp │ │ ├── Driver │ │ └── GPU │ │ │ ├── Cuda.cpp │ │ │ └── Cupti.cpp │ │ ├── Profiler │ │ └── CuptiProfiler.cpp │ │ └── Session │ │ └── Session.cpp │ ├── proton │ ├── _C │ │ └── include │ ├── __init__.py │ ├── flags.py │ ├── hook.py │ ├── profile.py │ ├── scope.py │ └── viewer.py │ ├── test │ ├── test_api.py │ ├── test_lib.py │ ├── test_profile.py │ └── test_viewer.py │ └── tutorials │ ├── dynamic_net.py │ └── matmul.py ├── unittest ├── Analysis │ ├── CMakeLists.txt │ └── UtilityTest.cpp ├── CMakeLists.txt ├── Conversion │ ├── CMakeLists.txt │ └── TritonGPUToLLVM │ │ ├── CMakeLists.txt │ │ ├── DumpLayout.cpp │ │ ├── DumpLayout.h │ │ ├── EmitIndicesTest.cpp │ │ └── PTXAsmFormatTest.cpp ├── Dialect │ ├── CMakeLists.txt │ └── TritonGPU │ │ ├── CMakeLists.txt │ │ ├── DialectTest.cpp │ │ └── SwizzleTest.cpp └── googletest.cmake └── utils └── nightly.pypirc /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # https://editorconfig.org/ 2 | 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | indent_style = space 9 | indent_size = 4 10 | trim_trailing_whitespace = true 11 | insert_final_newline = true 12 | 13 | [*.py] 14 | indent_size = 4 15 | src_paths=python 16 | 17 | [*.{yaml,yml}] 18 | indent_size = 2 19 | 20 | [*.md] 21 | indent_size = 2 22 | x-soft-wrap-text = true 23 | 24 | [*.rst] 25 | indent_size = 4 26 | x-soft-wrap-text = true 27 | 28 | [CMakeLists.txt,*.cmake] 29 | indent_size = 2 30 | 31 | [Makefile] 32 | indent_style = tab 33 | 34 | [*.{c,cc,cpp,h,hpp,cu,cuh}] 35 | indent_size = 2 36 | 37 | [*.mlir] 38 | indent_size = 2 39 | 40 | [*.td] 41 | indent_size = 4 42 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # W503 (linebreak occurred before binary operator) seems to be enabled by 3 | # default, even though it goes against pep8 and is incompatible with W504 4 | # (linebreak occurred *after* binary operator). Disable it. 5 | ignore = E501,E701,E731,W503 6 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Commits listed here are ignored by `git blame`. Add "big and uninteresting 2 | # changes" here. Don't forget that it has to be a separate commit (and, because 3 | # our automation squashes PRs, a separate PR)! 4 | # 5 | # Run the following command to teach your `git blame` to pick up this file. 6 | # 7 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs` 8 | 9 | 841a77d1b5961b43e1b64e5265bdfe52c133574d 10 | cb68a0d9d501657258ed9f7ad7610d0784c9be9a 11 | 03184de8b535bb24fb1f49cc1f5e008bcbaa73ef 12 | bc4a8e66da036fafc01b87ee9e210df7ee8fb738 13 | 846d6e7e77891706d179b20f27b1278ac3b9a9ac 14 | 0327b9d32db6d1d63d207ccab722bd45e00a6678 15 | df08301e76a56d9ab3f36ff00ab7133672baa8d3 16 | f88b01f558df06f010a869e01473253a5f5cd8db 17 | 312cf97e147e962562877026fd82c928cf6eaa30 18 | 53d868113a706988394134ca1f7f85cb3016cc81 19 | 539fbe5049570f29e73dc6843f984cd4913c5505 20 | 053af4e9f8f005e1bc3f8ac9bf285eaf0ac9bf72 21 | 5b36cb48ad9ce566dd24ff7183f207a1cb9358b5 22 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in 2 | # the repo. Unless a later match takes precedence, 3 | # @global-owner1 and @global-owner2 will be requested for 4 | # review when someone opens a pull request. 5 | * @ptillet 6 | 7 | # -------- 8 | # Analyses 9 | # -------- 10 | # Alias analysis 11 | include/triton/Analysis/Alias.h @Jokeren 12 | lib/Analysis/Alias.cpp @Jokeren 13 | # Allocation analysis 14 | include/triton/Analysis/Allocation.h @Jokeren 15 | lib/Analysis/Allocation.cpp @Jokeren 16 | # Membar analysis 17 | include/triton/Analysis/Membar.h @Jokeren 18 | lib/Analysis/Membar.cpp @Jokeren 19 | # AxisInfo analysis 20 | include/triton/Analysis/AxisInfo.h @ptillet 21 | lib/Analysis/AxisInfo.cpp @ptillet 22 | # Utilities 23 | include/triton/Analysis/Utility.h @Jokeren 24 | lib/Analysis/Utility.cpp @Jokeren 25 | 26 | # ---------- 27 | # Dialects 28 | # ---------- 29 | # Pipeline pass 30 | lib/Dialect/TritonGPU/Transforms/Pipeline.cpp @ptillet 31 | # Prefetch pass 32 | lib/Dialect/TritonGPU/Transforms/Prefetch.cpp @ptillet 33 | # Coalesce pass 34 | lib/Dialect/TritonGPU/Transforms/Coalesce.cpp @ptillet 35 | # Layout simplification pass 36 | lib/Dialect/TritonGPU/Transforms/Combine.cpp @ptillet 37 | 38 | # ----------- 39 | # Conversions 40 | # ----------- 41 | # TritonToTritonGPU 42 | include/triton/Conversion/TritonToTritonGPU/ @ptillet 43 | lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp @ptillet 44 | 45 | # ----------- 46 | # third_party 47 | # ----------- 48 | third_party/amd/ @antiagainst @zhanglx13 49 | -------------------------------------------------------------------------------- /.github/workflows/torch-inductor-tests.yml: -------------------------------------------------------------------------------- 1 | name: Torchinductor 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Wheels"] 6 | types: [completed] 7 | 8 | permissions: read-all 9 | 10 | jobs: 11 | Runner-Preparation: 12 | runs-on: ubuntu-latest 13 | outputs: 14 | matrix: ${{ steps.set-matrix.outputs.matrix }} 15 | steps: 16 | - name: Prepare runner matrix 17 | id: set-matrix 18 | run: | 19 | echo '::set-output name=matrix::[["self-hosted", "A100"]]' 20 | 21 | Integration-Tests: 22 | needs: Runner-Preparation 23 | timeout-minutes: 240 # 4 hours 24 | runs-on: ${{ matrix.runner }} 25 | strategy: 26 | matrix: 27 | runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix)}} 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v4 31 | - name: Packages 32 | run: | 33 | ./.github/workflows/torch-inductor/scripts/install_torchinductor.sh torchbench 34 | - name: Environment 35 | run: | 36 | source /opt/torchinductor_venv/bin/activate 37 | ./.github/workflows/torch-inductor/scripts/install_triton.sh 38 | - name: Performance 39 | run: | 40 | ./.github/workflows/torch-inductor/scripts/run_torchinductor_perf.sh torchbench 41 | # Runs too long time 42 | #- name: Accuracy 43 | # run: | 44 | # ./.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh 45 | -------------------------------------------------------------------------------- /.github/workflows/torch-inductor/scripts/check_acc.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | 4 | file_path = sys.argv[1] 5 | with open(file_path) as f: 6 | reader = csv.reader(f) 7 | for i, row in enumerate(reader): 8 | if i == 0: 9 | continue 10 | if row[3] != "pass": 11 | print(f"{row[1]} failed on device {row[0]} with batch size {row[2]}") 12 | -------------------------------------------------------------------------------- /.github/workflows/torch-inductor/scripts/common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TEST_REPORTS_DIR=/opt/torchinductor_reports 4 | PYTORCH_DIR=/opt/pytorch 5 | MODELS=(timm_models huggingface torchbench) 6 | 7 | echo "$TEST_REPORTS_DIR" 8 | echo "$PYTORCH_DIR" 9 | echo "${MODELS[@]}" 10 | -------------------------------------------------------------------------------- /.github/workflows/torch-inductor/scripts/install_triton.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # remember where we started 4 | ROOT="$(pwd)" 5 | 6 | # shellcheck source=/dev/null 7 | source /opt/torchinductor_venv/bin/activate 8 | # shellcheck source=/dev/null 9 | source ./.github/workflows/torch-inductor/scripts/common.sh 10 | 11 | # build our own triton 12 | cd python || exit 13 | pip3 install --pre pytorch-triton --extra-index-url https://download.pytorch.org/whl/nightly/cu118 14 | rm -rf build 15 | pip3 install -e . 16 | pip3 uninstall pytorch-triton -y 17 | 18 | # clean up cache 19 | rm -rf /tmp/torchinductor_root/ 20 | rm -rf ~/.triton/cache 21 | rm -rf "$TEST_REPORTS_DIR" 22 | 23 | # go back to where we started 24 | cd "$ROOT" || exit 25 | -------------------------------------------------------------------------------- /.github/workflows/torch-inductor/scripts/run_torchinductor_acc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # remember where we started 4 | ROOT="$(pwd)" 5 | INDUCTOR="$ROOT"/.github/workflows/torch-inductor 6 | MODEL_SPEC=$1 7 | 8 | # shellcheck source=/dev/null 9 | source /opt/torchinductor_venv/bin/activate 10 | # shellcheck source=/dev/null 11 | source "$INDUCTOR"/scripts/common.sh 12 | 13 | cd "$PYTORCH_DIR" || exit 14 | TEST_REPORTS_DIR=$TEST_REPORTS_DIR/acc 15 | mkdir -p "$TEST_REPORTS_DIR" 16 | 17 | for model in "${MODELS[@]}"; do 18 | if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then 19 | continue 20 | fi 21 | echo "Running accuracy test for $model" 22 | python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --device cuda \ 23 | --output "$TEST_REPORTS_DIR"/inference_"$model".csv 24 | python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --training --amp --device cuda \ 25 | --output "$TEST_REPORTS_DIR"/training_"$model".csv 26 | python3 benchmarks/dynamo/"$model".py --ci --accuracy --timing --explain --inductor --dynamic-shapes --device cuda \ 27 | --output "$TEST_REPORTS_DIR"/dynamic_shapes_"$model".csv 28 | done 29 | 30 | cd "$ROOT" || exit 31 | for model in "${MODELS[@]}"; do 32 | if [ "$model" != "$MODEL_SPEC" ] && [ "$MODEL_SPEC" != "all" ]; then 33 | continue 34 | fi 35 | echo "Checking accuracy test for $model" 36 | python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/inference_"$model".csv 37 | python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/training_"$model".csv 38 | python3 "$INDUCTOR"/scripts/check_acc.py "$TEST_REPORTS_DIR"/dynamic_shapes_"$model".csv 39 | done 40 | 41 | # go back to where we started 42 | cd "$ROOT" || exit 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Triton builds 2 | build/ 3 | build-*/ 4 | 5 | # Triton Python module builds 6 | python/build/ 7 | python/triton.egg-info/ 8 | python/triton/_C/libtriton.pyd 9 | python/triton/_C/libtriton.so 10 | 11 | # Backends copied from submodules 12 | python/triton/backends/ 13 | !python/triton/backends/__init__.py 14 | !python/triton/backends/compiler.py 15 | !python/triton/backends/driver.py 16 | 17 | # Proton 18 | python/triton/profiler 19 | 20 | # Python caches 21 | __pycache__/ 22 | *.py[cod] 23 | .pytest_cache 24 | 25 | # Environments 26 | .venv 27 | venv/ 28 | venv.bak/ 29 | 30 | # VS Code project files 31 | .vscode 32 | .vs 33 | 34 | # JetBrains project files 35 | .idea 36 | cmake-build-* 37 | 38 | # Third-party binaries 39 | cuobjdump 40 | nvdisasm 41 | ptxas 42 | 43 | # Docs 44 | docs/_build/ 45 | docs/python-api/generated/ 46 | docs/dialects/ 47 | docs/getting-started/tutorials 48 | docs/sg_execution_times.rst 49 | !python/tutorials/*.py 50 | !python/tutorials/*.rst 51 | 52 | # clangd index. (".clangd" is a config file now, thus trailing slash) 53 | .clangd/ 54 | .cache 55 | /compile_commands.json 56 | .vscode 57 | .vs 58 | 59 | # Vim 60 | *.swp 61 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: check-symlinks 6 | - id: destroyed-symlinks 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - id: check-toml 11 | - id: check-ast 12 | - id: check-added-large-files 13 | - id: check-merge-conflict 14 | - id: check-executables-have-shebangs 15 | - id: check-shebang-scripts-are-executable 16 | - id: detect-private-key 17 | - id: debug-statements 18 | 19 | - repo: https://github.com/astral-sh/ruff-pre-commit 20 | rev: v0.1.3 21 | hooks: 22 | - id: ruff 23 | files: '^python/.*' 24 | args: ["--fix", "--line-length", "120"] 25 | stages: [commit, push, manual] 26 | exclude: | 27 | (?x)( 28 | ^python/triton/runtime/.*| 29 | ^test/| 30 | ^docs/conf.py$ 31 | ) 32 | 33 | - repo: https://github.com/google/yapf 34 | rev: be72557 35 | hooks: 36 | - id: yapf 37 | args: ["-p", "-i"] 38 | stages: [commit, push, manual] 39 | exclude: "python/test/unit/language/test_line_info.py" 40 | 41 | - repo: https://github.com/pre-commit/mirrors-clang-format 42 | rev: v16.0.6 43 | hooks: 44 | - id: clang-format 45 | stages: [commit, push, manual] 46 | 47 | exclude: | 48 | (?x)( 49 | ^include/triton/external/| 50 | ^third_party/amd/backend/include/hip/| 51 | ^third_party/amd/backend/lib/| 52 | ^third_party/nvidia/backend/include/cuda.h 53 | ) 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018-2020 Philippe Tillet 3 | * Copyright 2020-2022 OpenAI 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files 7 | * (the "Software"), to deal in the Software without restriction, 8 | * including without limitation the rights to use, copy, modify, merge, 9 | * publish, distribute, sublicense, and/or sell copies of the Software, 10 | * and to permit persons to whom the Software is furnished to do so, 11 | * subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | -------------------------------------------------------------------------------- /bin/triton-lsp.cpp: -------------------------------------------------------------------------------- 1 | #include "./RegisterTritonDialects.h" 2 | 3 | #include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h" 4 | 5 | int main(int argc, char **argv) { 6 | mlir::DialectRegistry registry; 7 | registerTritonDialects(registry); 8 | 9 | mlir::MLIRContext context(registry); 10 | return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry)); 11 | } 12 | -------------------------------------------------------------------------------- /bin/triton-opt.cpp: -------------------------------------------------------------------------------- 1 | #include "./RegisterTritonDialects.h" 2 | 3 | #include "mlir/Tools/mlir-opt/MlirOptMain.h" 4 | 5 | int main(int argc, char **argv) { 6 | mlir::DialectRegistry registry; 7 | registerTritonDialects(registry); 8 | 9 | return mlir::asMainReturnCode(mlir::MlirOptMain( 10 | argc, argv, "Triton (GPU) optimizer driver\n", registry)); 11 | } 12 | -------------------------------------------------------------------------------- /bin/triton-reduce.cpp: -------------------------------------------------------------------------------- 1 | #include "./RegisterTritonDialects.h" 2 | 3 | #include "mlir/Tools/mlir-reduce/MlirReduceMain.h" 4 | 5 | int main(int argc, char **argv) { 6 | mlir::DialectRegistry registry; 7 | registerTritonDialects(registry); 8 | 9 | mlir::MLIRContext context(registry); 10 | return mlir::failed(mlir::mlirReduceMain(argc, argv, context)); 11 | } 12 | -------------------------------------------------------------------------------- /cmake/llvm-hash.txt: -------------------------------------------------------------------------------- 1 | ed4e505c219fe6c7464ea5a056e90d8cd94c7332 2 | -------------------------------------------------------------------------------- /cmake/nvidia-toolchain-version.txt: -------------------------------------------------------------------------------- 1 | 12.4.99 2 | -------------------------------------------------------------------------------- /cmake/pybind11-version.txt: -------------------------------------------------------------------------------- 1 | 2.11.1 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = Triton 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_templates/versions.html: -------------------------------------------------------------------------------- 1 | {%- if current_version %} 2 |
3 | 4 | Other Versions 5 | v: {{ current_version.name }} 6 | 7 | 8 |
9 | {%- if versions.tags %} 10 |
11 |
Tags
12 | {%- for item in versions.tags %} 13 |
{{ item.name }}
14 | {%- endfor %} 15 |
16 | {%- endif %} 17 | {%- if versions.branches %} 18 |
19 |
Branches
20 | {%- for item in versions.branches %} 21 |
{{ item.name }}
22 | {%- endfor %} 23 |
24 | {%- endif %} 25 |
26 |
27 | {%- endif %} 28 | -------------------------------------------------------------------------------- /docs/getting-started/tutorials/grouped_vs_row_major_ordering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png -------------------------------------------------------------------------------- /docs/getting-started/tutorials/parallel_reduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/parallel_reduction.png -------------------------------------------------------------------------------- /docs/getting-started/tutorials/random_bits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/getting-started/tutorials/random_bits.png -------------------------------------------------------------------------------- /docs/meetups/01-24-2024/notes.md: -------------------------------------------------------------------------------- 1 | #### Agenda: 2 | 3 | ##### Items: 4 | 1. 3rd party refactoring backend update. 5 | 2. AMD update about experience with refactored backend and new process. 6 | 3. Plan to restore the Intel XPU backend as third-party module. 7 | 4. Open discussion. 8 | 9 | ##### Minutes: 10 | Recording link [here](https://youtu.be/uRlqolhNbRk) 11 | 12 | 1. 3rd party refactoring backend update. 13 | - Backends are passes and IRs are shared by the backends to avoid divergence and duplications so that developers do not have to change the Triton source code 14 | - To discover backend forks in directories, put environment vars in setup.py. 15 | - Backends can link whatever library they want, they don’t need to copy paste Nvidia code. 16 | - Nvidia uses the same API as other backends, (refactoring of the C++ code is still remaining). No special casing for Nvidia code. 17 | - If Triton dependency is on top of the main branch then it will work for forks/branches. 18 | - Still remaining: LLVM IR conversion – reusuable pattern rewriters update; Reduce complexity in statefulness in Triton GPU - inherit from base pattern 19 | 2. AMD update about experience with refactored backend and new process. 20 | - Skipped due to lack of time. Will be covered in February meetup 21 | 3. Plan to restore the Intel XPU backend as third-party module. 22 | - Prereqs to upstream – Will take into account the system HW and SW, with perf to be ~80% of Nvidia, to allow upstreaming. 23 | - Consider how useful it is for AI research to allow upstreaming – as it impacts maintenance cost of the backends. 24 | - Don’t have plans to upstream mobile backends 25 | - Intel will hold offline discussion with Open AI for being in-tree. 26 | -------------------------------------------------------------------------------- /docs/meetups/02-20-2024/Proton.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/02-20-2024/Proton.pdf -------------------------------------------------------------------------------- /docs/meetups/02-20-2024/notes.md: -------------------------------------------------------------------------------- 1 | #### Agenda: 2 | 3 | ##### Items: 4 | 1. Intel update 5 | 2. AMD update 6 | 3. Profiler update 7 | 4. We are in the process of transitioning to a pro slack plan, so everybody will be able to see history. Expect this to take a few more weeks. 8 | 5. We are still working on finalizing a document about our technical governance structure. Expect this to take a few more weeks too.4. Open discussion. 9 | 10 | ##### Minutes: 11 | Recording link [here](https://youtu.be/JDQCdj18Snc) 12 | 13 | 1. Intel GPU integration with Triton and Pytorch: 14 | - No strong requirement from PyTorch for specific backends to be part of Triton official release. 15 | - Can use a separate branch/fork for CI/CD and testing. 16 | - Intel team will work with Pytorch offline to close. 17 | 2. AMD GPU backend update: 18 | - AMD team shared the refactored design for AMD backend. 19 | - The new design is modularized and reduces clutter and duplication in upstream Triton. 20 | - Further work needed for regression testing and secure runners. 21 | 3. Proton profiler update: 22 | - Keren from the OpenAI team presented a new profiler tool for Triton kernels, which supports multiple vendors, metrics, and formats. 23 | - Outlined the plan for open-sourcing, integrating, and extending the tool. 24 | -------------------------------------------------------------------------------- /docs/meetups/08-22-2023/amd-update.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/08-22-2023/amd-update.pdf -------------------------------------------------------------------------------- /docs/meetups/08-22-2023/intel-xpu-update.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/08-22-2023/intel-xpu-update.pptx -------------------------------------------------------------------------------- /docs/meetups/10-25-2023/intel-xpu-update.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/10-25-2023/intel-xpu-update.pdf -------------------------------------------------------------------------------- /docs/meetups/10-25-2023/notes.md: -------------------------------------------------------------------------------- 1 | #### Agenda: 2 | 3 | ##### Items: 4 | 1. H100 updates 5 | 2. Triton-Shared layer updates 6 | 3. Intel update 7 | 4. Open discussion 8 | 9 | ##### Minutes: 10 | Recording link [here](https://youtu.be/KZAzpKx1ebI) 11 | 12 | 1. H100 updates 13 | - Enabled WGMMA by default, now any matmul can reuse it. 14 | - fp8 formats enabled – 1.3 Petaflops on dense matmul on H100 (gemm performance) 15 | - Enabled Flash Attention using wgmma, resulting in 450 teraflop on fwd pass and 250 on backward pass – still working on perf for flash attention 16 | - fp8 numbers with flash attention running in fp8 with matmul is tricky, because the fp8 layout is significantly different than what is returned by wgmma, still wip 17 | 18 | 2. Triton-Shared layer 19 | - Please refer to slides for more details 20 | - Created a repo where you can find the middle layer 21 | - Available as a plugin into triton 22 | 23 | 3. Intel Update 24 | - Please refer to slides for more details 25 | -------------------------------------------------------------------------------- /docs/meetups/10-25-2023/triton-shared.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/meetups/10-25-2023/triton-shared.pptx -------------------------------------------------------------------------------- /docs/meetups/12-13-2023/notes.md: -------------------------------------------------------------------------------- 1 | #### Agenda: 2 | 3 | ##### Items: 4 | 1. Refactoring plan for 3rd party backends 5 | 2. Front end refactoring (AMD) 6 | 3. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development. 7 | 8 | ##### Minutes: 9 | Recording link [here](https://youtu.be/Lo43DQYkOWM) 10 | 11 | 1. Refactoring plan for 3rd party backends 12 | - Refactoring to be completed by end of the year so that all GPU backends can be individual passes on Triton GPU IR instead of being completely out of tree. The goal is for users to get other GPUs besides Cuda when they install Triton. Non-GPU Triton IR expected to stay as is. 13 | 3. Front end refactoring (AMD) 14 | - Will work with Phil for AMD related refactoring. Will share more details in next meetup about where AMD has diverged from Triton GPU IR and in the codeflow. 15 | 4. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development. 16 | - Can look at it on a case by case basis. 17 | -------------------------------------------------------------------------------- /docs/meetups/dev-meetup-2023.md: -------------------------------------------------------------------------------- 1 | The conference slides are available [here](https://drive.google.com/drive/folders/1yDFc4ElNN_GGhWDdMlM4wcm5uFEFFVQk?usp=sharing) 2 | 3 | The conference videos will be available [here](https://youtube.com/playlist?list=PLc_vA1r0qoiRZfUC3o4_yjj0FtWvodKAz&feature=shared) when ready. 4 | 5 | # Triton Developer Conference 6 | The Triton Developer Conference was held in a hybrid mode at the Microsoft Silicon Valley Campus in Mountain View, California. The conference was held on September 20th from 10am to 4pm, followed by a reception till 5:30 pm. 7 | 8 | Agenda for the conference: 9 | 10 | |Time |Title |Speaker 11 | |--------|-------|-------| 12 | |10:00 AM|Welcome|Kevin Scott (Microsoft)| 13 | |10:20 AM|The Triton Compiler: Past, Present and Future|Phil Tillet (OpenAI)| 14 | |11:00 AM|**Break**|| 15 | |11:20 AM|Hopper support in Triton|Gustav Zhu (Nvidia)| 16 | |11:40 AM|Bringing Triton to AMD GPUs|Jason Furmanek, Lixun Zhang (AMD)| 17 | |12:00 PM|Intel XPU Backend for Triton|Eikan Wang (Intel)| 18 | |12:20 PM|Vectorization of Triton Kernels for Qualcomm Hexagon Backend|Javed Absar (Qualcomm)| 19 | |12:30 PM|**Lunch**|| 20 | |1:40 PM |Triton for MTIA|Roman Levenstein et al, (Meta)| 21 | |2:00 PM |Using Triton IR for high-performance fusions in XLA|George Karpenkov (Google)| 22 | |2:20 PM |Triton for All: Triton as a device-independent language|Ian Bearman (Microsoft)| 23 | |2:40 PM|**Break**|| 24 | |3:00 PM|PyTorch 2.0 and TorchInductor|Jason Ansel, Horace He (Meta)| 25 | |3:20 PM|Pallas: A JAX Kernel Language|Sharad Vikram (Google)| 26 | |3:40 PM|Writing Grouped GEMMs in Triton|Vinod Grover (Nvidia)| 27 | |4:00 PM|**Reception**|| 28 | -------------------------------------------------------------------------------- /docs/programming-guide/chapter-1/cuda-parallel-matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-1/cuda-parallel-matmul.png -------------------------------------------------------------------------------- /docs/programming-guide/chapter-1/triton-parallel-matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-1/triton-parallel-matmul.png -------------------------------------------------------------------------------- /docs/programming-guide/chapter-2/halide-iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-2/halide-iteration.png -------------------------------------------------------------------------------- /docs/programming-guide/chapter-2/polyhedral-iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/docs/programming-guide/chapter-2/polyhedral-iteration.png -------------------------------------------------------------------------------- /docs/python-api/triton.rst: -------------------------------------------------------------------------------- 1 | triton 2 | ====== 3 | 4 | .. currentmodule:: triton 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :nosignatures: 9 | 10 | jit 11 | autotune 12 | heuristics 13 | Config 14 | -------------------------------------------------------------------------------- /docs/python-api/triton.testing.rst: -------------------------------------------------------------------------------- 1 | triton.testing 2 | ============== 3 | 4 | .. currentmodule:: triton.testing 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :nosignatures: 9 | 10 | Benchmark 11 | do_bench 12 | do_bench_cudagraph 13 | perf_report 14 | -------------------------------------------------------------------------------- /include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(triton) 2 | -------------------------------------------------------------------------------- /include/triton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Conversion) 2 | add_subdirectory(Dialect) 3 | add_subdirectory(Target) 4 | -------------------------------------------------------------------------------- /include/triton/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonCPUToLLVM) 2 | add_subdirectory(TritonGPUToLLVM) 3 | add_subdirectory(TritonToTritonCPU) 4 | add_subdirectory(TritonToTritonGPU) 5 | -------------------------------------------------------------------------------- /include/triton/Conversion/MLIRTypes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_MLIR_TYPES_H 2 | #define TRITON_CONVERSION_MLIR_TYPES_H 3 | 4 | #include "mlir/Transforms/DialectConversion.h" 5 | #include "triton/Dialect/TritonGPU/IR/Dialect.h" 6 | 7 | // This file redefines some common MLIR types for easy usage. 8 | namespace mlir { 9 | namespace triton { 10 | namespace type { 11 | 12 | // Integer types 13 | inline Type i32Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 32); } 14 | inline Type i16Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 16); } 15 | inline Type i8Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 8); } 16 | inline Type u32Ty(MLIRContext *ctx) { 17 | return IntegerType::get(ctx, 32, IntegerType::Unsigned); 18 | } 19 | inline Type u1Ty(MLIRContext *ctx) { 20 | return IntegerType::get(ctx, 1, IntegerType::Unsigned); 21 | } 22 | 23 | // Float types 24 | inline Type f16Ty(MLIRContext *ctx) { return FloatType::getF16(ctx); } 25 | inline Type f32Ty(MLIRContext *ctx) { return FloatType::getF32(ctx); } 26 | inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); } 27 | inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); } 28 | 29 | inline bool isFloat(Type type) { 30 | return type.isF32() || type.isF64() || type.isF16() || type.isF128(); 31 | } 32 | 33 | inline bool isInt(Type type) { return type.isIntOrFloat() && !isFloat(type); } 34 | 35 | } // namespace type 36 | } // namespace triton 37 | } // namespace mlir 38 | 39 | #endif // TRITON_CONVERSION_MLIR_TYPES_H 40 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonCPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonCPUToLLVM) 3 | add_public_tablegen_target(TritonCPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonCPUToLLVM/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_CONVERSION_TRITONCPUTOLLVM_PASSES_H 2 | #define TRITONCPU_CONVERSION_TRITONCPUTOLLVM_PASSES_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/Pass/Pass.h" 6 | #include "mlir/Transforms/DialectConversion.h" 7 | 8 | #include 9 | 10 | namespace mlir { 11 | 12 | class ModuleOp; 13 | template class OperationPass; 14 | 15 | namespace triton { 16 | 17 | #define GEN_PASS_DECL 18 | #include "triton/Conversion/TritonCPUToLLVM/Passes.h.inc" 19 | 20 | std::unique_ptr> createConvertTritonCPUToLLVMPass(); 21 | 22 | #define GEN_PASS_REGISTRATION 23 | #include "triton/Conversion/TritonCPUToLLVM/Passes.h.inc" 24 | 25 | } // namespace triton 26 | 27 | } // namespace mlir 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonCPUToLLVM/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_CONVERSION_PASSES 2 | #define TRITONCPU_CONVERSION_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def ConvertTritonCPUToLLVM : Pass<"convert-triton-cpu-to-llvm", "mlir::ModuleOp"> { 7 | let summary = "Convert TritonCPU to LLVM"; 8 | let description = [{ 9 | 10 | }]; 11 | let constructor = "mlir::triton::createConvertTritonCPUToLLVMPass()"; 12 | 13 | let dependentDialects = ["mlir::arith::ArithDialect", 14 | "mlir::LLVM::LLVMDialect", 15 | "mlir::math::MathDialect", 16 | "mlir::scf::SCFDialect", 17 | "mlir::tensor::TensorDialect", 18 | "mlir::triton::cpu::TritonCPUDialect", 19 | "mlir::triton::TritonDialect"]; 20 | 21 | let options = [ 22 | ]; 23 | } 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonCPUToLLVM/PatternTritonCPUOpToLLVM.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONCPU_TO_LLVM_PATTERNS_TRITON_CPU_OP_TO_LLVM_H 2 | #define TRITON_CONVERSION_TRITONCPU_TO_LLVM_PATTERNS_TRITON_CPU_OP_TO_LLVM_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "triton/Dialect/TritonCPU/IR/Dialect.h" 6 | 7 | using namespace mlir; 8 | using namespace mlir::triton; 9 | 10 | namespace mlir { 11 | namespace triton { 12 | // Some populate* functions have name collisions with the ones for GPUs. 13 | namespace cpu { 14 | 15 | constexpr int patternBenefitDefault = 1; 16 | constexpr int patternBenefitPrioritizeOverLLVMConversions = 10; 17 | constexpr int patternBenefitClampOptimizedPattern = 20; 18 | constexpr int patternBenefitConvertLayoutOptimizedPattern = 20; 19 | 20 | void populateControlFlowOpToLLVMPattern(LLVMTypeConverter &typeConverter, 21 | RewritePatternSet &patterns, 22 | PatternBenefit benefit); 23 | 24 | void populateFuncOpConversionPattern(LLVMTypeConverter &typeConverter, 25 | RewritePatternSet &patterns, 26 | PatternBenefit benefit); 27 | 28 | void populatePrintOpToLLVMPattern(LLVMTypeConverter &typeConverter, 29 | RewritePatternSet &patterns, 30 | PatternBenefit benefit); 31 | 32 | } // namespace cpu 33 | } // namespace triton 34 | } // namespace mlir 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonCPUToLLVM/TypeConverter.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_CONVERSION_TRITONCPUTOLLVM_TYPECONVERTER_H 2 | #define TRITONCPU_CONVERSION_TRITONCPUTOLLVM_TYPECONVERTER_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 6 | #include "triton/Conversion/MLIRTypes.h" 7 | #include "triton/Dialect/TritonCPU/IR/Types.h" 8 | 9 | using namespace mlir; 10 | using namespace mlir::triton; 11 | 12 | class TritonCPUToLLVMTypeConverter : public LLVMTypeConverter { 13 | public: 14 | using TypeConverter::convertType; 15 | 16 | TritonCPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option, 17 | const DataLayoutAnalysis *analysis = nullptr); 18 | 19 | Type convertTritonPointerType(triton::PointerType type); 20 | }; 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonCPUToLLVM/Utility.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONCPU_TO_LLVM_UTILITY_H 2 | #define TRITON_CONVERSION_TRITONCPU_TO_LLVM_UTILITY_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/Pattern.h" 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 6 | #include "triton/Analysis/Utility.h" 7 | #include "triton/Conversion/MLIRTypes.h" 8 | #include "triton/Dialect/Triton/IR/Utility.h" 9 | #include "triton/Dialect/TritonCPU/IR/Dialect.h" 10 | #include "llvm/Support/ErrorHandling.h" 11 | 12 | using namespace mlir; 13 | using namespace mlir::triton; 14 | 15 | namespace mlir { 16 | namespace LLVM { 17 | 18 | // TODO: Not sure we need this for CPU backends. 19 | inline bool isKernel(FunctionOpInterface funcOp) { 20 | return funcOp.getVisibility() == SymbolTable::Visibility::Public; 21 | } 22 | 23 | } // namespace LLVM 24 | } // namespace mlir 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/AsmFormat.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_ 2 | #define TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_ 3 | 4 | #include "mlir/IR/Value.h" 5 | #include "triton/Dialect/Triton/IR/Dialect.h" 6 | #include "llvm/ADT/SmallVector.h" 7 | #include "llvm/ADT/StringExtras.h" 8 | #include "llvm/ADT/StringRef.h" 9 | #include 10 | #include 11 | 12 | namespace mlir { 13 | class ConversionPatternRewriter; 14 | class Location; 15 | 16 | namespace triton { 17 | using llvm::StringRef; 18 | 19 | inline std::string strJoin(llvm::ArrayRef strs, 20 | llvm::StringRef delimiter) { 21 | return llvm::join(strs.begin(), strs.end(), delimiter); 22 | } 23 | 24 | } // namespace triton 25 | } // namespace mlir 26 | 27 | #endif // TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_ 28 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonGPUToLLVM) 3 | add_public_tablegen_target(TritonGPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H 2 | #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/Pass/Pass.h" 6 | #include "mlir/Transforms/DialectConversion.h" 7 | 8 | #include 9 | 10 | namespace mlir { 11 | 12 | class ModuleOp; 13 | template class OperationPass; 14 | 15 | namespace triton { 16 | 17 | #define GEN_PASS_DECL 18 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc" 19 | 20 | namespace gpu { 21 | std::unique_ptr> createAllocateSharedMemoryPass(); 22 | 23 | } // namespace gpu 24 | 25 | #define GEN_PASS_REGISTRATION 26 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc" 27 | 28 | } // namespace triton 29 | 30 | } // namespace mlir 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCOMMONGPU_CONVERSION_PASSES 2 | #define TRITONCOMMONGPU_CONVERSION_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def AllocateSharedMemory : Pass<"allocate-shared-memory", "mlir::ModuleOp"> { 7 | let summary = "Add metadata for shared memory allocation"; 8 | let constructor = "mlir::triton::gpu::createAllocateSharedMemoryPass()"; 9 | } 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/Patterns.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PATTERNS_H 2 | #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PATTERNS_H 3 | 4 | #include 5 | 6 | namespace mlir { 7 | class ModuleOp; 8 | class RankedTensorType; 9 | 10 | namespace triton::gpu { 11 | 12 | /// Replaces `blocked -> dot_op` with `blocked -> shared -> dot_op` in the given 13 | /// |module| op because the codegen doesn't handle `blocked -> dot_op` directly. 14 | void decomposeBlockedToDotLayoutConversion(ModuleOp module); 15 | 16 | /// Replaces `splat -> shared` with `splat -> blocked -> shared` in the given 17 | /// |module| op. 18 | void decomposeSplatOpToSharedLayoutConversion(ModuleOp module); 19 | 20 | /// Replaces `mma/mfma -> dot_op` with `mma/mfma -> blocked -> dot_op` in the 21 | /// given |module| op, but bypass the decomposition if |shortcutFn| returns 22 | /// true. 23 | using ShortcutFn = std::function; 24 | template 25 | void decomposeTensorCoreToDotLayoutConversion(ModuleOp module, 26 | ShortcutFn shortcutFn); 27 | 28 | } // namespace triton::gpu 29 | 30 | } // namespace mlir 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_TYPECONVERTER_H 2 | #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_TYPECONVERTER_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 6 | #include "triton/Conversion/MLIRTypes.h" 7 | #include "triton/Dialect/TritonGPU/IR/Types.h" 8 | 9 | using namespace mlir; 10 | using namespace mlir::triton; 11 | 12 | class TritonGPUToLLVMTypeConverter : public LLVMTypeConverter { 13 | public: 14 | using TypeConverter::convertType; 15 | 16 | TritonGPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option, 17 | const DataLayoutAnalysis *analysis = nullptr); 18 | 19 | Type getElementTypeForStruct(TensorOrMemDesc type); 20 | Type convertTritonPointerType(triton::PointerType type); 21 | Type convertTritonTensorType(RankedTensorType type); 22 | Type convertMemDescType(MemDescType type); 23 | Type convertAsyncToken(triton::gpu::AsyncTokenType type); 24 | }; 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonCPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonCPU) 3 | add_public_tablegen_target(TritonConversionToCPUPassIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonCPU/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TO_CPU_PASSES_H 2 | #define TRITON_CONVERSION_TO_CPU_PASSES_H 3 | 4 | #include "triton/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.h" 5 | 6 | namespace mlir { 7 | namespace triton { 8 | 9 | #define GEN_PASS_REGISTRATION 10 | #include "triton/Conversion/TritonToTritonCPU/Passes.h.inc" 11 | 12 | } // namespace triton 13 | } // namespace mlir 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonCPU/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TO_CPU_PASSES 2 | #define TRITON_CONVERSION_TO_CPU_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def ConvertTritonToTritonCPU: Pass<"convert-triton-to-tritoncpu", "mlir::ModuleOp"> { 7 | let summary = "Convert Triton to TritonCPU"; 8 | let description = [{ 9 | 10 | }]; 11 | let constructor = "mlir::triton::createConvertTritonToTritonCPUPass()"; 12 | 13 | let dependentDialects = ["mlir::arith::ArithDialect", 14 | "mlir::math::MathDialect", 15 | "mlir::scf::SCFDialect", 16 | "mlir::triton::cpu::TritonCPUDialect", 17 | "mlir::triton::TritonDialect"]; 18 | 19 | let options = [ 20 | ]; 21 | } 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONTOTRITONCPU_TRITONTOTRITONCPUPASS_H 2 | #define TRITON_CONVERSION_TRITONTOTRITONCPU_TRITONTOTRITONCPUPASS_H 3 | 4 | #include 5 | 6 | namespace mlir { 7 | 8 | class ModuleOp; 9 | template class OperationPass; 10 | 11 | namespace triton { 12 | 13 | std::unique_ptr> createConvertTritonToTritonCPUPass(); 14 | 15 | } // namespace triton 16 | } // namespace mlir 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonGPU) 3 | add_public_tablegen_target(TritonConversionToGPUPassIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonGPU/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TO_GPU_PASSES_H 2 | #define TRITON_CONVERSION_TO_GPU_PASSES_H 3 | 4 | #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" 5 | 6 | namespace mlir { 7 | namespace triton { 8 | 9 | #define GEN_PASS_REGISTRATION 10 | #include "triton/Conversion/TritonToTritonGPU/Passes.h.inc" 11 | 12 | } // namespace triton 13 | } // namespace mlir 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonGPU/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TO_GPU_PASSES 2 | #define TRITON_CONVERSION_TO_GPU_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def ConvertTritonToTritonGPU: Pass<"convert-triton-to-tritongpu", "mlir::ModuleOp"> { 7 | let summary = "Convert Triton to TritonGPU"; 8 | let description = [{ 9 | 10 | }]; 11 | let constructor = "mlir::triton::createConvertTritonToTritonGPUPass()"; 12 | 13 | let dependentDialects = ["mlir::arith::ArithDialect", 14 | "mlir::math::MathDialect", 15 | // TODO: Does this pass depend on SCF? 16 | "mlir::scf::SCFDialect", 17 | "mlir::triton::TritonDialect", 18 | "mlir::triton::gpu::TritonGPUDialect"]; 19 | 20 | let options = [ 21 | Option<"numWarps", "num-warps", 22 | "int32_t", /*default*/"4", 23 | "number of warps">, 24 | 25 | Option<"threadsPerWarp", "threads-per-warp", 26 | "int32_t", /*default*/"32", 27 | "number of threads per warp">, 28 | Option<"numCTAs", "num-ctas", 29 | "int32_t", /*default*/"1", 30 | "number of ctas in a cga">, 31 | Option<"computeCapability", "compute-capability", 32 | "int32_t", /*default*/"80", 33 | "compute capability"> 34 | ]; 35 | } 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H 2 | #define TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H 3 | 4 | #include 5 | 6 | namespace mlir { 7 | 8 | class ModuleOp; 9 | template class OperationPass; 10 | 11 | namespace triton { 12 | 13 | constexpr static char AttrNumWarpsName[] = "triton_gpu.num-warps"; 14 | constexpr static char AttrNumCTAsName[] = "triton_gpu.num-ctas"; 15 | constexpr static char AttrComputeCapabilityName[] = 16 | "triton_gpu.compute-capability"; 17 | 18 | constexpr static char AttrNumThreadsPerWarp[] = "triton_gpu.threads-per-warp"; 19 | 20 | // Create the pass with numWarps passed from cl::opt. 21 | std::unique_ptr> createConvertTritonToTritonGPUPass(); 22 | 23 | // Create the pass with numWarps set explicitly. 24 | std::unique_ptr> 25 | createConvertTritonToTritonGPUPass(int numWarps, int threadsPerWarp = 32, 26 | int numCTAs = 1, int computeCapability = 80); 27 | 28 | } // namespace triton 29 | } // namespace mlir 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /include/triton/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Triton) 2 | add_subdirectory(TritonCPU) 3 | add_subdirectory(TritonGPU) 4 | add_subdirectory(TritonNvidiaGPU) 5 | add_subdirectory(NVGPU) 6 | -------------------------------------------------------------------------------- /include/triton/Dialect/NVGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | #add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /include/triton/Dialect/NVGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS NVGPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu) 6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions) 7 | mlir_tablegen(Ops.h.inc -gen-op-decls) 8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 9 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 10 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 11 | add_mlir_doc(NVGPUDialect NVGPUDialect dialects/ -gen-dialect-doc) 12 | add_mlir_doc(NVGPUOps NVGPUOps dialects/ -gen-op-doc) 13 | add_public_tablegen_target(NVGPUTableGen) 14 | 15 | set(LLVM_TARGET_DEFINITIONS NVGPUAttrDefs.td) 16 | mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls) 17 | mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs) 18 | add_public_tablegen_target(NVGPUAttrDefsIncGen) 19 | -------------------------------------------------------------------------------- /include/triton/Dialect/NVGPU/IR/NVGPUAttrDefs.td: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining 4 | // a copy of this software and associated documentation files 5 | // (the "Software"), to deal in the Software without restriction, 6 | // including without limitation the rights to use, copy, modify, merge, 7 | // publish, distribute, sublicense, and/or sell copies of the Software, 8 | // and to permit persons to whom the Software is furnished to do so, 9 | // subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be 12 | // included in all copies or substantial portions of the Software. 13 | // 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | #ifndef NVGPU_ATTRDEFS 23 | #define NVGPU_ATTRDEFS 24 | 25 | include "triton/Dialect/NVGPU/IR/NVGPUDialect.td" 26 | include "mlir/IR/AttrTypeBase.td" 27 | 28 | class NVGPU_Attr traits = [], 29 | string baseCppClass = "::mlir::Attribute"> 30 | : AttrDef { 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /include/triton/Dialect/NVGPU/IR/NVGPUDialect.td: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining 4 | // a copy of this software and associated documentation files 5 | // (the "Software"), to deal in the Software without restriction, 6 | // including without limitation the rights to use, copy, modify, merge, 7 | // publish, distribute, sublicense, and/or sell copies of the Software, 8 | // and to permit persons to whom the Software is furnished to do so, 9 | // subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be 12 | // included in all copies or substantial portions of the Software. 13 | // 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | #ifndef NVGPU_DIALECT 23 | #define NVGPU_DIALECT 24 | 25 | include "mlir/IR/OpBase.td" 26 | 27 | def NVGPU_Dialect : Dialect { 28 | let name = "nvgpu"; 29 | let cppNamespace = "::mlir::triton::nvgpu"; 30 | 31 | let description = [{ 32 | NVGPU Dialect. 33 | }]; 34 | 35 | let dependentDialects = [ 36 | "mlir::LLVM::LLVMDialect" 37 | ]; 38 | } 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonOps.td) 4 | mlir_tablegen(Ops.h.inc -gen-op-decls) 5 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 6 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 7 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 8 | add_mlir_doc(TritonOps TritonOps dialects/ -gen-op-doc) 9 | 10 | set(LLVM_TARGET_DEFINITIONS TritonDialect.td) 11 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls) 12 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs) 13 | add_mlir_doc(TritonDialect TritonDialect dialects/ -gen-dialect-doc) 14 | 15 | set(LLVM_TARGET_DEFINITIONS TritonTypes.td) 16 | mlir_tablegen(Types.h.inc -gen-typedef-decls) 17 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs) 18 | 19 | set(LLVM_TARGET_DEFINITIONS TritonInterfaces.td) 20 | mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls) 21 | mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs) 22 | 23 | set(LLVM_TARGET_DEFINITIONS TritonTypeInterfaces.td) 24 | mlir_tablegen(TritonTypeInterfaces.h.inc -gen-type-interface-decls) 25 | mlir_tablegen(TritonTypeInterfaces.cpp.inc -gen-type-interface-defs) 26 | 27 | add_public_tablegen_target(TritonTableGen) 28 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/Interfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_IR_INTERFACES_H_ 2 | #define TRITON_IR_INTERFACES_H_ 3 | 4 | #include "mlir/IR/OpDefinition.h" 5 | 6 | #define GET_TYPEDEF_CLASSES 7 | #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc" 8 | 9 | #endif // TRITON_IR_TYPES_H_ 10 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/TritonDialect.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT 2 | #define TRITON_DIALECT 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | def Triton_Dialect : Dialect { 7 | let name = "tt"; 8 | 9 | let cppNamespace = "::mlir::triton"; 10 | 11 | let summary = "The Triton IR in MLIR"; 12 | 13 | let description = [{ 14 | Triton Dialect. 15 | 16 | Dependent Dialects: 17 | * Arith: 18 | * addf, addi, andi, cmpf, cmpi, divf, fptosi, ... 19 | * Math: 20 | * exp, sin, cos, log, ... 21 | * StructuredControlFlow: 22 | * for, if, while, yield, condition 23 | * ControlFlow: 24 | * br, cond_br 25 | }]; 26 | 27 | let dependentDialects = [ 28 | "arith::ArithDialect", 29 | "math::MathDialect", 30 | "scf::SCFDialect", 31 | "cf::ControlFlowDialect" 32 | ]; 33 | 34 | let extraClassDeclaration = [{ 35 | void registerTypes(); 36 | }]; 37 | 38 | let hasConstantMaterializer = 1; 39 | let useDefaultTypePrinterParser = 1; 40 | let usePropertiesForAttributes = 1; 41 | } 42 | 43 | include "triton/Dialect/Triton/IR/TritonTypes.td" 44 | 45 | 46 | #endif // TRITON_DIALECT 47 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/TritonInterfaces.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_INTERFACES 2 | #define TRITON_INTERFACES 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | def TensorSizeTrait : NativeOpTrait<"TensorSizeTrait">; 7 | def VerifyTensorLayoutsTrait : NativeOpTrait<"VerifyTensorLayoutsTrait">; 8 | def SameOperandsEncoding : NativeOpTrait<"SameOperandsEncoding">; 9 | def SameOperandsAndResultEncoding : NativeOpTrait<"SameOperandsAndResultEncoding">; 10 | def SameLoadStoreOperandsShape : NativeOpTrait<"SameLoadStoreOperandsShape">; 11 | def SameLoadStoreOperandsAndResultShape : NativeOpTrait<"SameLoadStoreOperandsAndResultShape">; 12 | def SameLoadStoreOperandsEncoding : NativeOpTrait<"SameLoadStoreOperandsEncoding">; 13 | def SameLoadStoreOperandsAndResultEncoding : NativeOpTrait<"SameLoadStoreOperandsAndResultEncoding">; 14 | 15 | #endif // TRITON_INTERFACES 16 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/TritonTypeInterfaces.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_TYPE_INTERFACES 2 | #define TRITON_TYPE_INTERFACES 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | // Interface dynamically attached to RankedTensorType and MemDescType. 7 | def TT_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> { 8 | let cppNamespace = "::mlir"; 9 | let methods = [ 10 | InterfaceMethod<"Returns the encoding of the tensor or memory descriptor", 11 | "mlir::Attribute", "getEncoding", (ins)>, 12 | InterfaceMethod<"Returns element type", 13 | "mlir::Type", "getElementType", (ins)>, 14 | InterfaceMethod<"Returns the type shape", 15 | "llvm::ArrayRef", "getShape", (ins)>, 16 | InterfaceMethod<"Returns the tensor or buffer rank", 17 | "int64_t", "getRank", (ins)>, 18 | InterfaceMethod<"Returns the element type bit width", 19 | "int64_t", "getElementTypeBitWidth", (ins)>, 20 | 21 | ]; 22 | } 23 | 24 | #endif // TRITON_TYPE_INTERFACES 25 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/Types.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_IR_TYPES_H_ 2 | #define TRITON_IR_TYPES_H_ 3 | 4 | #include "mlir/IR/BuiltinTypes.h" 5 | #include "mlir/IR/TypeSupport.h" 6 | #include "mlir/IR/Types.h" 7 | 8 | #define GET_TYPEDEF_CLASSES 9 | #include "triton/Dialect/Triton/IR/Types.h.inc" 10 | 11 | #include "triton/Dialect/Triton/IR/TritonTypeInterfaces.h.inc" 12 | 13 | namespace mlir { 14 | 15 | namespace triton { 16 | 17 | bool isTensorPointerType(Type type); 18 | 19 | bool isTensorOrTensorPointerType(Type type); 20 | 21 | unsigned getPointeeBitWidth(Type type); 22 | 23 | Type getPointeeType(Type type); 24 | 25 | Type getPointerType(Type type); 26 | 27 | Type getElementTypeOfTensorPointerType(Type type); 28 | 29 | Type getI1SameShape(Type type); 30 | 31 | Type getI32SameShape(Type type); 32 | 33 | Type getPointerTypeSameShape(Type type); 34 | 35 | } // namespace triton 36 | 37 | } // namespace mlir 38 | 39 | #endif // TRITON_IR_TYPES_H_ 40 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name Triton) 3 | add_public_tablegen_target(TritonTransformsIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/Transforms/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_ 2 | #define TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_ 3 | 4 | #include "mlir/Pass/Pass.h" 5 | 6 | namespace mlir { 7 | namespace triton { 8 | 9 | std::unique_ptr createCombineOpsPass(); 10 | 11 | std::unique_ptr createReorderBroadcastPass(); 12 | std::unique_ptr createRewriteTensorPointerPass(); 13 | 14 | } // namespace triton 15 | 16 | #define GEN_PASS_REGISTRATION 17 | #include "triton/Dialect/Triton/Transforms/Passes.h.inc" 18 | 19 | } // namespace mlir 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/Attributes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_ 2 | #define TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_ 3 | 4 | #include "triton/Dialect/TritonCPU/IR/TritonCPUInterfaces.h" 5 | 6 | #define GET_ATTRDEF_CLASSES 7 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.h.inc" 8 | 9 | #endif // TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_ 10 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonCPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_cpu) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_cpu) 6 | mlir_tablegen(Ops.h.inc -gen-op-decls) 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_cpu) 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_cpu) 10 | add_mlir_doc(TritonCPUDialect TritonCPUDialect dialects/ -gen-dialect-doc) 11 | add_mlir_doc(TritonCPUOps TritonCPUOps dialects/ -gen-op-doc) 12 | add_public_tablegen_target(TritonCPUTableGen) 13 | 14 | set(LLVM_TARGET_DEFINITIONS TritonCPUAttrDefs.td) 15 | mlir_tablegen(TritonCPUAttrInterfaces.h.inc -gen-attr-interface-decls) 16 | mlir_tablegen(TritonCPUAttrInterfaces.cpp.inc -gen-attr-interface-defs) 17 | mlir_tablegen(TritonCPUAttrDefs.h.inc -gen-attrdef-decls) 18 | mlir_tablegen(TritonCPUAttrDefs.cpp.inc -gen-attrdef-defs) 19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 21 | add_public_tablegen_target(TritonCPUAttrDefsIncGen) 22 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/Dialect.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_ 2 | #define TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_ 3 | 4 | #include "mlir/Dialect/Tensor/IR/Tensor.h" 5 | #include "mlir/IR/BuiltinOps.h" 6 | #include "mlir/IR/Dialect.h" 7 | 8 | // TritonCPU depends on Triton 9 | #include "triton/Dialect/Triton/IR/Dialect.h" 10 | #include "triton/Dialect/TritonCPU/IR/Attributes.h" 11 | #include "triton/Dialect/TritonCPU/IR/Dialect.h.inc" 12 | #include "triton/Dialect/TritonCPU/IR/Types.h" 13 | 14 | #define GET_OP_CLASSES 15 | #include "triton/Dialect/TritonCPU/IR/Ops.h.inc" 16 | 17 | #endif // TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_ 18 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_ATTRDEFS 2 | #define TRITONCPU_ATTRDEFS 3 | 4 | include "mlir/IR/AttrTypeBase.td" 5 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td" 6 | include "triton/Dialect/Triton/IR/TritonInterfaces.td" 7 | 8 | //===----------------------------------------------------------------------===// 9 | // TritonCPU Attribute Definitions 10 | //===----------------------------------------------------------------------===// 11 | def TritonCPU_AttrTrait : AttrInterface<"TritonCPU_AttrTrait"> { 12 | let cppNamespace = "::mlir::triton::cpu"; 13 | } 14 | 15 | class TritonCPU_Attr traits = [], 16 | Dialect dialect = TritonCPU_Dialect, 17 | string baseCppClass = "::mlir::Attribute"> 18 | : AttrDef { 19 | 20 | let description = [{ 21 | WIP... 22 | }]; 23 | } 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUDialect.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_DIALECT 2 | #define TRITONCPU_DIALECT 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | def TritonCPU_Dialect : Dialect { 7 | let name = "triton_cpu"; 8 | 9 | let cppNamespace = "::mlir::triton::cpu"; 10 | 11 | let hasOperationAttrVerify = 1; 12 | 13 | let description = [{ 14 | Triton CPU Dialect. 15 | }]; 16 | 17 | let dependentDialects = [ 18 | "triton::TritonDialect", 19 | "tensor::TensorDialect", 20 | ]; 21 | 22 | let extraClassDeclaration = [{ 23 | void registerTypes(); 24 | }]; 25 | 26 | let useDefaultTypePrinterParser = 1; 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUInterfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CPU_DIALECT_INTERFACES_H 2 | #define TRITON_CPU_DIALECT_INTERFACES_H 3 | 4 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrInterfaces.h.inc" 5 | 6 | #endif // TRITON_CPU_DIALECT_INTERFACES_H 7 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUOps.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_OPS 2 | #define TRITONCPU_OPS 3 | 4 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td" 5 | include "triton/Dialect/TritonCPU/IR/TritonCPUTypes.td" 6 | include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.td" 7 | include "mlir/Dialect/Arith/IR/ArithBase.td" 8 | include "triton/Dialect/Triton/IR/TritonTypes.td" 9 | include "triton/Dialect/Triton/IR/TritonAttrDefs.td" 10 | include "mlir/IR/OpBase.td" 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUTypes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_TYPES 2 | #define TRITONCPU_TYPES 3 | 4 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td" 5 | include "mlir/IR/AttrTypeBase.td" 6 | 7 | class TTC_TypeDef traits = []> 8 | : TypeDef { 9 | let mnemonic = _mnemonic; 10 | } 11 | 12 | def TTC_TokenType : TTC_TypeDef<"Token", "token"> { 13 | let parameters = (ins "int32_t":$type); 14 | 15 | let builders = [ 16 | TypeBuilder<(ins "unsigned":$type), [{ 17 | return $_get($_ctxt, type); 18 | }]> 19 | ]; 20 | 21 | let hasCustomAssemblyFormat = 1; 22 | 23 | let skipDefaultBuilders = 1; 24 | } 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/Types.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_IR_TYPES_H_ 2 | #define TRITONCPU_IR_TYPES_H_ 3 | 4 | #include "mlir/IR/TypeSupport.h" 5 | #include "mlir/IR/Types.h" 6 | 7 | #define GET_TYPEDEF_CLASSES 8 | #include "triton/Dialect/TritonCPU/IR/Types.h.inc" 9 | 10 | #endif // TRITON_IR_TYPES_H_ 11 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonCPU) 3 | add_public_tablegen_target(TritonCPUTransformsIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/Transforms/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONCPU_TRANSFORMS_PASSES_H_ 2 | #define TRITON_DIALECT_TRITONCPU_TRANSFORMS_PASSES_H_ 3 | 4 | #include "mlir/Pass/Pass.h" 5 | 6 | namespace mlir { 7 | namespace triton { 8 | namespace cpu {} // namespace cpu 9 | } // namespace triton 10 | 11 | /// Generate the code for registering passes. 12 | #define GEN_PASS_REGISTRATION 13 | #include "triton/Dialect/TritonCPU/Transforms/Passes.h.inc" 14 | 15 | } // namespace mlir 16 | #endif 17 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/Transforms/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_PASSES 2 | #define TRITONCPU_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/Transforms/TritonCPUConversion.h: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // 3 | // Defines utilities to use while converting to the TritonCPU dialect. 4 | // 5 | //===----------------------------------------------------------------------===// 6 | 7 | #ifndef TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_ 8 | #define TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_ 9 | 10 | #include "mlir/Transforms/DialectConversion.h" 11 | 12 | namespace mlir { 13 | 14 | class TritonCPUTypeConverter : public TypeConverter { 15 | public: 16 | TritonCPUTypeConverter(MLIRContext *context); 17 | 18 | private: 19 | MLIRContext *context; 20 | }; 21 | 22 | class TritonCPUConversionTarget : public ConversionTarget { 23 | 24 | public: 25 | explicit TritonCPUConversionTarget(MLIRContext &ctx, 26 | TritonCPUTypeConverter &typeConverter); 27 | }; 28 | 29 | } // namespace mlir 30 | 31 | #endif // TRITON_DIALECT_TRITONCPU_TRANSFORMS_TRITONCPUCONVERSION_H_ 32 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/Attributes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_ 2 | #define TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_ 3 | 4 | #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h" 5 | 6 | #define GET_ATTRDEF_CLASSES 7 | #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc" 8 | 9 | #endif // TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_ 10 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_gpu) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_gpu) 6 | mlir_tablegen(Ops.h.inc -gen-op-decls) 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_gpu) 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_gpu) 10 | add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc) 11 | add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc) 12 | add_public_tablegen_target(TritonGPUTableGen) 13 | 14 | set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td) 15 | mlir_tablegen(TritonGPUAttrInterfaces.h.inc -gen-attr-interface-decls) 16 | mlir_tablegen(TritonGPUAttrInterfaces.cpp.inc -gen-attr-interface-defs) 17 | mlir_tablegen(TritonGPUAttrDefs.h.inc -gen-attrdef-decls) 18 | mlir_tablegen(TritonGPUAttrDefs.cpp.inc -gen-attrdef-defs) 19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 21 | add_public_tablegen_target(TritonGPUAttrDefsIncGen) 22 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_GPU_DIALECT_INTERFACES_H 2 | #define TRITON_GPU_DIALECT_INTERFACES_H 3 | 4 | #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrInterfaces.h.inc" 5 | 6 | #endif // TRITON_GPU_DIALECT_INTERFACES_H 7 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/TritonGPUTypes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_TYPES 2 | #define TRITONGPU_TYPES 3 | 4 | include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td" 5 | include "mlir/IR/AttrTypeBase.td" 6 | 7 | class TTG_TypeDef traits = []> 8 | : TypeDef { 9 | let mnemonic = _mnemonic; 10 | } 11 | 12 | def TTG_TokenType : TTG_TypeDef<"Token", "token"> { 13 | let parameters = (ins "int32_t":$type); 14 | 15 | let builders = [ 16 | TypeBuilder<(ins "unsigned":$type), [{ 17 | return $_get($_ctxt, type); 18 | }]> 19 | ]; 20 | 21 | let hasCustomAssemblyFormat = 1; 22 | 23 | let skipDefaultBuilders = 1; 24 | } 25 | 26 | def TTG_AsyncToken : TTG_TypeDef<"AsyncToken", 27 | "async.token", []> { 28 | let summary = "async token type"; 29 | let description = [{ 30 | `ttg.async.token` is a type returned by an asynchronous operation. 31 | It is used to establish an SSA-based link between async operations 32 | and operations that group or synchronize the async operations. 33 | }]; 34 | } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/Types.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_IR_TYPES_H_ 2 | #define TRITONGPU_IR_TYPES_H_ 3 | 4 | #include "mlir/IR/TypeSupport.h" 5 | #include "mlir/IR/Types.h" 6 | 7 | #define GET_TYPEDEF_CLASSES 8 | #include "triton/Dialect/TritonGPU/IR/Types.h.inc" 9 | 10 | #endif // TRITON_IR_TYPES_H_ 11 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonGPU) 3 | add_public_tablegen_target(TritonGPUTransformsIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/Transforms/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_ 2 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_ 3 | 4 | #include "mlir/Pass/Pass.h" 5 | #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h" 6 | 7 | namespace mlir { 8 | namespace triton { 9 | namespace gpu { 10 | 11 | std::unique_ptr createPipelinePass(int numStages = 3, int numWarps = 4, 12 | int numCTAs = 1, 13 | int computeCapability = 80); 14 | 15 | std::unique_ptr createAccelerateMatmulPass(int computeCapability = 80); 16 | 17 | std::unique_ptr createF32DotTCPass(); 18 | 19 | std::unique_ptr createPrefetchPass(); 20 | 21 | std::unique_ptr createCoalescePass(); 22 | 23 | std::unique_ptr createReorderInstructionsPass(); 24 | 25 | std::unique_ptr createReduceDataDuplicationPass(); 26 | 27 | std::unique_ptr createRemoveLayoutConversionsPass(); 28 | 29 | std::unique_ptr createVerifier(); 30 | 31 | std::unique_ptr createOptimizeDotOperandsPass(); 32 | 33 | std::unique_ptr createOptimizeThreadLocalityPass(); 34 | 35 | } // namespace gpu 36 | } // namespace triton 37 | 38 | /// Generate the code for registering passes. 39 | #define GEN_PASS_REGISTRATION 40 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" 41 | 42 | } // namespace mlir 43 | #endif 44 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // 3 | // Defines utilities to use while converting to the TritonGPU dialect. 4 | // 5 | //===----------------------------------------------------------------------===// 6 | 7 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 8 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 9 | 10 | #include "mlir/Transforms/DialectConversion.h" 11 | 12 | namespace mlir { 13 | 14 | class TritonGPUTypeConverter : public TypeConverter { 15 | public: 16 | TritonGPUTypeConverter(MLIRContext *context, int numWarps, int threadsPerWarp, 17 | int numCTAs); 18 | int getNumWarps() const { return numWarps; } 19 | int getThreadsPerWarp() const { return threadsPerWarp; } 20 | int getNumCTAs() const { return numCTAs; } 21 | 22 | private: 23 | MLIRContext *context; 24 | int numWarps; 25 | int threadsPerWarp; 26 | int numCTAs; 27 | }; 28 | 29 | class TritonGPUConversionTarget : public ConversionTarget { 30 | 31 | public: 32 | explicit TritonGPUConversionTarget(MLIRContext &ctx, 33 | TritonGPUTypeConverter &typeConverter); 34 | }; 35 | 36 | } // namespace mlir 37 | 38 | #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 39 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonNvidiaGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_nvidia_gpu) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_nvidia_gpu) 6 | mlir_tablegen(Ops.h.inc -gen-op-decls) 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_nvidia_gpu) 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_nvidia_gpu) 10 | add_mlir_doc(TritonNvidiaGPUDialect TritonNvidiaGPUDialect dialects/ -gen-dialect-doc) 11 | add_mlir_doc(TritonNvidiaGPUOps TritonNvidiaGPUOps dialects/ -gen-op-doc) 12 | add_public_tablegen_target(TritonNvidiaGPUTableGen) 13 | 14 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUAttrDefs.td) 15 | mlir_tablegen(TritonNvidiaGPUAttrDefs.h.inc -gen-attrdef-decls) 16 | mlir_tablegen(TritonNvidiaGPUAttrDefs.cpp.inc -gen-attrdef-defs) 17 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 18 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 19 | add_public_tablegen_target(TritonNvidiaGPUAttrDefsIncGen) 20 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining 4 | // a copy of this software and associated documentation files 5 | // (the "Software"), to deal in the Software without restriction, 6 | // including without limitation the rights to use, copy, modify, merge, 7 | // publish, distribute, sublicense, and/or sell copies of the Software, 8 | // and to permit persons to whom the Software is furnished to do so, 9 | // subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be 12 | // included in all copies or substantial portions of the Software. 13 | // 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 19 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 20 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | #ifndef TRITONNVIDIAGPU_ATTRDEFS 23 | #define TRITONNVIDIAGPU_ATTRDEFS 24 | 25 | include "mlir/IR/AttrTypeBase.td" 26 | include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td" 27 | include "triton/Dialect/Triton/IR/TritonInterfaces.td" 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonNvidiaGPU/IR/Types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 NVIDIA Corporation & Affiliates. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining 5 | * a copy of this software and associated documentation files 6 | * (the "Software"), to deal in the Software without restriction, 7 | * including without limitation the rights to use, copy, modify, merge, 8 | * publish, distribute, sublicense, and/or sell copies of the Software, 9 | * and to permit persons to whom the Software is furnished to do so, 10 | * subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be 13 | * included in all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #ifndef TRITONNVIDIAGPU_IR_TYPES_H_ 25 | #define TRITONNVIDIAGPU_IR_TYPES_H_ 26 | 27 | #include "mlir/IR/TypeSupport.h" 28 | #include "mlir/IR/Types.h" 29 | 30 | #define GET_TYPEDEF_CLASSES 31 | #include "triton/Dialect/TritonNvidiaGPU/IR/Types.h.inc" 32 | 33 | #endif // TRITON_IR_TYPES_H_ 34 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonNvidiaGPU) 3 | add_public_tablegen_target(TritonNvidiaGPUTransformsIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Target/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(LLVMIR) 2 | -------------------------------------------------------------------------------- /include/triton/Target/LLVMIR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name LLVMIR) 3 | add_public_tablegen_target(LLVMIRIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Target/LLVMIR/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_TARGET_LLVM_IR_PASSES_H 2 | #define TRITON_TARGET_LLVM_IR_PASSES_H 3 | 4 | #include "mlir/Pass/Pass.h" 5 | 6 | namespace mlir { 7 | 8 | /// Create a pass to add DIScope 9 | std::unique_ptr createLLVMDIScopePass(); 10 | 11 | /// Generate the code for registering conversion passes. 12 | #define GEN_PASS_REGISTRATION 13 | #include "triton/Target/LLVMIR/Passes.h.inc" 14 | 15 | } // namespace mlir 16 | 17 | #endif // TRITON_TARGET_LLVM_IR_PASSES_H 18 | -------------------------------------------------------------------------------- /include/triton/Target/LLVMIR/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_TARGET_LLVMIR_PASSES 2 | #define TRITON_TARGET_LLVMIR_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def LLVMDIScope: Pass<"enable-line-info", "mlir::ModuleOp"> { 7 | let summary = "Materialize LLVM line info"; 8 | let description = [{ 9 | This pass materializes line mapping information for LLVM IR dialect operations. 10 | }]; 11 | 12 | let constructor = "mlir::createLLVMDIScopePass()"; 13 | } 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /include/triton/Tools/Sys/GetPlatform.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved. 3 | * 4 | * This file is part of ISAAC. 5 | * 6 | * ISAAC is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU Lesser General Public 8 | * License as published by the Free Software Foundation; either 9 | * version 2.1 of the License, or (at your option) any later version. 10 | * 11 | * This library is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * Lesser General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Lesser General Public 17 | * License along with this library; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 | * MA 02110-1301 USA 20 | */ 21 | 22 | #ifndef TDL_TOOLS_SYS_GETPLATFORM_HPP 23 | #define TDL_TOOLS_SYS_GETPLATFORM_HPP 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | // inline bool _isROCM = false; 33 | // inline void setROCM() { _isROCM = true; } 34 | // inline bool isROCM() { return _isROCM; } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAnalysis 2 | AxisInfo.cpp 3 | Allocation.cpp 4 | Membar.cpp 5 | Alias.cpp 6 | Utility.cpp 7 | 8 | DEPENDS 9 | TritonTableGen 10 | TritonGPUAttrDefsIncGen 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRAnalysis 14 | MLIRLLVMDialect 15 | TritonIR 16 | TritonGPUIR 17 | TritonNvidiaGPUIR 18 | ) 19 | -------------------------------------------------------------------------------- /lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # add_subdirectory(codegen) 2 | add_subdirectory(Analysis) 3 | add_subdirectory(Conversion) 4 | add_subdirectory(Dialect) 5 | add_subdirectory(Target) 6 | -------------------------------------------------------------------------------- /lib/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonToTritonCPU) 2 | add_subdirectory(TritonToTritonGPU) 3 | add_subdirectory(TritonCPUToLLVM) 4 | add_subdirectory(TritonGPUToLLVM) 5 | -------------------------------------------------------------------------------- /lib/Conversion/TritonCPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonCPUToLLVM 2 | ControlFlowOpToLLVM.cpp 3 | FuncOpToLLVM.cpp 4 | TypeConverter.cpp 5 | TritonCPUToLLVM.cpp 6 | 7 | DEPENDS 8 | TritonCPUConversionPassIncGen 9 | 10 | LINK_LIBS PUBLIC 11 | MLIRIR 12 | MLIRPass 13 | TritonAnalysis 14 | TritonIR 15 | TritonCPUIR 16 | TritonCPUTransforms 17 | ) 18 | -------------------------------------------------------------------------------- /lib/Conversion/TritonCPUToLLVM/ControlFlowOpToLLVM.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Conversion/TritonCPUToLLVM/PatternTritonCPUOpToLLVM.h" 2 | #include "triton/Conversion/TritonCPUToLLVM/Utility.h" 3 | #include "llvm/Support/ErrorHandling.h" 4 | 5 | namespace { 6 | 7 | using namespace mlir; 8 | using namespace mlir::triton; 9 | 10 | struct ReturnOpConversion : public ConvertOpToLLVMPattern { 11 | using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; 12 | 13 | LogicalResult 14 | matchAndRewrite(triton::ReturnOp op, OpAdaptor adaptor, 15 | ConversionPatternRewriter &rewriter) const override { 16 | auto funcOp = op->getParentOfType(); 17 | if (funcOp->hasAttr("cpu.kernel")) { 18 | if (op.getNumOperands() > 0) { 19 | return rewriter.notifyMatchFailure( 20 | op, "Kernel functions do not support return with operands"); 21 | } 22 | rewriter.replaceOpWithNewOp(op, TypeRange(), ValueRange(), 23 | op->getAttrs()); 24 | } else { 25 | llvm_unreachable("Not implemented"); 26 | } 27 | return success(); 28 | } 29 | }; 30 | 31 | } // namespace 32 | 33 | void mlir::triton::cpu::populateControlFlowOpToLLVMPattern( 34 | LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, 35 | PatternBenefit benefit) { 36 | patterns.add(typeConverter, benefit); 37 | } 38 | -------------------------------------------------------------------------------- /lib/Conversion/TritonCPUToLLVM/TypeConverter.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Conversion/TritonCPUToLLVM/TypeConverter.h" 2 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 3 | #include "triton/Conversion/MLIRTypes.h" 4 | #include "llvm/Support/ErrorHandling.h" 5 | 6 | using namespace mlir; 7 | using namespace mlir::triton; 8 | 9 | TritonCPUToLLVMTypeConverter::TritonCPUToLLVMTypeConverter( 10 | MLIRContext *ctx, LowerToLLVMOptions &option, 11 | const DataLayoutAnalysis *analysis) 12 | : LLVMTypeConverter(ctx, option, analysis) { 13 | addConversion([&](triton::PointerType type) -> std::optional { 14 | return convertTritonPointerType(type); 15 | }); 16 | 17 | // Internally store bfloat16 as int16 18 | addConversion([&](BFloat16Type type) -> std::optional { 19 | return IntegerType::get(type.getContext(), 16); 20 | }); 21 | } 22 | 23 | Type TritonCPUToLLVMTypeConverter::convertTritonPointerType( 24 | triton::PointerType type) { 25 | auto ctx = type.getContext(); 26 | auto pointeeType = type.getPointeeType(); 27 | if (pointeeType.isa()) { 28 | llvm_unreachable("Not implemented"); 29 | } 30 | return LLVM::LLVMPointerType::get(ctx, type.getAddressSpace()); 31 | } 32 | -------------------------------------------------------------------------------- /lib/Conversion/TritonGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonGPUToLLVM 2 | ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp 3 | DotOpToLLVM/FMA.cpp 4 | TypeConverter.cpp 5 | Utility.cpp 6 | ElementwiseOpToLLVM.cpp 7 | MemoryOpToLLVM.cpp 8 | AssertOpToLLVM.cpp 9 | ViewOpToLLVM.cpp 10 | MakeRangeOpToLLVM.cpp 11 | HistogramOpToLLVM.cpp 12 | AllocateSharedMemory.cpp 13 | ReduceOpToLLVM.cpp 14 | ScanOpToLLVM.cpp 15 | ConvertLayoutOpToLLVM.cpp 16 | ControlFlowOpToLLVM.cpp 17 | FuncOpToLLVM.cpp 18 | SPMDOpToLLVM.cpp 19 | DecomposeUnsupportedConversions.cpp 20 | PrintOpToLLVM.cpp 21 | 22 | DEPENDS 23 | TritonGPUConversionPassIncGen 24 | 25 | LINK_LIBS PUBLIC 26 | MLIRIR 27 | MLIRPass 28 | MLIRGPUDialect 29 | MLIRGPUToNVVMTransforms 30 | MLIRGPUToROCDLTransforms 31 | MLIRGPUTransforms 32 | TritonAnalysis 33 | TritonIR 34 | TritonGPUIR 35 | TritonGPUTransforms 36 | TritonNvidiaGPUTransforms 37 | NVGPUIR 38 | ) 39 | -------------------------------------------------------------------------------- /lib/Conversion/TritonGPUToLLVM/SPMDOpToLLVM.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h" 2 | #include "triton/Conversion/TritonGPUToLLVM/Utility.h" 3 | 4 | namespace { 5 | 6 | using namespace mlir; 7 | using namespace mlir::triton; 8 | 9 | struct GetProgramIdOpConversion 10 | : public ConvertOpToLLVMPattern { 11 | explicit GetProgramIdOpConversion(LLVMTypeConverter &typeConverter, 12 | const TargetInfoBase &targetInfo, 13 | PatternBenefit benefit = 1) 14 | : ConvertOpToLLVMPattern(typeConverter, benefit), 15 | targetInfo(targetInfo) {} 16 | 17 | LogicalResult 18 | matchAndRewrite(triton::GetProgramIdOp op, OpAdaptor adaptor, 19 | ConversionPatternRewriter &rewriter) const override { 20 | Value programId = targetInfo.programId(rewriter, op->getLoc(), 21 | op->getParentOfType(), 22 | op.getAxisAsInt()); 23 | rewriter.replaceOp(op, programId); 24 | return success(); 25 | } 26 | 27 | private: 28 | const TargetInfoBase &targetInfo; 29 | }; 30 | 31 | } // namespace 32 | 33 | void mlir::triton::populateSPMDOpToLLVMPattern(LLVMTypeConverter &typeConverter, 34 | RewritePatternSet &patterns, 35 | const TargetInfoBase &targetInfo, 36 | PatternBenefit benefit) { 37 | patterns.add(typeConverter, targetInfo, benefit); 38 | } 39 | -------------------------------------------------------------------------------- /lib/Conversion/TritonToTritonCPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonToTritonCPU 2 | TritonCPUConversion.cpp 3 | TritonToTritonCPUPass.cpp 4 | 5 | DEPENDS 6 | TritonConversionToCPUPassIncGen 7 | 8 | LINK_LIBS PUBLIC 9 | MLIRIR 10 | MLIRPass 11 | MLIRTransforms 12 | TritonIR 13 | TritonCPUIR 14 | TritonCPUTransforms 15 | ) 16 | -------------------------------------------------------------------------------- /lib/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Conversion/TritonToTritonCPU/TritonToTritonCPUPass.h" 2 | 3 | #include "mlir/Dialect/Arith/IR/Arith.h" 4 | #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" 5 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" 6 | #include "mlir/Dialect/Index/IR/IndexDialect.h" 7 | #include "mlir/Pass/Pass.h" 8 | #include "mlir/Transforms/DialectConversion.h" 9 | #include "triton/Analysis/Utility.h" 10 | #include "triton/Dialect/Triton/IR/Dialect.h" 11 | #include "triton/Dialect/Triton/IR/Utility.h" 12 | #include "triton/Dialect/TritonCPU/IR/Dialect.h" 13 | #include "triton/Dialect/TritonCPU/Transforms/TritonCPUConversion.h" 14 | #include "llvm/ADT/APSInt.h" 15 | #include 16 | 17 | #define GEN_PASS_CLASSES 18 | #include "triton/Conversion/TritonToTritonCPU/Passes.h.inc" 19 | 20 | namespace { 21 | 22 | using namespace mlir; 23 | using namespace mlir::triton; 24 | using namespace mlir::triton::cpu; 25 | 26 | class ConvertTritonToTritonCPU 27 | : public ConvertTritonToTritonCPUBase { 28 | public: 29 | ConvertTritonToTritonCPU() = default; 30 | 31 | void runOnOperation() override { 32 | // TODO: 33 | } 34 | }; 35 | 36 | } // namespace 37 | 38 | std::unique_ptr> 39 | mlir::triton::createConvertTritonToTritonCPUPass() { 40 | return std::make_unique<::ConvertTritonToTritonCPU>(); 41 | } 42 | -------------------------------------------------------------------------------- /lib/Conversion/TritonToTritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonToTritonGPU 2 | TritonGPUConversion.cpp 3 | TritonToTritonGPUPass.cpp 4 | 5 | DEPENDS 6 | TritonConversionToGPUPassIncGen 7 | 8 | LINK_LIBS PUBLIC 9 | MLIRIR 10 | MLIRPass 11 | MLIRTransforms 12 | TritonIR 13 | TritonGPUIR 14 | TritonGPUTransforms 15 | ) 16 | -------------------------------------------------------------------------------- /lib/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Triton) 2 | add_subdirectory(TritonCPU) 3 | add_subdirectory(TritonGPU) 4 | add_subdirectory(TritonNvidiaGPU) 5 | add_subdirectory(NVGPU) 6 | -------------------------------------------------------------------------------- /lib/Dialect/NVGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /lib/Dialect/NVGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(NVGPUIR 2 | Dialect.cpp 3 | 4 | DEPENDS 5 | NVGPUTableGen 6 | NVGPUAttrDefsIncGen 7 | 8 | LINK_LIBS PUBLIC 9 | MLIRLLVMDialect 10 | ) 11 | -------------------------------------------------------------------------------- /lib/Dialect/Triton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /lib/Dialect/Triton/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonIR 2 | Dialect.cpp 3 | Ops.cpp 4 | Types.cpp 5 | Traits.cpp 6 | 7 | DEPENDS 8 | TritonTableGen 9 | 10 | LINK_LIBS PUBLIC 11 | MLIRIR 12 | MLIRArithDialect 13 | MLIRMathDialect 14 | MLIRSCFDialect 15 | ) 16 | -------------------------------------------------------------------------------- /lib/Dialect/Triton/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Combine.td) 2 | mlir_tablegen(TritonCombine.inc -gen-rewriters) 3 | add_public_tablegen_target(TritonCombineIncGen) 4 | 5 | add_triton_library(TritonTransforms 6 | Combine.cpp 7 | ReorderBroadcast.cpp 8 | RewriteTensorPointer.cpp 9 | 10 | DEPENDS 11 | TritonTransformsIncGen 12 | TritonCombineIncGen 13 | 14 | LINK_LIBS PUBLIC 15 | MLIRPass 16 | MLIRTransformUtils 17 | TritonIR 18 | ) 19 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonCPUIR 2 | Dialect.cpp 3 | Types.cpp 4 | 5 | DEPENDS 6 | TritonCPUTableGen 7 | TritonCPUAttrDefsIncGen 8 | 9 | LINK_LIBS PUBLIC 10 | TritonIR 11 | ) 12 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/IR/Dialect.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Dialect/Triton/IR/Dialect.h" 2 | 3 | #include 4 | 5 | #include "mlir/IR/DialectImplementation.h" 6 | #include "mlir/IR/OpImplementation.h" 7 | #include "triton/Analysis/Utility.h" 8 | #include "triton/Dialect/Triton/IR/Utility.h" 9 | #include "triton/Dialect/TritonCPU/IR/Dialect.cpp.inc" 10 | #include "triton/Dialect/TritonCPU/IR/Dialect.h" 11 | #include "triton/Tools/Sys/GetEnv.hpp" 12 | #include "llvm/ADT/TypeSwitch.h" 13 | 14 | using namespace mlir; 15 | using namespace mlir::triton::cpu; 16 | 17 | //===----------------------------------------------------------------------===// 18 | // Attribute methods 19 | //===----------------------------------------------------------------------===// 20 | #define GET_ATTRDEF_CLASSES 21 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.cpp.inc" 22 | 23 | void TritonCPUDialect::initialize() { 24 | registerTypes(); 25 | 26 | addAttributes< 27 | #define GET_ATTRDEF_LIST 28 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.cpp.inc" 29 | >(); 30 | addOperations< 31 | #define GET_OP_LIST 32 | #include "triton/Dialect/TritonCPU/IR/Ops.cpp.inc" 33 | #include "triton/Dialect/TritonCPU/IR/OpsEnums.cpp.inc" 34 | >(); 35 | } 36 | 37 | // verify TritonCPU ops 38 | LogicalResult TritonCPUDialect::verifyOperationAttribute(Operation *op, 39 | NamedAttribute attr) { 40 | // TODO: fill this. 41 | return success(); 42 | } 43 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/IR/Types.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Dialect/TritonCPU/IR/Types.h" 2 | #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc` 3 | #include "triton/Dialect/TritonCPU/IR/Dialect.h" 4 | #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc` 5 | 6 | using namespace mlir; 7 | using namespace mlir::triton::cpu; 8 | 9 | #define GET_TYPEDEF_CLASSES 10 | #include "triton/Dialect/TritonCPU/IR/Types.cpp.inc" 11 | 12 | Type TokenType::parse(AsmParser &parser) { 13 | if (parser.parseLess()) 14 | return Type(); 15 | 16 | int type = 1; 17 | if (parser.parseInteger(type)) 18 | return Type(); 19 | 20 | if (parser.parseGreater()) 21 | return Type(); 22 | 23 | return TokenType::get(parser.getContext(), type); 24 | } 25 | 26 | void TokenType::print(AsmPrinter &printer) const { 27 | printer << "<" << getType() << ">"; 28 | } 29 | 30 | //===----------------------------------------------------------------------===// 31 | // Triton Dialect 32 | //===----------------------------------------------------------------------===// 33 | void ::mlir::triton::cpu::TritonCPUDialect::registerTypes() { 34 | addTypes< 35 | #define GET_TYPEDEF_LIST 36 | #include "triton/Dialect/TritonCPU/IR/Types.cpp.inc" 37 | >(); 38 | } 39 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonCPUTransforms 2 | 3 | DEPENDS 4 | TritonCPUTransformsIncGen 5 | 6 | LINK_LIBS PUBLIC 7 | MLIRTransforms 8 | MLIRTransformUtils 9 | TritonAnalysis 10 | TritonIR 11 | TritonCPUIR 12 | MLIRTransformUtils 13 | ) 14 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonGPUIR 2 | Dialect.cpp 3 | Types.cpp 4 | 5 | DEPENDS 6 | TritonGPUTableGen 7 | TritonGPUAttrDefsIncGen 8 | 9 | LINK_LIBS PUBLIC 10 | MLIRGPUDialect 11 | TritonIR 12 | ) 13 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/IR/Types.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Dialect/TritonGPU/IR/Types.h" 2 | #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc` 3 | #include "triton/Dialect/TritonGPU/IR/Dialect.h" 4 | #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc` 5 | 6 | using namespace mlir; 7 | using namespace mlir::triton::gpu; 8 | 9 | #define GET_TYPEDEF_CLASSES 10 | #include "triton/Dialect/TritonGPU/IR/Types.cpp.inc" 11 | 12 | Type TokenType::parse(AsmParser &parser) { 13 | if (parser.parseLess()) 14 | return Type(); 15 | 16 | int type = 1; 17 | if (parser.parseInteger(type)) 18 | return Type(); 19 | 20 | if (parser.parseGreater()) 21 | return Type(); 22 | 23 | return TokenType::get(parser.getContext(), type); 24 | } 25 | 26 | void TokenType::print(AsmPrinter &printer) const { 27 | printer << "<" << getType() << ">"; 28 | } 29 | 30 | //===----------------------------------------------------------------------===// 31 | // Triton Dialect 32 | //===----------------------------------------------------------------------===// 33 | void ::mlir::triton::gpu::TritonGPUDialect::registerTypes() { 34 | addTypes< 35 | #define GET_TYPEDEF_LIST 36 | #include "triton/Dialect/TritonGPU/IR/Types.cpp.inc" 37 | >(); 38 | } 39 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonGPUTransforms 2 | AccelerateMatmul.cpp 3 | Coalesce.cpp 4 | F32DotTC.cpp 5 | ReduceDataDuplication.cpp 6 | OptimizeDotOperands.cpp 7 | OptimizeThreadLocality.cpp 8 | Pipeliner/MatmulLoopPipeline.cpp 9 | Pipeliner/OuterLoopPipeline.cpp 10 | Pipeliner/PipelineExpander.cpp 11 | Pipeliner/SoftwarePipeliner.cpp 12 | Pipeliner/PipeliningUtility.cpp 13 | Prefetch.cpp 14 | RemoveLayoutConversions.cpp 15 | ReorderInstructions.cpp 16 | Utility.cpp 17 | 18 | DEPENDS 19 | TritonGPUTransformsIncGen 20 | 21 | LINK_LIBS PUBLIC 22 | MLIRTransforms 23 | MLIRTransformUtils 24 | TritonAnalysis 25 | TritonIR 26 | TritonGPUIR 27 | TritonNvidiaGPUIR 28 | MLIRTransformUtils 29 | ) 30 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_TRITONGPU_TRANSFORMS_PIPELINER_PIPELINING_UTILITY_H_ 2 | #define TRITON_TRITONGPU_TRANSFORMS_PIPELINER_PIPELINING_UTILITY_H_ 3 | 4 | #include "mlir/Dialect/SCF/IR/SCF.h" 5 | #include 6 | 7 | namespace mlir { 8 | namespace triton { 9 | 10 | static const char *kNumStagesAttrName = "tt.num_stages"; 11 | 12 | /// Function to mask operations during scheduling. 13 | Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred); 14 | 15 | /// Collect ssa dependencies of `op` in `deps`. if `includeArg` is true, 16 | /// continue looking through loop block arguments. 17 | void addDep(Operation *op, DenseSet &deps, bool includeArg = true, 18 | DenseSet *filter = nullptr); 19 | 20 | /// Add operations from `forOp` into a pipeline schedule with the the given 21 | /// `stage` when filter is true. This will add operation in the original loop 22 | /// order. 23 | void addOps(scf::ForOp forOp, int stage, 24 | std::vector> &schedule, 25 | std::function filter); 26 | } // namespace triton 27 | } // namespace mlir 28 | 29 | #endif // TRITON_TRITONGPU_TRANSFORMS_PIPELINER_PIPELINING_UTILITY_H_ 30 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_ 2 | #define TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_ 3 | 4 | #include "PipelineExpander.h" 5 | #include "mlir/Dialect/SCF/IR/SCF.h" 6 | #include "mlir/Support/LLVM.h" 7 | #include "llvm/ADT/ArrayRef.h" 8 | #include 9 | 10 | namespace mlir { 11 | namespace triton { 12 | 13 | /// This fill out the pipelining options including schedule and annotations 14 | /// for wait ops. This also does pre-processing by converting some of the 15 | /// loads into async loads so that the IR is ready to be pipelined. 16 | bool preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages, 17 | mlir::triton::PipeliningOption &options); 18 | 19 | /// Fills out pipelining options for an outer loop pipelining case. This 20 | /// schedules async copies to overlap with the epilogue of a loop. 21 | bool getOuterLoopSchedule(scf::ForOp &forOp, int numStages, 22 | mlir::triton::PipeliningOption &options); 23 | 24 | /// This does post-processing on the pipelined loop to try to pipeline wgmma 25 | /// ops. 26 | // TODO: this should be included as part of the pipeline but currently the wgmma 27 | // wait modeling is problematic. 28 | void asyncLaunchDots(scf::ForOp forOp); 29 | 30 | /// Post process the pipelined loop by updating the wait ops with the right 31 | /// number of groups in flight. 32 | void updateWaits(ModuleOp module); 33 | 34 | } // namespace triton 35 | } // namespace mlir 36 | #endif // TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_ 37 | -------------------------------------------------------------------------------- /lib/Dialect/TritonNvidiaGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /lib/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonNvidiaGPUIR 2 | Dialect.cpp 3 | Ops.cpp 4 | Types.cpp 5 | 6 | DEPENDS 7 | TritonNvidiaGPUTableGen 8 | TritonNvidiaGPUAttrDefsIncGen 9 | 10 | LINK_LIBS PUBLIC 11 | TritonIR 12 | TritonGPUIR 13 | ) 14 | -------------------------------------------------------------------------------- /lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonNvidiaGPUTransforms 2 | FenceInsertion.cpp 3 | PlanCTA.cpp 4 | 5 | DEPENDS 6 | TritonNvidiaGPUTransformsIncGen 7 | 8 | LINK_LIBS PUBLIC 9 | TritonIR 10 | TritonGPUIR 11 | TritonGPUTransforms 12 | TritonNvidiaGPUIR 13 | MLIRTransformUtils 14 | ) 15 | -------------------------------------------------------------------------------- /lib/Target/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(LLVMIR) 2 | -------------------------------------------------------------------------------- /lib/Target/LLVMIR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonLLVMIR 2 | LLVMDIScope.cpp 3 | LLVMIRBreakPhiStruct.cpp 4 | 5 | DEPENDS 6 | LLVMIRIncGen 7 | 8 | LINK_LIBS 9 | ${CMAKE_DL_LIBS} 10 | PUBLIC 11 | MLIRArithToLLVM 12 | MLIRBuiltinToLLVMIRTranslation 13 | MLIRIndexToLLVM 14 | MLIRIR 15 | MLIRLLVMDialect 16 | MLIRLLVMToLLVMIRTranslation 17 | MLIRNVVMToLLVMIRTranslation 18 | MLIRROCDLToLLVMIRTranslation 19 | MLIRSCFToControlFlow 20 | MLIRSupport 21 | MLIRTargetLLVMIRExport 22 | TritonGPUToLLVM 23 | ) 24 | 25 | set_source_files_properties( 26 | LLVMIRTranslation.cpp 27 | PROPERTIES 28 | COMPILE_FLAGS "-D__BUILD_DIR__=\\\"${CMAKE_BINARY_DIR}\\\"") 29 | -------------------------------------------------------------------------------- /lib/Target/LLVMIR/LLVMPasses.h: -------------------------------------------------------------------------------- 1 | #include "llvm/IR/PassManager.h" 2 | #include "llvm/Pass.h" 3 | #include "llvm/Support/CodeGen.h" 4 | 5 | namespace llvm { 6 | 7 | // Pass to pre-process LLVM IR before optimization and break up phi of struct. 8 | // Breaking up those phis into elementary types allows better optimizations 9 | // downstream. 10 | struct BreakStructPhiNodesPass : PassInfoMixin { 11 | PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); 12 | 13 | static StringRef name() { return "BreakStructPhiNodesPass"; } 14 | }; 15 | 16 | } // namespace llvm 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18", "ninja>=1.11.1"] 3 | 4 | [tool.yapf] 5 | based_on_style = "pep8" 6 | column_limit = 120 7 | disable_split_list_with_comment = true 8 | each_dict_entry_on_separate_line=false 9 | split_before_named_assigns = false 10 | split_complex_comprehension = true 11 | 12 | [tool.yapfignore] 13 | ignore_patterns = [ 14 | # This exclusion is also specified in .pre-commit-config.yaml. 15 | # - We put it here because if you run yapf directly, we want it to skip the 16 | # file. 17 | # - We also put it in .pre-commit-config because yapf raises an error if 18 | # pre-commit runs it but all of the files it might touch are ignored! 19 | "python/test/unit/language/test_line_info.py" 20 | ] 21 | 22 | [tool.ruff] 23 | line-length = 120 24 | 25 | [tool.ruff.lint] 26 | ignore = ["E501", "E701", "E731", "E741"] 27 | -------------------------------------------------------------------------------- /python/MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/python/MANIFEST.in -------------------------------------------------------------------------------- /python/examples/copy_strided.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import triton.language as tl 3 | import triton.compiler as tc 4 | 5 | 6 | # triton kernel 7 | @triton.jit 8 | def kernel(X, stride_xm, # 9 | Z, stride_zn, # 10 | BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr): 11 | off_m = tl.arange(0, BLOCK_M) 12 | off_n = tl.arange(0, BLOCK_N) 13 | Xs = X + off_m[:, None] * stride_xm + off_n[None, :] * 1 14 | Zs = Z + off_m[:, None] * 1 + off_n[None, :] * stride_zn 15 | tl.store(Zs, tl.load(Xs)) 16 | 17 | 18 | src = tc.ASTSource( 19 | fn=kernel, 20 | constants={"BLOCK_M": 64, "BLOCK_N": 64}, 21 | signature="*fp32,i32,*fp32,i32", 22 | ) 23 | 24 | ret = triton.compile(src) 25 | print(ret.asm["ttgir"]) 26 | -------------------------------------------------------------------------------- /python/examples/empty.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | import triton.language as tl 5 | 6 | 7 | @triton.jit 8 | def kernel(X, stride_xm, stride_xn, BLOCK: tl.constexpr): 9 | pass 10 | 11 | 12 | X = torch.randn(1, device="cuda") 13 | pgm = kernel[(1, )](X, 1, 1, BLOCK=1024) 14 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [build-system] 3 | requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18", "ninja>=1.11.1"] 4 | 5 | # We're incrementally switching from autopep8 to ruff. 6 | [tool.autopep8] 7 | aggressive = 1 8 | ignore = "E501,E701,E731,W690,W503" 9 | max_line_length = 88 10 | 11 | [tool.ruff] 12 | line-length = 120 13 | 14 | [tool.ruff.lint] 15 | ignore = ["E501", "E701", "E731", "E741"] 16 | -------------------------------------------------------------------------------- /python/src/passes.h: -------------------------------------------------------------------------------- 1 | #define ADD_PASS_WRAPPER_0(name, builder) \ 2 | m.def(name, [](mlir::PassManager &pm) { pm.addPass(builder()); }) 3 | 4 | #define ADD_PASS_WRAPPER_1(name, builder, ty0) \ 5 | m.def(name, \ 6 | [](mlir::PassManager &pm, ty0 val0) { pm.addPass(builder(val0)); }) 7 | 8 | #define ADD_PASS_WRAPPER_2(name, builder, ty0, ty1) \ 9 | m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1) { \ 10 | pm.addPass(builder(val0, val1)); \ 11 | }) 12 | 13 | #define ADD_PASS_WRAPPER_3(name, builder, ty0, ty1, ty2) \ 14 | m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2) { \ 15 | pm.addPass(builder(val0, val1, val2)); \ 16 | }) 17 | 18 | #define ADD_PASS_WRAPPER_4(name, builder, ty0, ty1, ty2, ty3) \ 19 | m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, \ 20 | ty3 val3) { pm.addPass(builder(val0, val1, val2, val3)); }) 21 | -------------------------------------------------------------------------------- /python/test/backend/extension_backend.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { 6 | // create a struct to hold device properties 7 | return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", 1024, 8 | "multiprocessor_count", 16, "sm_clock_rate", 2100, 9 | "mem_clock_rate", 2300, "mem_bus_width", 2400); 10 | } 11 | 12 | static PyObject *loadBinary(PyObject *self, PyObject *args) { 13 | // get allocated registers and spilled registers from the function 14 | int n_regs = 0; 15 | int n_spills = 0; 16 | int mod = 0; 17 | int fun = 0; 18 | return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, 19 | n_spills); 20 | } 21 | 22 | static PyMethodDef ModuleMethods[] = { 23 | {"load_binary", loadBinary, METH_VARARGS, 24 | "Load dummy binary for the extension device"}, 25 | {"get_device_properties", getDeviceProperties, METH_VARARGS, 26 | "Get the properties for the extension device"}, 27 | {NULL, NULL, 0, NULL} // sentinel 28 | }; 29 | 30 | static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "ext_utils", 31 | NULL, // documentation 32 | -1, // size 33 | ModuleMethods}; 34 | 35 | PyMODINIT_FUNC PyInit_ext_utils(void) { 36 | PyObject *m = PyModule_Create(&ModuleDef); 37 | if (m == NULL) { 38 | return NULL; 39 | } 40 | PyModule_AddFunctions(m, ModuleMethods); 41 | return m; 42 | } 43 | -------------------------------------------------------------------------------- /python/test/backend/third_party_backends/conftest.py: -------------------------------------------------------------------------------- 1 | # content of conftest.py 2 | 3 | import pytest 4 | 5 | 6 | def pytest_addoption(parser): 7 | parser.addoption("--backend", action="store", default="", help="Codegen backend") 8 | 9 | 10 | @pytest.fixture 11 | def cmdopt(request): 12 | return request.config.getoption("--backend") 13 | -------------------------------------------------------------------------------- /python/test/backend/third_party_backends/test_xpu_backend.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | import triton.language as tl 5 | 6 | 7 | def test_xpu_backend(cmdopt): 8 | if cmdopt == "xpu": 9 | has_ipex = False 10 | try: 11 | # Import IPEX to provide Intel GPU runtime 12 | import intel_extension_for_pytorch # type: ignore # noqa: F401 13 | has_ipex = True if hasattr(torch, "xpu") else False 14 | except Exception: 15 | has_ipex = False 16 | 17 | @triton.jit() 18 | def kernel(x_ptr, y_ptr, out_ptr): 19 | pid = tl.program_id(axis=0) 20 | x = tl.load(x_ptr + pid) 21 | y = tl.load(y_ptr + pid) 22 | out = x + y 23 | tl.store(out_ptr + pid, out) 24 | 25 | if has_ipex: 26 | for _ in range(1000): 27 | x = torch.randn((65536, ), device="xpu", dtype=torch.float32) 28 | y = torch.randn((65536, ), device="xpu", dtype=torch.float32) 29 | z = torch.zeros((65536, ), device="xpu", dtype=torch.float32) 30 | kernel[(65536, )](x, y, z, num_warps=32) 31 | assert torch.all(x + y == z) 32 | else: 33 | return 34 | -------------------------------------------------------------------------------- /python/test/kernel_comparison/kernels.yml: -------------------------------------------------------------------------------- 1 | name_and_extension: 2 | - name: _kernel_0d1d2d3de4de5de6c7de8de9c10de11c 3 | extension: ptx 4 | - name: _kernel_0d1d2d3de4de5de6de7c8de9c10de11c 5 | extension: ptx 6 | - name: _kernel_0d1d2d345de6c789c1011c 7 | extension: ptx 8 | - name: _kernel_0d1d2d3456c789c1011c 9 | extension: ptx 10 | - name: _kernel_0d1d2d3de4de5de6c7de8c9de10de11c 11 | extension: ptx 12 | - name: _kernel_0d1d2d34567c8c91011c 13 | extension: ptx 14 | - name: _kernel_0d1d2d3456c78c91011c 15 | extension: ptx 16 | - name: _kernel_0d1d2d3de4de5de6de7c8c9de10de11c 17 | extension: ptx 18 | - name: _kernel_0d1d2d34567c89c1011c 19 | extension: ptx 20 | - name: _kernel_0d1d2d345de6de7c89c1011c 21 | extension: ptx 22 | - name: _kernel_0d1d2d345de6de7c8c9de1011c 23 | extension: ptx 24 | - name: kernel_0d1d2de 25 | extension: ptx 26 | - name: _kernel_0d1d2d345de6c78c9de1011c 27 | extension: ptx 28 | - name: _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11de12de13de14de15c16de17de18de19c20de21de22de23c2425de26de 29 | extension: ptx 30 | - name: _fwd_kernel_0d1d2d34d5d6de7de8de9c10de11de12de13c14de15de16de17c18de19de20de21c2223de24de 31 | extension: ptx 32 | - name: _bwd_preprocess_0d1d2d 33 | extension: ptx 34 | -------------------------------------------------------------------------------- /python/test/unit/conftest.py: -------------------------------------------------------------------------------- 1 | # content of conftest.py 2 | 3 | import pytest 4 | 5 | 6 | def pytest_addoption(parser): 7 | parser.addoption("--device", action="store", default='cuda') 8 | 9 | 10 | @pytest.fixture 11 | def device(request): 12 | return request.config.getoption("--device") 13 | -------------------------------------------------------------------------------- /python/test/unit/hopper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/python/test/unit/hopper/__init__.py -------------------------------------------------------------------------------- /python/test/unit/language/conftest.py: -------------------------------------------------------------------------------- 1 | # content of conftest.py 2 | 3 | 4 | def pytest_configure(config): 5 | config.addinivalue_line("markers", "interpreter: indicate whether interpreter supports the test") 6 | -------------------------------------------------------------------------------- /python/test/unit/language/test_annotations.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import torch 3 | import triton 4 | import triton.language as tl 5 | import pytest 6 | 7 | 8 | def annotated_function(return_type=None, **arg_types): 9 | """A decorator to add annotations to a function.""" 10 | 11 | def decorator(func): 12 | func.__annotations__ = {**arg_types, 'return': return_type} 13 | return func 14 | 15 | return decorator 16 | 17 | 18 | # Test integer annotations 19 | @pytest.mark.parametrize(("signed", "width"), [ 20 | (signed, width) for signed in [False, True]\ 21 | for width in [8, 16, 32, 64] 22 | ] + [(False, 1)] 23 | ) 24 | def test_int_annotation(signed, width, device): 25 | 26 | @triton.jit 27 | @annotated_function(X=torch.tensor, v=f"tl.{'' if signed else 'u'}int{width}") 28 | def _kernel(X, v): 29 | tl.store(X, v) 30 | 31 | h = _kernel[(1, )](torch.empty(1, device=device), 3) 32 | pfx = 'si' if signed else 'ui' 33 | assert f'%arg1: i{width}' in h.asm["ttir"] 34 | assert f'arith.{pfx}tofp' in h.asm["ttir"] 35 | 36 | 37 | # Test that unknown annotations do not emit an error 38 | def test_unknown_annotation(device): 39 | 40 | @triton.jit 41 | def _kernel(X: torch.Tensor, N: int, BLOCK_SIZE: tl.constexpr): 42 | pass 43 | 44 | x = torch.empty(1, device=device) 45 | _kernel[(1, )](x, x.shape[0], 32) 46 | try: 47 | _kernel[(1, )](x.shape[0], x.shape[0], 32) 48 | except AttributeError: 49 | pass 50 | -------------------------------------------------------------------------------- /python/test/unit/language/test_decorator.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import pytest 3 | 4 | 5 | def test_decorator_with_def(device): 6 | 7 | def triton_heuristics_pointwise(**kwargs): 8 | 9 | def decorator(func): 10 | return func 11 | 12 | return decorator 13 | 14 | # "def" might appear in a decorator call, e.g. a hash string argument. 15 | # This test makes sure the compiler can find the right position of function 16 | # definition. 17 | @triton_heuristics_pointwise(inductor_meta={'backend_hash': 'def0aeffabe53b3f8'}, ) 18 | @triton.jit 19 | def kernel(): 20 | pass 21 | 22 | try: 23 | triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={})) 24 | except Exception as e: 25 | pytest.fail(f"triton compile failed with error: {e}") 26 | -------------------------------------------------------------------------------- /python/test/unit/language/test_reproducer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import pytest 5 | 6 | import torch 7 | import triton 8 | import re 9 | 10 | 11 | @triton.jit 12 | def triton_(): 13 | return 14 | 15 | 16 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda") 17 | def test_reproducer(): 18 | tmpdir = ".tmp" 19 | reproducer = 'triton-reproducer.mlir' 20 | if os.path.exists(tmpdir): 21 | shutil.rmtree(tmpdir, ignore_errors=True) 22 | if os.path.exists(reproducer): 23 | os.remove(reproducer) 24 | os.environ["TRITON_CACHE_DIR"] = tmpdir 25 | os.environ["TRITON_REPRODUCER_PATH"] = reproducer 26 | triton_[(1, )]() 27 | foundPipeline = "" 28 | with open(reproducer, 'r') as f: 29 | line = f.read() 30 | if 'pipeline:' in line: 31 | foundPipeline = line 32 | if 0 == len(foundPipeline): 33 | raise Exception("Failed to find pipeline info in reproducer file.") 34 | 35 | ttgir_to_llvm_pass = re.compile("convert-triton-{{.*}}gpu-to-llvm") 36 | if ttgir_to_llvm_pass.search(foundPipeline): 37 | raise Exception("Failed to find triton passes in pipeline") 38 | # cleanup 39 | if os.path.exists(tmpdir): 40 | shutil.rmtree(tmpdir, ignore_errors=True) 41 | if os.path.exists(reproducer): 42 | os.remove(reproducer) 43 | -------------------------------------------------------------------------------- /python/test/unit/operators/conftest.py: -------------------------------------------------------------------------------- 1 | # content of conftest.py 2 | 3 | 4 | def pytest_configure(config): 5 | config.addinivalue_line("markers", "interpreter: indicate whether interpreter supports the test") 6 | -------------------------------------------------------------------------------- /python/test/unit/operators/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | import triton 5 | import triton.ops 6 | 7 | 8 | @pytest.mark.parametrize("M, N, dtype, mode", [ # 9 | (M, N, dtype, mode) 10 | for M in [1024, 821] 11 | for N in [512, 857, 1871, 2089, 8573, 31000] 12 | for dtype in ['float16', 'float32'] 13 | for mode in ['forward', 'backward'] 14 | ]) 15 | def test_op(M, N, dtype, mode): 16 | capability = torch.cuda.get_device_capability() 17 | if capability[0] < 8 and dtype == "bfloat16": 18 | pytest.skip("Only test bfloat16 on devices with sm >= 80") 19 | dtype = {'bfloat16': torch.bfloat16, 'float16': torch.float16, 'float32': torch.float32}[dtype] 20 | # create inputs 21 | x = torch.randn(M, N, dtype=dtype, device='cuda', requires_grad=True) 22 | idx = 4 + torch.ones(M, dtype=torch.int64, device='cuda') 23 | # forward pass 24 | tt_y = triton.ops.cross_entropy(x, idx) 25 | th_y = torch.nn.CrossEntropyLoss(reduction="none")(x, idx) 26 | if mode == 'forward': 27 | torch.testing.assert_close(th_y, tt_y) 28 | # backward pass 29 | elif mode == 'backward': 30 | dy = torch.randn_like(tt_y) 31 | # triton backward 32 | tt_y.backward(dy) 33 | tt_dx = x.grad.clone() 34 | # torch backward 35 | x.grad = None 36 | th_y.backward(dy) 37 | th_dx = x.grad.clone() 38 | if dtype == torch.float16: 39 | torch.testing.assert_close(th_dx, tt_dx, rtol=0.001, atol=0.001) 40 | else: 41 | torch.testing.assert_close(th_dx, tt_dx) 42 | -------------------------------------------------------------------------------- /python/test/unit/runtime/test_driver.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import triton 4 | 5 | 6 | def test_is_lazy(): 7 | from importlib import reload 8 | reload(sys.modules["triton.runtime.driver"]) 9 | reload(sys.modules["triton.runtime"]) 10 | mod = sys.modules[triton.runtime.driver.__module__] 11 | assert isinstance(triton.runtime.driver.active, getattr(mod, "LazyProxy")) 12 | assert triton.runtime.driver.active._obj is None 13 | utils = triton.runtime.driver.active.utils # noqa: F841 14 | assert issubclass(triton.runtime.driver.active._obj.__class__, getattr(triton.backends.driver, "DriverBase")) 15 | -------------------------------------------------------------------------------- /python/test/unit/runtime/test_jit.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import pytest 3 | import torch 4 | 5 | import triton 6 | import triton.language as tl 7 | 8 | 9 | def test_pre_call_hooks(): 10 | 11 | @triton.jit 12 | def add_kernel( 13 | in_ptr0, 14 | in_ptr1, 15 | out_ptr, 16 | n_elements, 17 | BLOCK_SIZE: "tl.constexpr", 18 | ): 19 | pid = tl.program_id(axis=0) 20 | block_start = pid * BLOCK_SIZE 21 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 22 | mask = offsets < n_elements 23 | x = tl.load(in_ptr0 + offsets, mask=mask) 24 | y = tl.load(in_ptr1 + offsets, mask=mask) 25 | output = x + y 26 | tl.store(out_ptr + offsets, output, mask=mask) 27 | 28 | class MyTensor(torch.Tensor): 29 | pass 30 | 31 | def my_hook(*args, **kwargs): 32 | for arg in itertools.chain(args, kwargs.values()): 33 | if isinstance(arg, MyTensor): 34 | raise Exception("MyTensor is not allowed") 35 | 36 | add_kernel.add_pre_run_hook(my_hook) 37 | 38 | x = torch.randn(4).cuda() 39 | y = MyTensor(x) 40 | out = torch.zeros_like(x) 41 | with pytest.raises(Exception): 42 | add_kernel[(4, )](x, y, out, 4, 4) 43 | -------------------------------------------------------------------------------- /python/triton/_C/include: -------------------------------------------------------------------------------- 1 | ../../../include/ -------------------------------------------------------------------------------- /python/triton/__init__.py: -------------------------------------------------------------------------------- 1 | """isort:skip_file""" 2 | __version__ = '3.0.0' 3 | 4 | # --------------------------------------- 5 | # Note: import order is significant here. 6 | 7 | # submodules 8 | from .runtime import ( 9 | autotune, 10 | Config, 11 | heuristics, 12 | JITFunction, 13 | KernelInterface, 14 | reinterpret, 15 | TensorWrapper, 16 | OutOfResources, 17 | InterpreterError, 18 | MockTensor, 19 | ) 20 | from .runtime.jit import jit 21 | from .compiler import compile, CompilationError 22 | from .errors import TritonError 23 | 24 | from . import language 25 | from . import testing 26 | from . import tools 27 | 28 | __all__ = [ 29 | "autotune", 30 | "cdiv", 31 | "CompilationError", 32 | "compile", 33 | "Config", 34 | "heuristics", 35 | "impl", 36 | "InterpreterError", 37 | "jit", 38 | "JITFunction", 39 | "KernelInterface", 40 | "language", 41 | "MockTensor", 42 | "next_power_of_2", 43 | "ops", 44 | "OutOfResources", 45 | "reinterpret", 46 | "runtime", 47 | "TensorWrapper", 48 | "TritonError", 49 | "testing", 50 | "tools", 51 | ] 52 | 53 | # ------------------------------------- 54 | # misc. utilities that don't fit well 55 | # into any specific module 56 | # ------------------------------------- 57 | 58 | 59 | def cdiv(x: int, y: int): 60 | return (x + y - 1) // y 61 | 62 | 63 | def next_power_of_2(n: int): 64 | """Return the smallest power of 2 greater than or equal to n""" 65 | n -= 1 66 | n |= n >> 1 67 | n |= n >> 2 68 | n |= n >> 4 69 | n |= n >> 8 70 | n |= n >> 16 71 | n |= n >> 32 72 | n += 1 73 | return n 74 | -------------------------------------------------------------------------------- /python/triton/backends/driver.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod, abstractclassmethod 2 | 3 | 4 | class DriverBase(metaclass=ABCMeta): 5 | 6 | @abstractclassmethod 7 | def is_active(self): 8 | pass 9 | 10 | @abstractmethod 11 | def get_current_target(self): 12 | pass 13 | 14 | def __init__(self) -> None: 15 | pass 16 | 17 | 18 | class GPUDriver(DriverBase): 19 | 20 | def __init__(self): 21 | # TODO: support other frameworks than torch 22 | import torch 23 | self.get_device_capability = torch.cuda.get_device_capability 24 | try: 25 | from torch._C import _cuda_getCurrentRawStream 26 | self.get_current_stream = _cuda_getCurrentRawStream 27 | except ImportError: 28 | self.get_current_stream = lambda idx: torch.cuda.current_stream(idx).cuda_stream 29 | self.get_current_device = torch.cuda.current_device 30 | self.set_current_device = torch.cuda.set_device 31 | 32 | # TODO: remove once TMA is cleaned up 33 | def assemble_tensormap_to_arg(self, tensormaps_info, args): 34 | return args 35 | 36 | 37 | class CPUDriverBase(DriverBase): 38 | 39 | def __init__(self): 40 | # Right now, we just provide dummy functions. 41 | # TODO: Consider better engineering the code only intended for GPU in jit.py. 42 | self.get_device_capability = lambda idx: (0, 0) 43 | self.get_current_stream = lambda idx: 0 44 | self.get_current_device = lambda: 0 45 | self.set_current_device = lambda idx: None 46 | -------------------------------------------------------------------------------- /python/triton/compiler/__init__.py: -------------------------------------------------------------------------------- 1 | from .compiler import CompiledKernel, ASTSource, compile, AttrsDescriptor, make_backend, LazyDict 2 | from .errors import CompilationError 3 | 4 | __all__ = ["compile", "make_backend", "ASTSource", "AttrsDescriptor", "CompiledKernel", "CompilationError", "LazyDict"] 5 | -------------------------------------------------------------------------------- /python/triton/compiler/make_launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/python/triton/compiler/make_launcher.py -------------------------------------------------------------------------------- /python/triton/errors.py: -------------------------------------------------------------------------------- 1 | """Base class for all errors raised by Triton""" 2 | 3 | 4 | class TritonError(Exception): 5 | ... 6 | -------------------------------------------------------------------------------- /python/triton/language/extra/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cuda 2 | 3 | __all__ = ['cuda'] 4 | -------------------------------------------------------------------------------- /python/triton/language/extra/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | from . import libdevice 2 | 3 | from .utils import (globaltimer, num_threads, num_warps, smid, convert_custom_float8_sm70, convert_custom_float8_sm80) 4 | 5 | __all__ = [ 6 | "libdevice", "globaltimer", "num_threads", "num_warps", "smid", "convert_custom_float8_sm70", 7 | "convert_custom_float8_sm80" 8 | ] 9 | -------------------------------------------------------------------------------- /python/triton/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # from .conv import _conv, conv 2 | from . import blocksparse 3 | from .cross_entropy import _cross_entropy, cross_entropy 4 | from .flash_attention import attention 5 | from .matmul import _matmul, get_higher_dtype, matmul 6 | 7 | __all__ = ["blocksparse", "_cross_entropy", "cross_entropy", "_matmul", "matmul", "attention", "get_higher_dtype"] 8 | -------------------------------------------------------------------------------- /python/triton/ops/blocksparse/__init__.py: -------------------------------------------------------------------------------- 1 | from .matmul import matmul 2 | from .softmax import softmax 3 | 4 | __all__ = [ 5 | "matmul", 6 | "softmax", 7 | ] 8 | -------------------------------------------------------------------------------- /python/triton/runtime/__init__.py: -------------------------------------------------------------------------------- 1 | from .autotuner import (Autotuner, Config, Heuristics, autotune, heuristics) 2 | from .cache import RedisRemoteCacheBackend, RemoteCacheBackend 3 | from .driver import driver 4 | from .jit import JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret 5 | from .errors import OutOfResources, InterpreterError 6 | 7 | __all__ = [ 8 | "autotune", 9 | "Autotuner", 10 | "Config", 11 | "driver", 12 | "Heuristics", 13 | "heuristics", 14 | "InterpreterError", 15 | "JITFunction", 16 | "KernelInterface", 17 | "MockTensor", 18 | "OutOfResources", 19 | "RedisRemoteCacheBackend", 20 | "reinterpret", 21 | "RemoteCacheBackend", 22 | "TensorWrapper", 23 | ] 24 | -------------------------------------------------------------------------------- /python/triton/runtime/errors.py: -------------------------------------------------------------------------------- 1 | from ..errors import TritonError 2 | from typing import Optional 3 | 4 | 5 | class InterpreterError(TritonError): 6 | 7 | def __init__(self, error_message: Optional[str] = None): 8 | self.error_message = error_message 9 | 10 | def __str__(self) -> str: 11 | return self.error_message or "" 12 | 13 | 14 | class OutOfResources(TritonError): 15 | 16 | def __init__(self, required, limit, name): 17 | self.required = required 18 | self.limit = limit 19 | self.name = name 20 | 21 | def __str__(self) -> str: 22 | return f"out of resource: {self.name}, Required: {self.required}, Hardware limit: {self.limit}. Reducing block sizes or `num_stages` may help." 23 | 24 | def __reduce__(self): 25 | # this is necessary to make CompilationError picklable 26 | return (type(self), (self.required, self.limit, self.name)) 27 | -------------------------------------------------------------------------------- /python/triton/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/python/triton/tools/__init__.py -------------------------------------------------------------------------------- /python/triton/tools/compile.h: -------------------------------------------------------------------------------- 1 | #ifndef TT_KERNEL_INCLUDES 2 | #define TT_KERNEL_INCLUDES 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #endif 10 | 11 | void unload_{kernel_name}(void); 12 | void load_{kernel_name}(void); 13 | // tt-linker: {kernel_name}:{full_signature}:{algo_info} 14 | CUresult{_placeholder} {kernel_name}(CUstream stream, {signature}); 15 | -------------------------------------------------------------------------------- /python/tutorials/README.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one. 5 | 6 | To install the dependencies for the tutorials: 7 | 8 | .. code-block:: bash 9 | 10 | cd triton 11 | pip install -e './python[tutorials]' 12 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(lib) 2 | 3 | llvm_canonicalize_cmake_booleans( 4 | MLIR_ENABLE_BINDINGS_PYTHON 5 | ) 6 | 7 | configure_lit_site_cfg( 8 | ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in 9 | ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py 10 | MAIN_CONFIG 11 | ${CMAKE_CURRENT_SOURCe_DIR}/lit.cfg.py 12 | ) 13 | 14 | set(TRITON_TEST_DEPENDS 15 | triton-opt 16 | ) 17 | 18 | set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck") 19 | set(LIT_ARGS "-Dfilecheck=${FILECHECK_PATH}") 20 | add_lit_testsuite(check-triton-lit-tests "Running the triton regression tests" 21 | ${CMAKE_CURRENT_BINARY_DIR} 22 | ARGS ${LIT_ARGS} 23 | DEPENDS ${TRITON_TEST_DEPENDS} 24 | ) 25 | 26 | set_target_properties(check-triton-lit-tests PROPERTIES FOLDER "Tests") 27 | 28 | add_lit_testsuites(TRITON-LIT-TESTS ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TRITON_TEST_DEPENDS}) 29 | -------------------------------------------------------------------------------- /test/Conversion/amd/decompose-unsupported-conversions.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s --split-input-file --decompose-unsupported-amd-conversions | FileCheck %s 2 | 3 | // CHECK: #[[BLOCKED:.+]] = #triton_gpu.blocked<{{.*}}> 4 | // CHECK: #[[WMMA:.+]] = #triton_gpu.amd_wmma<{{.*}}> 5 | // CHECK: #[[SHARED:.+]] = #triton_gpu.shared<{{.*}}> 6 | #mma = #triton_gpu.amd_wmma<{warpsPerCTA = [2, 2]}> 7 | module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { 8 | tt.func @wmma_to_wmma_dot_op(%arg0: tensor<16x16xf16, #mma>) { 9 | // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<16x16xf16, #[[WMMA]]> -> tensor<16x16xf16, #[[BLOCKED]]> 10 | // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<16x16xf16, #[[SHARED]]> 11 | // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[WMMA]]}>> 12 | %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>> 13 | tt.return 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /test/Conversion/amd/fp_to_fp.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm | FileCheck %s 2 | 3 | // CHECK-LABEL: f16_to_f32 4 | #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> 5 | module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { 6 | tt.func @f16_to_f32(%arg0: tensor<8x8xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>) { 7 | // CHECK-COUNT-8: llvm.inline_asm asm_dialect {{.*}}v_cvt_f32_f16 {{.*}}: (f16) -> f32 8 | %0 = tt.fp_to_fp %arg0 : tensor<8x8xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> -> tensor<8x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> 9 | tt.return 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /test/Conversion/divide-by-0.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm --cse | FileCheck %s 2 | 3 | // CHECK-LABEL: dont_divide_0 4 | // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32 5 | // CHECK-NOT: llvm.urem %{{.*}}, %[[C0]] 6 | #blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}> 7 | #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 8]}> 8 | module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { 9 | tt.func public @dont_divide_0() attributes {noinline = false} { 10 | %zero = arith.constant dense<0.000000e+00> : tensor<16x1xf32, #mma> 11 | %cvt = triton_gpu.convert_layout %zero : tensor<16x1xf32, #mma> -> tensor<16x1xf32, #blocked> 12 | tt.return 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /test/Conversion/tritongpu_to_llvm_volta.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm=compute-capability=70 2>&1 | FileCheck %s 2 | 3 | #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> 4 | // CHECK-LABEL: clamp 5 | module attributes {"triton_gpu.compute-capability" = 70 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { 6 | tt.func public @clamp(%x : tensor<1024xf32, #blocked>, %limit : tensor<1024xf32, #blocked>) attributes {noinline = false} { 7 | %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> 8 | %neg_limit = arith.subf %cst, %limit : tensor<1024xf32, #blocked> 9 | 10 | // CHECK: llvm.fcmp "une" %[[REG:[a-zA-Z0-9]+]], %[[REG]] 11 | // CHECK-NEXT: llvm.intr.maxnum 12 | // CHECK-NEXT: llvm.intr.minnum 13 | // CHECK-NEXT: llvm.mlir.constant 14 | // CHECK-NEXT: llvm.select 15 | %12 = tt.clampf %x, %neg_limit, %limit, propagateNan = all : tensor<1024xf32, #blocked> 16 | tt.return 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /test/LLVMIR/break-phi-struct.ll: -------------------------------------------------------------------------------- 1 | ; RUN: triton-llvm-opt -break-struct-phi-nodes %s | FileCheck %s 2 | 3 | ; CHECK-LABEL: struct 4 | define {i32, i32} @struct(i1 %c) { 5 | ; CHECK: br i1 %{{.*}}, label [[TRUE:%.*]], label [[FALSE:%.*]] 6 | br i1 %c, label %true, label %false 7 | 8 | true: 9 | %s.1 = insertvalue {i32, i32} undef, i32 20, 0 10 | %s.2 = insertvalue {i32, i32} %s.1, i32 200, 1 11 | 12 | ; CHECK-DAG: [[E0:%.*]] = extractvalue { i32, i32 } %{{.*}}, 0 13 | ; CHECK-DAG: [[E1:%.*]] = extractvalue { i32, i32 } %{{.*}}, 1 14 | ; CHECK: br 15 | br label %exit 16 | 17 | false: 18 | %s.3 = insertvalue {i32, i32} undef, i32 30, 0 19 | %s.4 = insertvalue {i32, i32} %s.3, i32 300, 1 20 | ; CHECK-DAG: [[E2:%.*]] = extractvalue { i32, i32 } %{{.*}}, 0 21 | ; CHECK-DAG: [[E3:%.*]] = extractvalue { i32, i32 } %{{.*}}, 1 22 | ; CHECK: br 23 | br label %exit 24 | 25 | exit: 26 | ; CHECK-DAG: [[PHI0:%.*]] = phi i32 [ [[E0]], [[TRUE]] ], [ [[E2]], [[FALSE]] ] 27 | ; CHECK-DAG: [[PHI1:%.*]] = phi i32 [ [[E1]], [[TRUE]] ], [ [[E3]], [[FALSE]] ] 28 | ; CHECK: [[S0:%.*]] = insertvalue { i32, i32 } undef, i32 [[PHI0]], 0 29 | ; CHECK: [[S1:%.*]] = insertvalue { i32, i32 } [[S0]], i32 [[PHI1]], 1 30 | ; CHECK: ret { i32, i32 } [[S1]] 31 | %r = phi {i32, i32} [ %s.2, %true], [ %s.4, %false ] 32 | ret {i32, i32} %r 33 | } 34 | -------------------------------------------------------------------------------- /test/NVGPU/test_cga.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -split-input-file --convert-nv-gpu-to-llvm | FileCheck %s 2 | #SHARED = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> 3 | module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 2 : i32} { 4 | tt.func @test_mbarrier() { 5 | %ptr = llvm.mlir.zero : !llvm.ptr<3> 6 | 7 | // CHECK: llvm.inline_asm 8 | %v = nvgpu.cluster_id 9 | llvm.store %v, %ptr : i32, !llvm.ptr<3> 10 | 11 | tt.return 12 | } 13 | } // end module 14 | -------------------------------------------------------------------------------- /test/Triton/reproducer.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt --verify-diagnostics --dump-pass-pipeline --run-reproducer %s 2>&1 | FileCheck %s 2 | 3 | module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { 4 | tt.func public @triton__() attributes {noinline = false} { 5 | tt.return 6 | } 7 | } 8 | 9 | {-# 10 | external_resources: { 11 | mlir_reproducer: { 12 | pipeline: "builtin.module(any(convert-scf-to-cf,convert-index-to-llvm{index-bitwidth=0},convert-triton-gpu-to-llvm{compute-capability=90},convert-nv-gpu-to-llvm,convert-arith-to-llvm{index-bitwidth=0},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=true test-convergence=false top-down=true},cse,symbol-dce,enable-line-info))", 13 | disable_threading: false, 14 | verify_each: false 15 | } 16 | } 17 | #-} 18 | 19 | // CHECK: Pass Manager with 20 | // CHECK-NEXT: convert-triton-gpu-to-llvm 21 | -------------------------------------------------------------------------------- /test/Triton/verify-make-range.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt --split-input-file %s --verify-diagnostics 2 | 3 | tt.func public @i64_tensor() { 4 | // expected-error @+1 {{i32 elements}} 5 | %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : tensor<16xi64> 6 | tt.return 7 | } 8 | 9 | // ----- 10 | tt.func public @i32_scalar() { 11 | // expected-error @+1 {{invalid kind of type}} 12 | %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : i32 13 | tt.return 14 | } 15 | 16 | // ----- 17 | tt.func public @_2d_tensor() { 18 | // expected-error @+1 {{must be a 1D tensor}} 19 | %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : tensor<16x1xi32> 20 | tt.return 21 | } 22 | 23 | // ----- 24 | tt.func public @bad_start_end() { 25 | // expected-error @+1 {{start must be less than or equal to end}} 26 | %a = tt.make_range { start = 0 : i32, end = -16 : i32 } : tensor<16xi32> 27 | tt.return 28 | } 29 | 30 | // ----- 31 | tt.func public @bad_num_elems() { 32 | // expected-error @+1 {{number of elements}} 33 | %a = tt.make_range { start = 0 : i32, end = 32 : i32 } : tensor<16xi32> 34 | tt.return 35 | } 36 | -------------------------------------------------------------------------------- /test/TritonGPU/ops.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt --split-input-file %s | FileCheck %s 2 | 3 | // CHECK: #[[$WMMA:.*]] = #triton_gpu.amd_wmma 4 | #blocked = #triton_gpu.blocked<{sizePerThread = [2, 2], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> 5 | 6 | module attributes {"triton_gpu.compute-capability" = 0 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { 7 | // CHECK-LABEL: wmma_layout 8 | tt.func @wmma_layout(%0: tensor<16x16xf16, #blocked>) { 9 | %1 = triton_gpu.convert_layout %0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #triton_gpu.amd_wmma<{warpsPerCTA = [1, 1]}>> 10 | // CHECK: %{{.+}} = triton_gpu.convert_layout %{{.+}} : tensor<16x16xf16, #{{.+}}> -> tensor<16x16xf16, #[[$WMMA]]> 11 | tt.return 12 | } 13 | 14 | // CHECK-LABEL: wmma_dot_op_layout 15 | tt.func @wmma_dot_op_layout(%0: tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) { 16 | %1 = triton_gpu.convert_layout %0 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #triton_gpu.amd_wmma<{warpsPerCTA = [1, 1]}>}>> 17 | // CHECK: %{{.+}} = triton_gpu.convert_layout %{{.+}} : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #{{.+}}}>> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[$WMMA]]}>> 18 | tt.return 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /test/TritonGPU/reduce-data-duplication.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s 2 | 3 | // CHECK: #[[SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false} 4 | // CHECK: apply_swizzle 5 | // CHECK: %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[SHARED]]> 6 | 7 | #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}> 8 | #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}> 9 | module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { 10 | tt.func @apply_swizzle(%arg0: tensor<16x256xf16, #blocked>) { 11 | %0 = triton_gpu.convert_layout %arg0 : tensor<16x256xf16, #blocked> -> tensor<16x256xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> 12 | tt.return 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /test/TritonGPU/tritongpu_ops.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s | triton-opt | FileCheck %s 2 | 3 | #shared0 = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], hasLeadingOffset=true}> 4 | 5 | module attributes {"triton_gpu.compute-capability" = 0 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { 6 | // CHECK-LABEL: memdesc 7 | // CHECK-SAME: !tt.memdesc<1x64x16xf16, #{{.+}}> 8 | tt.func @memdesc(%d : !tt.memdesc<1x64x16xf16, #shared0>) { 9 | tt.return 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /test/lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(TritonTestAnalysis 2 | TestAlias.cpp 3 | TestAxisInfo.cpp 4 | TestAllocation.cpp 5 | TestMembar.cpp 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRPass 9 | ${triton_libs} 10 | ) 11 | -------------------------------------------------------------------------------- /test/lib/Analysis/TestAxisInfo.cpp: -------------------------------------------------------------------------------- 1 | #include "mlir/Pass/Pass.h" 2 | #include "triton/Analysis/AxisInfo.h" 3 | #include "triton/Analysis/Utility.h" 4 | 5 | using namespace mlir; 6 | using namespace mlir::triton; 7 | 8 | namespace { 9 | 10 | struct TestAxisInfoPass 11 | : public PassWrapper> { 12 | 13 | MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAxisInfoPass); 14 | 15 | StringRef getArgument() const final { return "test-print-alignment"; } 16 | StringRef getDescription() const final { 17 | return "print the result of the alignment analysis pass"; 18 | } 19 | 20 | void runOnOperation() override { 21 | Operation *operation = getOperation(); 22 | ModuleOp moduleOp = cast(operation); 23 | ModuleAxisInfoAnalysis moduleAxisInfoAnalysis(moduleOp); 24 | moduleOp.walk([&](FuncOp funcOp) { 25 | auto &os = llvm::errs(); 26 | auto opName = SymbolTable::getSymbolName(funcOp).getValue().str(); 27 | os << "@" << opName << "\n"; 28 | funcOp.walk([&](Operation *op) { 29 | if (op->getNumResults() < 1) 30 | return; 31 | for (Value result : op->getResults()) { 32 | result.print(os); 33 | os << " => "; 34 | auto *axisInfo = moduleAxisInfoAnalysis.getAxisInfo(result); 35 | if (axisInfo) 36 | axisInfo->print(os); 37 | os << "\n"; 38 | } 39 | }); 40 | }); 41 | } 42 | }; 43 | 44 | } // namespace 45 | 46 | namespace mlir { 47 | namespace test { 48 | void registerTestAlignmentPass() { PassRegistration(); } 49 | } // namespace test 50 | } // namespace mlir 51 | -------------------------------------------------------------------------------- /test/lib/Analysis/TestMembar.cpp: -------------------------------------------------------------------------------- 1 | #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" 2 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" 3 | #include "mlir/IR/Dialect.h" 4 | #include "mlir/Pass/Pass.h" 5 | #include "mlir/Transforms/DialectConversion.h" 6 | #include "triton/Analysis/Allocation.h" 7 | #include "triton/Analysis/Membar.h" 8 | 9 | using namespace mlir; 10 | 11 | namespace { 12 | 13 | struct TestMembarPass 14 | : public PassWrapper> { 15 | 16 | MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMembarPass); 17 | 18 | StringRef getArgument() const final { return "test-print-membar"; } 19 | StringRef getDescription() const final { 20 | return "print the result of the allocation pass"; 21 | } 22 | 23 | void runOnOperation() override { 24 | Operation *operation = getOperation(); 25 | ModuleOp moduleOp = cast(operation); 26 | // Print all ops after membar pass 27 | ModuleAllocation allocation(moduleOp); 28 | ModuleMembarAnalysis membarPass(&allocation); 29 | membarPass.run(); 30 | } 31 | }; 32 | 33 | } // namespace 34 | 35 | namespace mlir { 36 | namespace test { 37 | void registerTestMembarPass() { PassRegistration(); } 38 | } // namespace test 39 | } // namespace mlir 40 | -------------------------------------------------------------------------------- /test/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | -------------------------------------------------------------------------------- /test/lit.site.cfg.py.in: -------------------------------------------------------------------------------- 1 | @LIT_SITE_CFG_IN_HEADER@ 2 | 3 | import sys 4 | 5 | config.triton_obj_root = "@TRITON_BINARY_DIR@" 6 | config.llvm_src_root = "@LLVM_SOURCE_DIR@" 7 | config.llvm_obj_root = "@LLVM_BINARY_DIR@" 8 | config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" 9 | config.llvm_lib_dir = "@LLVM_LIBS_DIR@" 10 | config.llvm_shlib_dir = "@SHLIBDIR@" 11 | config.llvm_shlib_ext = "@SHLIBEXT@" 12 | config.llvm_exe_ext = "@EXEEXT@" 13 | config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" 14 | config.mlir_binary_dir = "@MLIR_BINARY_DIR@" 15 | config.python_executable = "@Python3_EXECUTABLE@" 16 | config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@ 17 | 18 | 19 | import lit.llvm 20 | lit.llvm.initialize(lit_config, config) 21 | 22 | # Let the main config do the real work 23 | lit_config.load_config(config, "@TRITON_SOURCE_DIR@/test/lit.cfg.py") 24 | -------------------------------------------------------------------------------- /third_party/amd/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) 2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) 3 | add_subdirectory(include) 4 | add_subdirectory(lib) 5 | if(TRITON_BUILD_PYTHON_MODULE) 6 | add_triton_plugin(TritonAMD ${CMAKE_CURRENT_SOURCE_DIR}/python/triton_amd.cc LINK_LIBS TritonAMDGPUToLLVM TritonAMDGPUTransforms) 7 | endif() 8 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/amd_detail/amd_hip_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | this software and associated documentation files (the "Software"), to deal in 6 | the Software without restriction, including without limitation the rights to 7 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 8 | of the Software, and to permit persons to whom the Software is furnished to do 9 | so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. 21 | */ 22 | 23 | #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H 24 | #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H 25 | 26 | #if defined(__clang__) && defined(__HIP__) 27 | #define __HIP_CLANG_ONLY__ 1 28 | #else 29 | #define __HIP_CLANG_ONLY__ 0 30 | #endif 31 | 32 | #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H 33 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/amd_detail/concepts.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #pragma once 24 | 25 | namespace hip_impl // Documentation only. 26 | { 27 | #define requires(...) 28 | 29 | #define FunctionalProcedure typename 30 | } // namespace hip_impl 31 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/amd_detail/grid_launch.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "grid_launch.h" 4 | #include "hc.hpp" 5 | 6 | class grid_launch_parm_cxx : public grid_launch_parm 7 | { 8 | public: 9 | grid_launch_parm_cxx() = default; 10 | 11 | // customized serialization: don't need av and cf in kernel 12 | __attribute__((annotate("serialize"))) 13 | void __cxxamp_serialize(Kalmar::Serialize& s) const { 14 | s.Append(sizeof(int), &grid_dim.x); 15 | s.Append(sizeof(int), &grid_dim.y); 16 | s.Append(sizeof(int), &grid_dim.z); 17 | s.Append(sizeof(int), &group_dim.x); 18 | s.Append(sizeof(int), &group_dim.y); 19 | s.Append(sizeof(int), &group_dim.z); 20 | } 21 | 22 | __attribute__((annotate("user_deserialize"))) 23 | grid_launch_parm_cxx(int grid_dim_x, int grid_dim_y, int grid_dim_z, 24 | int group_dim_x, int group_dim_y, int group_dim_z) { 25 | grid_dim.x = grid_dim_x; 26 | grid_dim.y = grid_dim_y; 27 | grid_dim.z = grid_dim_z; 28 | group_dim.x = group_dim_x; 29 | group_dim.y = group_dim_y; 30 | group_dim.z = group_dim_z; 31 | } 32 | }; 33 | 34 | 35 | extern inline void grid_launch_init(grid_launch_parm *lp) { 36 | lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1; 37 | 38 | lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1; 39 | 40 | lp->dynamic_group_mem_bytes = 0; 41 | 42 | lp->barrier_bit = barrier_bit_queue_default; 43 | lp->launch_fence = -1; 44 | 45 | // TODO - set to NULL? 46 | static hc::accelerator_view av = hc::accelerator().get_default_view(); 47 | lp->av = &av; 48 | lp->cf = NULL; 49 | } 50 | 51 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/amd_detail/grid_launch_GGL.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | #pragma once 23 | 24 | #if GENERIC_GRID_LAUNCH == 1 25 | #include "macro_based_grid_launch.hpp" 26 | #endif // GENERIC_GRID_LAUNCH -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_bf16.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef HIP_INCLUDE_HIP_HIP_BF16_H 24 | #define HIP_INCLUDE_HIP_HIP_BF16_H 25 | 26 | #include 27 | 28 | #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) 29 | #include 30 | #elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) 31 | #include 32 | #else 33 | #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); 34 | #endif 35 | 36 | #endif // HIP_INCLUDE_HIP_HIP_BF16_H 37 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_fp16.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #ifndef HIP_INCLUDE_HIP_HIP_FP16_H 24 | #define HIP_INCLUDE_HIP_HIP_FP16_H 25 | 26 | #include 27 | 28 | #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) 29 | #include 30 | #elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) 31 | #include "cuda_fp16.h" 32 | #else 33 | #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); 34 | #endif 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_gl_interop.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | #ifndef HIP_GL_INTEROP_H 23 | #define HIP_GL_INTEROP_H 24 | 25 | #include 26 | 27 | #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) 28 | #include "hip/amd_detail/amd_hip_gl_interop.h" 29 | #elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) 30 | #include "hip/nvidia_detail/nvidia_hip_gl_interop.h" 31 | #endif 32 | #endif 33 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_hcc.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in 10 | all copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 17 | THE SOFTWARE. 18 | */ 19 | 20 | #ifndef HIP_INCLUDE_HIP_HIP_HCC_H 21 | #define HIP_INCLUDE_HIP_HIP_HCC_H 22 | #warning "hip/hip_hcc.h is deprecated, please use hip/hip_ext.h" 23 | #include "hip/hip_ext.h" 24 | #endif // #ifdef HIP_INCLUDE_HIP_HIP_HCC_H 25 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_profile.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in 10 | all copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 17 | THE SOFTWARE. 18 | */ 19 | 20 | #ifndef HIP_INCLUDE_HIP_HIP_PROFILE_H 21 | #define HIP_INCLUDE_HIP_HIP_PROFILE_H 22 | 23 | #define HIP_SCOPED_MARKER(markerName, group) 24 | #define HIP_BEGIN_MARKER(markerName, group) 25 | #define HIP_END_MARKER() 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_texture_types.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | 24 | #ifndef HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H 25 | #define HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H 26 | 27 | #include 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_version.h: -------------------------------------------------------------------------------- 1 | // Auto-generated by cmake 2 | 3 | #ifndef HIP_VERSION_H 4 | #define HIP_VERSION_H 5 | 6 | #define HIP_VERSION_MAJOR 6 7 | #define HIP_VERSION_MINOR 0 8 | #define HIP_VERSION_PATCH 32830 9 | #define HIP_VERSION_GITHASH "d62f6a171" 10 | #define HIP_VERSION_BUILD_ID 0 11 | #define HIP_VERSION_BUILD_NAME "" 12 | #define HIP_VERSION (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH) 13 | 14 | #define __HIP_HAS_GET_PCH 1 15 | 16 | #endif 17 | 18 | -------------------------------------------------------------------------------- /third_party/amd/backend/lib/cuda2gcn.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/amd/backend/lib/cuda2gcn.bc -------------------------------------------------------------------------------- /third_party/amd/backend/lib/ockl.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/amd/backend/lib/ockl.bc -------------------------------------------------------------------------------- /third_party/amd/backend/lib/ocml.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/amd/backend/lib/ocml.bc -------------------------------------------------------------------------------- /third_party/amd/backend/lib/opencl.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/amd/backend/lib/opencl.bc -------------------------------------------------------------------------------- /third_party/amd/include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonAMDGPUToLLVM) 2 | add_subdirectory(TritonAMDGPUTransforms) 3 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonAMDGPUToLLVM) 3 | add_public_tablegen_target(TritonAMDGPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUToLLVM/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONAMDGPU_CONVERSION_PASSES_H 2 | #define TRITONAMDGPU_CONVERSION_PASSES_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 6 | #include "mlir/Pass/Pass.h" 7 | #include "mlir/Transforms/DialectConversion.h" 8 | 9 | #include 10 | 11 | namespace mlir { 12 | 13 | class ModuleOp; 14 | template class OperationPass; 15 | 16 | namespace triton { 17 | 18 | #define GEN_PASS_DECL 19 | #include "TritonAMDGPUToLLVM/Passes.h.inc" 20 | 21 | namespace AMD { 22 | std::unique_ptr> 23 | createDecomposeUnsupportedConversionsPass(); 24 | 25 | } // namespace AMD 26 | 27 | std::unique_ptr> createConvertTritonAMDGPUToLLVMPass(); 28 | std::unique_ptr> 29 | createConvertTritonAMDGPUToLLVMPass(int32_t computeCapability); 30 | #define GEN_PASS_REGISTRATION 31 | #include "TritonAMDGPUToLLVM/Passes.h.inc" 32 | 33 | } // namespace triton 34 | 35 | } // namespace mlir 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUToLLVM/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONAMDGPU_CONVERSION_PASSES 2 | #define TRITONAMDGPU_CONVERSION_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def DecomposeUnsupportedAMDConversions : Pass<"decompose-unsupported-amd-conversions", "mlir::ModuleOp"> { 7 | let summary = "Decompose conversions that are not supported by TritonGPU -> LLVM"; 8 | let constructor = "mlir::triton::AMD::createDecomposeUnsupportedConversionsPass()"; 9 | } 10 | 11 | def ConvertTritonAMDGPUToLLVM : Pass<"convert-triton-amdgpu-to-llvm", "mlir::ModuleOp"> { 12 | let summary = "Convert TritonGPU to LLVM"; 13 | let description = [{ 14 | 15 | }]; 16 | let constructor = "mlir::triton::createConvertTritonAMDGPUToLLVMPass()"; 17 | 18 | let dependentDialects = ["mlir::arith::ArithDialect", 19 | "mlir::math::MathDialect", 20 | "mlir::gpu::GPUDialect", 21 | "mlir::scf::SCFDialect", 22 | "mlir::LLVM::LLVMDialect", 23 | "mlir::tensor::TensorDialect", 24 | "mlir::triton::TritonDialect", 25 | "mlir::triton::gpu::TritonGPUDialect", 26 | "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect", 27 | "mlir::ROCDL::ROCDLDialect", 28 | "mlir::NVVM::NVVMDialect"]; 29 | 30 | let options = [ 31 | Option<"computeCapability", "compute-capability", 32 | "int32_t", /*default*/"80", 33 | "device compute capability">, 34 | ]; 35 | } 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUTransforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonAMDGPU) 3 | add_public_tablegen_target(TritonAMDGPUTransformsIncGen) 4 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUTransforms/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONAMDGPU_TRANSFORMS_PASSES_H_ 2 | #define TRITON_DIALECT_TRITONAMDGPU_TRANSFORMS_PASSES_H_ 3 | 4 | #include "mlir/Pass/Pass.h" 5 | #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h" 6 | 7 | namespace mlir { 8 | 9 | std::unique_ptr 10 | createTritonAMDGPUPipelinePass(int numStages = 3, int numWarps = 4, 11 | int numCTAs = 1, int computeCapability = 80); 12 | 13 | std::unique_ptr createTritonAMDGPUStreamPipelinePass(); 14 | 15 | std::unique_ptr 16 | createTritonAMDGPUAccelerateMatmulPass(std::string archGenName = std::string(), 17 | int matrixInstructionSize = 0, 18 | int kpack = 1); 19 | 20 | std::unique_ptr createTritonAMDGPUPrefetchPass(); 21 | 22 | std::unique_ptr createTritonAMDGPUCanonicalizeLoopsPass(); 23 | 24 | std::unique_ptr createTritonAMDGPUCoalescePass(); 25 | 26 | std::unique_ptr createTritonAMDGPUReorderInstructionsPass(); 27 | 28 | std::unique_ptr createTritonAMDGPURemoveLayoutConversionsPass(); 29 | 30 | std::unique_ptr createTritonAMDGPUVerifier(); 31 | 32 | std::unique_ptr createTritonAMDGPUOptimizeDotOperandsPass(); 33 | 34 | std::unique_ptr createTritonAMDGPUOptimizeEpiloguePass(); 35 | 36 | /// Generate the code for registering passes. 37 | #define GEN_PASS_REGISTRATION 38 | #include "TritonAMDGPUTransforms/Passes.h.inc" 39 | 40 | } // namespace mlir 41 | #endif 42 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUTransforms/TritonGPUConversion.h: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // 3 | // Defines utilities to use while converting to the TritonGPU dialect. 4 | // 5 | //===----------------------------------------------------------------------===// 6 | 7 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 8 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 9 | 10 | #include "mlir/Transforms/DialectConversion.h" 11 | 12 | namespace mlir { 13 | 14 | class TritonGPUTypeConverter : public TypeConverter { 15 | public: 16 | TritonGPUTypeConverter(MLIRContext *context, int numWarps, int threadsPerWarp, 17 | int numCTAs); 18 | int getNumWarps() const { return numWarps; } 19 | int getThreadsPerWarp() const { return threadsPerWarp; } 20 | int getNumCTAs() const { return numCTAs; } 21 | 22 | private: 23 | MLIRContext *context; 24 | int numWarps; 25 | int threadsPerWarp; 26 | int numCTAs; 27 | }; 28 | 29 | class TritonGPUConversionTarget : public ConversionTarget { 30 | 31 | public: 32 | explicit TritonGPUConversionTarget(MLIRContext &ctx, 33 | TritonGPUTypeConverter &typeConverter); 34 | }; 35 | 36 | } // namespace mlir 37 | 38 | #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 39 | -------------------------------------------------------------------------------- /third_party/amd/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonAMDGPUToLLVM) 2 | add_subdirectory(TritonAMDGPUTransforms) 3 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAMDGPUToLLVM 2 | ConvertLayoutOpToLLVM/SharedToDotOperandHelper.cpp 3 | ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp 4 | ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp 5 | ConvertLayoutOpToLLVM.cpp 6 | DotOpToLLVM/MFMA.cpp 7 | DotOpToLLVM/WMMA.cpp 8 | DotOpToLLVM.cpp 9 | ElementwiseOpToLLVM.cpp 10 | LoadStoreOpToLLVM.cpp 11 | GCNAsmFormat.cpp 12 | TritonGPUToLLVM.cpp 13 | Utility.cpp 14 | TargetInfo.cpp 15 | DecomposeUnsupportedConversions.cpp 16 | SPMDOpToLLVM.cpp 17 | 18 | DEPENDS 19 | TritonAMDGPUConversionPassIncGen 20 | 21 | LINK_LIBS PUBLIC 22 | TritonGPUToLLVM 23 | ) 24 | 25 | target_compile_definitions(TritonAMDGPUToLLVM PUBLIC USE_ROCM) 26 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUToLLVM/SPMDOpToLLVM.cpp: -------------------------------------------------------------------------------- 1 | #include "PatternTritonGPUOpToLLVM.h" 2 | #include "Utility.h" 3 | 4 | using namespace mlir; 5 | 6 | namespace { 7 | 8 | struct GetNumProgramsOpConversion 9 | : public ConvertOpToLLVMPattern { 10 | using ConvertOpToLLVMPattern< 11 | triton::GetNumProgramsOp>::ConvertOpToLLVMPattern; 12 | 13 | LogicalResult 14 | matchAndRewrite(triton::GetNumProgramsOp op, OpAdaptor adaptor, 15 | ConversionPatternRewriter &rewriter) const override { 16 | static constexpr mlir::gpu::Dimension dims[] = {mlir::gpu::Dimension::x, 17 | mlir::gpu::Dimension::y, 18 | mlir::gpu::Dimension::z}; 19 | Location loc = op->getLoc(); 20 | assert(op.getAxisAsInt() < 3); 21 | Value blockId = 22 | rewriter.create<::mlir::gpu::GridDimOp>(loc, dims[op.getAxisAsInt()]); 23 | rewriter.replaceOpWithNewOp(op, i32_ty, blockId); 24 | return success(); 25 | } 26 | }; 27 | 28 | } // namespace 29 | 30 | void mlir::triton::AMD::populateSPMDOpToLLVMPattern( 31 | LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, 32 | PatternBenefit benefit) { 33 | patterns.add(typeConverter, benefit); 34 | } 35 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONAMDGPU_TO_LLVM_UTILITY_H 2 | #define TRITON_CONVERSION_TRITONAMDGPU_TO_LLVM_UTILITY_H 3 | 4 | #include "TritonAMDGPUToLLVM/GCNAsmFormat.h" 5 | 6 | #include "mlir/Conversion/LLVMCommon/Pattern.h" 7 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 8 | #include "triton/Analysis/Utility.h" 9 | #include "triton/Conversion/MLIRTypes.h" 10 | #include "triton/Conversion/TritonGPUToLLVM/Utility.h" 11 | namespace mlir::LLVM::AMD { 12 | 13 | Value shuffleXor(Location loc, ConversionPatternRewriter &rewriter, Value val, 14 | int i); 15 | Value shuffleUp(Location loc, ConversionPatternRewriter &rewriter, Value val, 16 | int i); 17 | Value shuffleIdx(Location loc, ConversionPatternRewriter &rewriter, Value val, 18 | int i); 19 | Value shuffleIdx(Location loc, ConversionPatternRewriter &rewriter, Value val, 20 | Value i); 21 | 22 | Value llGetPid(Location loc, ConversionPatternRewriter &rewriter, 23 | ModuleOp moduleOp, int axis); 24 | } // namespace mlir::LLVM::AMD 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAMDGPUTransforms 2 | AccelerateAMDMatmul.cpp 3 | OptimizeEpilogue.cpp 4 | RemoveLayoutConversions.cpp 5 | ReorderInstructions.cpp 6 | StreamPipeline.cpp 7 | MfmaGroup.cpp 8 | 9 | DEPENDS 10 | TritonAMDGPUTransformsIncGen 11 | ) 12 | 13 | target_include_directories(TritonAMDGPUTransforms PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include) 14 | target_include_directories(TritonAMDGPUTransforms PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../include) 15 | target_compile_definitions(TritonAMDGPUTransforms PUBLIC USE_ROCM) 16 | -------------------------------------------------------------------------------- /third_party/cpu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(TRITON_BUILD_PYTHON_MODULE) 2 | add_triton_plugin(TritonCPU ${CMAKE_CURRENT_SOURCE_DIR}/triton_cpu.cc LINK_LIBS TritonCPUToLLVM) 3 | endif() 4 | -------------------------------------------------------------------------------- /third_party/cpu/triton_cpu.cc: -------------------------------------------------------------------------------- 1 | #include "mlir/Pass/Pass.h" 2 | #include "mlir/Pass/PassManager.h" 3 | #include "triton/Conversion/TritonCPUToLLVM/Passes.h" 4 | #include "triton/Dialect/TritonCPU/IR/Dialect.h" 5 | #include "llvm/IR/Constants.h" 6 | #include "llvm/Support/TargetSelect.h" 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | namespace py = pybind11; 14 | 15 | void init_triton_cpu_passes_ttcpuir(py::module &&m) { 16 | using namespace mlir::triton; 17 | m.def("add_to_llvmir", [](mlir::PassManager &pm) { 18 | pm.addPass(mlir::triton::createConvertTritonCPUToLLVMPass()); 19 | }); 20 | } 21 | 22 | void init_triton_cpu(py::module &&m) { 23 | auto passes = m.def_submodule("passes"); 24 | init_triton_cpu_passes_ttcpuir(passes.def_submodule("ttcpuir")); 25 | 26 | m.def("load_dialects", [](mlir::MLIRContext &context) { 27 | mlir::DialectRegistry registry; 28 | registry.insert(); 29 | context.appendDialectRegistry(registry); 30 | context.loadAllAvailableDialects(); 31 | }); 32 | } 33 | -------------------------------------------------------------------------------- /third_party/nvidia/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) 2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) 3 | add_subdirectory(include) 4 | add_subdirectory(lib) 5 | if(TRITON_BUILD_PYTHON_MODULE) 6 | add_triton_plugin(TritonNVIDIA ${CMAKE_CURRENT_SOURCE_DIR}/triton_nvidia.cc LINK_LIBS TritonNVIDIAGPUToLLVM NVGPUToLLVM) 7 | endif() 8 | -------------------------------------------------------------------------------- /third_party/nvidia/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/nvidia/backend/__init__.py -------------------------------------------------------------------------------- /third_party/nvidia/backend/lib/libdevice.10.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/triton-cpu/db25542331a158f79067540d01dab666d94e8696/third_party/nvidia/backend/lib/libdevice.10.bc -------------------------------------------------------------------------------- /third_party/nvidia/include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonNVIDIAGPUToLLVM) 2 | add_subdirectory(NVGPUToLLVM) 3 | -------------------------------------------------------------------------------- /third_party/nvidia/include/NVGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name NVGPUToLLVM) 3 | add_public_tablegen_target(NVGPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_NVGPU_TO_LLVM_PASS_H 2 | #define TRITON_CONVERSION_NVGPU_TO_LLVM_PASS_H 3 | 4 | #include 5 | 6 | namespace mlir { 7 | 8 | class ModuleOp; 9 | template class OperationPass; 10 | 11 | namespace triton { 12 | 13 | std::unique_ptr> createConvertNVGPUToLLVMPass(); 14 | 15 | } // namespace triton 16 | 17 | } // namespace mlir 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /third_party/nvidia/include/NVGPUToLLVM/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef NVGPU_CONVERSION_PASSES_H 2 | #define NVGPU_CONVERSION_PASSES_H 3 | 4 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 5 | #include "mlir/Pass/Pass.h" 6 | #include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h" 7 | 8 | namespace mlir { 9 | namespace triton { 10 | 11 | #define GEN_PASS_REGISTRATION 12 | #include "nvidia/include/NVGPUToLLVM/Passes.h.inc" 13 | 14 | } // namespace triton 15 | } // namespace mlir 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /third_party/nvidia/include/NVGPUToLLVM/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef NVGPU_CONVERSION_PASSES 2 | #define NVGPU_CONVERSION_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | 7 | def ConvertNVGPUToLLVM : Pass<"convert-nv-gpu-to-llvm", "mlir::ModuleOp"> { 8 | let summary = "Convert NVGPU to LLVM"; 9 | let description = [{ 10 | 11 | }]; 12 | let constructor = "mlir::triton::createConvertNVGPUToLLVMPass()"; 13 | 14 | let dependentDialects = ["mlir::arith::ArithDialect", 15 | "mlir::LLVM::LLVMDialect", 16 | "mlir::NVVM::NVVMDialect", 17 | "mlir::triton::nvgpu::NVGPUDialect"]; 18 | } 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /third_party/nvidia/include/TritonNVIDIAGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonNVIDIAGPUToLLVM) 3 | add_public_tablegen_target(TritonNVIDIAGPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_PASSES_H 2 | #define TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_PASSES_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/Pass/Pass.h" 6 | #include "mlir/Transforms/DialectConversion.h" 7 | 8 | #include 9 | 10 | namespace mlir { 11 | 12 | class ModuleOp; 13 | template class OperationPass; 14 | 15 | namespace triton { 16 | 17 | #define GEN_PASS_DECL 18 | #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc" 19 | 20 | namespace NVIDIA { 21 | std::unique_ptr> 22 | createDecomposeUnsupportedConversionsPass(); 23 | 24 | } // namespace NVIDIA 25 | 26 | std::unique_ptr> createConvertTritonGPUToLLVMPass(); 27 | std::unique_ptr> 28 | createConvertTritonGPUToLLVMPass(int32_t computeCapability); 29 | 30 | #define GEN_PASS_REGISTRATION 31 | #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc" 32 | 33 | } // namespace triton 34 | 35 | } // namespace mlir 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_CONVERSION_PASSES 2 | #define TRITONGPU_CONVERSION_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def DecomposeUnsupportedNVIDIAConversions : Pass<"decompose-unsupported-nvidia-conversions", "mlir::ModuleOp"> { 7 | let summary = "Decompose conversions that are not supported by TritonGPU -> LLVM"; 8 | let constructor = "mlir::triton::NVIDIA::createDecomposeUnsupportedConversionsPass()"; 9 | } 10 | 11 | def ConvertTritonGPUToLLVM : Pass<"convert-triton-gpu-to-llvm", "mlir::ModuleOp"> { 12 | let summary = "Convert TritonGPU to LLVM"; 13 | let description = [{ 14 | 15 | }]; 16 | let constructor = "mlir::triton::createConvertTritonGPUToLLVMPass()"; 17 | 18 | let dependentDialects = ["mlir::arith::ArithDialect", 19 | "mlir::math::MathDialect", 20 | "mlir::gpu::GPUDialect", 21 | "mlir::scf::SCFDialect", 22 | "mlir::LLVM::LLVMDialect", 23 | "mlir::tensor::TensorDialect", 24 | "mlir::triton::TritonDialect", 25 | "mlir::triton::gpu::TritonGPUDialect", 26 | "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect", 27 | "mlir::NVVM::NVVMDialect"]; 28 | 29 | let options = [ 30 | Option<"computeCapability", "compute-capability", 31 | "int32_t", /*default*/"80", 32 | "device compute capability">, 33 | ]; 34 | } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonNVIDIAGPUToLLVM) 2 | add_subdirectory(NVGPUToLLVM) 3 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/NVGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(NVGPUToLLVM 2 | NVGPUToLLVMPass.cpp 3 | 4 | DEPENDS 5 | NVGPUConversionPassIncGen 6 | ) 7 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonNVIDIAGPUToLLVM 2 | ConvertLayoutOpToLLVM/SharedToDotOperandMMAv1.cpp 3 | ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp 4 | ConvertLayoutOpToLLVM.cpp 5 | DotOpToLLVM/MMAv1.cpp 6 | DotOpToLLVM/MMAv2.cpp 7 | DotOpToLLVM/WGMMA.cpp 8 | DotOpToLLVM.cpp 9 | ElementwiseOpToLLVM.cpp 10 | LoadStoreOpToLLVM.cpp 11 | BarrierOpToLLVM.cpp 12 | TritonGPUToLLVM.cpp 13 | DecomposeUnsupportedConversions.cpp 14 | SPMDOpToLLVM.cpp 15 | TensorPtrOpsToLLVM.cpp 16 | ClusterOpsToLLVM.cpp 17 | PTXAsmFormat.cpp 18 | Utility.cpp 19 | TargetInfo.cpp 20 | 21 | DEPENDS 22 | TritonNVIDIAGPUConversionPassIncGen 23 | NVGPUAttrDefsIncGen 24 | 25 | LINK_LIBS PUBLIC 26 | TritonGPUToLLVM 27 | ) 28 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp: -------------------------------------------------------------------------------- 1 | #include "TritonNVIDIAGPUToLLVM/Passes.h" 2 | #include "mlir/Pass/Pass.h" 3 | #include "triton/Analysis/Utility.h" 4 | #include "triton/Conversion/TritonGPUToLLVM/Patterns.h" 5 | #include "triton/Dialect/Triton/IR/Dialect.h" 6 | #include "triton/Dialect/TritonGPU/IR/Dialect.h" 7 | 8 | using namespace mlir; 9 | 10 | namespace mlir { 11 | namespace triton { 12 | #define GEN_PASS_DEF_DECOMPOSEUNSUPPORTEDNVIDIACONVERSIONS 13 | #include "TritonNVIDIAGPUToLLVM/Passes.h.inc" 14 | } // namespace triton 15 | } // namespace mlir 16 | 17 | namespace { 18 | struct DecomposeUnsupportedConversions 19 | : public mlir::triton::impl::DecomposeUnsupportedNVIDIAConversionsBase< 20 | DecomposeUnsupportedConversions> { 21 | void runOnOperation() override { 22 | ModuleOp mod = getOperation(); 23 | triton::gpu::decomposeSplatOpToSharedLayoutConversion(mod); 24 | triton::gpu::decomposeTensorCoreToDotLayoutConversion< 25 | triton::gpu::NvidiaMmaEncodingAttr>(mod, isMmaToDotShortcut); 26 | triton::gpu::decomposeBlockedToDotLayoutConversion(mod); 27 | } 28 | }; 29 | } // namespace 30 | 31 | namespace mlir::triton::NVIDIA { 32 | 33 | std::unique_ptr> 34 | createDecomposeUnsupportedConversionsPass() { 35 | return std::make_unique(); 36 | } 37 | 38 | } // namespace mlir::triton::NVIDIA 39 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/SPMDOpToLLVM.cpp: -------------------------------------------------------------------------------- 1 | #include "PatternTritonGPUOpToLLVM.h" 2 | #include "Utility.h" 3 | 4 | namespace { 5 | 6 | using namespace mlir; 7 | using namespace mlir::triton; 8 | 9 | struct GetNumProgramsOpConversion 10 | : public ConvertOpToLLVMPattern { 11 | using ConvertOpToLLVMPattern< 12 | triton::GetNumProgramsOp>::ConvertOpToLLVMPattern; 13 | 14 | LogicalResult 15 | matchAndRewrite(triton::GetNumProgramsOp op, OpAdaptor adaptor, 16 | ConversionPatternRewriter &rewriter) const override { 17 | // It is not easy to get the compute capability here, so we use numCTAs to 18 | // decide the semantic of GetNumProgramsOp. If numCTAs = 1, then 19 | // GetNumProgramsOp is converted to "%nctaid", otherwise it is converted to 20 | // "%nclusterid". 21 | auto moduleOp = op->getParentOfType(); 22 | assert(moduleOp && "Parent ModuleOp not found for GetProgramIdOp"); 23 | int numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(moduleOp); 24 | 25 | Location loc = op->getLoc(); 26 | assert(op.getAxisAsInt() < 3); 27 | std::string sreg = numCTAs == 1 ? "%nctaid." : "%nclusterid."; 28 | sreg.append(1, 'x' + op.getAxisAsInt()); // 0 -> 'x', 1 -> 'y', 2 -> 'z' 29 | 30 | Value numPrograms = LLVM::NVIDIA::getSRegValue(rewriter, loc, sreg); 31 | rewriter.replaceOp(op, numPrograms); 32 | return success(); 33 | } 34 | }; 35 | 36 | } // namespace 37 | 38 | void mlir::triton::NVIDIA::populateSPMDOpToLLVMPattern( 39 | LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, 40 | PatternBenefit benefit) { 41 | patterns.add(typeConverter, benefit); 42 | } 43 | -------------------------------------------------------------------------------- /third_party/proton/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | proton.egg-info 3 | proton/_C/libproton.so 4 | 5 | *.hatchet 6 | -------------------------------------------------------------------------------- /third_party/proton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(Proton CXX) 2 | 3 | set(PROTON_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/csrc) 4 | set(PROTON_EXTERN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/extern) 5 | file(GLOB_RECURSE PROTON_SRC ${PROTON_SRC_DIR}/lib/*.cpp) 6 | add_library(proton SHARED ${PROTON_SRC} ${PROTON_SRC_DIR}/${PROJECT_NAME}.cpp) 7 | 8 | find_package(CUDAToolkit) 9 | 10 | if(${CUDAToolkit_VERSION_MAJOR} VERSION_LESS 11) 11 | message(FATAL_ERROR "CUDA 11 or higher is required") 12 | endif() 13 | 14 | # Try to find CUPTI from the default include dir, if not found, search the 15 | # EXTRAS dir 16 | find_path( 17 | CUPTI_INCLUDE_DIR 18 | NAMES cupti.h 19 | HINTS ${CUDAToolkit_INCLUDE_DIRS} 20 | PATH_SUFFIXES include) 21 | 22 | if(NOT CUPTI_INCLUDE_DIR) 23 | find_path( 24 | CUPTI_INCLUDE_DIR 25 | NAMES cupti.h 26 | HINTS ${CUDAToolkit_ROOT}/extras/CUPTI 27 | PATH_SUFFIXES include) 28 | endif() 29 | 30 | # Check if CUPTI was found 31 | if(NOT CUPTI_INCLUDE_DIR) 32 | message(FATAL_ERROR "CUPTI include directory not found: CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") 33 | else() 34 | message(STATUS "Found CUPTI include directory: ${CUPTI_INCLUDE_DIR}") 35 | endif() 36 | 37 | include_directories(${PYBIND11_INCLUDE_DIR}) 38 | include_directories(${JSON_INCLUDE_DIR}) 39 | include_directories(${PROTON_SRC_DIR}/include) 40 | include_directories(${PROTON_EXTERN_DIR}) 41 | 42 | if(PYTHON_INCLUDE_DIRS) 43 | include_directories(${PYTHON_INCLUDE_DIRS}) 44 | else() 45 | find_package(Python3 REQUIRED Interpreter Development) 46 | include_directories(${Python3_INCLUDE_DIRS}) 47 | endif() 48 | 49 | include_directories(${CUDAToolkit_INCLUDE_DIRS}) 50 | include_directories(${CUPTI_INCLUDE_DIR}) 51 | target_link_libraries(proton ${Python_LIBRARIES}) 52 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Context/Python.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_CONTEXT_PYTHON_H_ 2 | #define PROTON_CONTEXT_PYTHON_H_ 3 | 4 | #include "Context.h" 5 | 6 | namespace proton { 7 | 8 | /// Unwind the Python stack and early return a list of contexts. 9 | class PythonContextSource : public ContextSource { 10 | public: 11 | std::vector getContexts() override; 12 | }; 13 | 14 | } // namespace proton 15 | 16 | #endif // PROTON_CONTEXT_PYTHON_H_ 17 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Context/Shadow.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_CONTEXT_SHADOW_H_ 2 | #define PROTON_CONTEXT_SHADOW_H_ 3 | 4 | #include "Context.h" 5 | #include 6 | 7 | namespace proton { 8 | 9 | /// Incrementally build a list of contexts by shadowing the stack with 10 | /// user-defined scopes. 11 | class ShadowContextSource : public ContextSource, public ScopeInterface { 12 | public: 13 | ShadowContextSource() = default; 14 | 15 | std::vector getContexts() override { return contextStack; } 16 | 17 | void enterScope(const Scope &scope) override; 18 | 19 | void exitScope(const Scope &scope) override; 20 | 21 | private: 22 | std::vector contextStack; 23 | }; 24 | 25 | } // namespace proton 26 | 27 | #endif // PROTON_CONTEXT_CONTEXT_H_ 28 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Data/Data.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DATA_DATA_H_ 2 | #define PROTON_DATA_DATA_H_ 3 | 4 | #include "Context/Context.h" 5 | #include "Metric.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace proton { 12 | 13 | enum class OutputFormat { Hatchet, Count }; 14 | 15 | class Data : public InternalOpInterface { 16 | public: 17 | Data(const std::string &path, ContextSource *contextSource = nullptr) 18 | : path(path), contextSource(contextSource) {} 19 | virtual ~Data() = default; 20 | 21 | /// Add a single metric to the data. 22 | /// [MT] The implementation must be thread-safe. 23 | virtual void addMetric(size_t scopeId, std::shared_ptr metric) = 0; 24 | 25 | /// Add multiple metrics to the data. 26 | /// [MT] The implementation must be thread-safe. 27 | virtual void 28 | addMetrics(size_t scopeId, 29 | const std::map &metrics) = 0; 30 | 31 | /// Dump the data to the given output format. 32 | /// [MT] Thread-safe. 33 | void dump(OutputFormat outputFormat); 34 | 35 | protected: 36 | /// The actual implementation of the dump operation. 37 | /// [MT] Thread-safe. 38 | virtual void doDump(std::ostream &os, OutputFormat outputFormat) const = 0; 39 | 40 | protected: 41 | mutable std::shared_mutex mutex; 42 | 43 | const std::string path{}; 44 | ContextSource *contextSource{}; 45 | }; 46 | 47 | OutputFormat parseOutputFormat(const std::string &outputFormat); 48 | 49 | const std::string outputFormatToString(OutputFormat outputFormat); 50 | 51 | } // namespace proton 52 | 53 | #endif // PROTON_DATA_DATA_H_ 54 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Data/TraceData.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DATA_TRACE_DATA_H_ 2 | #define PROTON_DATA_TRACE_DATA_H_ 3 | 4 | #include "Data.h" 5 | 6 | namespace proton { 7 | 8 | class TraceData : public Data { 9 | public: 10 | using Data::Data; 11 | 12 | void addMetric(size_t scopeId, std::shared_ptr metric) override; 13 | 14 | void 15 | addMetrics(size_t scopeId, 16 | const std::map &metrics) override; 17 | 18 | protected: 19 | void startOp(const Scope &scope) override final; 20 | 21 | void stopOp(const Scope &scope) override final; 22 | 23 | private: 24 | void doDump(std::ostream &os, OutputFormat outputFormat) const override; 25 | }; 26 | 27 | } // namespace proton 28 | 29 | #endif // PROTON_DATA_TRACE_DATA_H_ 30 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Driver/GPU/Cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DRIVER_GPU_CUDA_H_ 2 | #define PROTON_DRIVER_GPU_CUDA_H_ 3 | 4 | #include 5 | 6 | namespace proton { 7 | 8 | namespace cuda { 9 | 10 | template CUresult init(int flags); 11 | 12 | template CUresult ctxSynchronize(); 13 | 14 | template CUresult ctxGetCurrent(CUcontext *pctx); 15 | 16 | template 17 | CUresult deviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); 18 | 19 | template CUresult deviceGet(CUdevice *device, int ordinal); 20 | 21 | } // namespace cuda 22 | 23 | } // namespace proton 24 | 25 | #endif // PROTON_DRIVER_GPU_CUDA_H_ 26 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Proton.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_H_ 2 | #define PROTON_H_ 3 | 4 | #include "Context/Context.h" 5 | #include "Data/Data.h" 6 | #include "Data/Metric.h" 7 | #include "Session/Session.h" 8 | 9 | #endif // PROTON_H_ 10 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/Errors.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace proton { 4 | 5 | class NotImplemented : public std::logic_error { 6 | public: 7 | NotImplemented() : std::logic_error("Not yet implemented"){}; 8 | }; 9 | 10 | } // namespace proton 11 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/Singleton.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_SINGLETON_H_ 2 | #define PROTON_UTILITY_SINGLETON_H_ 3 | 4 | namespace proton { 5 | 6 | template class Singleton { 7 | public: 8 | Singleton(const Singleton &) = delete; 9 | Singleton &operator=(const Singleton &) = delete; 10 | 11 | static T &instance() { 12 | static T _; 13 | return _; 14 | } 15 | 16 | protected: 17 | Singleton() = default; 18 | }; 19 | 20 | } // namespace proton 21 | 22 | #endif // PROTON_UTILITY_SINGLETON_H_ 23 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/String.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_STRING_H_ 2 | #define PROTON_UTILITY_STRING_H_ 3 | 4 | #include 5 | 6 | namespace proton { 7 | 8 | inline std::string toLower(const std::string &str) { 9 | std::string lower; 10 | for (auto c : str) { 11 | lower += tolower(c); 12 | } 13 | return lower; 14 | } 15 | 16 | } // namespace proton 17 | 18 | #endif // PROTON_UTILITY_STRING_H_ 19 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/Traits.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_TRAITS_H_ 2 | #define PROTON_UTILITY_TRAITS_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace proton { 8 | template 9 | struct is_one_of : std::disjunction...> {}; 10 | } // namespace proton 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Context/Context.cpp: -------------------------------------------------------------------------------- 1 | #include "Context/Context.h" 2 | 3 | namespace proton { 4 | 5 | std::atomic Scope::scopeIdCounter{1}; 6 | 7 | } // namespace proton 8 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Context/Python.cpp: -------------------------------------------------------------------------------- 1 | #include "Context/Python.h" 2 | #include "pybind11/pybind11.h" 3 | #include 4 | 5 | namespace proton { 6 | 7 | namespace { 8 | 9 | std::string UnpackPyobject(PyObject *pyObject) { 10 | if (PyBytes_Check(pyObject)) { 11 | size_t size = PyBytes_GET_SIZE(pyObject); 12 | return std::string(PyBytes_AS_STRING(pyObject), size); 13 | } 14 | if (PyUnicode_Check(pyObject)) { 15 | // NOLINTNEXTLINE(cppcoreguidelines-init-variables) 16 | Py_ssize_t size; 17 | const char *data = PyUnicode_AsUTF8AndSize(pyObject, &size); 18 | if (!data) { 19 | return ""; 20 | } 21 | return std::string(data, (size_t)size); 22 | } 23 | return ""; 24 | } 25 | 26 | } // namespace 27 | 28 | std::vector PythonContextSource::getContexts() { 29 | pybind11::gil_scoped_acquire gil; 30 | 31 | PyFrameObject *frame = PyEval_GetFrame(); 32 | Py_XINCREF(frame); 33 | 34 | std::vector contexts; 35 | while (frame != nullptr) { 36 | PyCodeObject *f_code = PyFrame_GetCode(frame); 37 | size_t lineno = PyFrame_GetLineNumber(frame); 38 | size_t firstLineNo = f_code->co_firstlineno; 39 | std::string file = UnpackPyobject(f_code->co_filename); 40 | std::string function = UnpackPyobject(f_code->co_name); 41 | auto pythonFrame = file + ":" + function + "@" + std::to_string(lineno); 42 | contexts.push_back(Context(pythonFrame)); 43 | auto newFrame = PyFrame_GetBack(frame); 44 | Py_DECREF(frame); 45 | frame = newFrame; 46 | } 47 | return contexts; 48 | } 49 | 50 | } // namespace proton 51 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Context/Shadow.cpp: -------------------------------------------------------------------------------- 1 | #include "Context/Shadow.h" 2 | 3 | #include 4 | 5 | namespace proton { 6 | 7 | void ShadowContextSource::enterScope(const Scope &scope) { 8 | contextStack.push_back(scope); 9 | } 10 | 11 | void ShadowContextSource::exitScope(const Scope &scope) { 12 | if (contextStack.empty()) { 13 | throw std::runtime_error("Context stack is empty"); 14 | } 15 | if (contextStack.back() != scope) { 16 | throw std::runtime_error("Context stack is not balanced"); 17 | } 18 | contextStack.pop_back(); 19 | } 20 | 21 | } // namespace proton 22 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Data/Data.cpp: -------------------------------------------------------------------------------- 1 | #include "Data/Data.h" 2 | #include "Utility/String.h" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace proton { 11 | 12 | void Data::dump(OutputFormat outputFormat) { 13 | std::shared_lock lock(mutex); 14 | 15 | std::unique_ptr out; 16 | if (path.empty() || path == "-") { 17 | out.reset(new std::ostream(std::cout.rdbuf())); // Redirecting to cout 18 | } else { 19 | out.reset(new std::ofstream( 20 | path + "." + 21 | outputFormatToString(outputFormat))); // Opening a file for output 22 | } 23 | doDump(*out, outputFormat); 24 | } 25 | 26 | OutputFormat parseOutputFormat(const std::string &outputFormat) { 27 | if (toLower(outputFormat) == "hatchet") { 28 | return OutputFormat::Hatchet; 29 | } 30 | throw std::runtime_error("Unknown output format: " + outputFormat); 31 | } 32 | 33 | const std::string outputFormatToString(OutputFormat outputFormat) { 34 | if (outputFormat == OutputFormat::Hatchet) { 35 | return "hatchet"; 36 | } 37 | throw std::runtime_error("Unknown output format: " + 38 | std::to_string(static_cast(outputFormat))); 39 | } 40 | 41 | } // namespace proton 42 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Data/TraceData.cpp: -------------------------------------------------------------------------------- 1 | #include "Data/TraceData.h" 2 | #include "Utility/Errors.h" 3 | 4 | #include 5 | 6 | namespace proton { 7 | 8 | void TraceData::startOp(const Scope &scope) { throw NotImplemented(); } 9 | 10 | void TraceData::stopOp(const Scope &scope) { throw NotImplemented(); } 11 | 12 | void TraceData::addMetric(size_t scopeId, std::shared_ptr metric) { 13 | throw NotImplemented(); 14 | } 15 | 16 | void TraceData::addMetrics( 17 | size_t scopeId, const std::map &metrics) { 18 | throw NotImplemented(); 19 | } 20 | 21 | void TraceData::doDump(std::ostream &os, OutputFormat outputFormat) const { 22 | throw NotImplemented(); 23 | } 24 | 25 | } // namespace proton 26 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Driver/GPU/Cuda.cpp: -------------------------------------------------------------------------------- 1 | #include "Driver/GPU/Cuda.h" 2 | #include "Driver/Dispatch.h" 3 | 4 | namespace proton { 5 | 6 | namespace cuda { 7 | 8 | struct ExternLibCuda : public ExternLibBase { 9 | using RetType = CUresult; 10 | static constexpr const char *name = "libcuda.so"; 11 | static constexpr RetType success = CUDA_SUCCESS; 12 | static void *lib; 13 | }; 14 | 15 | void *ExternLibCuda::lib = nullptr; 16 | 17 | DEFINE_DISPATCH(ExternLibCuda, init, cuInit, int) 18 | 19 | DEFINE_DISPATCH(ExternLibCuda, ctxSynchronize, cuCtxSynchronize) 20 | 21 | DEFINE_DISPATCH(ExternLibCuda, ctxGetCurrent, cuCtxGetCurrent, CUcontext *) 22 | 23 | DEFINE_DISPATCH(ExternLibCuda, deviceGet, cuDeviceGet, CUdevice *, int) 24 | 25 | DEFINE_DISPATCH(ExternLibCuda, deviceGetAttribute, cuDeviceGetAttribute, int *, 26 | CUdevice_attribute, CUdevice) 27 | 28 | } // namespace cuda 29 | 30 | } // namespace proton 31 | -------------------------------------------------------------------------------- /third_party/proton/proton/_C/include: -------------------------------------------------------------------------------- 1 | ../../csrc/include/ -------------------------------------------------------------------------------- /third_party/proton/proton/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .scope import scope, enter_scope, exit_scope 3 | from .profile import ( 4 | start, 5 | activate, 6 | deactivate, 7 | finalize, 8 | profile, 9 | DEFAULT_PROFILE_NAME, 10 | ) 11 | -------------------------------------------------------------------------------- /third_party/proton/proton/flags.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the global flags used in the proton package. 3 | """ 4 | 5 | # Whether to enable profiling. Default is False. 6 | profiling_on = False 7 | 8 | 9 | def set_profiling_on(): 10 | global profiling_on 11 | profiling_on = True 12 | 13 | 14 | def set_profiling_off(): 15 | global profiling_on 16 | profiling_on = False 17 | 18 | 19 | def get_profiling_on(): 20 | global profiling_on 21 | return profiling_on 22 | -------------------------------------------------------------------------------- /third_party/proton/proton/hook.py: -------------------------------------------------------------------------------- 1 | from .scope import enter_scope, exit_scope 2 | from triton.compiler import CompiledKernel, LazyDict 3 | 4 | COMPUTE_METADATA_SCOPE_NAME = "__proton_launch_metadata" 5 | 6 | 7 | class TritonHook: 8 | metrics = ["flops8", "flops16", "flops32", "flops64", "bytes"] 9 | 10 | @staticmethod 11 | def enter(metadata: LazyDict) -> None: 12 | enter_scope(COMPUTE_METADATA_SCOPE_NAME) 13 | metadata = metadata.get() 14 | exit_scope() 15 | fn_metrics = {k: metadata[k] for k in TritonHook.metrics if k in metadata} 16 | enter_scope(metadata["name"], triton_op=True, metrics=fn_metrics) 17 | 18 | @staticmethod 19 | def exit(metadata: LazyDict) -> None: 20 | exit_scope(triton_op=True) 21 | 22 | 23 | def register_triton_hook() -> None: 24 | if CompiledKernel.launch_enter_hook is None: 25 | CompiledKernel.launch_enter_hook = TritonHook.enter 26 | CompiledKernel.launch_exit_hook = TritonHook.exit 27 | 28 | 29 | def unregister_triton_hook() -> None: 30 | if CompiledKernel.launch_enter_hook == TritonHook.enter: 31 | CompiledKernel.launch_enter_hook = None 32 | CompiledKernel.launch_exit_hook = None 33 | -------------------------------------------------------------------------------- /third_party/proton/test/test_lib.py: -------------------------------------------------------------------------------- 1 | import triton._C.libproton.proton as libproton 2 | import tempfile 3 | import pathlib 4 | 5 | 6 | def test_record(): 7 | id0 = libproton.record_scope() 8 | id1 = libproton.record_scope() 9 | assert id1 == id0 + 1 10 | 11 | 12 | def test_scope(): 13 | id0 = libproton.record_scope() 14 | libproton.enter_scope(id0, "zero") 15 | id1 = libproton.record_scope() 16 | libproton.enter_scope(id1, "one") 17 | libproton.exit_scope(id1, "one") 18 | libproton.exit_scope(id0, "zero") 19 | 20 | 21 | def test_op(): 22 | id0 = libproton.record_scope() 23 | libproton.enter_op(id0, "zero") 24 | libproton.exit_op(id0, "zero") 25 | 26 | 27 | def test_session(): 28 | with tempfile.NamedTemporaryFile(delete=True, suffix=".hatchet") as f: 29 | session_id = libproton.start(f.name.split(".")[0], "cupti", "shadow", "tree") 30 | libproton.deactivate(session_id) 31 | libproton.activate(session_id) 32 | libproton.finalize(session_id, "hatchet") 33 | libproton.finalize_all("hatchet") 34 | assert pathlib.Path(f.name).exists() 35 | 36 | 37 | def test_add_metrics(): 38 | with tempfile.NamedTemporaryFile(delete=True, suffix=".hatchet") as f: 39 | libproton.start(f.name.split(".")[0], "cupti", "shadow", "tree") 40 | id1 = libproton.record_scope() 41 | libproton.enter_scope(id1, "one") 42 | libproton.add_metrics(id1, {"a": 1.0, "b": 2.0}) 43 | libproton.exit_scope(id1, "one") 44 | libproton.finalize_all("hatchet") 45 | assert pathlib.Path(f.name).exists() 46 | -------------------------------------------------------------------------------- /third_party/proton/test/test_viewer.py: -------------------------------------------------------------------------------- 1 | import triton.profiler as proton 2 | import subprocess 3 | 4 | 5 | def test_help(): 6 | # Only check if the viewer can be invoked 7 | ret = subprocess.check_call(["proton-viewer", "-h"]) 8 | assert ret == 0 9 | -------------------------------------------------------------------------------- /unittest/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_ut( 2 | NAME TestTritonAnalysis 3 | SRCS UtilityTest.cpp 4 | LIBS 5 | TritonAnalysis 6 | TritonIR 7 | TritonGPUIR 8 | ) 9 | -------------------------------------------------------------------------------- /unittest/Analysis/UtilityTest.cpp: -------------------------------------------------------------------------------- 1 | //===- UtilityTest.cpp - Tests for 2 | // Utility----------------------------------===// 3 | // 4 | //===----------------------------------------------------------------------===// 5 | 6 | #include "triton/Dialect/Triton/IR/Utility.h" 7 | #include 8 | 9 | namespace mlir { 10 | 11 | TEST(Analysis, reorder) { 12 | SmallVector shape({10, 20, 30}); 13 | { 14 | SmallVector order({2, 1, 0}); 15 | auto reordered = triton::applyPermutation(shape, order); 16 | EXPECT_EQ(reordered[0], 30); 17 | EXPECT_EQ(reordered[1], 20); 18 | EXPECT_EQ(reordered[2], 10); 19 | } 20 | { 21 | SmallVector order({1, 0, 2}); 22 | auto reordered = triton::applyPermutation(shape, order); 23 | EXPECT_EQ(reordered[0], 20); 24 | EXPECT_EQ(reordered[1], 10); 25 | EXPECT_EQ(reordered[2], 30); 26 | } 27 | } 28 | 29 | } // namespace mlir 30 | -------------------------------------------------------------------------------- /unittest/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include (${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake) 2 | 3 | include(GoogleTest) 4 | enable_testing() 5 | 6 | get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) 7 | get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) 8 | get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS) 9 | 10 | function(add_triton_ut) 11 | set(options) 12 | set(oneValueArgs NAME) 13 | set(multiValueArgs SRCS LIBS DEFS) 14 | cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 15 | add_test(NAME ${__NAME} 16 | COMMAND ${__NAME}) 17 | add_executable( 18 | ${__NAME} 19 | ${__SRCS}) 20 | target_link_libraries( 21 | ${__NAME} 22 | PRIVATE 23 | GTest::gtest_main 24 | ${triton_libs} 25 | ${dialect_libs} 26 | ${conversion_libs} 27 | gmock 28 | ${__LIBS}) 29 | 30 | target_compile_options(${__NAME} PRIVATE -fno-rtti) 31 | 32 | target_compile_definitions(${__NAME} PRIVATE ${__DEFS}) 33 | 34 | # Without the TEST_DISCOVERY_TIMEOUT, the tests randomly time out on my mac 35 | # laptop. I think the issue may be that the very first time you run a program 36 | # it's a bit slow. 37 | gtest_discover_tests(${__NAME} PROPERTIES TEST_DISCOVERY_TIMEOUT 60) 38 | endfunction() 39 | 40 | add_subdirectory(Analysis) 41 | add_subdirectory(Conversion) 42 | add_subdirectory(Dialect) 43 | -------------------------------------------------------------------------------- /unittest/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonGPUToLLVM) 2 | -------------------------------------------------------------------------------- /unittest/Conversion/TritonGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_ut( 2 | NAME TestPtxAsmFormat 3 | SRCS PTXAsmFormatTest.cpp 4 | LIBS TritonGPUToLLVM TritonNVIDIAGPUToLLVM 5 | ) 6 | 7 | add_triton_ut( 8 | NAME TestEmitIndicesNvidia 9 | SRCS EmitIndicesTest.cpp DumpLayout.cpp 10 | LIBS TritonGPUIR TritonNvidiaGPUIR TritonNVIDIAGPUToLLVM 11 | DEFS NVIDIA_TARGET=1 12 | ) 13 | 14 | add_triton_ut( 15 | NAME TestEmitIndicesAMD 16 | SRCS EmitIndicesTest.cpp DumpLayout.cpp 17 | LIBS TritonGPUIR TritonAMDGPUToLLVM 18 | DEFS AMD_TARGET=1 19 | ) 20 | -------------------------------------------------------------------------------- /unittest/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonGPU) 2 | -------------------------------------------------------------------------------- /unittest/Dialect/TritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_ut( 2 | NAME TestSwizzling 3 | SRCS SwizzleTest.cpp 4 | LIBS TritonGPUIR TritonNvidiaGPUIR 5 | ) 6 | add_triton_ut( 7 | NAME Dialect 8 | SRCS DialectTest.cpp 9 | LIBS TritonGPUIR 10 | ) 11 | -------------------------------------------------------------------------------- /unittest/googletest.cmake: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | 3 | set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against") 4 | 5 | if(GOOGLETEST_DIR) 6 | set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override") 7 | endif() 8 | 9 | FetchContent_Declare( 10 | googletest 11 | GIT_REPOSITORY https://github.com/google/googletest.git 12 | GIT_TAG release-1.12.1 13 | ) 14 | 15 | FetchContent_GetProperties(googletest) 16 | 17 | if(NOT googletest_POPULATED) 18 | FetchContent_Populate(googletest) 19 | if (MSVC) 20 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 21 | endif() 22 | add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL) 23 | endif() 24 | -------------------------------------------------------------------------------- /utils/nightly.pypirc: -------------------------------------------------------------------------------- 1 | [distutils] 2 | Index-servers = 3 | Triton-Nightly 4 | 5 | [Triton-Nightly] 6 | Repository = https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/upload/ 7 | --------------------------------------------------------------------------------