├── .clang-format ├── .editorconfig ├── .git-blame-ignore-revs ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug.yml │ ├── config.yml │ └── performance.yml ├── PULL_REQUEST_TEMPLATE.md ├── actions │ └── checkout-install-ukernels │ │ └── action.yml ├── dependabot.yml └── workflows │ ├── build-test.yml │ ├── create_release.yml │ ├── documentation.yml │ ├── integration-tests.yml │ ├── integration-tests.yml.in │ ├── llvm-build.yml │ ├── llvm-build │ ├── almalinux.Dockerfile │ └── centos.Dockerfile │ ├── test-backends.yml │ └── wheels.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── RELEASE.md ├── bin ├── CMakeLists.txt ├── RegisterTritonDialects.h ├── triton-llvm-opt.cpp ├── triton-lsp.cpp ├── triton-opt.cpp ├── triton-reduce.cpp └── triton-tensor-layout.cpp ├── cmake ├── AddTritonUnitTest.cmake ├── FindLLVM.cmake ├── json-version.txt ├── llvm-hash.txt └── nvidia-toolchain-version.json ├── docs ├── Makefile ├── _templates │ └── versions.html ├── backend │ ├── ldmatrixOperand0.svg │ └── ldmatrixOperand1.svg ├── conf.py ├── getting-started │ ├── installation.rst │ └── tutorials │ │ ├── grouped_vs_row_major_ordering.png │ │ ├── parallel_reduction.png │ │ └── random_bits.png ├── index.rst ├── meetups │ ├── 01-24-2024 │ │ └── notes.md │ ├── 02-20-2024 │ │ ├── Proton.pdf │ │ └── notes.md │ ├── 04-02-2024 │ │ └── notes.md │ ├── 05-07-2024 │ │ └── notes.md │ ├── 07-18-2023 │ │ └── notes.md │ ├── 08-06-2024 │ │ └── notes.md │ ├── 08-22-2023 │ │ ├── amd-update.pdf │ │ ├── intel-xpu-update.pptx │ │ └── notes.md │ ├── 10-25-2023 │ │ ├── intel-xpu-update.pdf │ │ ├── notes.md │ │ └── triton-shared.pptx │ ├── 12-13-2023 │ │ └── notes.md │ ├── dev-meetup-2023.md │ └── dev_conference_2024.md ├── programming-guide │ ├── chapter-1 │ │ ├── cuda-parallel-matmul.png │ │ ├── introduction.rst │ │ └── triton-parallel-matmul.png │ ├── chapter-2 │ │ ├── halide-iteration.png │ │ ├── polyhedral-iteration.png │ │ └── related-work.rst │ └── chapter-3 │ │ └── debugging.rst └── python-api │ ├── triton-semantics.rst │ ├── triton.language.rst │ ├── triton.rst │ └── triton.testing.rst ├── include ├── CMakeLists.txt └── triton │ ├── Analysis │ ├── Alias.h │ ├── Allocation.h │ ├── AxisInfo.h │ ├── Membar.h │ └── Utility.h │ ├── CMakeLists.txt │ ├── Conversion │ ├── CMakeLists.txt │ ├── MLIRTypes.h │ ├── TritonGPUToLLVM │ │ ├── AsmFormat.h │ │ ├── CMakeLists.txt │ │ ├── ElementwiseOpToLLVMBase.h │ │ ├── FMADotUtility.h │ │ ├── Passes.h │ │ ├── Passes.td │ │ ├── PatternTritonGPUOpToLLVM.h │ │ ├── TargetInfoBase.h │ │ ├── TypeConverter.h │ │ └── Utility.h │ └── TritonToTritonGPU │ │ ├── CMakeLists.txt │ │ ├── Passes.h │ │ ├── Passes.td │ │ └── TritonToTritonGPUPass.h │ ├── Dialect │ ├── CMakeLists.txt │ ├── Triton │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.h │ │ │ ├── Interfaces.h │ │ │ ├── OpInterfaces.h │ │ │ ├── Traits.h │ │ │ ├── TritonAttrDefs.td │ │ │ ├── TritonDialect.td │ │ │ ├── TritonInterfaces.td │ │ │ ├── TritonOpInterfaces.td │ │ │ ├── TritonOps.td │ │ │ ├── TritonTypes.td │ │ │ ├── Types.h │ │ │ └── Utility.h │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ └── Passes.td │ ├── TritonCPU │ │ ├── CMakeLists.txt │ │ └── IR │ │ │ ├── Attributes.h │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.h │ │ │ ├── TritonCPUAttrDefs.td │ │ │ ├── TritonCPUDialect.td │ │ │ ├── TritonCPUInterfaces.h │ │ │ ├── TritonCPUOps.td │ │ │ ├── TritonCPUTypes.td │ │ │ └── Types.h │ ├── TritonGPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── Attributes.h │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.h │ │ │ ├── LinearLayoutConversions.h │ │ │ ├── TritonGPUAttrDefs.td │ │ │ ├── TritonGPUDialect.td │ │ │ ├── TritonGPUInterfaces.h │ │ │ ├── TritonGPUOps.td │ │ │ ├── TritonGPUTypeInterfaces.td │ │ │ ├── TritonGPUTypes.td │ │ │ └── Types.h │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── DecomposeScaledBlocked.h │ │ │ ├── MMAv5PipelineUtility.h │ │ │ ├── Partition.h │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── PipelineExpander.h │ │ │ ├── PipeliningUtility.h │ │ │ ├── Schedule.h │ │ │ ├── TritonGPUConversion.h │ │ │ ├── Utility.h │ │ │ └── WarpSpecialization.h │ └── TritonNvidiaGPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ ├── CMakeLists.txt │ │ ├── Dialect.h │ │ ├── TritonNvidiaGPUAttrDefs.td │ │ ├── TritonNvidiaGPUDialect.td │ │ ├── TritonNvidiaGPUOpInterfaces.td │ │ └── TritonNvidiaGPUOps.td │ │ └── Transforms │ │ ├── CMakeLists.txt │ │ ├── Passes.h │ │ ├── Passes.td │ │ └── TMAUtilities.h │ ├── Target │ ├── CMakeLists.txt │ └── LLVMIR │ │ ├── CMakeLists.txt │ │ ├── Passes.h │ │ └── Passes.td │ └── Tools │ ├── LayoutUtils.h │ ├── LinearLayout.h │ ├── StrUtil.h │ └── Sys │ └── GetEnv.hpp ├── lib ├── Analysis │ ├── Alias.cpp │ ├── Allocation.cpp │ ├── AxisInfo.cpp │ ├── CMakeLists.txt │ ├── Membar.cpp │ └── Utility.cpp ├── CMakeLists.txt ├── Conversion │ ├── CMakeLists.txt │ ├── TritonGPUToLLVM │ │ ├── AllocateSharedMemory.cpp │ │ ├── AllocateWarpGroups.cpp │ │ ├── AssertOpToLLVM.cpp │ │ ├── CMakeLists.txt │ │ ├── ControlFlowOpToLLVM.cpp │ │ ├── ConvertLayoutOpToLLVM.cpp │ │ ├── DotOpToLLVM │ │ │ ├── FMA.cpp │ │ │ └── FMADotUtility.cpp │ │ ├── ElementwiseOpToLLVM.cpp │ │ ├── FuncOpToLLVM.cpp │ │ ├── GatherOpToLLVM.cpp │ │ ├── GlobalScratchMemoryAllocation.cpp │ │ ├── HistogramOpToLLVM.cpp │ │ ├── MakeRangeOpToLLVM.cpp │ │ ├── MemoryOpToLLVM.cpp │ │ ├── PrintOpToLLVM.cpp │ │ ├── ReduceOpToLLVM.cpp │ │ ├── ReduceScanCommon.h │ │ ├── SPMDOpToLLVM.cpp │ │ ├── ScanOpToLLVM.cpp │ │ ├── TypeConverter.cpp │ │ ├── Utility.cpp │ │ └── ViewOpToLLVM.cpp │ └── TritonToTritonGPU │ │ ├── CMakeLists.txt │ │ ├── TritonGPUConversion.cpp │ │ └── TritonToTritonGPUPass.cpp ├── Dialect │ ├── CMakeLists.txt │ ├── Triton │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Canonicalize.td │ │ │ ├── Dialect.cpp │ │ │ ├── OpInterfaces.cpp │ │ │ ├── Ops.cpp │ │ │ ├── Traits.cpp │ │ │ └── Types.cpp │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── Combine.cpp │ │ │ ├── Combine.td │ │ │ ├── LoopInvariantCodeMotion.cpp │ │ │ ├── LoopUnroll.cpp │ │ │ ├── ReorderBroadcast.cpp │ │ │ └── RewriteTensorPointer.cpp │ ├── TritonCPU │ │ ├── CMakeLists.txt │ │ └── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.cpp │ │ │ ├── Ops.cpp │ │ │ └── Types.cpp │ ├── TritonGPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── Dialect.cpp │ │ │ ├── LinearLayoutConversions.cpp │ │ │ ├── Ops.cpp │ │ │ └── Types.cpp │ │ └── Transforms │ │ │ ├── AccelerateMatmul.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── Coalesce.cpp │ │ │ ├── CoalesceAsyncCopy.cpp │ │ │ ├── CombineTensorSelectAndIf.cpp │ │ │ ├── DecomposeScaledBlocked.cpp │ │ │ ├── F32DotTC.cpp │ │ │ ├── FuseNestedLoops.cpp │ │ │ ├── HoistTMEMAlloc.cpp │ │ │ ├── OptimizeAccumulatorInit.cpp │ │ │ ├── OptimizeDotOperands.cpp │ │ │ ├── OptimizeThreadLocality.cpp │ │ │ ├── Pipeliner │ │ │ ├── AssignLatencies.cpp │ │ │ ├── LowerLoops.cpp │ │ │ ├── MMAv5PipelineUtility.cpp │ │ │ ├── ModifiedAccMMAPipeline.cpp │ │ │ ├── Partition.cpp │ │ │ ├── PipelineExpander.cpp │ │ │ ├── PipeliningUtility.cpp │ │ │ ├── Schedule.cpp │ │ │ ├── ScheduleLoops.cpp │ │ │ ├── SoftwarePipeliner.cpp │ │ │ ├── TC05MMAPipeline.cpp │ │ │ ├── TMAStoresPipeline.cpp │ │ │ ├── TestPipelineAssignLatencies.cpp │ │ │ ├── TestPipelineLowerLoop.cpp │ │ │ ├── TestPipelineScheduleLoop.cpp │ │ │ └── WGMMAPipeline.cpp │ │ │ ├── Prefetch.cpp │ │ │ ├── ReduceDataDuplication.cpp │ │ │ ├── RemoveLayoutConversions.cpp │ │ │ ├── ReorderInstructions.cpp │ │ │ ├── Utility.cpp │ │ │ └── WarpSpecialization │ │ │ ├── AutomaticWarpSpecialization.cpp │ │ │ ├── LoadMMASpecialization.cpp │ │ │ ├── OptimizePartitionWarps.cpp │ │ │ ├── PartitionLoops.cpp │ │ │ └── RewritePartitionDependencies.cpp │ └── TritonNvidiaGPU │ │ ├── CMakeLists.txt │ │ ├── IR │ │ ├── CMakeLists.txt │ │ ├── Dialect.cpp │ │ └── Ops.cpp │ │ └── Transforms │ │ ├── CMakeLists.txt │ │ ├── FenceInsertion.cpp │ │ ├── KeepAccInTMem.cpp │ │ ├── MMALowering.cpp │ │ ├── OptimizeDescriptorEncoding.cpp │ │ ├── PlanCTA.cpp │ │ ├── PromoteLHSToTMem.cpp │ │ ├── TMALowering.cpp │ │ └── TensorMemoryAllocation.cpp ├── Instrumentation │ ├── CMakeLists.txt │ └── PrintLoadStoreMemSpaces.cpp ├── Target │ ├── CMakeLists.txt │ └── LLVMIR │ │ ├── CMakeLists.txt │ │ ├── LLVMDIScope.cpp │ │ ├── LLVMIRBreakPhiStruct.cpp │ │ └── LLVMPasses.h └── Tools │ ├── CMakeLists.txt │ ├── LayoutUtils.cpp │ └── LinearLayout.cpp ├── pyproject.toml ├── python ├── MANIFEST.in ├── build_helpers.py ├── pyproject.toml ├── requirements.txt ├── setup.py ├── src │ ├── interpreter.cc │ ├── ir.cc │ ├── llvm.cc │ ├── main.cc │ ├── passes.cc │ └── passes.h ├── test-requirements.txt ├── test │ ├── backend │ │ ├── extension_backend.c │ │ └── test_device_backend.py │ ├── kernel_comparison │ │ └── kernels.yml │ ├── regression │ │ ├── conftest.py │ │ ├── test_cast_matmul.py │ │ └── test_functional_regressions.py │ └── unit │ │ ├── blackwell │ │ └── test_tmem.py │ │ ├── conftest.py │ │ ├── cpu │ │ ├── test_math.py │ │ └── test_opt.py │ │ ├── cuda │ │ ├── __init__.py │ │ ├── test_experimental_tma.py │ │ ├── test_flashattention.py │ │ ├── test_gemm.py │ │ ├── test_gemm_fusion.py │ │ ├── test_mixed_io.py │ │ ├── test_tensor_descriptor.py │ │ ├── test_tma_descriptor.py │ │ └── test_tma_store_gemm.py │ │ ├── instrumentation │ │ └── test_gpuhello.py │ │ ├── language │ │ ├── print_helper.py │ │ ├── test_annotations.py │ │ ├── test_block_pointer.py │ │ ├── test_compile_errors.py │ │ ├── test_compile_only.py │ │ ├── test_conversions.py │ │ ├── test_core.py │ │ ├── test_decorator.py │ │ ├── test_libdevice.py │ │ ├── test_line_info.py │ │ ├── test_matmul.py │ │ ├── test_mxfp.py │ │ ├── test_pipeliner.py │ │ ├── test_random.py │ │ ├── test_reproducer.py │ │ ├── test_standard.py │ │ ├── test_subprocess.py │ │ ├── test_tuple.py │ │ └── test_warp_specialization.py │ │ ├── runtime │ │ ├── test_autotuner.py │ │ ├── test_bindings.py │ │ ├── test_cache.py │ │ ├── test_cublas.py │ │ ├── test_driver.py │ │ ├── test_jit.py │ │ ├── test_launch.py │ │ └── test_subproc.py │ │ ├── test_debug.py │ │ ├── test_debug_dump.py │ │ ├── test_perf_warning.py │ │ └── tools │ │ ├── test_aot.py │ │ ├── test_disasm.py │ │ └── test_irsource.py ├── triton │ ├── _C │ │ └── include │ ├── __init__.py │ ├── _internal_testing.py │ ├── _utils.py │ ├── backends │ │ ├── __init__.py │ │ ├── compiler.py │ │ └── driver.py │ ├── compiler │ │ ├── __init__.py │ │ ├── code_generator.py │ │ ├── compiler.py │ │ ├── config.py │ │ ├── errors.py │ │ └── make_launcher.py │ ├── errors.py │ ├── language │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── core.py │ │ ├── extra │ │ │ ├── __init__.py │ │ │ └── libdevice.py │ │ ├── math.py │ │ ├── random.py │ │ ├── semantic.py │ │ └── standard.py │ ├── runtime │ │ ├── __init__.py │ │ ├── _allocation.py │ │ ├── autotuner.py │ │ ├── build.py │ │ ├── cache.py │ │ ├── driver.py │ │ ├── errors.py │ │ ├── interpreter.py │ │ └── jit.py │ ├── testing.py │ └── tools │ │ ├── __init__.py │ │ ├── build_extern.py │ │ ├── compile.py │ │ ├── disasm.py │ │ ├── experimental_descriptor.py │ │ ├── link.py │ │ └── mxfp.py └── tutorials │ ├── 01-vector-add.py │ ├── 02-fused-softmax-cpu.py │ ├── 02-fused-softmax.py │ ├── 03-matrix-multiplication-cpu.py │ ├── 03-matrix-multiplication.py │ ├── 04-low-memory-dropout.py │ ├── 05-layer-norm.py │ ├── 06-fused-attention.py │ ├── 07-extern-functions.py │ ├── 08-grouped-gemm.py │ ├── 09-persistent-matmul.py │ ├── 10-block-scaled-matmul.py │ ├── README.rst │ ├── cpu-blocked-matmul.py │ ├── matrix-vector-multiplication-bf16.py │ └── matrix-vector-multiplication.py ├── test ├── Analysis │ ├── test-alias.mlir │ ├── test-alignment.mlir │ ├── test-allocation.mlir │ └── test-membar.mlir ├── CMakeLists.txt ├── Conversion │ ├── allocate_shared_memory.mlir │ ├── allocate_warp_groups.mlir │ ├── amd │ │ ├── async_ops_to_llvm.mlir │ │ ├── buffer_load_store.mlir │ │ ├── buffer_load_to_local_to_llvm.mlir │ │ ├── builtin_func_to_llvm.mlir │ │ ├── compute-base-ptr.mlir │ │ ├── dedup-by-constancy.mlir │ │ ├── ds_transpose.mlir │ │ ├── fdivide.mlir │ │ ├── fp_to_fp.mlir │ │ ├── in_thread_transpose.mlir │ │ ├── invalid_extractslice_to_llvm.mlir │ │ ├── load_store.mlir │ │ ├── math-denorm-handling.mlir │ │ ├── mfma-shortcut.mlir │ │ ├── tritongpu_to_llvm.mlir │ │ ├── tritongpu_to_llvm_rdna.mlir │ │ └── tritongpu_wmma_dot_to_llvm.mlir │ ├── atomic_ldst.mlir │ ├── cvt_to_llvm.mlir │ ├── dedup-by-constancy.mlir │ ├── divide-by-0.mlir │ ├── gather_to_llvm.mlir │ ├── nvgpu_to_llvm.mlir │ ├── reduce_to_llvm.mlir │ ├── scan_to_llvm.mlir │ ├── tma_to_llvm.mlir │ ├── triton_to_tritongpu.mlir │ ├── tritongpu_to_llvm.mlir │ ├── tritongpu_to_llvm_blackwell.mlir │ ├── tritongpu_to_llvm_block_dot_shortcut.mlir │ ├── tritongpu_to_llvm_hopper.mlir │ ├── tritongpu_to_llvm_hopper_ptx80.mlir │ ├── tritongpu_to_llvm_volta.mlir │ ├── tritongpu_to_ptx.mlir │ ├── tritonnvidiagpu_to_llvm.mlir │ └── warp_specialize_to_llvm.mlir ├── LLVMIR │ └── break-phi-struct.ll ├── Proton │ └── ops.mlir ├── Tools │ └── tensor_layout_print.mlir ├── Triton │ ├── canonicalize.mlir │ ├── combine.mlir │ ├── invalid.mlir │ ├── loop-invariant-code-motion.mlir │ ├── loop-unroll.mlir │ ├── ops.mlir │ ├── reorder-broadcast.mlir │ ├── reproducer.mlir │ ├── rewrite-tensor-pointer.mlir │ ├── vecadd.mlir │ └── verify-make-range.mlir ├── TritonCPU │ ├── canonicalize.mlir │ ├── convert-atomic.mlir │ ├── convert-memory-ops.mlir │ ├── dot-to-amx.mlir │ ├── dot-to-onednn.mlir │ ├── dot-to-xsmm.mlir │ ├── math-to-vec-lib.mlir │ ├── optimize-masks.mlir │ ├── reduction.mlir │ └── scalarize-memory-ops.mlir ├── TritonGPU │ ├── accelerate-matmul.mlir │ ├── accumulator-init.mlir │ ├── amd │ │ ├── accelerate-amd-matmul-chain-dot.mlir │ │ ├── accelerate-amd-matmul-fma.mlir │ │ ├── accelerate-amd-matmul-mfma-gfx950.mlir │ │ ├── accelerate-amd-matmul-mfma.mlir │ │ ├── accelerate-amd-matmul-wmma-gen1.mlir │ │ ├── accelerate-amd-matmul-wmma-gen2.mlir │ │ ├── amd-block-pingpong.mlir │ │ ├── amd-canonicalize-pointers-dont-run-mlir-canonicalizer.mlir │ │ ├── amd-canonicalize-pointers.mlir │ │ ├── amd-coalesce-async-copy.mlir │ │ ├── amd-conditional-barrier.mlir │ │ ├── amd-convert-buffer-ops-range-analysis.mlir │ │ ├── amd-convert-buffer-ops.mlir │ │ ├── amd-extractslice-op.mlir │ │ ├── amd-fold-true-cmpi.mlir │ │ ├── amd-hoist-cvtToDotOp.mlir │ │ ├── amd-instruction-sched.mlir │ │ ├── amd-optimize-epilogue.mlir │ │ ├── amd-range-analysis.mlir │ │ ├── amd-reorder-instructions.mlir │ │ ├── amd-sched-2nd-load.mlir │ │ ├── amd-schedule-hint.mlir │ │ ├── amd-stream-loop-assume.mlir │ │ ├── amd-stream-prefetch.mlir │ │ ├── in-thread-transpose.mlir │ │ ├── invalid.mlir │ │ ├── mfma-double-rate.mlir │ │ ├── mfma-xf32.mlir │ │ ├── optimize-lds-usage.mlir │ │ └── sink-setprio-mfma.mlir │ ├── atomic-cas.mlir │ ├── automatic-warp-specialization.mlir │ ├── blackwell_acc_tmem.mlir │ ├── canonicalize.mlir │ ├── coalesce-async-copy.mlir │ ├── coalesce.mlir │ ├── combine-select-if.mlir │ ├── combine.mlir │ ├── dot-operands.mlir │ ├── fence-inserstion.mlir │ ├── fuse-nested-loops.mlir │ ├── global_scratch_alloc.mlir │ ├── global_scratch_to_llvm.mlir │ ├── hoist-tmem-alloc.mlir │ ├── invalid-attributes.mlir │ ├── invalid.mlir │ ├── load-mma-specialization.mlir │ ├── loop-pipeline-async-latencies.mlir │ ├── loop-pipeline-blackwell.mlir │ ├── loop-pipeline-cuda.mlir │ ├── loop-pipeline-expand.mlir │ ├── loop-pipeline-hip.mlir │ ├── loop-pipeline-hopper-remove-wait.mlir │ ├── loop-pipeline-hopper.mlir │ ├── loop-pipeline-indirect-load.mlir │ ├── loop-pipeline.mlir │ ├── loop-schedule.mlir │ ├── matmul-loop-pipeline.mlir │ ├── matmul.mlir │ ├── ops.mlir │ ├── optimize-locality.mlir │ ├── optimize-partition-warps.mlir │ ├── optimize_epilogue.mlir │ ├── partition-loops.mlir │ ├── pipeline-assign-latencies.mlir │ ├── pipeline-loop-nest.mlir │ ├── pipeline-lower-loop.mlir │ ├── pipeline-schedule-loop.mlir │ ├── prefetch.mlir │ ├── promote-lhs-to-tmem.mlir │ ├── reduce-data-duplication.mlir │ ├── reorder-instructions.mlir │ ├── rewrite-partition-dependencies.mlir │ ├── samples │ │ ├── descriptor-matmul-pipeline.mlir │ │ ├── descriptor-matmul-pipeline.mlir.in │ │ ├── simulated-grouped-gemm.mlir │ │ └── simulated-grouped-gemm.mlir.in │ ├── tf32x3-matmul.mlir │ └── verify-blocked-layout.mlir ├── TritonNvidiaGPU │ ├── canonicalize.mlir │ ├── invalid.mlir │ ├── membar.mlir │ ├── mma_lowering.mlir │ ├── ops.mlir │ ├── optimize_descriptor_encoding.mlir │ ├── test_promotion_to_tensor_memory.mlir │ ├── test_tensor_memory_allocation.mlir │ └── tma_lowering.mlir ├── lib │ ├── Analysis │ │ ├── CMakeLists.txt │ │ ├── TestAlias.cpp │ │ ├── TestAllocation.cpp │ │ ├── TestAxisInfo.cpp │ │ └── TestMembar.cpp │ ├── CMakeLists.txt │ └── Instrumentation │ │ ├── CMakeLists.txt │ │ └── GPUHello.cpp ├── lit.cfg.py └── lit.site.cfg.py.in ├── third_party ├── amd │ ├── CMakeLists.txt │ ├── backend │ │ ├── compiler.py │ │ ├── driver.c │ │ ├── driver.py │ │ ├── include │ │ │ ├── hip │ │ │ │ ├── amd_detail │ │ │ │ │ ├── amd_channel_descriptor.h │ │ │ │ │ ├── amd_device_functions.h │ │ │ │ │ ├── amd_hip_atomic.h │ │ │ │ │ ├── amd_hip_bf16.h │ │ │ │ │ ├── amd_hip_bfloat16.h │ │ │ │ │ ├── amd_hip_common.h │ │ │ │ │ ├── amd_hip_complex.h │ │ │ │ │ ├── amd_hip_cooperative_groups.h │ │ │ │ │ ├── amd_hip_fp16.h │ │ │ │ │ ├── amd_hip_fp8.h │ │ │ │ │ ├── amd_hip_gl_interop.h │ │ │ │ │ ├── amd_hip_math_constants.h │ │ │ │ │ ├── amd_hip_runtime.h │ │ │ │ │ ├── amd_hip_runtime_pt_api.h │ │ │ │ │ ├── amd_hip_unsafe_atomics.h │ │ │ │ │ ├── amd_hip_vector_types.h │ │ │ │ │ ├── amd_math_functions.h │ │ │ │ │ ├── amd_surface_functions.h │ │ │ │ │ ├── amd_warp_functions.h │ │ │ │ │ ├── amd_warp_sync_functions.h │ │ │ │ │ ├── concepts.hpp │ │ │ │ │ ├── device_library_decls.h │ │ │ │ │ ├── functional_grid_launch.hpp │ │ │ │ │ ├── grid_launch.h │ │ │ │ │ ├── grid_launch.hpp │ │ │ │ │ ├── grid_launch_GGL.hpp │ │ │ │ │ ├── helpers.hpp │ │ │ │ │ ├── hip_api_trace.hpp │ │ │ │ │ ├── hip_assert.h │ │ │ │ │ ├── hip_cooperative_groups_helper.h │ │ │ │ │ ├── hip_fp16_gcc.h │ │ │ │ │ ├── hip_fp16_math_fwd.h │ │ │ │ │ ├── hip_ldg.h │ │ │ │ │ ├── hip_prof_str.h │ │ │ │ │ ├── hip_runtime_prof.h │ │ │ │ │ ├── host_defines.h │ │ │ │ │ ├── hsa_helpers.hpp │ │ │ │ │ ├── macro_based_grid_launch.hpp │ │ │ │ │ ├── math_fwd.h │ │ │ │ │ ├── ockl_image.h │ │ │ │ │ ├── program_state.hpp │ │ │ │ │ ├── texture_fetch_functions.h │ │ │ │ │ └── texture_indirect_functions.h │ │ │ │ ├── channel_descriptor.h │ │ │ │ ├── device_functions.h │ │ │ │ ├── driver_types.h │ │ │ │ ├── hip_bf16.h │ │ │ │ ├── hip_bfloat16.h │ │ │ │ ├── hip_common.h │ │ │ │ ├── hip_complex.h │ │ │ │ ├── hip_cooperative_groups.h │ │ │ │ ├── hip_deprecated.h │ │ │ │ ├── hip_ext.h │ │ │ │ ├── hip_fp16.h │ │ │ │ ├── hip_fp8.h │ │ │ │ ├── hip_gl_interop.h │ │ │ │ ├── hip_hcc.h │ │ │ │ ├── hip_math_constants.h │ │ │ │ ├── hip_profile.h │ │ │ │ ├── hip_runtime.h │ │ │ │ ├── hip_runtime_api.h │ │ │ │ ├── hip_texture_types.h │ │ │ │ ├── hip_vector_types.h │ │ │ │ ├── hip_version.h │ │ │ │ ├── hiprtc.h │ │ │ │ ├── library_types.h │ │ │ │ ├── math_functions.h │ │ │ │ ├── surface_types.h │ │ │ │ └── texture_types.h │ │ │ ├── hsa │ │ │ │ ├── Brig.h │ │ │ │ ├── amd_hsa_common.h │ │ │ │ ├── amd_hsa_elf.h │ │ │ │ ├── amd_hsa_kernel_code.h │ │ │ │ ├── amd_hsa_queue.h │ │ │ │ ├── amd_hsa_signal.h │ │ │ │ ├── hsa.h │ │ │ │ ├── hsa_amd_tool.h │ │ │ │ ├── hsa_api_trace.h │ │ │ │ ├── hsa_api_trace_version.h │ │ │ │ ├── hsa_ext_amd.h │ │ │ │ ├── hsa_ext_finalize.h │ │ │ │ ├── hsa_ext_image.h │ │ │ │ ├── hsa_ven_amd_aqlprofile.h │ │ │ │ ├── hsa_ven_amd_loader.h │ │ │ │ └── hsa_ven_amd_pc_sampling.h │ │ │ └── roctracer │ │ │ │ ├── ext │ │ │ │ └── prof_protocol.h │ │ │ │ ├── hip_ostream_ops.h │ │ │ │ ├── hsa_ostream_ops.h │ │ │ │ ├── hsa_prof_str.h │ │ │ │ ├── roctracer.h │ │ │ │ ├── roctracer_ext.h │ │ │ │ ├── roctracer_hcc.h │ │ │ │ ├── roctracer_hip.h │ │ │ │ ├── roctracer_hsa.h │ │ │ │ ├── roctracer_plugin.h │ │ │ │ ├── roctracer_roctx.h │ │ │ │ └── roctx.h │ │ └── lib │ │ │ ├── asanrtl.bc │ │ │ ├── ockl.bc │ │ │ └── ocml.bc │ ├── include │ │ ├── Analysis │ │ │ └── RangeAnalysis.h │ │ ├── CMakeLists.txt │ │ ├── Dialect │ │ │ ├── CMakeLists.txt │ │ │ └── TritonAMDGPU │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── Dialect.h │ │ │ │ ├── TritonAMDGPUAttrDefs.td │ │ │ │ ├── TritonAMDGPUDialect.td │ │ │ │ └── TritonAMDGPUOps.td │ │ │ │ └── Utility │ │ │ │ └── CommonUtils.h │ │ ├── TritonAMDGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── GCNAsmFormat.h │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── PatternTritonAMDGPUToLLVM.h │ │ │ └── TargetUtils.h │ │ └── TritonAMDGPUTransforms │ │ │ ├── CMakeLists.txt │ │ │ ├── MfmaGroup.h │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── TritonGPUConversion.h │ ├── language │ │ └── hip │ │ │ ├── __init__.py │ │ │ └── libdevice.py │ ├── lib │ │ ├── Analysis │ │ │ ├── CMakeLists.txt │ │ │ └── RangeAnalysis.cpp │ │ ├── CMakeLists.txt │ │ ├── Dialect │ │ │ ├── CMakeLists.txt │ │ │ └── TritonAMDGPU │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── Dialect.cpp │ │ │ │ └── Utility │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── CommonUtils.cpp │ │ ├── TritonAMDGPUDialectToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── ExtractSliceOpToLLVM.cpp │ │ │ ├── InThreadTransposeOpToTTG.cpp │ │ │ └── TritonAMDGPUToLLVMPatterns.cpp │ │ ├── TritonAMDGPUToLLVM │ │ │ ├── AtomicRMWOpsEmitter.cpp │ │ │ ├── AtomicRMWOpsEmitter.h │ │ │ ├── BufferOpsEmitter.cpp │ │ │ ├── BufferOpsEmitter.h │ │ │ ├── BuiltinFuncToLLVM.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── ConvertLayoutOpToLLVM.cpp │ │ │ ├── ConvertLayoutOpToLLVM │ │ │ │ ├── SharedToDotOperandHelper.cpp │ │ │ │ ├── SharedToDotOperandHelper.h │ │ │ │ ├── SharedToDotOperandMFMA.cpp │ │ │ │ └── SharedToDotOperandWMMA.cpp │ │ │ ├── DotOpToLLVM.cpp │ │ │ ├── DotOpToLLVM │ │ │ │ ├── FMA.cpp │ │ │ │ ├── MFMA.cpp │ │ │ │ └── WMMA.cpp │ │ │ ├── ElementwiseOpToLLVM.cpp │ │ │ ├── GCNAsmFormat.cpp │ │ │ ├── LoadStoreOpToLLVM.cpp │ │ │ ├── MemoryOpToLLVM.cpp │ │ │ ├── OptimizeLDSUsage.cpp │ │ │ ├── OptimizeLDSUtility.cpp │ │ │ ├── OptimizeLDSUtility.h │ │ │ ├── PatternTritonGPUOpToLLVM.h │ │ │ ├── SPMDOpToLLVM.cpp │ │ │ ├── SchedInstructions.cpp │ │ │ ├── SchedInstructions.h │ │ │ ├── TargetInfo.cpp │ │ │ ├── TargetInfo.h │ │ │ ├── TargetUtils.cpp │ │ │ ├── TritonGPUToLLVM.cpp │ │ │ ├── UpcastMXFPToLLVM.cpp │ │ │ ├── Utility.cpp │ │ │ └── Utility.h │ │ └── TritonAMDGPUTransforms │ │ │ ├── AccelerateAMDMatmul.cpp │ │ │ ├── BlockPingpong.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── CanonicalizePointers.cpp │ │ │ ├── CoalesceAsyncCopy.cpp │ │ │ ├── ConvertToBufferOps.cpp │ │ │ ├── FoldTrueCmpIOp.cpp │ │ │ ├── HoistLayoutConversions.cpp │ │ │ ├── InThreadTranspose.cpp │ │ │ ├── MfmaGroup.cpp │ │ │ ├── OptimizeEpilogue.cpp │ │ │ ├── ReorderInstructions.cpp │ │ │ └── StreamPipeline.cpp │ ├── python │ │ ├── test │ │ │ ├── address_sanitizer_helper.py │ │ │ ├── test_address_sanitizer.py │ │ │ └── test_extract_slice.py │ │ └── triton_amd.cc │ ├── test │ │ ├── CMakeLists.txt │ │ └── lib │ │ │ ├── Analysis │ │ │ ├── CMakeLists.txt │ │ │ └── TestAMDRangeAnalysis.cpp │ │ │ └── CMakeLists.txt │ └── unittest │ │ ├── CMakeLists.txt │ │ └── Conversion │ │ ├── CMakeLists.txt │ │ └── OptimizeLDSTest.cpp ├── cpu │ ├── CMakeLists.txt │ ├── backend │ │ ├── __init__.py │ │ ├── compiler.py │ │ └── driver.py │ ├── include │ │ ├── Analysis │ │ │ └── TensorPtrShapeInfo.h │ │ ├── CMakeLists.txt │ │ ├── ScalarizePass │ │ │ ├── CMakeLists.txt │ │ │ ├── ScalarizeInterface.h │ │ │ ├── ScalarizeInterface.td │ │ │ └── ScalarizeInterfaceImpl.h │ │ ├── TritonCPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ └── Passes.td │ │ ├── TritonCPUTransforms │ │ │ ├── CMakeLists.txt │ │ │ ├── OptCommon.h │ │ │ ├── Passes.h │ │ │ └── Passes.td │ │ └── TritonToTritonCPU │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ └── Passes.td │ ├── language │ │ └── cpu │ │ │ ├── __init__.py │ │ │ ├── libdevice.py │ │ │ └── utils.py │ ├── lib │ │ ├── Analysis │ │ │ ├── CMakeLists.txt │ │ │ └── TensorPtrShapeInfo.cpp │ │ ├── CMakeLists.txt │ │ ├── TritonCPUToLLVM │ │ │ ├── AtomicOpsToLLVM.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── DebugOpsToLLVM.cpp │ │ │ ├── FuncOpToLLVM.cpp │ │ │ ├── GetProgramIdOpToLLVM.cpp │ │ │ ├── LowerMultiReduction.cpp │ │ │ ├── MathToVecLib.cpp │ │ │ ├── MemoryOpToLLVM.cpp │ │ │ ├── TypeConverter.cpp │ │ │ ├── TypeConverter.h │ │ │ ├── UkernelOpsToOneDNNLLVM.cpp │ │ │ ├── UkernelOpsToXSMMLLVM.cpp │ │ │ ├── Utility.cpp │ │ │ └── Utility.h │ │ ├── TritonCPUTransforms │ │ │ ├── CMakeLists.txt │ │ │ ├── Canonicalize.cpp │ │ │ ├── ConvertDotOp │ │ │ │ ├── ConvertDotCommon.cpp │ │ │ │ ├── ConvertDotCommon.h │ │ │ │ ├── ConvertDotGeneric.cpp │ │ │ │ ├── ConvertDotOpToUkernelOps.cpp │ │ │ │ ├── ConvertDotToAMX.cpp │ │ │ │ └── ConvertDotToFMA.cpp │ │ │ ├── ConvertDotProduct.cpp │ │ │ ├── ConvertUnsupportedOps.cpp │ │ │ ├── DecomposeFpConversions.cpp │ │ │ └── OptimizeMasks.cpp │ │ └── TritonToTritonCPU │ │ │ ├── CMakeLists.txt │ │ │ ├── ConvertAtomicOps.cpp │ │ │ ├── ConvertControlFlowOps.cpp │ │ │ ├── ConvertDebugOps.cpp │ │ │ ├── ConvertDotOp.cpp │ │ │ ├── ConvertElemManipOps.cpp │ │ │ ├── ConvertElementwiseOps.cpp │ │ │ ├── ConvertHistogramOp.cpp │ │ │ ├── ConvertMemoryOps.cpp │ │ │ ├── ConvertPtrOps.cpp │ │ │ ├── ConvertReductionOp.cpp │ │ │ ├── ConvertScanOp.cpp │ │ │ ├── OpTypeConversion.h │ │ │ ├── ReduceScanCommon.h │ │ │ ├── ScalarizeInterface.cpp │ │ │ ├── ScalarizeUsingForOps.cpp │ │ │ ├── TypeConverter.cpp │ │ │ └── TypeConverter.h │ ├── runtime │ │ ├── cpu_runtime.cpp │ │ ├── runtime_onednn.cpp │ │ └── runtime_xsmm.cpp │ └── triton_cpu.cc ├── f2reduce │ ├── CMakeLists.txt │ ├── LICENCE.txt │ ├── README.md │ ├── VERSION │ ├── f2reduce.cpp │ └── f2reduce.h ├── nvidia │ ├── CMakeLists.txt │ ├── backend │ │ ├── __init__.py │ │ ├── compiler.py │ │ ├── driver.c │ │ ├── driver.py │ │ └── lib │ │ │ └── libdevice.10.bc │ ├── include │ │ ├── CMakeLists.txt │ │ ├── Dialect │ │ │ ├── CMakeLists.txt │ │ │ ├── NVGPU │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── IR │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ ├── Dialect.h │ │ │ │ │ ├── NVGPUAttrDefs.td │ │ │ │ │ ├── NVGPUDialect.td │ │ │ │ │ └── NVGPUOps.td │ │ │ └── NVWS │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── Dialect.h │ │ │ │ ├── NVWSAttrDefs.td │ │ │ │ ├── NVWSDialect.td │ │ │ │ ├── NVWSOps.td │ │ │ │ └── NVWSTypes.td │ │ │ │ └── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── Passes.h │ │ │ │ └── Passes.td │ │ ├── NVGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── NVGPUToLLVMPass.h │ │ │ ├── Passes.h │ │ │ └── Passes.td │ │ ├── TritonNVIDIAGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── PTXAsmFormat.h │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── Utility.h │ │ ├── cublas_instance.h │ │ └── cublas_types.h │ ├── language │ │ └── cuda │ │ │ ├── __init__.py │ │ │ ├── _experimental_tma.py │ │ │ ├── libdevice.py │ │ │ └── utils.py │ ├── lib │ │ ├── CMakeLists.txt │ │ ├── Dialect │ │ │ ├── CMakeLists.txt │ │ │ ├── NVGPU │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── IR │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ └── Dialect.cpp │ │ │ └── NVWS │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── Dialect.cpp │ │ │ │ └── Ops.cpp │ │ │ │ └── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── LowerWarpGroup.cpp │ │ ├── NVGPUToLLVM │ │ │ ├── CMakeLists.txt │ │ │ └── NVGPUToLLVMPass.cpp │ │ └── TritonNVIDIAGPUToLLVM │ │ │ ├── BarrierOpToLLVM.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── ClusterOpsToLLVM.cpp │ │ │ ├── ConvertLayoutOpToLLVM.cpp │ │ │ ├── ConvertWarpSpecializeToLLVM.cpp │ │ │ ├── DotOpToLLVM.cpp │ │ │ ├── DotOpToLLVM │ │ │ ├── MMAHelpers.h │ │ │ ├── MMAv2.cpp │ │ │ ├── MMAv5.cpp │ │ │ └── WGMMA.cpp │ │ │ ├── ElementwiseOpToLLVM.cpp │ │ │ ├── Fp4ToFpOpToLLVM.cpp │ │ │ ├── LoadStoreOpToLLVM.cpp │ │ │ ├── MemoryOpToLLVM.cpp │ │ │ ├── PTXAsmFormat.cpp │ │ │ ├── PatternTritonGPUOpToLLVM.h │ │ │ ├── SPMDOpToLLVM.cpp │ │ │ ├── TMAToLLVM.cpp │ │ │ ├── TargetInfo.cpp │ │ │ ├── TargetInfo.h │ │ │ ├── TensorMemoryToLLVM.cpp │ │ │ ├── TensorPtrOpsToLLVM.cpp │ │ │ ├── TritonGPUToLLVM.cpp │ │ │ ├── Utility.cpp │ │ │ └── Utility.h │ ├── test │ │ ├── CMakeLists.txt │ │ ├── NVWS │ │ │ ├── invalid.mlir │ │ │ ├── lower_warp_group.mlir │ │ │ └── ops.mlir │ │ ├── lit.cfg.py │ │ └── lit.site.cfg.py.in │ ├── tools │ │ └── cuda │ │ │ ├── compile.c │ │ │ └── compile.h │ ├── triton_nvidia.cc │ └── unittest │ │ ├── CMakeLists.txt │ │ └── Conversion │ │ ├── CMakeLists.txt │ │ └── TritonGPUToLLVM │ │ ├── CMakeLists.txt │ │ └── PTXAsmFormatTest.cpp └── proton │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── csrc │ ├── CMakeLists.txt │ ├── Proton.cpp │ ├── include │ │ ├── Context │ │ │ ├── Context.h │ │ │ ├── Python.h │ │ │ └── Shadow.h │ │ ├── Data │ │ │ ├── Data.h │ │ │ ├── Metric.h │ │ │ ├── TraceData.h │ │ │ └── TreeData.h │ │ ├── Driver │ │ │ ├── Device.h │ │ │ ├── Dispatch.h │ │ │ └── GPU │ │ │ │ ├── CudaApi.h │ │ │ │ ├── CuptiApi.h │ │ │ │ ├── HipApi.h │ │ │ │ ├── HsaApi.h │ │ │ │ └── RoctracerApi.h │ │ ├── Profiler │ │ │ ├── Cupti │ │ │ │ ├── CuptiPCSampling.h │ │ │ │ └── CuptiProfiler.h │ │ │ ├── GPUProfiler.h │ │ │ ├── Profiler.h │ │ │ └── Roctracer │ │ │ │ └── RoctracerProfiler.h │ │ ├── Proton.h │ │ ├── Session │ │ │ └── Session.h │ │ └── Utility │ │ │ ├── Atomic.h │ │ │ ├── Errors.h │ │ │ ├── Map.h │ │ │ ├── Set.h │ │ │ ├── Singleton.h │ │ │ ├── String.h │ │ │ └── Traits.h │ └── lib │ │ ├── CMakeLists.txt │ │ ├── Context │ │ ├── CMakeLists.txt │ │ ├── Context.cpp │ │ ├── Python.cpp │ │ └── Shadow.cpp │ │ ├── Data │ │ ├── CMakeLists.txt │ │ ├── Data.cpp │ │ ├── TraceData.cpp │ │ └── TreeData.cpp │ │ ├── Driver │ │ ├── CMakeLists.txt │ │ ├── Device.cpp │ │ └── GPU │ │ │ ├── CudaApi.cpp │ │ │ ├── CuptiApi.cpp │ │ │ ├── HipApi.cpp │ │ │ ├── HsaApi.cpp │ │ │ └── RoctracerApi.cpp │ │ ├── Profiler │ │ ├── CMakeLists.txt │ │ ├── Cupti │ │ │ ├── CuptiPCSampling.cpp │ │ │ └── CuptiProfiler.cpp │ │ └── RocTracer │ │ │ └── RoctracerProfiler.cpp │ │ └── Session │ │ ├── CMakeLists.txt │ │ └── Session.cpp │ ├── dialect │ ├── CMakeLists.txt │ ├── include │ │ ├── CMakeLists.txt │ │ ├── Dialect │ │ │ ├── CMakeLists.txt │ │ │ └── Proton │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── Dialect.h │ │ │ │ ├── ProtonAttrDefs.td │ │ │ │ ├── ProtonDialect.td │ │ │ │ └── ProtonOps.td │ │ └── TritonProtonToLLVM │ │ │ └── PatternTritonProtonOpToLLVM.h │ ├── lib │ │ ├── CMakeLists.txt │ │ ├── Dialect │ │ │ ├── CMakeLists.txt │ │ │ └── Proton │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── Dialect.cpp │ │ │ │ └── Ops.cpp │ │ └── TritonProtonToLLVM │ │ │ ├── CMakeLists.txt │ │ │ └── RecordOpToLLVM.cpp │ └── triton_proton.cc │ ├── proton │ ├── _C │ │ └── include │ ├── __init__.py │ ├── context.py │ ├── flags.py │ ├── hook.py │ ├── language.py │ ├── profile.py │ ├── proton.py │ ├── scope.py │ ├── state.py │ └── viewer.py │ ├── test │ ├── examples │ │ ├── cuda.json │ │ ├── frame.json │ │ ├── hip.json │ │ ├── leaf_nodes.json │ │ └── triton.json │ ├── helper.py │ ├── helper_kernels.py │ ├── instrument.py │ ├── test_api.py │ ├── test_cmd.py │ ├── test_lib.py │ ├── test_profile.py │ ├── test_record.py │ └── test_viewer.py │ └── tutorials │ ├── dynamic_net.py │ └── matmul.py ├── unittest ├── Analysis │ ├── CMakeLists.txt │ └── UtilityTest.cpp ├── CMakeLists.txt ├── Dialect │ ├── CMakeLists.txt │ └── TritonGPU │ │ ├── CMakeLists.txt │ │ ├── DialectTest.cpp │ │ ├── DumpLayoutTest.cpp │ │ ├── LinearLayoutConversionsTest.cpp │ │ └── SwizzleTest.cpp ├── Tools │ ├── CMakeLists.txt │ ├── LayoutUtilsTest.cpp │ └── LinearLayoutTest.cpp └── googletest.cmake └── utils ├── generate-test-checks.py └── nightly.pypirc /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # https://editorconfig.org/ 2 | 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | indent_style = space 9 | indent_size = 4 10 | trim_trailing_whitespace = true 11 | insert_final_newline = true 12 | 13 | [*.py] 14 | indent_size = 4 15 | src_paths=python 16 | 17 | [*.{yaml,yml}] 18 | indent_size = 2 19 | 20 | [*.md] 21 | indent_size = 2 22 | x-soft-wrap-text = true 23 | 24 | [*.rst] 25 | indent_size = 4 26 | x-soft-wrap-text = true 27 | 28 | [CMakeLists.txt,*.cmake] 29 | indent_size = 2 30 | 31 | [Makefile] 32 | indent_style = tab 33 | 34 | [*.{c,cc,cpp,h,hpp,cu,cuh}] 35 | indent_size = 2 36 | 37 | [*.mlir] 38 | indent_size = 2 39 | 40 | [*.td] 41 | indent_size = 4 42 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Commits listed here are ignored by `git blame`. Add "big and uninteresting 2 | # changes" here. Don't forget that it has to be a separate commit (and, because 3 | # our automation squashes PRs, a separate PR)! 4 | # 5 | # Run the following command to teach your `git blame` to pick up this file. 6 | # 7 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs` 8 | 9 | 841a77d1b5961b43e1b64e5265bdfe52c133574d 10 | cb68a0d9d501657258ed9f7ad7610d0784c9be9a 11 | 03184de8b535bb24fb1f49cc1f5e008bcbaa73ef 12 | bc4a8e66da036fafc01b87ee9e210df7ee8fb738 13 | 846d6e7e77891706d179b20f27b1278ac3b9a9ac 14 | 0327b9d32db6d1d63d207ccab722bd45e00a6678 15 | df08301e76a56d9ab3f36ff00ab7133672baa8d3 16 | f88b01f558df06f010a869e01473253a5f5cd8db 17 | 312cf97e147e962562877026fd82c928cf6eaa30 18 | 53d868113a706988394134ca1f7f85cb3016cc81 19 | 539fbe5049570f29e73dc6843f984cd4913c5505 20 | 053af4e9f8f005e1bc3f8ac9bf285eaf0ac9bf72 21 | 5b36cb48ad9ce566dd24ff7183f207a1cb9358b5 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Community help 4 | url: https://discord.gg/gpumode 5 | about: GPU-mode discord community has a triton channel which is a great resource for help writing/learning triton 6 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 10 | 11 | # New contributor declaration 12 | - [ ] I am not making a trivial change, such as fixing a typo in a comment. 13 | 14 | - [ ] I have written a PR description following these 15 | [rules](https://cbea.ms/git-commit/#why-not-how). 16 | 17 | - [ ] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. 18 | 19 | - Select one of the following. 20 | - [ ] I have added tests. 21 | - `/test` for `lit` tests 22 | - `/unittest` for C++ tests 23 | - `/python/test` for end-to-end tests 24 | - [ ] This PR does not need a test because `FILL THIS IN`. 25 | 26 | - Select one of the following. 27 | - [ ] I have not added any `lit` tests. 28 | - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), 29 | including the "tests should be minimal" section. (Usually running Python code 30 | and using the instructions it generates is not minimal.) 31 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | # Enable version updates for GitHub Actions 9 | - package-ecosystem: "github-actions" 10 | # Look for GitHub Actions workflows in the `root` directory 11 | directory: "/" 12 | # Check the for updates once a week 13 | schedule: 14 | interval: "weekly" 15 | -------------------------------------------------------------------------------- /.github/workflows/llvm-build/almalinux.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM almalinux:8 2 | ARG llvm_dir=llvm-project 3 | # Add the cache artifacts and the LLVM source tree to the container 4 | ADD sccache /sccache 5 | ADD "${llvm_dir}" /source/llvm-project 6 | ENV SCCACHE_DIR="/sccache" 7 | ENV SCCACHE_CACHE_SIZE="2G" 8 | 9 | RUN dnf install --assumeyes llvm-toolset 10 | RUN dnf install --assumeyes python38-pip python38-devel git 11 | 12 | RUN python3 -m pip install --upgrade pip 13 | RUN python3 -m pip install --upgrade cmake ninja sccache lit 14 | 15 | # Install MLIR's Python Dependencies 16 | RUN python3 -m pip install -r /source/llvm-project/mlir/python/requirements.txt 17 | 18 | # Configure, Build, Test, and Install LLVM 19 | RUN cmake -GNinja -Bbuild \ 20 | -DCMAKE_BUILD_TYPE=Release \ 21 | -DCMAKE_C_COMPILER=clang \ 22 | -DCMAKE_CXX_COMPILER=clang++ \ 23 | -DCMAKE_ASM_COMPILER=clang \ 24 | -DCMAKE_C_COMPILER_LAUNCHER=sccache \ 25 | -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \ 26 | -DCMAKE_CXX_FLAGS="-Wno-everything" \ 27 | -DCMAKE_LINKER=lld \ 28 | -DCMAKE_INSTALL_PREFIX="/install" \ 29 | -DLLVM_BUILD_UTILS=ON \ 30 | -DLLVM_BUILD_TOOLS=ON \ 31 | -DLLVM_ENABLE_ASSERTIONS=ON \ 32 | -DMLIR_ENABLE_BINDINGS_PYTHON=ON \ 33 | -DLLVM_ENABLE_PROJECTS="mlir;lld" \ 34 | -DLLVM_ENABLE_TERMINFO=OFF \ 35 | -DLLVM_INSTALL_UTILS=ON \ 36 | -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \ 37 | /source/llvm-project/llvm 38 | 39 | RUN ninja -C build install 40 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "sleef"] 2 | path = third_party/sleef 3 | url = https://github.com/shibatch/sleef 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018-2020 Philippe Tillet 3 | * Copyright 2020-2022 OpenAI 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files 7 | * (the "Software"), to deal in the Software without restriction, 8 | * including without limitation the rights to use, copy, modify, merge, 9 | * publish, distribute, sublicense, and/or sell copies of the Software, 10 | * and to permit persons to whom the Software is furnished to do so, 11 | * subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | -------------------------------------------------------------------------------- /bin/triton-lsp.cpp: -------------------------------------------------------------------------------- 1 | #include "./RegisterTritonDialects.h" 2 | 3 | #include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h" 4 | 5 | int main(int argc, char **argv) { 6 | mlir::DialectRegistry registry; 7 | registerTritonDialects(registry); 8 | 9 | return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry)); 10 | } 11 | -------------------------------------------------------------------------------- /bin/triton-opt.cpp: -------------------------------------------------------------------------------- 1 | #include "./RegisterTritonDialects.h" 2 | 3 | #include "mlir/Tools/mlir-opt/MlirOptMain.h" 4 | 5 | int main(int argc, char **argv) { 6 | mlir::DialectRegistry registry; 7 | registerTritonDialects(registry); 8 | 9 | return mlir::asMainReturnCode(mlir::MlirOptMain( 10 | argc, argv, "Triton (GPU) optimizer driver\n", registry)); 11 | } 12 | -------------------------------------------------------------------------------- /bin/triton-reduce.cpp: -------------------------------------------------------------------------------- 1 | #include "./RegisterTritonDialects.h" 2 | 3 | #include "mlir/Tools/mlir-reduce/MlirReduceMain.h" 4 | 5 | int main(int argc, char **argv) { 6 | mlir::DialectRegistry registry; 7 | registerTritonDialects(registry); 8 | 9 | mlir::MLIRContext context(registry); 10 | return mlir::failed(mlir::mlirReduceMain(argc, argv, context)); 11 | } 12 | -------------------------------------------------------------------------------- /cmake/AddTritonUnitTest.cmake: -------------------------------------------------------------------------------- 1 | include(${PROJECT_SOURCE_DIR}/unittest/googletest.cmake) 2 | 3 | include(GoogleTest) 4 | enable_testing() 5 | 6 | function(add_triton_ut) 7 | set(options) 8 | set(oneValueArgs NAME) 9 | set(multiValueArgs SRCS LIBS DEFS) 10 | cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 11 | 12 | get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) 13 | get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) 14 | get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS) 15 | 16 | add_test(NAME ${__NAME} 17 | COMMAND ${__NAME}) 18 | add_executable( 19 | ${__NAME} 20 | ${__SRCS}) 21 | target_link_libraries( 22 | ${__NAME} 23 | PRIVATE 24 | GTest::gtest_main 25 | ${triton_libs} 26 | ${dialect_libs} 27 | ${conversion_libs} 28 | gmock 29 | ${__LIBS}) 30 | 31 | if(NOT MSVC) 32 | target_compile_options(${__NAME} PRIVATE -fno-rtti) 33 | endif() 34 | 35 | target_compile_definitions(${__NAME} PRIVATE ${__DEFS}) 36 | 37 | # Without the TEST_DISCOVERY_TIMEOUT, the tests randomly time out on my mac 38 | # laptop. I think the issue may be that the very first time you run a program 39 | # it's a bit slow. 40 | gtest_discover_tests(${__NAME} DISCOVERY_TIMEOUT 60) 41 | 42 | # Add the unit test to the top-level unit test target. 43 | add_dependencies(TritonUnitTests ${__NAME}) 44 | endfunction() 45 | -------------------------------------------------------------------------------- /cmake/json-version.txt: -------------------------------------------------------------------------------- 1 | v3.11.3 2 | -------------------------------------------------------------------------------- /cmake/llvm-hash.txt: -------------------------------------------------------------------------------- 1 | adba14acea99cc6a17d837763a3248c9d4a2fadf 2 | -------------------------------------------------------------------------------- /cmake/nvidia-toolchain-version.json: -------------------------------------------------------------------------------- 1 | { 2 | "ptxas": "12.8.93", 3 | "cuobjdump": "12.8.55", 4 | "nvdisasm": "12.8.55", 5 | "cudacrt": "12.8.61", 6 | "cudart": "12.8.57", 7 | "cupti": "12.8.90" 8 | } 9 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = Triton 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_templates/versions.html: -------------------------------------------------------------------------------- 1 | {%- if current_version %} 2 |
3 | 4 | Other Versions 5 | v: {{ current_version.name }} 6 | 7 | 8 |
9 | {%- if versions.tags %} 10 |
11 |
Tags
12 | {%- for item in versions.tags %} 13 |
{{ item.name }}
14 | {%- endfor %} 15 |
16 | {%- endif %} 17 | {%- if versions.branches %} 18 |
19 |
Branches
20 | {%- for item in versions.branches %} 21 |
{{ item.name }}
22 | {%- endfor %} 23 |
24 | {%- endif %} 25 |
26 |
27 | {%- endif %} 28 | -------------------------------------------------------------------------------- /docs/getting-started/tutorials/grouped_vs_row_major_ordering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png -------------------------------------------------------------------------------- /docs/getting-started/tutorials/parallel_reduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/getting-started/tutorials/parallel_reduction.png -------------------------------------------------------------------------------- /docs/getting-started/tutorials/random_bits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/getting-started/tutorials/random_bits.png -------------------------------------------------------------------------------- /docs/meetups/02-20-2024/Proton.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/meetups/02-20-2024/Proton.pdf -------------------------------------------------------------------------------- /docs/meetups/02-20-2024/notes.md: -------------------------------------------------------------------------------- 1 | #### Agenda: 2 | 3 | ##### Items: 4 | 1. Intel update 5 | 2. AMD update 6 | 3. Profiler update 7 | 4. We are in the process of transitioning to a pro slack plan, so everybody will be able to see history. Expect this to take a few more weeks. 8 | 5. We are still working on finalizing a document about our technical governance structure. Expect this to take a few more weeks too.4. Open discussion. 9 | 10 | ##### Minutes: 11 | Recording link [here](https://youtu.be/JDQCdj18Snc) 12 | 13 | 1. Intel GPU integration with Triton and Pytorch: 14 | - No strong requirement from PyTorch for specific backends to be part of Triton official release. 15 | - Can use a separate branch/fork for CI/CD and testing. 16 | - Intel team will work with Pytorch offline to close. 17 | 2. AMD GPU backend update: 18 | - AMD team shared the refactored design for AMD backend. 19 | - The new design is modularized and reduces clutter and duplication in upstream Triton. 20 | - Further work needed for regression testing and secure runners. 21 | 3. Proton profiler update: 22 | - Keren from the OpenAI team presented a new profiler tool for Triton kernels, which supports multiple vendors, metrics, and formats. 23 | - Outlined the plan for open-sourcing, integrating, and extending the tool. 24 | -------------------------------------------------------------------------------- /docs/meetups/08-06-2024/notes.md: -------------------------------------------------------------------------------- 1 | #### Agenda: 2 | 1. Triton-CPU Update 3 | 2. Intel GPU backend update 4 | 5 | ##### Items: 6 | Meeting notes: 7 | 1. Triton-CPU Update: Intel and Meta jointly presented the work on Triton-CPU, highlighting good progress on coverage and performance improvements. They also covered some of the optimizations they leveraged to get performance comparable to torch-native and torch-inductor. More details are in their slides. 8 | 2. Intel GPU Backend: Intel GPU backend shows good performance close to expert-tuned kernels and the use of block pointers for performance gains. There were questions around the future of block pointers and their importance for performance gains. With block-pointer deprecation there is a need for a more generic interface to support various backends including Intel GPU. 9 | 3. The 2024 Triton conference is on September 17th 2024 in Fremont California! Please register [here](README.md). 10 | ##### Minutes: 11 | Recording link [here](https://youtu.be/dfL3L4_3ujg) 12 | 13 | Presentations repo [here](https://drive.google.com/drive/folders/1fQ3zVrM7DT8W8FGJWKx1wNr2X53tYbeT?usp=sharing) 14 | -------------------------------------------------------------------------------- /docs/meetups/08-22-2023/amd-update.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/meetups/08-22-2023/amd-update.pdf -------------------------------------------------------------------------------- /docs/meetups/08-22-2023/intel-xpu-update.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/meetups/08-22-2023/intel-xpu-update.pptx -------------------------------------------------------------------------------- /docs/meetups/10-25-2023/intel-xpu-update.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/meetups/10-25-2023/intel-xpu-update.pdf -------------------------------------------------------------------------------- /docs/meetups/10-25-2023/notes.md: -------------------------------------------------------------------------------- 1 | #### Agenda: 2 | 3 | ##### Items: 4 | 1. H100 updates 5 | 2. Triton-Shared layer updates 6 | 3. Intel update 7 | 4. Open discussion 8 | 9 | ##### Minutes: 10 | Recording link [here](https://youtu.be/KZAzpKx1ebI) 11 | 12 | 1. H100 updates 13 | - Enabled WGMMA by default, now any matmul can reuse it. 14 | - fp8 formats enabled – 1.3 Petaflops on dense matmul on H100 (gemm performance) 15 | - Enabled Flash Attention using wgmma, resulting in 450 teraflop on fwd pass and 250 on backward pass – still working on perf for flash attention 16 | - fp8 numbers with flash attention running in fp8 with matmul is tricky, because the fp8 layout is significantly different than what is returned by wgmma, still wip 17 | 18 | 2. Triton-Shared layer 19 | - Please refer to slides for more details 20 | - Created a repo where you can find the middle layer 21 | - Available as a plugin into triton 22 | 23 | 3. Intel Update 24 | - Please refer to slides for more details 25 | -------------------------------------------------------------------------------- /docs/meetups/10-25-2023/triton-shared.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/meetups/10-25-2023/triton-shared.pptx -------------------------------------------------------------------------------- /docs/meetups/12-13-2023/notes.md: -------------------------------------------------------------------------------- 1 | #### Agenda: 2 | 3 | ##### Items: 4 | 1. Refactoring plan for 3rd party backends 5 | 2. Front end refactoring (AMD) 6 | 3. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development. 7 | 8 | ##### Minutes: 9 | Recording link [here](https://youtu.be/Lo43DQYkOWM) 10 | 11 | 1. Refactoring plan for 3rd party backends 12 | - Refactoring to be completed by end of the year so that all GPU backends can be individual passes on Triton GPU IR instead of being completely out of tree. The goal is for users to get other GPUs besides Cuda when they install Triton. Non-GPU Triton IR expected to stay as is. 13 | 3. Front end refactoring (AMD) 14 | - Will work with Phil for AMD related refactoring. Will share more details in next meetup about where AMD has diverged from Triton GPU IR and in the codeflow. 15 | 4. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development. 16 | - Can look at it on a case by case basis. 17 | -------------------------------------------------------------------------------- /docs/meetups/dev_conference_2024.md: -------------------------------------------------------------------------------- 1 | The conference slides are available [here](https://drive.google.com/drive/folders/1osK9hwcX_lC1EjdZGB-v4w5oKx23UnU2?usp=drive_link) 2 | 3 | The conference videos are available [here](https://www.youtube.com/playlist?list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz). 4 | -------------------------------------------------------------------------------- /docs/programming-guide/chapter-1/cuda-parallel-matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/programming-guide/chapter-1/cuda-parallel-matmul.png -------------------------------------------------------------------------------- /docs/programming-guide/chapter-1/triton-parallel-matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/programming-guide/chapter-1/triton-parallel-matmul.png -------------------------------------------------------------------------------- /docs/programming-guide/chapter-2/halide-iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/programming-guide/chapter-2/halide-iteration.png -------------------------------------------------------------------------------- /docs/programming-guide/chapter-2/polyhedral-iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/docs/programming-guide/chapter-2/polyhedral-iteration.png -------------------------------------------------------------------------------- /docs/python-api/triton.rst: -------------------------------------------------------------------------------- 1 | triton 2 | ====== 3 | 4 | .. currentmodule:: triton 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :nosignatures: 9 | 10 | jit 11 | autotune 12 | heuristics 13 | Config 14 | -------------------------------------------------------------------------------- /docs/python-api/triton.testing.rst: -------------------------------------------------------------------------------- 1 | triton.testing 2 | ============== 3 | 4 | .. currentmodule:: triton.testing 5 | 6 | .. autosummary:: 7 | :toctree: generated 8 | :nosignatures: 9 | 10 | Benchmark 11 | do_bench 12 | do_bench_cudagraph 13 | perf_report 14 | assert_close 15 | -------------------------------------------------------------------------------- /include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(triton) 2 | -------------------------------------------------------------------------------- /include/triton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Conversion) 2 | add_subdirectory(Dialect) 3 | add_subdirectory(Target) 4 | -------------------------------------------------------------------------------- /include/triton/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonGPUToLLVM) 2 | add_subdirectory(TritonToTritonGPU) 3 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/AsmFormat.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_ 2 | #define TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_ 3 | 4 | #include "mlir/IR/Value.h" 5 | #include "triton/Dialect/Triton/IR/Dialect.h" 6 | #include "llvm/ADT/SmallVector.h" 7 | #include "llvm/ADT/StringExtras.h" 8 | #include "llvm/ADT/StringRef.h" 9 | #include 10 | #include 11 | 12 | namespace mlir { 13 | class ConversionPatternRewriter; 14 | class Location; 15 | 16 | namespace triton { 17 | using llvm::StringRef; 18 | 19 | inline std::string strJoin(llvm::ArrayRef strs, 20 | llvm::StringRef delimiter) { 21 | return llvm::join(strs.begin(), strs.end(), delimiter); 22 | } 23 | 24 | } // namespace triton 25 | } // namespace mlir 26 | 27 | #endif // TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_ 28 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonGPUToLLVM) 3 | add_public_tablegen_target(TritonGPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonGPUToLLVM/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H 2 | #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H 3 | 4 | #include "mlir/Pass/Pass.h" 5 | 6 | #include 7 | 8 | namespace mlir { 9 | 10 | class ModuleOp; 11 | template class OperationPass; 12 | 13 | namespace triton::gpu { 14 | 15 | #define GEN_PASS_DECL 16 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc" 17 | 18 | #define GEN_PASS_REGISTRATION 19 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc" 20 | 21 | } // namespace triton::gpu 22 | 23 | } // namespace mlir 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonGPU) 3 | add_public_tablegen_target(TritonConversionToGPUPassIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonGPU/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TO_GPU_PASSES_H 2 | #define TRITON_CONVERSION_TO_GPU_PASSES_H 3 | 4 | #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" 5 | 6 | namespace mlir { 7 | namespace triton { 8 | 9 | #define GEN_PASS_REGISTRATION 10 | #include "triton/Conversion/TritonToTritonGPU/Passes.h.inc" 11 | 12 | } // namespace triton 13 | } // namespace mlir 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H 2 | #define TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace mlir { 9 | 10 | class ModuleOp; 11 | template class OperationPass; 12 | 13 | namespace triton { 14 | 15 | // Create the pass with numWarps passed from cl::opt. 16 | std::unique_ptr> createConvertTritonToTritonGPUPass(); 17 | 18 | // Create the pass with numWarps set explicitly. 19 | std::unique_ptr> 20 | createConvertTritonToTritonGPUPass(const std::string &target, int numWarps, 21 | int threadsPerWarp = 32, int numCTAs = 1, 22 | bool enableSourceRemat = false); 23 | 24 | } // namespace triton 25 | } // namespace mlir 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /include/triton/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Triton) 2 | add_subdirectory(TritonCPU) 3 | add_subdirectory(TritonGPU) 4 | add_subdirectory(TritonNvidiaGPU) 5 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonOps.td) 4 | mlir_tablegen(Ops.h.inc -gen-op-decls) 5 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 6 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 7 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 8 | add_mlir_doc(TritonOps TritonOps dialects/ -gen-op-doc) 9 | 10 | set(LLVM_TARGET_DEFINITIONS TritonDialect.td) 11 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls) 12 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs) 13 | add_mlir_doc(TritonDialect TritonDialect dialects/ -gen-dialect-doc) 14 | 15 | set(LLVM_TARGET_DEFINITIONS TritonTypes.td) 16 | mlir_tablegen(Types.h.inc -gen-typedef-decls) 17 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs) 18 | 19 | set(LLVM_TARGET_DEFINITIONS TritonInterfaces.td) 20 | mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls) 21 | mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs) 22 | 23 | set(LLVM_TARGET_DEFINITIONS TritonOpInterfaces.td) 24 | mlir_tablegen(OpInterfaces.h.inc -gen-op-interface-decls) 25 | mlir_tablegen(OpInterfaces.cpp.inc -gen-op-interface-defs) 26 | 27 | add_public_tablegen_target(TritonTableGen) 28 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/Interfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_IR_INTERFACES_H_ 2 | #define TRITON_IR_INTERFACES_H_ 3 | 4 | #include "mlir/IR/OpDefinition.h" 5 | 6 | #define GET_TYPEDEF_CLASSES 7 | #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc" 8 | 9 | #endif // TRITON_IR_TYPES_H_ 10 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/OpInterfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_IR_OP_INTERFACES_H_ 2 | #define TRITON_IR_OP_INTERFACES_H_ 3 | 4 | #include "mlir/IR/OpDefinition.h" 5 | 6 | namespace mlir { 7 | 8 | namespace triton { 9 | 10 | namespace impl { 11 | 12 | LogicalResult verifyTransposeOpInterface(Operation *op); 13 | 14 | LogicalResult verifyDotOpInterface(Operation *op); 15 | 16 | } // namespace impl 17 | 18 | } // namespace triton 19 | } // namespace mlir 20 | 21 | #include "triton/Dialect/Triton/IR/OpInterfaces.h.inc" 22 | 23 | #endif // TRITON_IR_OP_INTERFACES_H_ 24 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/TritonDialect.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT 2 | #define TRITON_DIALECT 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | def Triton_Dialect : Dialect { 7 | let name = "tt"; 8 | 9 | let cppNamespace = "::mlir::triton"; 10 | 11 | let summary = "The Triton IR in MLIR"; 12 | 13 | let description = [{ 14 | Triton Dialect. 15 | 16 | Dependent Dialects: 17 | * Arith: 18 | * addf, addi, andi, cmpf, cmpi, divf, fptosi, ... 19 | * Math: 20 | * exp, sin, cos, log, ... 21 | * StructuredControlFlow: 22 | * for, if, while, yield, condition 23 | * ControlFlow: 24 | * br, cond_br 25 | }]; 26 | 27 | let dependentDialects = [ 28 | "arith::ArithDialect", 29 | "math::MathDialect", 30 | "scf::SCFDialect", 31 | "cf::ControlFlowDialect", 32 | "ub::UBDialect" 33 | ]; 34 | 35 | let extraClassDeclaration = [{ 36 | void registerTypes(); 37 | }]; 38 | 39 | let hasConstantMaterializer = 1; 40 | let useDefaultTypePrinterParser = 1; 41 | let usePropertiesForAttributes = 1; 42 | } 43 | 44 | include "triton/Dialect/Triton/IR/TritonTypes.td" 45 | 46 | 47 | #endif // TRITON_DIALECT 48 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/IR/Types.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_IR_TYPES_H_ 2 | #define TRITON_IR_TYPES_H_ 3 | 4 | #include "mlir/IR/BuiltinTypes.h" 5 | #include "mlir/IR/TypeSupport.h" 6 | #include "mlir/IR/Types.h" 7 | 8 | #define GET_TYPEDEF_CLASSES 9 | #include "triton/Dialect/Triton/IR/Types.h.inc" 10 | 11 | namespace mlir { 12 | 13 | namespace triton { 14 | 15 | bool isTensorPointerType(Type type); 16 | 17 | bool isTensorOrTensorPointerType(Type type); 18 | 19 | unsigned getPointeeBitWidth(Type type); 20 | 21 | Type getPointeeType(Type type); 22 | 23 | Type getPointerType(Type type, int addressSpace = 1); 24 | 25 | int getAddressSpace(Type type); 26 | 27 | Type getElementTypeOfTensorPointerType(Type type); 28 | 29 | Type getI1SameShape(Type type); 30 | 31 | Type getI32SameShape(Type type); 32 | 33 | Type getPointerTypeSameShape(Type type); 34 | 35 | Type getPointerTypeToElement(Type type); 36 | 37 | } // namespace triton 38 | 39 | } // namespace mlir 40 | 41 | #endif // TRITON_IR_TYPES_H_ 42 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name Triton) 3 | add_public_tablegen_target(TritonTransformsIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Dialect/Triton/Transforms/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_ 2 | #define TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_ 3 | 4 | #include "mlir/Pass/Pass.h" 5 | 6 | namespace mlir { 7 | namespace triton { 8 | 9 | std::unique_ptr createCombineOpsPass(); 10 | 11 | std::unique_ptr createLoopInvariantCodeMotionPass(); 12 | std::unique_ptr createReorderBroadcastPass(); 13 | std::unique_ptr createRewriteTensorPointerPass(); 14 | std::unique_ptr createLoopUnrollPass(); 15 | 16 | } // namespace triton 17 | 18 | #define GEN_PASS_REGISTRATION 19 | #include "triton/Dialect/Triton/Transforms/Passes.h.inc" 20 | 21 | } // namespace mlir 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/Attributes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_ 2 | #define TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_ 3 | 4 | #include "triton/Dialect/TritonCPU/IR/TritonCPUInterfaces.h" 5 | 6 | #define GET_ATTRDEF_CLASSES 7 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.h.inc" 8 | 9 | #endif // TRITON_DIALECT_TRITONCPU_IR_ATTRIBUTES_H_ 10 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonCPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_cpu) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_cpu) 6 | mlir_tablegen(Ops.h.inc -gen-op-decls) 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_cpu) 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_cpu) 10 | add_mlir_doc(TritonCPUDialect TritonCPUDialect dialects/ -gen-dialect-doc) 11 | add_mlir_doc(TritonCPUOps TritonCPUOps dialects/ -gen-op-doc) 12 | add_public_tablegen_target(TritonCPUTableGen) 13 | 14 | set(LLVM_TARGET_DEFINITIONS TritonCPUAttrDefs.td) 15 | mlir_tablegen(TritonCPUAttrInterfaces.h.inc -gen-attr-interface-decls) 16 | mlir_tablegen(TritonCPUAttrInterfaces.cpp.inc -gen-attr-interface-defs) 17 | mlir_tablegen(TritonCPUAttrDefs.h.inc -gen-attrdef-decls) 18 | mlir_tablegen(TritonCPUAttrDefs.cpp.inc -gen-attrdef-defs) 19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 21 | add_public_tablegen_target(TritonCPUAttrDefsIncGen) 22 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/Dialect.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_ 2 | #define TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_ 3 | 4 | #include "mlir/Dialect/Tensor/IR/Tensor.h" 5 | #include "mlir/IR/BuiltinOps.h" 6 | #include "mlir/IR/Dialect.h" 7 | 8 | // TritonCPU depends on Triton 9 | #include "triton/Dialect/Triton/IR/Dialect.h" 10 | #include "triton/Dialect/TritonCPU/IR/Attributes.h" 11 | #include "triton/Dialect/TritonCPU/IR/Dialect.h.inc" 12 | #include "triton/Dialect/TritonCPU/IR/Types.h" 13 | 14 | #define GET_OP_CLASSES 15 | #include "triton/Dialect/TritonCPU/IR/Ops.h.inc" 16 | 17 | #endif // TRITON_DIALECT_TRITONCPU_IR_DIALECT_H_ 18 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUAttrDefs.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_ATTRDEFS 2 | #define TRITONCPU_ATTRDEFS 3 | 4 | include "mlir/IR/AttrTypeBase.td" 5 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td" 6 | include "triton/Dialect/Triton/IR/TritonInterfaces.td" 7 | 8 | //===----------------------------------------------------------------------===// 9 | // TritonCPU Attribute Definitions 10 | //===----------------------------------------------------------------------===// 11 | def TritonCPU_AttrTrait : AttrInterface<"TritonCPU_AttrTrait"> { 12 | let cppNamespace = "::mlir::triton::cpu"; 13 | } 14 | 15 | class TritonCPU_Attr traits = [], 16 | Dialect dialect = TritonCPU_Dialect, 17 | string baseCppClass = "::mlir::Attribute"> 18 | : AttrDef { 19 | 20 | let description = [{TritonCPU attr.}]; 21 | let attrName = "triton.cpu." # attrMnemonic; 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUDialect.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_DIALECT 2 | #define TRITONCPU_DIALECT 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | def TritonCPU_Dialect : Dialect { 7 | let name = "triton_cpu"; 8 | 9 | let cppNamespace = "::mlir::triton::cpu"; 10 | 11 | let hasOperationAttrVerify = 1; 12 | 13 | let description = [{ 14 | Triton CPU Dialect. 15 | }]; 16 | 17 | let dependentDialects = [ 18 | "triton::TritonDialect", 19 | "tensor::TensorDialect", 20 | "mlir::memref::MemRefDialect", 21 | ]; 22 | 23 | let extraClassDeclaration = [{ 24 | void registerTypes(); 25 | }]; 26 | 27 | let useDefaultTypePrinterParser = 1; 28 | let useDefaultAttributePrinterParser = 1; 29 | let usePropertiesForAttributes = 1; 30 | } 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUInterfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CPU_DIALECT_INTERFACES_H 2 | #define TRITON_CPU_DIALECT_INTERFACES_H 3 | 4 | #include "triton/Dialect/TritonCPU/IR/TritonCPUAttrInterfaces.h.inc" 5 | 6 | #endif // TRITON_CPU_DIALECT_INTERFACES_H 7 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/TritonCPUTypes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_TYPES 2 | #define TRITONCPU_TYPES 3 | 4 | include "triton/Dialect/Triton/IR/TritonTypes.td" 5 | include "triton/Dialect/TritonCPU/IR/TritonCPUDialect.td" 6 | include "mlir/IR/AttrTypeBase.td" 7 | 8 | class TTC_TypeDef traits = []> 9 | : TypeDef { 10 | let mnemonic = _mnemonic; 11 | } 12 | 13 | def TTC_TokenType : TTC_TypeDef<"Token", "token"> { 14 | let parameters = (ins "int32_t":$type); 15 | 16 | let builders = [ 17 | TypeBuilder<(ins "unsigned":$type), [{ 18 | return $_get($_ctxt, type); 19 | }]> 20 | ]; 21 | 22 | let hasCustomAssemblyFormat = 1; 23 | 24 | let skipDefaultBuilders = 1; 25 | } 26 | 27 | def TTC_Vector : VectorOfAnyRankOf<[TT_Float, TT_Int]>; 28 | 29 | def TTC_Type : AnyTypeOf<[TT_Float, TT_Int, TTC_Vector]>; 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonCPU/IR/Types.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONCPU_IR_TYPES_H_ 2 | #define TRITONCPU_IR_TYPES_H_ 3 | 4 | #include "mlir/IR/TypeSupport.h" 5 | #include "mlir/IR/Types.h" 6 | 7 | #define GET_TYPEDEF_CLASSES 8 | #include "triton/Dialect/TritonCPU/IR/Types.h.inc" 9 | 10 | #endif // TRITON_IR_TYPES_H_ 11 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/Attributes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_ 2 | #define TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_ 3 | 4 | #include "mlir/IR/Attributes.h" 5 | #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h" 6 | 7 | #define GET_ATTRDEF_CLASSES 8 | #include "triton/Dialect/TritonGPU/IR/AttrDefs.h.inc" 9 | 10 | #endif // TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_ 11 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=ttg) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=ttg) 6 | mlir_tablegen(Ops.h.inc -gen-op-decls) 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=ttg) 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=ttg) 10 | add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc) 11 | add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc) 12 | add_public_tablegen_target(TritonGPUTableGen) 13 | 14 | set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td) 15 | mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls) 16 | mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs) 17 | mlir_tablegen(AttrDefs.h.inc -gen-attrdef-decls) 18 | mlir_tablegen(AttrDefs.cpp.inc -gen-attrdef-defs) 19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 21 | add_public_tablegen_target(TritonGPUAttrDefsIncGen) 22 | 23 | set(LLVM_TARGET_DEFINITIONS TritonGPUTypeInterfaces.td) 24 | mlir_tablegen(TypeInterfaces.h.inc -gen-type-interface-decls) 25 | mlir_tablegen(TypeInterfaces.cpp.inc -gen-type-interface-defs) 26 | add_public_tablegen_target(TritonGPUTypeInterfacesIncGen) 27 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_DIALECT 2 | #define TRITONGPU_DIALECT 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | def TritonGPU_Dialect : Dialect { 7 | let name = "ttg"; 8 | 9 | let cppNamespace = "::mlir::triton::gpu"; 10 | 11 | let hasOperationAttrVerify = 1; 12 | 13 | let description = [{ 14 | Triton GPU Dialect. 15 | }]; 16 | 17 | let dependentDialects = [ 18 | "triton::TritonDialect", 19 | "mlir::gpu::GPUDialect", 20 | ]; 21 | 22 | let extraClassDeclaration = [{ 23 | void registerTypes(); 24 | 25 | LinearLayout toLinearLayout(ArrayRef shape, Attribute layout); 26 | LinearEncodingAttr toLinearEncoding(ArrayRef shape, Attribute layout); 27 | 28 | static int getNumCTAs(ModuleOp mod); 29 | static int getThreadsPerWarp(ModuleOp mod); 30 | 31 | private: 32 | LinearLayoutCache llCache; 33 | LinearEncodingCache leCache; 34 | }]; 35 | 36 | let useDefaultTypePrinterParser = 1; 37 | let useDefaultAttributePrinterParser = 1; 38 | let usePropertiesForAttributes = 1; 39 | } 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_GPU_DIALECT_INTERFACES_H 2 | #define TRITON_GPU_DIALECT_INTERFACES_H 3 | 4 | // clang-format off 5 | #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" 6 | #include "triton/Dialect/TritonGPU/IR/AttrInterfaces.h.inc" 7 | // clang-format on 8 | 9 | #endif // TRITON_GPU_DIALECT_INTERFACES_H 10 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_GPU_TYPE_INTERFACES 2 | #define TRITON_GPU_TYPE_INTERFACES 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | // Interface dynamically attached to RankedTensorType and MemDescType. 7 | def TTG_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> { 8 | let cppNamespace = "::mlir::triton::gpu"; 9 | let methods = [ 10 | InterfaceMethod<"Returns the encoding of the tensor or memory descriptor", 11 | "mlir::Attribute", "getEncoding", (ins)>, 12 | InterfaceMethod<"Returns element type", 13 | "mlir::Type", "getElementType", (ins)>, 14 | InterfaceMethod<"Returns the type shape", 15 | "llvm::ArrayRef", "getShape", (ins)>, 16 | InterfaceMethod<"Returns the tensor or buffer rank", 17 | "int64_t", "getRank", (ins)>, 18 | InterfaceMethod<"Returns the element type bit width", 19 | "int64_t", "getElementTypeBitWidth", (ins)>, 20 | ]; 21 | } 22 | 23 | #endif // TRITON_GPU_TYPE_INTERFACES 24 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/IR/Types.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_IR_TYPES_H_ 2 | #define TRITONGPU_IR_TYPES_H_ 3 | 4 | #include "mlir/IR/BuiltinTypes.h" 5 | #include "mlir/IR/TypeSupport.h" 6 | #include "mlir/IR/Types.h" 7 | #include "triton/Dialect/TritonGPU/IR/Attributes.h" 8 | 9 | #define GET_TYPEDEF_CLASSES 10 | #include "triton/Dialect/TritonGPU/IR/Types.h.inc" 11 | 12 | #include "triton/Dialect/TritonGPU/IR/TypeInterfaces.h.inc" 13 | 14 | #endif // TRITON_IR_TYPES_H_ 15 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonGPU) 3 | add_public_tablegen_target(TritonGPUTransformsIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h: -------------------------------------------------------------------------------- 1 | #include "mlir/IR/PatternMatch.h" 2 | 3 | namespace mlir::triton::gpu { 4 | 5 | void populateDecomposeScaledBlockedPatterns(mlir::RewritePatternSet &patterns, 6 | int benefit); 7 | 8 | } // namespace mlir::triton::gpu 9 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/Transforms/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_ 2 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_ 3 | 4 | #include "mlir/Pass/Pass.h" 5 | #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h" 6 | 7 | namespace mlir { 8 | namespace triton { 9 | namespace gpu { 10 | 11 | // Generate the pass class declarations. 12 | #define GEN_PASS_DECL 13 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" 14 | 15 | /// Generate the code for registering passes. 16 | #define GEN_PASS_REGISTRATION 17 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" 18 | 19 | } // namespace gpu 20 | } // namespace triton 21 | } // namespace mlir 22 | #endif 23 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // 3 | // Defines utilities to use while converting to the TritonGPU dialect. 4 | // 5 | //===----------------------------------------------------------------------===// 6 | 7 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 8 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 9 | 10 | #include "mlir/Transforms/DialectConversion.h" 11 | 12 | namespace mlir { 13 | 14 | class TritonGPUTypeConverter : public TypeConverter { 15 | public: 16 | TritonGPUTypeConverter(MLIRContext *context, int numWarps, int threadsPerWarp, 17 | int numCTAs, bool enableSourceRemat); 18 | int getNumWarps() const { return numWarps; } 19 | int getThreadsPerWarp() const { return threadsPerWarp; } 20 | int getNumCTAs() const { return numCTAs; } 21 | 22 | private: 23 | MLIRContext *context; 24 | int numWarps; 25 | int threadsPerWarp; 26 | int numCTAs; 27 | }; 28 | 29 | class TritonGPUConversionTarget : public ConversionTarget { 30 | 31 | public: 32 | explicit TritonGPUConversionTarget(MLIRContext &ctx, 33 | TritonGPUTypeConverter &typeConverter); 34 | }; 35 | 36 | } // namespace mlir 37 | 38 | #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ 39 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonGPU/Transforms/WarpSpecialization.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_TRITONGPU_TRANSFORM_PIPELINE_WARPSPECIALIZATION_H_ 2 | #define TRITON_TRITONGPU_TRANSFORM_PIPELINE_WARPSPECIALIZATION_H_ 3 | 4 | #include "mlir/Support/LogicalResult.h" 5 | 6 | namespace mlir { 7 | namespace scf { 8 | class ForOp; 9 | } // namespace scf 10 | namespace triton::gpu { 11 | // Identify load-mma dependencies and specialize them to different partitions. 12 | LogicalResult specializeLoadMMADependencies(scf::ForOp &loop, 13 | int defaultNumStages); 14 | // This is the final step to prepare a loop for warp specialization. This takes 15 | // a loop with a partition schedule and rewrites the loop such that all SSA 16 | // dependencies between partitions are passed through shared memory and 17 | // multibuffers them according to partition stages. 18 | LogicalResult rewritePartitionDependencies(scf::ForOp &loop); 19 | // Given a loop where the partitions' inputs and outputs have been fully 20 | // rewritten to be reference semantic, partitiong the loop into a 21 | // `ttg.warp_specialize` by duplicating the loop for each partition and 22 | // rematerializing, as necessary, operations in the root partition. 23 | LogicalResult partitionLoop(scf::ForOp loop); 24 | } // namespace triton::gpu 25 | } // namespace mlir 26 | 27 | #endif // TRITON_TRITONGPU_TRANSFORM_PIPELINE_WARPSPECIALIZATION_H_ 28 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonNvidiaGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=ttng) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=ttng) 6 | mlir_tablegen(Ops.h.inc -gen-op-decls) 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 8 | add_mlir_doc(TritonNvidiaGPUDialect TritonNvidiaGPUDialect dialects/ -gen-dialect-doc) 9 | add_mlir_doc(TritonNvidiaGPUOps TritonNvidiaGPUOps dialects/ -gen-op-doc) 10 | add_public_tablegen_target(TritonNvidiaGPUTableGen) 11 | 12 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUAttrDefs.td) 13 | mlir_tablegen(TritonNvidiaGPUAttrDefs.h.inc -gen-attrdef-decls) 14 | mlir_tablegen(TritonNvidiaGPUAttrDefs.cpp.inc -gen-attrdef-defs) 15 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 16 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 17 | add_public_tablegen_target(TritonNvidiaGPUAttrDefsIncGen) 18 | 19 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOpInterfaces.td) 20 | mlir_tablegen(TritonNvidiaGPUOpInterfaces.h.inc -gen-op-interface-decls) 21 | mlir_tablegen(TritonNvidiaGPUOpInterfaces.cpp.inc -gen-op-interface-defs) 22 | add_public_tablegen_target(TritonNvidiaGPUOpInterfacesIncGen) 23 | -------------------------------------------------------------------------------- /include/triton/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonNvidiaGPU) 3 | add_public_tablegen_target(TritonNvidiaGPUTransformsIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Target/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(LLVMIR) 2 | -------------------------------------------------------------------------------- /include/triton/Target/LLVMIR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name LLVMIR) 3 | add_public_tablegen_target(LLVMIRIncGen) 4 | -------------------------------------------------------------------------------- /include/triton/Target/LLVMIR/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_TARGET_LLVM_IR_PASSES_H 2 | #define TRITON_TARGET_LLVM_IR_PASSES_H 3 | 4 | #include "mlir/Pass/Pass.h" 5 | 6 | namespace mlir { 7 | 8 | /// Create a pass to add DIScope 9 | std::unique_ptr createLLVMDIScopePass(); 10 | 11 | /// Generate the code for registering conversion passes. 12 | #define GEN_PASS_REGISTRATION 13 | #include "triton/Target/LLVMIR/Passes.h.inc" 14 | 15 | } // namespace mlir 16 | 17 | #endif // TRITON_TARGET_LLVM_IR_PASSES_H 18 | -------------------------------------------------------------------------------- /include/triton/Target/LLVMIR/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_TARGET_LLVMIR_PASSES 2 | #define TRITON_TARGET_LLVMIR_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def LLVMDIScope: Pass<"enable-line-info", "mlir::ModuleOp"> { 7 | let summary = "Materialize LLVM line info"; 8 | let description = [{ 9 | This pass materializes line mapping information for LLVM IR dialect operations. 10 | }]; 11 | 12 | let constructor = "mlir::createLLVMDIScopePass()"; 13 | } 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAnalysis 2 | AxisInfo.cpp 3 | Allocation.cpp 4 | Membar.cpp 5 | Alias.cpp 6 | Utility.cpp 7 | 8 | DEPENDS 9 | TritonTableGen 10 | TritonGPUTableGen 11 | TritonGPUAttrDefsIncGen 12 | TritonGPUTypeInterfacesIncGen 13 | 14 | LINK_LIBS PUBLIC 15 | MLIRAnalysis 16 | MLIRLLVMDialect 17 | TritonIR 18 | TritonGPUIR 19 | TritonNvidiaGPUIR 20 | ) 21 | -------------------------------------------------------------------------------- /lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | add_subdirectory(Conversion) 3 | add_subdirectory(Dialect) 4 | add_subdirectory(Target) 5 | add_subdirectory(Tools) 6 | add_subdirectory(Instrumentation) 7 | -------------------------------------------------------------------------------- /lib/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonToTritonGPU) 2 | add_subdirectory(TritonGPUToLLVM) 3 | -------------------------------------------------------------------------------- /lib/Conversion/TritonGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonGPUToLLVM 2 | DotOpToLLVM/FMA.cpp 3 | DotOpToLLVM/FMADotUtility.cpp 4 | AllocateSharedMemory.cpp 5 | AllocateWarpGroups.cpp 6 | AssertOpToLLVM.cpp 7 | ControlFlowOpToLLVM.cpp 8 | ConvertLayoutOpToLLVM.cpp 9 | ElementwiseOpToLLVM.cpp 10 | FuncOpToLLVM.cpp 11 | GatherOpToLLVM.cpp 12 | GlobalScratchMemoryAllocation.cpp 13 | HistogramOpToLLVM.cpp 14 | MakeRangeOpToLLVM.cpp 15 | MemoryOpToLLVM.cpp 16 | PrintOpToLLVM.cpp 17 | ReduceOpToLLVM.cpp 18 | ScanOpToLLVM.cpp 19 | SPMDOpToLLVM.cpp 20 | TypeConverter.cpp 21 | Utility.cpp 22 | ViewOpToLLVM.cpp 23 | 24 | DEPENDS 25 | TritonGPUConversionPassIncGen 26 | 27 | LINK_LIBS PUBLIC 28 | MLIRIR 29 | MLIRPass 30 | MLIRGPUDialect 31 | MLIRGPUToNVVMTransforms 32 | MLIRGPUToROCDLTransforms 33 | MLIRGPUTransforms 34 | TritonAnalysis 35 | TritonIR 36 | TritonGPUIR 37 | TritonGPUTransforms 38 | TritonNvidiaGPUTransforms 39 | ) 40 | -------------------------------------------------------------------------------- /lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMA.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Conversion/TritonGPUToLLVM/FMADotUtility.h" 2 | #include "triton/Conversion/TritonGPUToLLVM/Utility.h" 3 | 4 | using namespace mlir; 5 | using namespace mlir::triton; 6 | using namespace ::mlir::triton::gpu; 7 | 8 | namespace { 9 | class GenericFMAVectorMultiplier : public FMAVectorMultiplier { 10 | OpBuilder &builder; 11 | Location loc; 12 | 13 | public: 14 | GenericFMAVectorMultiplier(OpBuilder &builder, Location loc) 15 | : builder(builder), loc(loc) {} 16 | 17 | Value multiplyVectors(ArrayRef a, ArrayRef b, 18 | Value c) override { 19 | auto K = a.size(); 20 | assert(b.size() == K); 21 | Value accum = c; 22 | for (auto [aElem, bElem] : llvm::zip(a, b)) 23 | accum = builder.create(loc, aElem, bElem, accum); 24 | return accum; 25 | } 26 | }; 27 | 28 | } // namespace 29 | 30 | LogicalResult convertFMADot(DotOp op, DotOp::Adaptor adaptor, 31 | const LLVMTypeConverter *typeConverter, 32 | ConversionPatternRewriter &rewriter) { 33 | auto *ctx = rewriter.getContext(); 34 | auto loc = op.getLoc(); 35 | GenericFMAVectorMultiplier multiplier(rewriter, loc); 36 | return parametricConvertFMADot(op, adaptor, typeConverter, rewriter, 37 | multiplier); 38 | } 39 | -------------------------------------------------------------------------------- /lib/Conversion/TritonToTritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonToTritonGPU 2 | TritonGPUConversion.cpp 3 | TritonToTritonGPUPass.cpp 4 | 5 | DEPENDS 6 | TritonConversionToGPUPassIncGen 7 | 8 | LINK_LIBS PUBLIC 9 | MLIRIR 10 | MLIRPass 11 | MLIRTransforms 12 | TritonIR 13 | ProtonIR 14 | TritonGPUIR 15 | ) 16 | -------------------------------------------------------------------------------- /lib/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Triton) 2 | add_subdirectory(TritonCPU) 3 | add_subdirectory(TritonGPU) 4 | add_subdirectory(TritonNvidiaGPU) 5 | -------------------------------------------------------------------------------- /lib/Dialect/Triton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /lib/Dialect/Triton/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Canonicalize.td) 2 | mlir_tablegen(TritonCanonicalize.inc -gen-rewriters) 3 | add_public_tablegen_target(TritonCanonicalizeIncGen) 4 | 5 | add_triton_library(TritonIR 6 | Dialect.cpp 7 | Ops.cpp 8 | Traits.cpp 9 | Types.cpp 10 | OpInterfaces.cpp 11 | 12 | DEPENDS 13 | TritonTableGen 14 | TritonCanonicalizeIncGen 15 | 16 | LINK_LIBS PUBLIC 17 | MLIRIR 18 | MLIRArithDialect 19 | MLIRMathDialect 20 | MLIRSCFDialect 21 | ) 22 | -------------------------------------------------------------------------------- /lib/Dialect/Triton/IR/Canonicalize.td: -------------------------------------------------------------------------------- 1 | #ifndef TT_PATTERNS 2 | #define TT_PATTERNS 3 | 4 | include "mlir/IR/PatternBase.td" 5 | include "triton/Dialect/Triton/IR/TritonOps.td" 6 | 7 | // broadcast(splat(x)) -> splat(x) 8 | def BroadcastSplatPattern : 9 | Pat<(TT_BroadcastOp (TT_SplatOp $x)), 10 | (TT_SplatOp $x)>; 11 | 12 | // broadcast(broadcast(x)) -> broadcast(x) 13 | def BroadcastBroadcastPattern : 14 | Pat<(TT_BroadcastOp (TT_BroadcastOp $x)), 15 | (TT_BroadcastOp $x)>; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /lib/Dialect/Triton/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Combine.td) 2 | mlir_tablegen(TritonCombine.inc -gen-rewriters) 3 | add_public_tablegen_target(TritonCombineIncGen) 4 | 5 | add_triton_library(TritonTransforms 6 | Combine.cpp 7 | LoopInvariantCodeMotion.cpp 8 | LoopUnroll.cpp 9 | ReorderBroadcast.cpp 10 | RewriteTensorPointer.cpp 11 | 12 | DEPENDS 13 | TritonTransformsIncGen 14 | TritonCombineIncGen 15 | 16 | LINK_LIBS PUBLIC 17 | MLIRPass 18 | MLIRTransformUtils 19 | TritonIR 20 | ) 21 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonCPUIR 2 | Dialect.cpp 3 | Ops.cpp 4 | Types.cpp 5 | 6 | DEPENDS 7 | TritonCPUTableGen 8 | TritonCPUAttrDefsIncGen 9 | 10 | LINK_LIBS PUBLIC 11 | TritonIR 12 | ) 13 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/IR/Ops.cpp: -------------------------------------------------------------------------------- 1 | #include "mlir/IR/Builders.h" 2 | #include "triton/Dialect/TritonCPU/IR/Dialect.h" 3 | 4 | #define GET_OP_CLASSES 5 | #include "triton/Dialect/TritonCPU/IR/Ops.cpp.inc" 6 | 7 | // enum attribute definitions 8 | #include "triton/Dialect/TritonCPU/IR/OpsEnums.cpp.inc" 9 | 10 | namespace mlir::triton::cpu { 11 | 12 | LogicalResult PrintOp::verify() { 13 | if (getOperands().size() > 1) 14 | return emitOpError("expects at most one operand"); 15 | return success(); 16 | } 17 | 18 | void ExternElementwiseOp::getEffects( 19 | SmallVectorImpl> 20 | &effects) { 21 | if (getPure()) 22 | return; 23 | effects.emplace_back(MemoryEffects::Write::get(), 24 | SideEffects::DefaultResource::get()); 25 | effects.emplace_back(MemoryEffects::Read::get(), 26 | SideEffects::DefaultResource::get()); 27 | } 28 | 29 | LogicalResult 30 | DotOp::inferReturnTypes(MLIRContext *context, std::optional location, 31 | ValueRange operands, DictionaryAttr attributes, 32 | OpaqueProperties properties, RegionRange regions, 33 | SmallVectorImpl &inferredReturnTypes) { 34 | // type is the same as the accumulator 35 | auto accTy = cast(operands[2].getType()); 36 | inferredReturnTypes.push_back(accTy); 37 | return success(); 38 | } 39 | 40 | } // namespace mlir::triton::cpu 41 | -------------------------------------------------------------------------------- /lib/Dialect/TritonCPU/IR/Types.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Dialect/TritonCPU/IR/Types.h" 2 | #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc` 3 | #include "triton/Dialect/TritonCPU/IR/Dialect.h" 4 | #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc` 5 | 6 | using namespace mlir; 7 | using namespace mlir::triton::cpu; 8 | 9 | #define GET_TYPEDEF_CLASSES 10 | #include "triton/Dialect/TritonCPU/IR/Types.cpp.inc" 11 | 12 | Type TokenType::parse(AsmParser &parser) { 13 | if (parser.parseLess()) 14 | return Type(); 15 | 16 | int type = 1; 17 | if (parser.parseInteger(type)) 18 | return Type(); 19 | 20 | if (parser.parseGreater()) 21 | return Type(); 22 | 23 | return TokenType::get(parser.getContext(), type); 24 | } 25 | 26 | void TokenType::print(AsmPrinter &printer) const { 27 | printer << "<" << getType() << ">"; 28 | } 29 | 30 | //===----------------------------------------------------------------------===// 31 | // Triton Dialect 32 | //===----------------------------------------------------------------------===// 33 | void ::mlir::triton::cpu::TritonCPUDialect::registerTypes() { 34 | addTypes< 35 | #define GET_TYPEDEF_LIST 36 | #include "triton/Dialect/TritonCPU/IR/Types.cpp.inc" 37 | >(); 38 | } 39 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonGPUIR 2 | Dialect.cpp 3 | LinearLayoutConversions.cpp 4 | Ops.cpp 5 | Types.cpp 6 | 7 | DEPENDS 8 | TritonGPUTableGen 9 | TritonGPUAttrDefsIncGen 10 | TritonGPUTypeInterfacesIncGen 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRGPUDialect 14 | TritonIR 15 | TritonTools 16 | ) 17 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/Transforms/Pipeliner/TestPipelineAssignLatencies.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Dialect/TritonGPU/IR/Dialect.h" 2 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h" 3 | #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" 4 | #include "triton/Dialect/TritonGPU/Transforms/Schedule.h" 5 | #include "triton/Dialect/TritonGPU/Transforms/Utility.h" 6 | 7 | using namespace mlir; 8 | namespace tt = mlir::triton; 9 | namespace ttg = mlir::triton::gpu; 10 | 11 | namespace mlir { 12 | namespace triton { 13 | namespace gpu { 14 | 15 | #define GEN_PASS_DEF_TRITONGPUTESTPIPELINEASSIGNLATENCIES 16 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" 17 | 18 | struct TestPipelineAssignLatencies 19 | : public impl::TritonGPUTestPipelineAssignLatenciesBase< 20 | TestPipelineAssignLatencies> { 21 | using impl::TritonGPUTestPipelineAssignLatenciesBase< 22 | TestPipelineAssignLatencies>::TritonGPUTestPipelineAssignLatenciesBase; 23 | 24 | void runOnOperation() override { assignLatencies(getOperation(), numStages); } 25 | }; 26 | 27 | } // namespace gpu 28 | } // namespace triton 29 | } // namespace mlir 30 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/Transforms/Pipeliner/TestPipelineLowerLoop.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Dialect/TritonGPU/IR/Dialect.h" 2 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h" 3 | #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" 4 | #include "triton/Dialect/TritonGPU/Transforms/Schedule.h" 5 | #include "triton/Dialect/TritonGPU/Transforms/Utility.h" 6 | 7 | using namespace mlir; 8 | namespace tt = mlir::triton; 9 | namespace ttg = mlir::triton::gpu; 10 | 11 | namespace mlir { 12 | namespace triton { 13 | namespace gpu { 14 | 15 | #define GEN_PASS_DEF_TRITONGPUTESTPIPELINELOWERLOOP 16 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" 17 | 18 | struct TestPipelineLowerLoop 19 | : public impl::TritonGPUTestPipelineLowerLoopBase { 20 | using impl::TritonGPUTestPipelineLowerLoopBase< 21 | TestPipelineLowerLoop>::TritonGPUTestPipelineLowerLoopBase; 22 | 23 | void runOnOperation() override { 24 | ModuleOp m = getOperation(); 25 | 26 | lowerLoops(m); 27 | } 28 | }; 29 | 30 | } // namespace gpu 31 | } // namespace triton 32 | } // namespace mlir 33 | -------------------------------------------------------------------------------- /lib/Dialect/TritonGPU/Transforms/Pipeliner/TestPipelineScheduleLoop.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Dialect/TritonGPU/IR/Dialect.h" 2 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h" 3 | #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" 4 | #include "triton/Dialect/TritonGPU/Transforms/Schedule.h" 5 | #include "triton/Dialect/TritonGPU/Transforms/Utility.h" 6 | 7 | using namespace mlir; 8 | namespace tt = mlir::triton; 9 | namespace ttg = mlir::triton::gpu; 10 | 11 | namespace mlir { 12 | namespace triton { 13 | namespace gpu { 14 | 15 | #define GEN_PASS_DEF_TRITONGPUTESTPIPELINESCHEDULELOOP 16 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" 17 | 18 | static const char *kLatencyAttrName = "tt.latency"; 19 | 20 | struct TestPipelineScheduleLoop 21 | : public impl::TritonGPUTestPipelineScheduleLoopBase< 22 | TestPipelineScheduleLoop> { 23 | using impl::TritonGPUTestPipelineScheduleLoopBase< 24 | TestPipelineScheduleLoop>::TritonGPUTestPipelineScheduleLoopBase; 25 | 26 | void runOnOperation() override { scheduleLoops(getOperation()); } 27 | }; 28 | 29 | } // namespace gpu 30 | } // namespace triton 31 | } // namespace mlir 32 | -------------------------------------------------------------------------------- /lib/Dialect/TritonNvidiaGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /lib/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonNvidiaGPUIR 2 | Dialect.cpp 3 | Ops.cpp 4 | 5 | DEPENDS 6 | TritonNvidiaGPUTableGen 7 | TritonNvidiaGPUAttrDefsIncGen 8 | TritonNvidiaGPUOpInterfacesIncGen 9 | 10 | LINK_LIBS PUBLIC 11 | TritonIR 12 | TritonGPUIR 13 | ) 14 | -------------------------------------------------------------------------------- /lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonNvidiaGPUTransforms 2 | FenceInsertion.cpp 3 | KeepAccInTMem.cpp 4 | MMALowering.cpp 5 | OptimizeDescriptorEncoding.cpp 6 | PlanCTA.cpp 7 | PromoteLHSToTMem.cpp 8 | TensorMemoryAllocation.cpp 9 | TMALowering.cpp 10 | 11 | DEPENDS 12 | TritonNvidiaGPUTransformsIncGen 13 | 14 | LINK_LIBS PUBLIC 15 | TritonIR 16 | TritonGPUIR 17 | TritonGPUTransforms 18 | TritonNvidiaGPUIR 19 | MLIRTransformUtils 20 | ) 21 | -------------------------------------------------------------------------------- /lib/Target/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(LLVMIR) 2 | -------------------------------------------------------------------------------- /lib/Target/LLVMIR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonLLVMIR 2 | LLVMDIScope.cpp 3 | LLVMIRBreakPhiStruct.cpp 4 | 5 | DEPENDS 6 | LLVMIRIncGen 7 | 8 | LINK_LIBS 9 | ${CMAKE_DL_LIBS} 10 | PUBLIC 11 | MLIRArithToLLVM 12 | MLIRBuiltinToLLVMIRTranslation 13 | MLIRIndexToLLVM 14 | MLIRIR 15 | MLIRLLVMDialect 16 | MLIRLLVMToLLVMIRTranslation 17 | MLIRNVVMToLLVMIRTranslation 18 | MLIRROCDLToLLVMIRTranslation 19 | MLIRSCFToControlFlow 20 | MLIRSupport 21 | MLIRTargetLLVMIRExport 22 | TritonGPUToLLVM 23 | ) 24 | 25 | set_source_files_properties( 26 | LLVMIRTranslation.cpp 27 | PROPERTIES 28 | COMPILE_FLAGS "-D__BUILD_DIR__=\\\"${CMAKE_BINARY_DIR}\\\"") 29 | -------------------------------------------------------------------------------- /lib/Target/LLVMIR/LLVMPasses.h: -------------------------------------------------------------------------------- 1 | #include "llvm/IR/PassManager.h" 2 | #include "llvm/Pass.h" 3 | #include "llvm/Support/CodeGen.h" 4 | 5 | namespace llvm { 6 | 7 | // Pass to pre-process LLVM IR before optimization and break up phi of struct. 8 | // Breaking up those phis into elementary types allows better optimizations 9 | // downstream. 10 | struct BreakStructPhiNodesPass : PassInfoMixin { 11 | PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); 12 | 13 | static StringRef name() { return "BreakStructPhiNodesPass"; } 14 | }; 15 | 16 | } // namespace llvm 17 | -------------------------------------------------------------------------------- /lib/Tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonTools 2 | LayoutUtils.cpp 3 | LinearLayout.cpp 4 | 5 | DEPENDS 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRIR 9 | MLIRLLVMDialect 10 | f2reduce 11 | ) 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18", "ninja>=1.11.1"] 3 | 4 | [tool.yapf] 5 | based_on_style = "pep8" 6 | column_limit = 120 7 | disable_split_list_with_comment = true 8 | each_dict_entry_on_separate_line=false 9 | split_before_named_assigns = false 10 | split_complex_comprehension = true 11 | 12 | [tool.ruff] 13 | line-length = 120 14 | 15 | [tool.ruff.lint] 16 | ignore = ["E501", "E701", "E731", "E741"] 17 | -------------------------------------------------------------------------------- /python/MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/python/MANIFEST.in -------------------------------------------------------------------------------- /python/build_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sysconfig 3 | import sys 4 | from pathlib import Path 5 | 6 | 7 | def get_base_dir(): 8 | return os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) 9 | 10 | 11 | def get_cmake_dir(): 12 | plat_name = sysconfig.get_platform() 13 | python_version = sysconfig.get_python_version() 14 | dir_name = f"cmake.{plat_name}-{sys.implementation.name}-{python_version}" 15 | cmake_dir = Path(get_base_dir()) / "python" / "build" / dir_name 16 | cmake_dir.mkdir(parents=True, exist_ok=True) 17 | return cmake_dir 18 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [build-system] 3 | requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18", "ninja>=1.11.1", "pybind11>=2.13.1"] 4 | 5 | # We're incrementally switching from autopep8 to ruff. 6 | [tool.autopep8] 7 | aggressive = 1 8 | ignore = "E501,E701,E731,W690,W503" 9 | max_line_length = 88 10 | 11 | [tool.ruff] 12 | line-length = 120 13 | 14 | [tool.ruff.lint] 15 | ignore = ["E501", "E701", "E731", "E741"] 16 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | ninja 2 | cmake 3 | setuptools>=40.8.0 4 | wheel 5 | cmake>=3.18,<4.0 6 | ninja>=1.11.1 7 | pybind11>=2.13.1 8 | lit 9 | -------------------------------------------------------------------------------- /python/test-requirements.txt: -------------------------------------------------------------------------------- 1 | autopep8 2 | isort 3 | numpy 4 | pytest 5 | pytest-forked 6 | pytest-xdist 7 | scipy>=1.7.1 8 | llnl-hatchet 9 | -------------------------------------------------------------------------------- /python/test/kernel_comparison/kernels.yml: -------------------------------------------------------------------------------- 1 | name_and_extension: 2 | - name: _kernel_0d1d2d3de4de5de6c7de8de9c10de11c 3 | extension: ptx 4 | - name: _kernel_0d1d2d3de4de5de6de7c8de9c10de11c 5 | extension: ptx 6 | - name: _kernel_0d1d2d345de6c789c1011c 7 | extension: ptx 8 | - name: _kernel_0d1d2d3456c789c1011c 9 | extension: ptx 10 | - name: _kernel_0d1d2d3de4de5de6c7de8c9de10de11c 11 | extension: ptx 12 | - name: _kernel_0d1d2d34567c8c91011c 13 | extension: ptx 14 | - name: _kernel_0d1d2d3456c78c91011c 15 | extension: ptx 16 | - name: _kernel_0d1d2d3de4de5de6de7c8c9de10de11c 17 | extension: ptx 18 | - name: _kernel_0d1d2d34567c89c1011c 19 | extension: ptx 20 | - name: _kernel_0d1d2d345de6de7c89c1011c 21 | extension: ptx 22 | - name: _kernel_0d1d2d345de6de7c8c9de1011c 23 | extension: ptx 24 | - name: kernel_0d1d2de 25 | extension: ptx 26 | - name: _kernel_0d1d2d345de6c78c9de1011c 27 | extension: ptx 28 | - name: _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11de12de13de14de15c16de17de18de19c20de21de22de23c2425de26de 29 | extension: ptx 30 | - name: _fwd_kernel_0d1d2d34d5d6de7de8de9c10de11de12de13c14de15de16de17c18de19de20de21c2223de24de 31 | extension: ptx 32 | - name: _bwd_preprocess_0d1d2d 33 | extension: ptx 34 | -------------------------------------------------------------------------------- /python/test/regression/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import tempfile 4 | 5 | 6 | def pytest_addoption(parser): 7 | parser.addoption("--device", action="store", default="cuda") 8 | 9 | 10 | @pytest.fixture 11 | def device(request): 12 | return request.config.getoption("--device") 13 | 14 | 15 | @pytest.fixture 16 | def fresh_triton_cache(): 17 | with tempfile.TemporaryDirectory() as tmpdir: 18 | try: 19 | os.environ["TRITON_CACHE_DIR"] = tmpdir 20 | yield tmpdir 21 | finally: 22 | os.environ.pop("TRITON_CACHE_DIR", None) 23 | -------------------------------------------------------------------------------- /python/test/unit/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import tempfile 4 | 5 | 6 | def pytest_configure(config): 7 | config.addinivalue_line("markers", "interpreter: indicate whether interpreter supports the test") 8 | config.addinivalue_line("markers", "cpu: indicate whether test is supported on cpu") 9 | 10 | 11 | def pytest_addoption(parser): 12 | parser.addoption("--device", action="store", default="cuda") 13 | 14 | 15 | @pytest.fixture 16 | def device(request): 17 | return request.config.getoption("--device") 18 | 19 | 20 | @pytest.fixture 21 | def fresh_triton_cache(): 22 | with tempfile.TemporaryDirectory() as tmpdir: 23 | try: 24 | os.environ["TRITON_CACHE_DIR"] = tmpdir 25 | yield tmpdir 26 | finally: 27 | os.environ.pop("TRITON_CACHE_DIR", None) 28 | -------------------------------------------------------------------------------- /python/test/unit/cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/python/test/unit/cuda/__init__.py -------------------------------------------------------------------------------- /python/test/unit/instrumentation/test_gpuhello.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import pytest 4 | import os 5 | 6 | import triton 7 | import triton.language as tl 8 | 9 | test_stdout = 'Hello From First Instruction of GPU Kernel: kernel1\ttest_gpuhello.py:17:4\n\ 10 | Hello From First Instruction of GPU Kernel: kernel2\ttest_gpuhello.py:23:4\n\ 11 | Hello From First Instruction of GPU Kernel: kernel3\ttest_gpuhello.py:29:4\n' 12 | 13 | 14 | @pytest.mark.parametrize(None, [None]) 15 | @triton.jit 16 | def kernel1(BLOCK_SIZE: tl.constexpr): 17 | return 18 | 19 | 20 | @pytest.mark.parametrize(None, [None]) 21 | @triton.jit 22 | def kernel2(BLOCK_SIZE: tl.constexpr): 23 | return 24 | 25 | 26 | @pytest.mark.parametrize(None, [None]) 27 | @triton.jit 28 | def kernel3(BLOCK_SIZE: tl.constexpr): 29 | return 30 | 31 | 32 | def func(x: torch.Tensor, y: torch.Tensor): 33 | output = torch.empty_like(x) 34 | n_elements = output.numel() 35 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) 36 | kernel1[grid](BLOCK_SIZE=1024) 37 | kernel2[grid](BLOCK_SIZE=1024) 38 | kernel3[grid](BLOCK_SIZE=1024) 39 | 40 | 41 | def test_op(capfd, device: str): 42 | size = 98432 43 | x = torch.rand(size, device=device) 44 | y = torch.rand(size, device=device) 45 | func(x, y) 46 | stdout, stderr = capfd.readouterr() 47 | if 'LLVM_PASS_PLUGIN_PATH' in os.environ: 48 | assert repr(stderr) == repr(test_stdout) 49 | -------------------------------------------------------------------------------- /python/test/unit/language/test_reproducer.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import re 3 | 4 | 5 | def test_triton_reproducer_path(monkeypatch, tmp_path): 6 | # If we get a cache hit there will be no reproducer generated 7 | monkeypatch.setenv("TRITON_ALWAYS_COMPILE", "1") 8 | 9 | @triton.jit 10 | def triton_(): 11 | return 12 | 13 | # We need an temp empty file for MLIR to write the reproducer to, and then 14 | # the TRITON_REPRODUCER_PATH env var enables crash the reproduction 15 | # generation in MLIR. 16 | repro_path = tmp_path / "repro.mlir" 17 | repro_path.touch() 18 | monkeypatch.setenv("TRITON_REPRODUCER_PATH", str(repro_path)) 19 | 20 | # Run the kernel so MLIR will generate a crash reproducer. It doesn't really 21 | # matter what the kernel does, just that the PassManager runs its passes. 22 | triton_[(1, )]() 23 | 24 | repro = repro_path.read_text() 25 | assert "mlir_reproducer" in repro, f"Expected MLIR reproducer in {repro_path}. Got:\n{repro}" 26 | m = re.search(r"pipeline: \"(.*)\"", repro) 27 | assert m, "Expected to match pass pipeline after \"pipeline:\" in MLIR reproducer" 28 | pipeline_str = m.group(1) 29 | assert pipeline_str, "Expected non-empty pass pipeline in MLIR reproducer" 30 | -------------------------------------------------------------------------------- /python/test/unit/runtime/test_jit.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import pytest 3 | import torch 4 | 5 | import triton 6 | import triton.language as tl 7 | 8 | 9 | def test_pre_call_hooks(device): 10 | 11 | @triton.jit 12 | def add_kernel( 13 | in_ptr0, 14 | in_ptr1, 15 | out_ptr, 16 | n_elements, 17 | BLOCK_SIZE: "tl.constexpr", 18 | ): 19 | pid = tl.program_id(axis=0) 20 | block_start = pid * BLOCK_SIZE 21 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 22 | mask = offsets < n_elements 23 | x = tl.load(in_ptr0 + offsets, mask=mask) 24 | y = tl.load(in_ptr1 + offsets, mask=mask) 25 | output = x + y 26 | tl.store(out_ptr + offsets, output, mask=mask) 27 | 28 | class MyTensor(torch.Tensor): 29 | pass 30 | 31 | def my_hook(*args, **kwargs): 32 | for arg in itertools.chain(args, kwargs.values()): 33 | if isinstance(arg, MyTensor): 34 | raise Exception("MyTensor is not allowed") 35 | 36 | add_kernel.add_pre_run_hook(my_hook) 37 | 38 | x = torch.randn(4, device=device) 39 | y = MyTensor(x) 40 | out = torch.zeros_like(x) 41 | with pytest.raises(Exception): 42 | add_kernel[(4, )](x, y, out, 4, 4) 43 | -------------------------------------------------------------------------------- /python/test/unit/tools/test_disasm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | import pytest 5 | import triton.language as tl 6 | 7 | 8 | def test_disam_cubin(): 9 | if not triton.runtime.driver.active.get_current_target().backend == "cuda": 10 | pytest.skip("Test requires CUDA.") 11 | 12 | @triton.jit 13 | def kernel(X, i: tl.constexpr): 14 | tl.store(X, i) 15 | 16 | x = torch.empty(1, dtype=torch.int32, device='cuda') 17 | h = kernel[(1, )](x, i=12) 18 | assert x[0] == 12 19 | sass = h.asm["sass"] 20 | # check that the sass has a store instruction. 21 | assert "STG.E" in sass 22 | -------------------------------------------------------------------------------- /python/triton/_C/include: -------------------------------------------------------------------------------- 1 | ../../../include/ -------------------------------------------------------------------------------- /python/triton/_utils.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | 4 | def get_iterable_path(iterable, path): 5 | return reduce(lambda a, idx: a[idx], path, iterable) 6 | 7 | 8 | def set_iterable_path(iterable, path, val): 9 | prev = iterable if len(path) == 1 else get_iterable_path(iterable, path[:-1]) 10 | prev[path[-1]] = val 11 | 12 | 13 | def find_paths_if(iterable, pred): 14 | from .language import core 15 | is_iterable = lambda x: isinstance(x, (list, tuple, core.tuple, core.tuple_type)) 16 | ret = dict() 17 | 18 | def _impl(current, path): 19 | path = (path[0], ) if len(path) == 1 else tuple(path) 20 | if is_iterable(current): 21 | for idx, item in enumerate(current): 22 | _impl(item, path + (idx, )) 23 | elif pred(path, current): 24 | if len(path) == 1: 25 | ret[(path[0], )] = None 26 | else: 27 | ret[tuple(path)] = None 28 | 29 | if is_iterable(iterable): 30 | _impl(iterable, []) 31 | elif pred(list(), iterable): 32 | ret = {tuple(): None} 33 | else: 34 | ret = dict() 35 | return list(ret.keys()) 36 | -------------------------------------------------------------------------------- /python/triton/compiler/__init__.py: -------------------------------------------------------------------------------- 1 | from .compiler import CompiledKernel, ASTSource, IRSource, compile, make_backend, LazyDict 2 | from .errors import CompilationError 3 | 4 | __all__ = ["compile", "make_backend", "ASTSource", "IRSource", "CompiledKernel", "CompilationError", "LazyDict"] 5 | -------------------------------------------------------------------------------- /python/triton/compiler/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def front_end_debugging(): 5 | return os.getenv("TRITON_FRONT_END_DEBUGGING", "0") == "1" 6 | -------------------------------------------------------------------------------- /python/triton/compiler/make_launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/python/triton/compiler/make_launcher.py -------------------------------------------------------------------------------- /python/triton/errors.py: -------------------------------------------------------------------------------- 1 | """Base class for all errors raised by Triton""" 2 | 3 | 4 | class TritonError(Exception): 5 | ... 6 | -------------------------------------------------------------------------------- /python/triton/language/_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | TRITON_MAX_TENSOR_NUMEL = 1048576 4 | 5 | 6 | def is_power_of_two(x): 7 | return (x & (x - 1)) == 0 8 | 9 | 10 | def validate_block_shape(shape: List[int]): 11 | numel = 1 12 | for i, d in enumerate(shape): 13 | if not isinstance(d, int): 14 | raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d)}]") 15 | if not is_power_of_two(d): 16 | raise ValueError(f"Shape element {i} must be a power of 2") 17 | numel *= d 18 | 19 | if numel > TRITON_MAX_TENSOR_NUMEL: 20 | raise ValueError(f"numel ({numel}) exceeds triton maximum tensor numel ({TRITON_MAX_TENSOR_NUMEL})") 21 | return numel 22 | -------------------------------------------------------------------------------- /python/triton/language/extra/__init__.py: -------------------------------------------------------------------------------- 1 | import pkgutil 2 | from importlib.util import module_from_spec 3 | from sys import modules 4 | 5 | _backends = [] 6 | for module_finder, module_name, is_pkg in pkgutil.iter_modules( 7 | __path__, 8 | prefix=__name__ + ".", 9 | ): 10 | # skip .py files (like libdevice.py) 11 | if not is_pkg: 12 | continue 13 | 14 | # import backends (like cuda and hip) that are included during setup.py 15 | spec = module_finder.find_spec(module_name) 16 | if spec is None or spec.loader is None: 17 | continue 18 | module = module_from_spec(spec) 19 | spec.loader.exec_module(module) 20 | 21 | _backends.append(module_name) 22 | modules[module_name] = module 23 | 24 | __all__ = _backends 25 | 26 | del _backends 27 | -------------------------------------------------------------------------------- /python/triton/runtime/__init__.py: -------------------------------------------------------------------------------- 1 | from .autotuner import (Autotuner, Config, Heuristics, autotune, heuristics) 2 | from .cache import RedisRemoteCacheBackend, RemoteCacheBackend 3 | from .driver import driver 4 | from .jit import JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret 5 | from .errors import OutOfResources, InterpreterError 6 | 7 | __all__ = [ 8 | "autotune", 9 | "Autotuner", 10 | "Config", 11 | "driver", 12 | "Heuristics", 13 | "heuristics", 14 | "InterpreterError", 15 | "JITFunction", 16 | "KernelInterface", 17 | "MockTensor", 18 | "OutOfResources", 19 | "RedisRemoteCacheBackend", 20 | "reinterpret", 21 | "RemoteCacheBackend", 22 | "TensorWrapper", 23 | ] 24 | -------------------------------------------------------------------------------- /python/triton/runtime/_allocation.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Protocol 2 | 3 | 4 | class Buffer(Protocol): 5 | 6 | def data_ptr(self) -> int: 7 | ... 8 | 9 | 10 | class Allocator(Protocol): 11 | 12 | def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer: 13 | ... 14 | 15 | 16 | class NullAllocator: 17 | 18 | def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer: 19 | raise RuntimeError("Kernel requires a runtime memory allocation, but no allocator was set. " + 20 | "Use triton.set_allocator to specify an allocator.") 21 | 22 | 23 | _allocator: Allocator = NullAllocator() 24 | 25 | 26 | def set_allocator(allocator: Allocator): 27 | """ 28 | The allocator function is called during kernel launch for kernels that 29 | require additional global memory workspace. 30 | """ 31 | global _allocator 32 | _allocator = allocator 33 | -------------------------------------------------------------------------------- /python/triton/runtime/errors.py: -------------------------------------------------------------------------------- 1 | from ..errors import TritonError 2 | from typing import Optional 3 | 4 | 5 | class InterpreterError(TritonError): 6 | 7 | def __init__(self, error_message: Optional[str] = None): 8 | self.error_message = error_message 9 | 10 | def __str__(self) -> str: 11 | return self.error_message or "" 12 | 13 | 14 | class OutOfResources(TritonError): 15 | 16 | def __init__(self, required, limit, name): 17 | self.required = required 18 | self.limit = limit 19 | self.name = name 20 | 21 | def __str__(self) -> str: 22 | return f"out of resource: {self.name}, Required: {self.required}, Hardware limit: {self.limit}. Reducing block sizes or `num_stages` may help." 23 | 24 | def __reduce__(self): 25 | # this is necessary to make CompilationError picklable 26 | return (type(self), (self.required, self.limit, self.name)) 27 | 28 | 29 | class PTXASError(TritonError): 30 | 31 | def __init__(self, error_message: Optional[str] = None): 32 | self.error_message = error_message 33 | 34 | def __str__(self) -> str: 35 | error_message = self.error_message or "" 36 | return f"PTXAS error: {error_message}" 37 | -------------------------------------------------------------------------------- /python/triton/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/python/triton/tools/__init__.py -------------------------------------------------------------------------------- /python/triton/tools/experimental_descriptor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | 5 | 6 | class TmaDescKernelParam: 7 | TMA_DESC_SIZE = 128 8 | 9 | def __init__(self, ptr, dims, block_dims, element_size): 10 | self.desc = torch.empty(self.TMA_DESC_SIZE, dtype=torch.uint8, device="cpu") 11 | assert len(dims) == len(block_dims) 12 | assert 1 <= len(dims) <= 2 13 | assert self.desc.data_ptr() % 64 == 0 14 | 15 | if len(dims) == 1: 16 | triton.runtime.driver.active.utils.fill_1d_tma_descriptor(ptr, dims[0], block_dims[0], element_size, 17 | self.desc.data_ptr()) 18 | else: 19 | triton.runtime.driver.active.utils.fill_2d_tma_descriptor(ptr, dims[0], dims[1], block_dims[0], 20 | block_dims[1], element_size, self.desc.data_ptr()) 21 | 22 | # Return a CUtensorMap* pointer in host memory 23 | def tma_desc_cpu_ptr(self): 24 | return self.desc.data_ptr() 25 | 26 | 27 | def create_1d_tma_descriptor(ptr, dim, block_dim, element_size): 28 | return TmaDescKernelParam(ptr, [dim], [block_dim], element_size) 29 | 30 | 31 | def create_2d_tma_descriptor(ptr, dim1, dim0, block_dim1, block_dim0, element_size): 32 | return TmaDescKernelParam(ptr, [dim1, dim0], [block_dim1, block_dim0], element_size) 33 | -------------------------------------------------------------------------------- /python/tutorials/README.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one. 5 | 6 | To install the dependencies for the tutorials: 7 | 8 | .. code-block:: bash 9 | 10 | cd triton 11 | pip install -e './python[tutorials]' 12 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(lib) 2 | 3 | llvm_canonicalize_cmake_booleans( 4 | MLIR_ENABLE_BINDINGS_PYTHON 5 | ) 6 | 7 | configure_lit_site_cfg( 8 | ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in 9 | ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py 10 | MAIN_CONFIG 11 | ${CMAKE_CURRENT_SOURCe_DIR}/lit.cfg.py 12 | ) 13 | 14 | set(TRITON_TEST_DEPENDS 15 | triton-opt 16 | triton-tensor-layout 17 | triton-llvm-opt 18 | ) 19 | 20 | set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck") 21 | set(LIT_ARGS "-Dfilecheck=${FILECHECK_PATH}") 22 | 23 | add_lit_testsuite(check-triton-lit-tests "Running the triton regression tests" 24 | ${CMAKE_CURRENT_BINARY_DIR} 25 | ARGS ${LIT_ARGS} 26 | DEPENDS ${TRITON_TEST_DEPENDS} 27 | ) 28 | 29 | set_target_properties(check-triton-lit-tests PROPERTIES FOLDER "Tests") 30 | 31 | add_lit_testsuites(TRITON-LIT-TESTS ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TRITON_TEST_DEPENDS}) 32 | -------------------------------------------------------------------------------- /test/Conversion/allocate_shared_memory.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s --allocate-shared-memory | FileCheck %s 2 | 3 | #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [1, 0]}> 4 | 5 | // CHECK-LABEL: module 6 | // CHECK-SAME: ttg.shared = 131072 : i32 7 | module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { 8 | 9 | // CHECK-LABEL: @gather_op 10 | // TODO(jeff): Optimize the lowering to reduce shared memory usage. 11 | tt.func @gather_op(%arg0: tensor<1024x256xi32, #blocked>, %arg1: tensor<128x256xf32, #blocked>) { 12 | // CHECK-NEXT: allocation.offset = 0 : i32 13 | %0 = tt.gather %arg1[%arg0] {axis = 0 : i32} : (tensor<128x256xf32, #blocked>, tensor<1024x256xi32, #blocked>) -> tensor<1024x256xf32, #blocked> 14 | tt.return 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /test/Conversion/amd/builtin_func_to_llvm.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=True" --convert-builtin-func-to-llvm="ftz=True" | FileCheck %s --check-prefix=LLVM_FTZ 2 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=False" --convert-builtin-func-to-llvm="ftz=False" | FileCheck %s --check-prefix=LLVM_NO_FTZ 3 | 4 | #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}> 5 | module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} { 6 | tt.func public @test_fast_expf(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} { 7 | // LLVM_FTZ: llvm.amdgcn.exp2.f32 8 | // LLVM_NO_FTZ: llvm.exp2.f32 9 | %0 = tt.extern_elementwise %arg0 {libname = "libdevice", libpath = "", pure = true, symbol = "__triton_hip_fast_expf"} : (tensor<64xf32, #blocked>) -> tensor<64xf32, #blocked> 10 | tt.return 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /test/Conversion/amd/tritongpu_to_llvm_rdna.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx1100 --convert-builtin-func-to-llvm | FileCheck %s 2 | 3 | #blocked3 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> 4 | module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} { 5 | // CHECK-LABEL: reduce_dpp_max 6 | tt.func @reduce_dpp_max(%arg0: tensor<32xf32, #blocked3>) { 7 | // CHECK: rocdl.update.dpp 8 | // CHECK-SAME: with 280, 15, 15, true : f32 9 | // CHECK-NEXT: llvm.intr.maxnum 10 | 11 | // CHECK-NEXT: rocdl.update.dpp 12 | // CHECK-SAME: with 276, 15, 15, true : f32 13 | // CHECK-NEXT: llvm.intr.maxnum 14 | 15 | // CHECK-NEXT: rocdl.update.dpp 16 | // CHECK-SAME: with 274, 15, 15, true : f32 17 | // CHECK-NEXT: llvm.intr.maxnum 18 | 19 | // CHECK-NEXT: rocdl.update.dpp 20 | // CHECK-SAME: with 273, 15, 15, true : f32 21 | // CHECK-NEXT: llvm.intr.maxnum 22 | 23 | // CHECK: llvm.amdgcn.permlanex16 24 | // CHECK: llvm.intr.maxnum 25 | // CHECK: rocdl.readlane 26 | %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({ 27 | ^bb0(%arg1: f32, %arg2: f32): 28 | %1 = arith.maxnumf %arg1, %arg2 : f32 29 | tt.reduce.return %1 : f32 30 | }) : (tensor<32xf32, #blocked3>) -> f32 31 | tt.return 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /test/Conversion/divide-by-0.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm --cse | FileCheck %s 2 | 3 | // CHECK-LABEL: dont_divide_0 4 | // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32 5 | // CHECK-NOT: llvm.urem %{{.*}}, %[[C0]] 6 | #blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}> 7 | #mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 8]}> 8 | module attributes {"ttg.target" = "cuda:80", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} { 9 | tt.func public @dont_divide_0() attributes {noinline = false} { 10 | %zero = arith.constant dense<0.000000e+00> : tensor<16x1xf32, #mma> 11 | %cvt = ttg.convert_layout %zero : tensor<16x1xf32, #mma> -> tensor<16x1xf32, #blocked> 12 | tt.return 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /test/Conversion/tritongpu_to_llvm_volta.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s --convert-triton-gpu-to-llvm=compute-capability=70 2>&1 | FileCheck %s 2 | 3 | #blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> 4 | // CHECK-LABEL: clamp 5 | module attributes {"ttg.target" = "cuda:70", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} { 6 | tt.func public @clamp(%x : tensor<1024xf32, #blocked>, %limit : tensor<1024xf32, #blocked>) attributes {noinline = false} { 7 | %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> 8 | %neg_limit = arith.subf %cst, %limit : tensor<1024xf32, #blocked> 9 | 10 | // CHECK: llvm.fcmp "une" %[[REG:[a-zA-Z0-9]+]], %[[REG]] 11 | // CHECK-NEXT: llvm.intr.maxnum 12 | // CHECK-NEXT: llvm.intr.minnum 13 | // CHECK-NEXT: llvm.mlir.constant 14 | // CHECK-NEXT: llvm.select 15 | %12 = tt.clampf %x, %neg_limit, %limit, propagateNan = all : tensor<1024xf32, #blocked> 16 | tt.return 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /test/LLVMIR/break-phi-struct.ll: -------------------------------------------------------------------------------- 1 | ; RUN: triton-llvm-opt -break-struct-phi-nodes %s | FileCheck %s 2 | 3 | ; CHECK-LABEL: struct 4 | define {i32, i32} @struct(i1 %c) { 5 | ; CHECK: br i1 %{{.*}}, label [[TRUE:%.*]], label [[FALSE:%.*]] 6 | br i1 %c, label %true, label %false 7 | 8 | true: 9 | %s.1 = insertvalue {i32, i32} undef, i32 20, 0 10 | %s.2 = insertvalue {i32, i32} %s.1, i32 200, 1 11 | 12 | ; CHECK-DAG: [[E0:%.*]] = extractvalue { i32, i32 } %{{.*}}, 0 13 | ; CHECK-DAG: [[E1:%.*]] = extractvalue { i32, i32 } %{{.*}}, 1 14 | ; CHECK: br 15 | br label %exit 16 | 17 | false: 18 | %s.3 = insertvalue {i32, i32} undef, i32 30, 0 19 | %s.4 = insertvalue {i32, i32} %s.3, i32 300, 1 20 | ; CHECK-DAG: [[E2:%.*]] = extractvalue { i32, i32 } %{{.*}}, 0 21 | ; CHECK-DAG: [[E3:%.*]] = extractvalue { i32, i32 } %{{.*}}, 1 22 | ; CHECK: br 23 | br label %exit 24 | 25 | exit: 26 | ; CHECK-DAG: [[PHI0:%.*]] = phi i32 [ [[E0]], [[TRUE]] ], [ [[E2]], [[FALSE]] ] 27 | ; CHECK-DAG: [[PHI1:%.*]] = phi i32 [ [[E1]], [[TRUE]] ], [ [[E3]], [[FALSE]] ] 28 | ; CHECK: [[S0:%.*]] = insertvalue { i32, i32 } undef, i32 [[PHI0]], 0 29 | ; CHECK: [[S1:%.*]] = insertvalue { i32, i32 } [[S0]], i32 [[PHI1]], 1 30 | ; CHECK: ret { i32, i32 } [[S1]] 31 | %r = phi {i32, i32} [ %s.2, %true], [ %s.4, %false ] 32 | ret {i32, i32} %r 33 | } 34 | -------------------------------------------------------------------------------- /test/Proton/ops.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt --split-input-file %s -cse -canonicalize | FileCheck %s 2 | 3 | module { 4 | // CHECK-LABEL: proton_record 5 | tt.func @proton_record() { 6 | // CHECK: proton.record() {isStart = true, regionId = 1 : i32} 7 | // CHECK-NEXT: proton.record() {isStart = false, regionId = 1 : i32} 8 | // CHECK-NEXT: tt.return 9 | proton.record() {isStart = true, regionId = 1 : i32} 10 | proton.record() {isStart = false, regionId = 1 : i32} 11 | tt.return 12 | } 13 | } // end module 14 | 15 | // ----- 16 | -------------------------------------------------------------------------------- /test/Triton/reproducer.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt --verify-diagnostics --dump-pass-pipeline --run-reproducer %s 2>&1 | FileCheck %s 2 | 3 | module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} { 4 | tt.func public @triton__() attributes {noinline = false} { 5 | tt.return 6 | } 7 | } 8 | 9 | {-# 10 | external_resources: { 11 | mlir_reproducer: { 12 | pipeline: "builtin.module(any(convert-scf-to-cf,convert-index-to-llvm{index-bitwidth=0},convert-triton-gpu-to-llvm{compute-capability=90},convert-nv-gpu-to-llvm,convert-arith-to-llvm{index-bitwidth=0},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,symbol-dce,enable-line-info))", 13 | disable_threading: false, 14 | verify_each: false 15 | } 16 | } 17 | #-} 18 | 19 | // CHECK: Pass Manager with 20 | // CHECK-NEXT: convert-triton-gpu-to-llvm 21 | -------------------------------------------------------------------------------- /test/Triton/verify-make-range.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt --split-input-file %s --verify-diagnostics 2 | 3 | tt.func public @i64_tensor() { 4 | // expected-error @+1 {{i32 elements}} 5 | %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : tensor<16xi64> 6 | tt.return 7 | } 8 | 9 | // ----- 10 | tt.func public @i32_scalar() { 11 | // expected-error @+1 {{invalid kind of type}} 12 | %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : i32 13 | tt.return 14 | } 15 | 16 | // ----- 17 | tt.func public @_2d_tensor() { 18 | // expected-error @+1 {{must be a 1D tensor}} 19 | %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : tensor<16x1xi32> 20 | tt.return 21 | } 22 | 23 | // ----- 24 | tt.func public @bad_start_end() { 25 | // expected-error @+1 {{start must be less than or equal to end}} 26 | %a = tt.make_range { start = 0 : i32, end = -16 : i32 } : tensor<16xi32> 27 | tt.return 28 | } 29 | 30 | // ----- 31 | tt.func public @bad_num_elems() { 32 | // expected-error @+1 {{number of elements}} 33 | %a = tt.make_range { start = 0 : i32, end = 32 : i32 } : tensor<16xi32> 34 | tt.return 35 | } 36 | -------------------------------------------------------------------------------- /test/TritonCPU/reduction.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -split-input-file -triton-cpu-convert-reduction -canonicalize 2 | 3 | // Regression test: Check that we handle consecutive calls to tt.reduce with 4 | // different types & number of arguments. 5 | 6 | module { 7 | tt.func public @triton_(%arg0: tensor<1x4xf32>, %arg1: tensor<1x4xi32>) { 8 | %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({ 9 | ^bb0(%arg3: f32, %arg4: f32): 10 | tt.reduce.return %arg3 : f32 11 | }) : (tensor<1x4xf32>) -> tensor<1xf32> 12 | %1:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ 13 | ^bb0(%arg3: f32, %arg4: i32, %arg5: f32, %arg6: i32): 14 | tt.reduce.return %arg3, %arg4 : f32, i32 15 | }) : (tensor<1x4xf32>, tensor<1x4xi32>) -> (tensor<1xf32>, tensor<1xi32>) 16 | tt.return 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /test/TritonGPU/amd/amd-conditional-barrier.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s 2 | 3 | module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} { 4 | tt.func @conditional_barrier() { 5 | // CHECK-LABEL: llvm.func @conditional_barrier 6 | 7 | // CHECK: %[[CMP0:.+]] = llvm.icmp "ne" %3, %1 : i32 8 | // CHECK: %[[CMP1:.+]] = llvm.icmp "eq" %3, %1 : i32 9 | // CHECK: llvm.cond_br %[[CMP0]], ^bb1, ^bb2 10 | // CHECK: ^bb1: 11 | // CHECK: rocdl.s.barrier 12 | // CHECK: llvm.br ^bb2 13 | // CHECK: ^bb2: 14 | // CHECK: llvm.add 15 | // CHECK: llvm.cond_br %[[CMP1]], ^bb3, ^bb4 16 | // CHECK: ^bb3: 17 | // CHECK: rocdl.s.barrier 18 | // CHECK: llvm.br ^bb4 19 | // CHECK: ^bb4: 20 | // CHECK: llvm.return 21 | 22 | %c256_i32 = arith.constant 256 : i32 23 | %c0_i32 = arith.constant 0 : i32 24 | %0 = rocdl.workitem.id.x : i32 25 | %1 = arith.divsi %0, %c256_i32 : i32 26 | %2 = arith.cmpi ne, %1, %c0_i32 : i32 27 | %3 = arith.cmpi eq, %1, %c0_i32 : i32 28 | amdgpu.cond_barrier %2 29 | %4 = arith.addi %0, %c256_i32 : i32 30 | amdgpu.cond_barrier %3 31 | tt.return 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /test/TritonGPU/amd/sink-setprio-mfma.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s 2 | 3 | // CHECK-LABEL: llvm.func @sink_setprio 4 | // CHECK: rocdl.mfma 5 | // CHECK-NOT: rocdl.mfma 6 | // CHECK: rocdl.s.setprio 1 7 | // CHECK-COUNT-15: rocdl.mfma 8 | // CHECK-NOT: rocdl.mfma 9 | // CHECK: rocdl.s.setprio 0 10 | 11 | #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}> 12 | #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}> 13 | #mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = true}> 14 | module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} { 15 | tt.func public @sink_setprio( 16 | %arg0: tensor<64x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, 17 | %arg1: tensor<128x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>) { 18 | %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma> 19 | rocdl.s.setprio 1 20 | %dot = tt.dot %arg0, %arg1, %cst_0 : 21 | tensor<64x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<128x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<64x64xf32, #mma> 22 | rocdl.s.setprio 0 23 | tt.return 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/TritonGPU/global_scratch_to_llvm.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -allow-unregistered-dialect --tritongpu-global-scratch-memory-allocation --convert-triton-gpu-to-llvm | FileCheck %s 2 | 3 | module attributes {"ttg.num-warps" = 4 : i32} { 4 | // CHECK-LABEL: @global_scratch_alloc_warpgroup(%arg0: !llvm.ptr<1>) 5 | tt.func @global_scratch_alloc_warpgroup() { 6 | // CHECK-NEXT: ttg.warp_specialize(%arg0) 7 | ttg.warp_specialize() 8 | default { 9 | ttg.warp_yield 10 | } 11 | // CHECK: partition0(%arg1: !llvm.ptr<1>) 12 | partition0() num_warps(1) { 13 | // CHECK-COUNT-2: llvm.getelementptr %arg1 14 | %0 = ttg.global_scratch_alloc {alignment = 8 : i32, nbytes = 100 : i32, ttg.global_scratch_memory_offset = 0 : i32} : !tt.ptr 15 | %1 = ttg.global_scratch_alloc {alignment = 8 : i32, nbytes = 100 : i32, ttg.global_scratch_memory_offset = 0 : i32} : !tt.ptr 16 | "use"(%0, %1) : (!tt.ptr, !tt.ptr) -> () 17 | ttg.warp_return 18 | } : () -> () 19 | tt.return 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /test/TritonGPU/tf32x3-matmul.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -tritongpu-F32DotTC -canonicalize | FileCheck %s --check-prefixes=CHECK 2 | 3 | // CHECK: %[[DOT1:.*]] = tt.dot %[[LHS_LOW:.*]], %[[RHS_HIGH:.*]], %cst, inputPrecision = tf32 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32> 4 | // CHECK: %[[DOT2:.*]] = tt.dot %[[LHS_HIGH:.*]], %[[RHS_LOW:.*]], %[[DOT1]], inputPrecision = tf32 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32> 5 | // CHECK: %[[CMP:.*]] = arith.cmpf uno, %[[DOT2]], %[[DOT2]] : tensor<16x16xf32> 6 | // CHECK: %[[MASKED:.*]] = arith.select %[[CMP]], %cst, %[[DOT2]] : tensor<16x16xi1>, tensor<16x16xf32> 7 | // CHECK: %[[RESULT:.*]] = tt.dot %[[LHS_HIGH]], %[[RHS_HIGH]], %[[MASKED]], inputPrecision = tf32 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32> 8 | 9 | module { 10 | tt.func @dot_test(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>, %arg2: tensor<16x16xf32>) -> tensor<16x16xf32> { 11 | %4 = tt.dot %arg0, %arg1, %arg2, inputPrecision = tf32x3 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32> 12 | tt.return %4 : tensor<16x16xf32> 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /test/TritonNvidiaGPU/canonicalize.mlir: -------------------------------------------------------------------------------- 1 | // RUN: triton-opt %s -canonicalize | FileCheck %s 2 | 3 | // CHECK-LABEL: @test_dce_tmem_alloc 4 | // CHECK-NOT: ttng.tmem_alloc 5 | // CHECK: tt.return 6 | #linear = #ttg.linear<{register = [[0, 1], [0, 2], [32, 0], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 0], [0, 0]], block = []}> 7 | #tmem_scales = #ttng.tensor_memory_scales_encoding<> 8 | module attributes {"ttg.num-warps" = 8 : i32, "ttg.num-ctas" = 1 : i32, "ttg.target" = "cuda:80"} { 9 | tt.func @test_dce_tmem_alloc(%arg: tensor<128x4xi8, #linear>) { 10 | %a = ttng.tmem_alloc %arg : (tensor<128x4xi8, #linear>) -> !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory> 11 | tt.return 12 | } 13 | } // end module 14 | -------------------------------------------------------------------------------- /test/lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(TritonTestAnalysis 2 | TestAlias.cpp 3 | TestAxisInfo.cpp 4 | TestAllocation.cpp 5 | TestMembar.cpp 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRPass 9 | ${triton_libs} 10 | ) 11 | -------------------------------------------------------------------------------- /test/lib/Analysis/TestMembar.cpp: -------------------------------------------------------------------------------- 1 | #include "../third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Utility.h" 2 | #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" 3 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" 4 | #include "mlir/IR/Dialect.h" 5 | #include "mlir/Pass/Pass.h" 6 | #include "mlir/Transforms/DialectConversion.h" 7 | #include "triton/Analysis/Allocation.h" 8 | #include "triton/Analysis/Membar.h" 9 | 10 | using namespace mlir; 11 | 12 | namespace { 13 | 14 | struct TestMembarPass 15 | : public PassWrapper> { 16 | 17 | MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMembarPass); 18 | 19 | StringRef getArgument() const final { return "test-print-membar"; } 20 | StringRef getDescription() const final { 21 | return "print the result of the allocation pass"; 22 | } 23 | 24 | void runOnOperation() override { 25 | Operation *operation = getOperation(); 26 | ModuleOp moduleOp = cast(operation); 27 | // Print all ops after membar pass 28 | ModuleAllocation allocation(moduleOp); 29 | ModuleMembarAnalysis membarPass(&allocation, 30 | mlir::triton::NVIDIA::canSkipBarSync); 31 | membarPass.run(); 32 | } 33 | }; 34 | 35 | } // namespace 36 | 37 | namespace mlir { 38 | namespace test { 39 | void registerTestMembarPass() { PassRegistration(); } 40 | } // namespace test 41 | } // namespace mlir 42 | -------------------------------------------------------------------------------- /test/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | add_subdirectory(Instrumentation) 3 | -------------------------------------------------------------------------------- /test/lib/Instrumentation/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(GPU_INSTRUMENTATION_PASSES 2 | GPUInstrumentationTestLib 3 | ) 4 | 5 | set(GPUInstrumentationTestLib_SOURCES 6 | GPUHello.cpp 7 | ) 8 | 9 | 10 | foreach( plugin ${GPU_INSTRUMENTATION_PASSES} ) 11 | add_library( 12 | ${plugin} 13 | SHARED 14 | ${${plugin}_SOURCES} 15 | ) 16 | 17 | target_link_libraries( 18 | ${plugin} 19 | PRIVATE 20 | LLVMCore 21 | "$<$:-undefined dynamic_lookup>" 22 | ) 23 | # CMAKE_LIBRARY_OUTPUT_DIRECTORY is only set during the Python 24 | # build. It is empty if building directly from the root 25 | # CMakeLists.txt file. Therefore if not building from Python just 26 | # use the default CMake shared lib path otherwise this causes a hard 27 | # build error 28 | if(DEFINED CMAKE_LIBRARY_OUTPUT_DIRECTORY) 29 | set_target_properties(${plugin} PROPERTIES 30 | LIBRARY_OUTPUT_DIRECTORY 31 | "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../instrumentation") 32 | endif(DEFINED CMAKE_LIBRARY_OUTPUT_DIRECTORY) 33 | 34 | # This is set to -fvisibility=hidden in the top level CMake file 35 | # which causes the llvmGetPassPluginInfo symbol to be hidden and 36 | # an "entry point not found" error. Reset it just for this target 37 | if(NOT MSVC) 38 | target_compile_options(${plugin} PRIVATE -fvisibility=default) 39 | endif() 40 | endforeach() 41 | -------------------------------------------------------------------------------- /test/lit.site.cfg.py.in: -------------------------------------------------------------------------------- 1 | @LIT_SITE_CFG_IN_HEADER@ 2 | 3 | import sys 4 | 5 | config.triton_obj_root = "@triton_BINARY_DIR@" 6 | config.llvm_src_root = "@LLVM_SOURCE_DIR@" 7 | config.llvm_obj_root = "@LLVM_BINARY_DIR@" 8 | config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" 9 | config.llvm_lib_dir = "@LLVM_LIBS_DIR@" 10 | config.llvm_shlib_dir = "@CMAKE_LIBRARY_OUTPUT_DIRECTORY@" 11 | config.llvm_shlib_ext = "@CMAKE_SHARED_LIBRARY_SUFFIX@" 12 | config.llvm_exe_ext = "@EXEEXT@" 13 | config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" 14 | config.mlir_binary_dir = "@MLIR_BINARY_DIR@" 15 | config.python_executable = "@Python3_EXECUTABLE@" 16 | config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@ 17 | 18 | 19 | import lit.llvm 20 | lit.llvm.initialize(lit_config, config) 21 | 22 | # Let the main config do the real work 23 | lit_config.load_config(config, "@triton_SOURCE_DIR@/test/lit.cfg.py") 24 | -------------------------------------------------------------------------------- /third_party/amd/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) 2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) 3 | add_subdirectory(include) 4 | add_subdirectory(lib) 5 | if(TRITON_BUILD_PYTHON_MODULE) 6 | add_triton_plugin(TritonAMD ${CMAKE_CURRENT_SOURCE_DIR}/python/triton_amd.cc LINK_LIBS TritonAMDGPUToLLVM TritonAMDGPUTransforms TritonAMDGPUDialectToLLVM) 7 | target_link_libraries(TritonAMD PRIVATE Python3::Module pybind11::headers) 8 | endif() 9 | if(TRITON_BUILD_UT) 10 | add_subdirectory(unittest) 11 | endif() 12 | add_subdirectory(test) 13 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/amd_detail/concepts.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | #pragma once 24 | 25 | namespace hip_impl // Documentation only. 26 | { 27 | #define requires(...) 28 | 29 | #define FunctionalProcedure typename 30 | } // namespace hip_impl 31 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/amd_detail/grid_launch_GGL.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | #pragma once 23 | 24 | #if GENERIC_GRID_LAUNCH == 1 25 | #include "macro_based_grid_launch.hpp" 26 | #endif // GENERIC_GRID_LAUNCH -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_hcc.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in 10 | all copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 17 | THE SOFTWARE. 18 | */ 19 | 20 | #ifndef HIP_INCLUDE_HIP_HIP_HCC_H 21 | #define HIP_INCLUDE_HIP_HIP_HCC_H 22 | #warning "hip/hip_hcc.h is deprecated, please use hip/hip_ext.h" 23 | #include "hip/hip_ext.h" 24 | #endif // #ifdef HIP_INCLUDE_HIP_HIP_HCC_H 25 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_profile.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in 10 | all copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 17 | THE SOFTWARE. 18 | */ 19 | 20 | #ifndef HIP_INCLUDE_HIP_HIP_PROFILE_H 21 | #define HIP_INCLUDE_HIP_HIP_PROFILE_H 22 | 23 | #define HIP_SCOPED_MARKER(markerName, group) 24 | #define HIP_BEGIN_MARKER(markerName, group) 25 | #define HIP_END_MARKER() 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_texture_types.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | */ 22 | 23 | 24 | #ifndef HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H 25 | #define HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H 26 | 27 | #include 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/hip/hip_version.h: -------------------------------------------------------------------------------- 1 | // Auto-generated by cmake 2 | 3 | #ifndef HIP_VERSION_H 4 | #define HIP_VERSION_H 5 | 6 | #define HIP_VERSION_MAJOR 6 7 | #define HIP_VERSION_MINOR 2 8 | #define HIP_VERSION_PATCH 41134 9 | #define HIP_VERSION_GITHASH "65d174c3e" 10 | #define HIP_VERSION_BUILD_ID 0 11 | #define HIP_VERSION_BUILD_NAME "" 12 | #define HIP_VERSION (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH) 13 | 14 | #define __HIP_HAS_GET_PCH 1 15 | 16 | #endif 17 | 18 | -------------------------------------------------------------------------------- /third_party/amd/backend/include/roctracer/roctracer_hcc.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2018-2022 Advanced Micro Devices, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. */ 20 | 21 | #pragma message( \ 22 | "This file has been deprecated and marked for removal. Please use roctracer_hip.h instead.") 23 | 24 | #include "roctracer_hip.h" -------------------------------------------------------------------------------- /third_party/amd/backend/lib/asanrtl.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/third_party/amd/backend/lib/asanrtl.bc -------------------------------------------------------------------------------- /third_party/amd/backend/lib/ockl.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/third_party/amd/backend/lib/ockl.bc -------------------------------------------------------------------------------- /third_party/amd/backend/lib/ocml.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/third_party/amd/backend/lib/ocml.bc -------------------------------------------------------------------------------- /third_party/amd/include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Dialect) 2 | add_subdirectory(TritonAMDGPUToLLVM) 3 | add_subdirectory(TritonAMDGPUTransforms) 4 | -------------------------------------------------------------------------------- /third_party/amd/include/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonAMDGPU) 2 | -------------------------------------------------------------------------------- /third_party/amd/include/Dialect/TritonAMDGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /third_party/amd/include/Dialect/TritonAMDGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS TritonAMDGPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=amdgpu) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=amdgpu) 6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions) 7 | mlir_tablegen(Ops.h.inc -gen-op-decls) 8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 9 | add_mlir_doc(TritonAMDGPUDialect TritonAMDGPUDialect dialects/ -gen-dialect-doc) 10 | add_mlir_doc(TritonAMDGPUOps TritonAMDGPUOps dialects/ -gen-op-doc) 11 | add_public_tablegen_target(TritonAMDGPUTableGen) 12 | 13 | set(LLVM_TARGET_DEFINITIONS TritonAMDGPUAttrDefs.td) 14 | mlir_tablegen(TritonAMDGPUEnums.h.inc -gen-enum-decls) 15 | mlir_tablegen(TritonAMDGPUEnums.cpp.inc -gen-enum-defs) 16 | mlir_tablegen(TritonAMDGPUAttrDefs.h.inc -gen-attrdef-decls) 17 | mlir_tablegen(TritonAMDGPUAttrDefs.cpp.inc -gen-attrdef-defs) 18 | add_public_tablegen_target(TritonAMDGPUAttrDefsIncGen) 19 | -------------------------------------------------------------------------------- /third_party/amd/include/Dialect/TritonAMDGPU/Utility/CommonUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_DIALECT_TRITONAMDGPU_UTILITY_COMMONUTILS_H_ 2 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_DIALECT_TRITONAMDGPU_UTILITY_COMMONUTILS_H_ 3 | 4 | #include "mlir/Dialect/SCF/IR/SCF.h" 5 | #include "triton/Dialect/Triton/IR/Dialect.h" 6 | 7 | namespace mlir::triton::AMD { 8 | SmallVector getLeafForOps(triton::FuncOp funcOp); 9 | } // namespace mlir::triton::AMD 10 | 11 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_DIALECT_TRITONAMDGPU_UTILITY_COMMONUTILS_H_ 12 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonAMDGPUToLLVM) 3 | add_public_tablegen_target(TritonAMDGPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUToLLVM/PatternTritonAMDGPUToLLVM.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_PATTERNTRITONAMDGPUTOLLVM_H_ 2 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_PATTERNTRITONAMDGPUTOLLVM_H_ 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | 6 | namespace mlir::triton::AMD { 7 | 8 | void populateExtractSliceOpToLLVMPatterns( 9 | mlir::LLVMTypeConverter &typeConverter, mlir::RewritePatternSet &patterns, 10 | mlir::PatternBenefit benefit); 11 | 12 | void populateInThreadTransposeOpToTTGPatterns(mlir::RewritePatternSet &patterns, 13 | mlir::PatternBenefit benefit); 14 | 15 | } // namespace mlir::triton::AMD 16 | 17 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_PATTERNTRITONAMDGPUTOLLVM_H_ 18 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUToLLVM/TargetUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_TARGETUTILS_H_ 2 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_TARGETUTILS_H_ 3 | 4 | #include "llvm/ADT/StringRef.h" 5 | 6 | namespace mlir::triton::AMD { 7 | 8 | // A list of ISA families we care about. 9 | enum class ISAFamily { 10 | Unknown, 11 | CDNA1, 12 | CDNA2, 13 | CDNA3, 14 | CDNA4, 15 | RDNA1, 16 | RDNA2, 17 | RDNA3, 18 | }; 19 | 20 | // Deduces the corresponding ISA family for the given target gfx |arch|. 21 | ISAFamily deduceISAFamily(llvm::StringRef arch); 22 | 23 | // Retursn true if given architecture support V_DOT instruction. 24 | bool supportsVDot(llvm::StringRef arch); 25 | 26 | // Here is a partial definition of DppCtrl enums. For the complete definition, 27 | // please check: 28 | // https://github.com/llvm/llvm-project/blob/8c75290/llvm/lib/Target/AMDGPU/SIDefines.h#L939 29 | enum class DppCtrl : uint32_t { 30 | QUAD_PERM_FIRST = 0, 31 | ROW_SHL0 = 0x100, 32 | ROW_SHR0 = 0x110, 33 | BCAST15 = 0x142, 34 | BCAST31 = 0x143 35 | }; 36 | 37 | } // namespace mlir::triton::AMD 38 | 39 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_TARGETUTILS_H_ 40 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUTransforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonAMDGPU) 3 | add_public_tablegen_target(TritonAMDGPUTransformsIncGen) 4 | -------------------------------------------------------------------------------- /third_party/amd/include/TritonAMDGPUTransforms/TritonGPUConversion.h: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // 3 | // Defines utilities to use while converting to the TritonGPU dialect. 4 | // 5 | //===----------------------------------------------------------------------===// 6 | 7 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_TRITONGPUCONVERSION_H_ 8 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_TRITONGPUCONVERSION_H_ 9 | 10 | #include "mlir/Transforms/DialectConversion.h" 11 | 12 | namespace mlir { 13 | 14 | class TritonGPUTypeConverter : public TypeConverter { 15 | public: 16 | TritonGPUTypeConverter(MLIRContext *context, int numWarps, int threadsPerWarp, 17 | int numCTAs); 18 | int getNumWarps() const { return numWarps; } 19 | int getThreadsPerWarp() const { return threadsPerWarp; } 20 | int getNumCTAs() const { return numCTAs; } 21 | 22 | private: 23 | MLIRContext *context; 24 | int numWarps; 25 | int threadsPerWarp; 26 | int numCTAs; 27 | }; 28 | 29 | class TritonGPUConversionTarget : public ConversionTarget { 30 | 31 | public: 32 | explicit TritonGPUConversionTarget(MLIRContext &ctx, 33 | TritonGPUTypeConverter &typeConverter); 34 | }; 35 | 36 | } // namespace mlir 37 | 38 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_TRITONGPUCONVERSION_H_ 39 | -------------------------------------------------------------------------------- /third_party/amd/language/hip/__init__.py: -------------------------------------------------------------------------------- 1 | from . import libdevice 2 | 3 | __all__ = ["libdevice"] 4 | -------------------------------------------------------------------------------- /third_party/amd/lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAMDAnalysis 2 | RangeAnalysis.cpp 3 | 4 | DEPENDS 5 | TritonTableGen 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRAnalysis 9 | MLIRLLVMDialect 10 | TritonIR 11 | TritonGPUIR 12 | ) 13 | -------------------------------------------------------------------------------- /third_party/amd/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | add_subdirectory(Dialect) 3 | add_subdirectory(TritonAMDGPUToLLVM) 4 | add_subdirectory(TritonAMDGPUDialectToLLVM) 5 | add_subdirectory(TritonAMDGPUTransforms) 6 | -------------------------------------------------------------------------------- /third_party/amd/lib/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonAMDGPU) 2 | -------------------------------------------------------------------------------- /third_party/amd/lib/Dialect/TritonAMDGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Utility) 3 | -------------------------------------------------------------------------------- /third_party/amd/lib/Dialect/TritonAMDGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAMDGPUIR 2 | Dialect.cpp 3 | 4 | DEPENDS 5 | TritonAMDGPUTableGen 6 | TritonAMDGPUAttrDefsIncGen 7 | 8 | LINK_LIBS PUBLIC 9 | MLIRLLVMDialect 10 | TritonIR 11 | TritonGPUIR 12 | ) 13 | -------------------------------------------------------------------------------- /third_party/amd/lib/Dialect/TritonAMDGPU/Utility/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAMDUtils 2 | CommonUtils.cpp 3 | 4 | LINK_LIBS PUBLIC 5 | MLIRLLVMDialect 6 | TritonIR 7 | TritonGPUIR 8 | ) 9 | -------------------------------------------------------------------------------- /third_party/amd/lib/Dialect/TritonAMDGPU/Utility/CommonUtils.cpp: -------------------------------------------------------------------------------- 1 | #include "third_party/amd/include/Dialect/TritonAMDGPU/Utility/CommonUtils.h" 2 | 3 | namespace mlir::triton::AMD { 4 | SmallVector getLeafForOps(triton::FuncOp funcOp) { 5 | SmallVector allOps; 6 | funcOp->walk([&](scf::ForOp forOp) { allOps.push_back(forOp); }); 7 | 8 | SmallVector leafOps; 9 | for (scf::ForOp forOp : allOps) { 10 | auto searchResult = forOp.getBody()->walk( 11 | [](scf::ForOp) { return WalkResult::interrupt(); }); 12 | if (!searchResult.wasInterrupted()) 13 | leafOps.push_back(forOp); 14 | } 15 | return leafOps; 16 | } 17 | } // namespace mlir::triton::AMD 18 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUDialectToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAMDGPUDialectToLLVM 2 | TritonAMDGPUToLLVMPatterns.cpp 3 | ExtractSliceOpToLLVM.cpp 4 | InThreadTransposeOpToTTG.cpp 5 | 6 | DEPENDS 7 | TritonAMDGPUIR 8 | ) 9 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUDialectToLLVM/InThreadTransposeOpToTTG.cpp: -------------------------------------------------------------------------------- 1 | #include "Dialect/TritonAMDGPU/IR/Dialect.h" 2 | #include "triton/Conversion/MLIRTypes.h" 3 | 4 | using namespace mlir; 5 | using namespace mlir::triton; 6 | namespace ttg = mlir::triton::gpu; 7 | 8 | namespace { 9 | 10 | struct InThreadTransposeOpConversion 11 | : public OpConversionPattern { 12 | public: 13 | explicit InThreadTransposeOpConversion(MLIRContext *ctx, 14 | PatternBenefit benefit) 15 | : OpConversionPattern(ctx, benefit) {} 16 | 17 | LogicalResult 18 | matchAndRewrite(triton::amdgpu::InThreadTransposeOp op, OpAdaptor adaptor, 19 | ConversionPatternRewriter &rewriter) const override { 20 | rewriter.replaceOpWithNewOp(op, op.getType(), 21 | op.getSrc()); 22 | return success(); 23 | } 24 | }; 25 | 26 | } // namespace 27 | 28 | namespace mlir::triton::AMD { 29 | 30 | void populateInThreadTransposeOpToTTGPatterns(RewritePatternSet &patterns, 31 | PatternBenefit benefit) { 32 | patterns.add(patterns.getContext(), benefit); 33 | } 34 | 35 | } // namespace mlir::triton::AMD 36 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUDialectToLLVM/TritonAMDGPUToLLVMPatterns.cpp: -------------------------------------------------------------------------------- 1 | #include "third_party/amd/include/TritonAMDGPUToLLVM/PatternTritonAMDGPUToLLVM.h" 2 | #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h" 3 | 4 | namespace mlir::triton::AMD { 5 | void populateTritonAMDGPUToLLVMPatterns(LLVMTypeConverter &typeConverter, 6 | RewritePatternSet &patterns, 7 | PatternBenefit benefit) { 8 | populateExtractSliceOpToLLVMPatterns(typeConverter, patterns, benefit); 9 | populateInThreadTransposeOpToTTGPatterns(patterns, benefit); 10 | } 11 | } // namespace mlir::triton::AMD 12 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAMDGPUToLLVM 2 | AtomicRMWOpsEmitter.cpp 3 | BufferOpsEmitter.cpp 4 | ConvertLayoutOpToLLVM/SharedToDotOperandHelper.cpp 5 | ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp 6 | ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp 7 | ConvertLayoutOpToLLVM.cpp 8 | MemoryOpToLLVM.cpp 9 | DotOpToLLVM/FMA.cpp 10 | DotOpToLLVM/MFMA.cpp 11 | DotOpToLLVM/WMMA.cpp 12 | DotOpToLLVM.cpp 13 | ElementwiseOpToLLVM.cpp 14 | LoadStoreOpToLLVM.cpp 15 | GCNAsmFormat.cpp 16 | TritonGPUToLLVM.cpp 17 | BuiltinFuncToLLVM.cpp 18 | Utility.cpp 19 | TargetInfo.cpp 20 | TargetUtils.cpp 21 | OptimizeLDSUsage.cpp 22 | OptimizeLDSUtility.cpp 23 | SPMDOpToLLVM.cpp 24 | SchedInstructions.cpp 25 | UpcastMXFPToLLVM.cpp 26 | 27 | DEPENDS 28 | TritonAMDGPUConversionPassIncGen 29 | 30 | LINK_LIBS PUBLIC 31 | TritonGPUToLLVM 32 | TritonAMDGPUIR 33 | TritonProtonToLLVM 34 | ) 35 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_SCHEDINSTRUCTIONS_H_ 2 | #define TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_SCHEDINSTRUCTIONS_H_ 3 | 4 | #include "mlir/IR/Types.h" 5 | #include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h" 6 | #include "triton/Dialect/Triton/IR/Dialect.h" 7 | #include "triton/Dialect/TritonGPU/IR/Dialect.h" 8 | 9 | // The following functions are used to collect and set side-channel information 10 | // during to LLVM conversion/lowering to facilitate instruction scheduling 11 | // controls. 12 | namespace mlir::triton { 13 | template 14 | void setNumGeneratedMMAs(DotOpType op, size_t mmaCount, unsigned m, unsigned n, 15 | unsigned k, Type elementType); 16 | 17 | template 18 | void setNumGeneratedGlobalLoads(LoadOpType op, size_t globalLoadsCount, 19 | Type type); 20 | void setNumGeneratedDsReads(gpu::LocalLoadOp op, size_t numDsReadsCount, 21 | Type type); 22 | void storeOpSchedAnnotations(triton::gpu::LocalStoreOp op, size_t llvmOpCount, 23 | Type type); 24 | triton::DotOp getSingleDotOpIfExists(scf::ForOp forOp); 25 | } // namespace mlir::triton 26 | 27 | #endif // TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_SCHEDINSTRUCTIONS_H_ 28 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonAMDGPUTransforms 2 | AccelerateAMDMatmul.cpp 3 | BlockPingpong.cpp 4 | CanonicalizePointers.cpp 5 | CoalesceAsyncCopy.cpp 6 | ConvertToBufferOps.cpp 7 | OptimizeEpilogue.cpp 8 | HoistLayoutConversions.cpp 9 | ReorderInstructions.cpp 10 | StreamPipeline.cpp 11 | MfmaGroup.cpp 12 | InThreadTranspose.cpp 13 | FoldTrueCmpIOp.cpp 14 | 15 | DEPENDS 16 | TritonAMDGPUIR 17 | TritonAMDGPUTransformsIncGen 18 | TritonGPUIR 19 | TritonAMDUtils 20 | TritonAMDAnalysis 21 | ) 22 | 23 | target_include_directories(TritonAMDGPUTransforms PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include) 24 | target_include_directories(TritonAMDGPUTransforms PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../include) 25 | -------------------------------------------------------------------------------- /third_party/amd/lib/TritonAMDGPUTransforms/FoldTrueCmpIOp.cpp: -------------------------------------------------------------------------------- 1 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 2 | #include "mlir/Pass/Pass.h" 3 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 4 | #include "third_party/amd/include/Analysis/RangeAnalysis.h" 5 | #include "triton/Analysis/Utility.h" 6 | 7 | #define GEN_PASS_CLASSES 8 | #include "TritonAMDGPUTransforms/Passes.h" 9 | 10 | using namespace mlir; 11 | using namespace mlir::triton; 12 | 13 | namespace { 14 | 15 | struct TritonAMDFoldTrueCmpIOpPass 16 | : TritonAMDFoldTrueCmpIBase { 17 | 18 | void runOnOperation() override { 19 | DenseMap> assumptions = 20 | AMD::TritonIntegerRangeAnalysis::collectAssumptions(getOperation()); 21 | std::unique_ptr solver = createDataFlowSolver(); 22 | solver->load(assumptions); 23 | if (failed(solver->initializeAndRun(getOperation()))) 24 | return signalPassFailure(); 25 | 26 | ModuleOp mod = getOperation(); 27 | RewritePatternSet patterns(&getContext()); 28 | AMD::populateFoldTrueCmpIOpPatterns(patterns, solver.get()); 29 | (void)applyPatternsGreedily(mod, std::move(patterns)); 30 | } 31 | }; 32 | 33 | } // namespace 34 | 35 | std::unique_ptr mlir::createTritonAMDGPUFoldTrueCmpIPass() { 36 | return std::make_unique(); 37 | } 38 | -------------------------------------------------------------------------------- /third_party/amd/python/test/address_sanitizer_helper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | 5 | size = 4096 6 | x = torch.rand(size, device='cuda') 7 | y = torch.rand(size, device='cuda') 8 | output = torch.empty_like(x) 9 | n_elements = output.numel() 10 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) 11 | 12 | 13 | @triton.jit 14 | def add_kernel( 15 | x_ptr, 16 | y_ptr, 17 | output_ptr, 18 | n_elements, 19 | BLOCK_SIZE: tl.constexpr, 20 | ): 21 | pid = tl.program_id(axis=0) 22 | block_start = pid * BLOCK_SIZE 23 | #Set access to go out of bounds for ASAN test 24 | offsets = block_start + tl.arange(0, BLOCK_SIZE) + 1 25 | x = tl.load(x_ptr + offsets) 26 | y = tl.load(y_ptr + offsets) 27 | output = x + y 28 | tl.store(output_ptr + offsets, output) 29 | 30 | 31 | pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) 32 | amdgcn = pgm.asm['amdgcn'] 33 | print(amdgcn) 34 | -------------------------------------------------------------------------------- /third_party/amd/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(lib) 2 | -------------------------------------------------------------------------------- /third_party/amd/test/lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(TritonAMDGPUTestAnalysis 2 | TestAMDRangeAnalysis.cpp 3 | 4 | DEPENDS 5 | TritonTableGen 6 | TritonGPUTableGen 7 | TritonGPUAttrDefsIncGen 8 | TritonGPUTypeInterfacesIncGen 9 | 10 | LINK_LIBS PUBLIC 11 | MLIRPass 12 | ${triton_libs} 13 | ) 14 | -------------------------------------------------------------------------------- /third_party/amd/test/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | -------------------------------------------------------------------------------- /third_party/amd/unittest/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Conversion) 2 | -------------------------------------------------------------------------------- /third_party/amd/unittest/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_ut(NAME TestOptimizeLDS 2 | SRCS OptimizeLDSTest.cpp 3 | LIBS 4 | TritonAnalysis 5 | TritonIR 6 | TritonGPUIR) 7 | -------------------------------------------------------------------------------- /third_party/cpu/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/third_party/cpu/backend/__init__.py -------------------------------------------------------------------------------- /third_party/cpu/include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(ScalarizePass) 2 | add_subdirectory(TritonCPUToLLVM) 3 | add_subdirectory(TritonCPUTransforms) 4 | add_subdirectory(TritonToTritonCPU) 5 | -------------------------------------------------------------------------------- /third_party/cpu/include/ScalarizePass/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS ScalarizeInterface.td) 2 | mlir_tablegen(ScalarizeInterface.h.inc -gen-op-interface-decls) 3 | mlir_tablegen(ScalarizeInterface.cpp.inc -gen-op-interface-defs) 4 | add_public_tablegen_target(ScalarizeInterfaceIncGen) 5 | -------------------------------------------------------------------------------- /third_party/cpu/include/ScalarizePass/ScalarizeInterface.h: -------------------------------------------------------------------------------- 1 | #ifndef MLIR_INTERFACES_SCALARIZE_INTERFACE_H_ 2 | #define MLIR_INTERFACES_SCALARIZE_INTERFACE_H_ 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/IR/Builders.h" 6 | #include "mlir/IR/BuiltinTypes.h" 7 | #include "mlir/IR/Operation.h" 8 | #include "mlir/Support/LLVM.h" 9 | 10 | #include "mlir/IR/OpDefinition.h" 11 | 12 | /// Include the ODS generated interface header files. 13 | #include "cpu/include/ScalarizePass/ScalarizeInterface.h.inc" 14 | 15 | namespace mlir { 16 | namespace triton { 17 | namespace cpu { 18 | 19 | mlir::Value computeScalarValue(mlir::Operation *scalarizationOp, 20 | mlir::Value vals, 21 | mlir::ArrayRef indices, 22 | mlir::PatternRewriter &rewriter); 23 | 24 | mlir::Value computeScalarValue(mlir::Operation *scalarizationOp, 25 | mlir::Value vals, mlir::ValueRange indices, 26 | mlir::PatternRewriter &rewriter); 27 | 28 | bool canComputeScalarValue(mlir::Value vals); 29 | } // namespace cpu 30 | } // namespace triton 31 | } // namespace mlir 32 | 33 | #endif // MLIR_INTERFACES_SCALARIZE_INTERFACE_H_ 34 | -------------------------------------------------------------------------------- /third_party/cpu/include/ScalarizePass/ScalarizeInterfaceImpl.h: -------------------------------------------------------------------------------- 1 | #ifndef MLIR_DIALECT_TRITON_SCALARIZEINTERFACEIMPL_H 2 | #define MLIR_DIALECT_TRITON_SCALARIZEINTERFACEIMPL_H 3 | 4 | namespace mlir { 5 | class DialectRegistry; 6 | 7 | namespace triton { 8 | namespace cpu { 9 | 10 | void registerTritonOpScalarizeExternalModels(DialectRegistry ®istry); 11 | 12 | } // namespace cpu 13 | } // namespace triton 14 | } // namespace mlir 15 | 16 | #endif // MLIR_DIALECT_TRITON_SCALARIZEINTERFACEIMPL_H 17 | -------------------------------------------------------------------------------- /third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonCPUToLLVM) 3 | add_public_tablegen_target(TritonCPUToLLVMConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/cpu/include/TritonCPUTransforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonCPUTransforms) 3 | add_public_tablegen_target(TritonCPUTransformsPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/cpu/include/TritonToTritonCPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonCPU) 3 | add_public_tablegen_target(TritonToTritonCPUPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/cpu/language/cpu/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import vnni_decode 2 | 3 | __all__ = ["vnni_decode"] 4 | -------------------------------------------------------------------------------- /third_party/cpu/language/cpu/utils.py: -------------------------------------------------------------------------------- 1 | from triton import jit 2 | import triton.language as tl 3 | from triton.language.core import builtin 4 | 5 | 6 | @jit 7 | def _vnni_decode(arg0): 8 | tl.static_assert(len(arg0.shape) == 2) 9 | tmp = arg0.reshape((arg0.shape[0], arg0.shape[1] // 2, 2)) 10 | tmp1, tmp2 = tl.split(tmp) 11 | return tl.join(tmp1.T, tmp2.T).reshape((arg0.shape[1] // 2, arg0.shape[0] * 2)).T 12 | 13 | 14 | @builtin 15 | def vnni_decode(arg0, _builder=None, _generator=None): 16 | bitwidth = arg0.dtype.primitive_bitwidth 17 | if bitwidth > 16: 18 | raise ValueError("Expected 8-bit or 16-bit values for vnni_decode") 19 | decoded = _generator.call_JitFunction(_vnni_decode, (arg0, ), kwargs={}) 20 | if bitwidth == 8: 21 | decoded = _generator.call_JitFunction(_vnni_decode, (decoded, ), kwargs={}) 22 | return decoded 23 | -------------------------------------------------------------------------------- /third_party/cpu/lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonCPUAnalysis 2 | TensorPtrShapeInfo.cpp 3 | 4 | DEPENDS 5 | TritonCPUTableGen 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRAnalysis 9 | TritonIR 10 | TritonCPUIR 11 | ) 12 | -------------------------------------------------------------------------------- /third_party/cpu/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | add_subdirectory(TritonCPUToLLVM) 3 | add_subdirectory(TritonCPUTransforms) 4 | add_subdirectory(TritonToTritonCPU) 5 | -------------------------------------------------------------------------------- /third_party/cpu/lib/TritonCPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonCPUToLLVM 2 | AtomicOpsToLLVM.cpp 3 | DebugOpsToLLVM.cpp 4 | UkernelOpsToOneDNNLLVM.cpp 5 | UkernelOpsToXSMMLLVM.cpp 6 | FuncOpToLLVM.cpp 7 | GetProgramIdOpToLLVM.cpp 8 | LowerMultiReduction.cpp 9 | MathToVecLib.cpp 10 | MemoryOpToLLVM.cpp 11 | TypeConverter.cpp 12 | Utility.cpp 13 | 14 | DEPENDS 15 | TritonCPUToLLVMConversionPassIncGen 16 | 17 | LINK_LIBS PUBLIC 18 | MLIRVectorToLLVMPass 19 | ) 20 | -------------------------------------------------------------------------------- /third_party/cpu/lib/TritonCPUToLLVM/TypeConverter.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONCPU_TO_LLVM_TYPECONVERTER_H 2 | #define TRITON_CONVERSION_TRITONCPU_TO_LLVM_TYPECONVERTER_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 6 | #include "triton/Conversion/MLIRTypes.h" 7 | #include "triton/Dialect/Triton/IR/Types.h" 8 | 9 | using namespace mlir; 10 | using namespace mlir::triton; 11 | 12 | class TritonCPUToLLVMTypeConverter : public LLVMTypeConverter { 13 | public: 14 | using TypeConverter::convertType; 15 | 16 | TritonCPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option, 17 | const DataLayoutAnalysis *analysis = nullptr); 18 | 19 | Type convertTritonPointerType(triton::PointerType type); 20 | Type convertTritonTensorType(RankedTensorType type); 21 | }; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /third_party/cpu/lib/TritonCPUToLLVM/Utility.cpp: -------------------------------------------------------------------------------- 1 | #include "Utility.h" 2 | 3 | using namespace mlir; 4 | using namespace mlir::triton; 5 | 6 | namespace mlir::triton::cpu { 7 | 8 | Value getProgramId(mlir::FunctionOpInterface funcOp, int axis) { 9 | auto args = funcOp.getArguments(); 10 | assert(funcOp && args.size() >= 6); 11 | assert(axis >= 0 && axis < 3); 12 | 13 | // The first three of the last six args are x, y, z program ids. 14 | auto argIdx = args.size() - 6 + axis; 15 | assert(argIdx < args.size() && "out-of-bounds arg index"); 16 | assert(args[argIdx].getType().isInteger(32) && "unexpected arg type"); 17 | return args[argIdx]; 18 | } 19 | 20 | Value getNumPrograms(mlir::FunctionOpInterface funcOp, int axis) { 21 | auto args = funcOp.getArguments(); 22 | assert(funcOp && args.size() >= 6); 23 | assert(axis >= 0 && axis < 3); 24 | 25 | // The last three of the args are gridX, gridY, gridZ (bounds) of grid. 26 | auto argIdx = args.size() - 3 + axis; 27 | assert(argIdx < args.size() && "out-of-bounds arg index"); 28 | assert(args[argIdx].getType().isInteger(32) && "unexpected arg type"); 29 | return args[argIdx]; 30 | } 31 | 32 | } // namespace mlir::triton::cpu 33 | -------------------------------------------------------------------------------- /third_party/cpu/lib/TritonCPUToLLVM/Utility.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONCPU_TO_LLVM_UTILITY_H 2 | #define TRITON_CONVERSION_TRITONCPU_TO_LLVM_UTILITY_H 3 | 4 | #include "mlir/Analysis/DataFlowFramework.h" 5 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 6 | 7 | namespace mlir::triton::cpu { 8 | 9 | Value getProgramId(mlir::FunctionOpInterface funcOp, int axis); 10 | Value getNumPrograms(mlir::FunctionOpInterface funcOp, int axis); 11 | 12 | } // namespace mlir::triton::cpu 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /third_party/cpu/lib/TritonCPUTransforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonCPUTransforms 2 | ConvertDotOp/ConvertDotCommon.cpp 3 | ConvertDotOp/ConvertDotGeneric.cpp 4 | ConvertDotOp/ConvertDotToAMX.cpp 5 | ConvertDotOp/ConvertDotToFMA.cpp 6 | ConvertDotOp/ConvertDotOpToUkernelOps.cpp 7 | Canonicalize.cpp 8 | ConvertDotProduct.cpp 9 | ConvertUnsupportedOps.cpp 10 | DecomposeFpConversions.cpp 11 | OptimizeMasks.cpp 12 | 13 | DEPENDS 14 | TritonCPUTransformsPassIncGen 15 | ) 16 | 17 | if (dnnl_FOUND) 18 | target_link_libraries(TritonCPUTransforms PRIVATE DNNL::dnnl) 19 | endif() 20 | -------------------------------------------------------------------------------- /third_party/cpu/lib/TritonToTritonCPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonToTritonCPU 2 | ConvertAtomicOps.cpp 3 | ConvertControlFlowOps.cpp 4 | ConvertDebugOps.cpp 5 | ConvertDotOp.cpp 6 | ConvertElementwiseOps.cpp 7 | ConvertElemManipOps.cpp 8 | ConvertHistogramOp.cpp 9 | ScalarizeInterface.cpp 10 | ScalarizeUsingForOps.cpp 11 | ConvertMemoryOps.cpp 12 | ConvertPtrOps.cpp 13 | ConvertReductionOp.cpp 14 | ConvertScanOp.cpp 15 | TypeConverter.cpp 16 | 17 | DEPENDS 18 | TritonToTritonCPUPassIncGen 19 | ScalarizeInterfaceIncGen 20 | MLIRDialectUtils 21 | 22 | LINK_LIBS PUBLIC 23 | TritonCPUIR 24 | MLIRVectorDialect 25 | ) 26 | -------------------------------------------------------------------------------- /third_party/cpu/lib/TritonToTritonCPU/TypeConverter.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITON_TO_TRITONCPU_TYPECONVERTER_H 2 | #define TRITON_CONVERSION_TRITON_TO_TRITONCPU_TYPECONVERTER_H 3 | 4 | #include "triton/Conversion/MLIRTypes.h" 5 | #include "triton/Dialect/Triton/IR/Types.h" 6 | 7 | using namespace mlir; 8 | using namespace mlir::triton; 9 | 10 | class TritonToTritonCPUTypeConverter : public TypeConverter { 11 | public: 12 | using TypeConverter::convertType; 13 | 14 | TritonToTritonCPUTypeConverter(); 15 | 16 | Type convertTritonPointerType(triton::PointerType type); 17 | }; 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /third_party/f2reduce/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(f2reduce 2 | f2reduce.cpp 3 | ) 4 | -------------------------------------------------------------------------------- /third_party/f2reduce/LICENCE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2023 Adam P. Goucher, Hatsya Limited 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /third_party/f2reduce/VERSION: -------------------------------------------------------------------------------- 1 | Cloned from https://gitlab.com/hatsya/open-source/f2reduce at revision 2 | 949b91d022c001bbce19157f806013d37f05fbf5. 3 | -------------------------------------------------------------------------------- /third_party/f2reduce/f2reduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | // OpenAI change: Switched from `extern "C"` to `namespace f2reduce`. 5 | namespace f2reduce { 6 | 7 | /** 8 | * Converts a matrix over F_2 into row-reduced echelon form. 9 | * 10 | * The matrix should be in row-major format. The stride parameter specifies 11 | * the offset (in 64-bit words, *not* bytes!) between successive rows of the 12 | * matrix, and should obey the inequality: 13 | * 14 | * 64 |stride| >= cols 15 | * 16 | * i.e. that the rows occupy disjoint regions of memory. For best performance 17 | * the stride should be divisible by 16 words (128 bytes). 18 | * 19 | * We adopt 'little-endian' semantics: the element in row i and column j+64*k 20 | * of the matrix (zero-indexed) is given by (matrix[i * stride + k] >> j) & 1. 21 | * 22 | * The matrix is overwritten in place with its row-reduced echelon form. 23 | */ 24 | void inplace_rref_strided(uint64_t *matrix, uint64_t rows, uint64_t cols, uint64_t stride); 25 | 26 | uint64_t get_recommended_stride(uint64_t cols); 27 | 28 | } // namespace f2reduce 29 | -------------------------------------------------------------------------------- /third_party/nvidia/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) 2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) 3 | add_subdirectory(include) 4 | add_subdirectory(lib) 5 | if(TRITON_BUILD_PYTHON_MODULE) 6 | add_triton_plugin(TritonNVIDIA ${CMAKE_CURRENT_SOURCE_DIR}/triton_nvidia.cc LINK_LIBS TritonNVIDIAGPUToLLVM NVGPUToLLVM) 7 | target_link_libraries(TritonNVIDIA PRIVATE Python3::Module pybind11::headers) 8 | endif() 9 | if(TRITON_BUILD_UT) 10 | add_subdirectory(test) 11 | add_subdirectory(unittest) 12 | endif() 13 | -------------------------------------------------------------------------------- /third_party/nvidia/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/third_party/nvidia/backend/__init__.py -------------------------------------------------------------------------------- /third_party/nvidia/backend/lib/libdevice.10.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-lang/triton-cpu/e60f448f8f197073b75d6d3e77347414a5db3ee7/third_party/nvidia/backend/lib/libdevice.10.bc -------------------------------------------------------------------------------- /third_party/nvidia/include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Dialect) 2 | add_subdirectory(TritonNVIDIAGPUToLLVM) 3 | add_subdirectory(NVGPUToLLVM) 4 | -------------------------------------------------------------------------------- /third_party/nvidia/include/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(NVGPU) 2 | add_subdirectory(NVWS) 3 | -------------------------------------------------------------------------------- /third_party/nvidia/include/Dialect/NVGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /third_party/nvidia/include/Dialect/NVGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS NVGPUOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu) 6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions) 7 | mlir_tablegen(Ops.h.inc -gen-op-decls) 8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 9 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 10 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 11 | add_mlir_doc(NVGPUDialect NVGPUDialect dialects/ -gen-dialect-doc) 12 | add_mlir_doc(NVGPUOps NVGPUOps dialects/ -gen-op-doc) 13 | add_public_tablegen_target(NVGPUTableGen) 14 | 15 | set(LLVM_TARGET_DEFINITIONS NVGPUAttrDefs.td) 16 | mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls) 17 | mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs) 18 | add_public_tablegen_target(NVGPUAttrDefsIncGen) 19 | -------------------------------------------------------------------------------- /third_party/nvidia/include/Dialect/NVWS/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /third_party/nvidia/include/Dialect/NVWS/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS NVWSOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvws) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvws) 6 | mlir_tablegen(Ops.h.inc -gen-op-decls) 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=nvws) 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=nvws) 10 | add_mlir_doc(NVWSDialect NVWSDialect dialects/ -gen-dialect-doc) 11 | add_mlir_doc(NVWSOps NVWSOps dialects/ -gen-op-doc) 12 | add_public_tablegen_target(NVWSTableGen) 13 | 14 | set(LLVM_TARGET_DEFINITIONS NVWSAttrDefs.td) 15 | mlir_tablegen(NVWSAttrDefs.h.inc -gen-attrdef-decls) 16 | mlir_tablegen(NVWSAttrDefs.cpp.inc -gen-attrdef-defs) 17 | add_public_tablegen_target(NVWSAttrDefsIncGen) 18 | -------------------------------------------------------------------------------- /third_party/nvidia/include/Dialect/NVWS/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name NVWSTransforms) 3 | add_public_tablegen_target(NVWSTransformsIncGen) 4 | -------------------------------------------------------------------------------- /third_party/nvidia/include/NVGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name NVGPUToLLVM) 3 | add_public_tablegen_target(NVGPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_NVGPU_TO_LLVM_PASS_H 2 | #define TRITON_CONVERSION_NVGPU_TO_LLVM_PASS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "mlir/IR/Operation.h" 10 | #include "mlir/IR/PatternMatch.h" 11 | #include "mlir/IR/Value.h" 12 | #include "mlir/Support/LogicalResult.h" 13 | 14 | namespace mlir { 15 | 16 | class ModuleOp; 17 | template class OperationPass; 18 | 19 | namespace triton { 20 | 21 | namespace nvgpu { 22 | 23 | using Constraints = std::vector; 24 | using OperandsAndConstraints = std::vector>; 25 | 26 | LogicalResult 27 | rewriteAsPtxAsm(mlir::Operation *op, mlir::PatternRewriter &rewriter, 28 | std::string ptxAsm, 29 | const OperandsAndConstraints &operandsAndConstraints = {}, 30 | const Constraints &outputConstraints = {}); 31 | 32 | } // namespace nvgpu 33 | 34 | std::unique_ptr> createConvertNVGPUToLLVMPass(); 35 | 36 | } // namespace triton 37 | 38 | } // namespace mlir 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /third_party/nvidia/include/NVGPUToLLVM/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef NVGPU_CONVERSION_PASSES_H 2 | #define NVGPU_CONVERSION_PASSES_H 3 | 4 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 5 | #include "mlir/Pass/Pass.h" 6 | #include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h" 7 | 8 | namespace mlir { 9 | namespace triton { 10 | 11 | #define GEN_PASS_REGISTRATION 12 | #include "nvidia/include/NVGPUToLLVM/Passes.h.inc" 13 | 14 | } // namespace triton 15 | } // namespace mlir 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /third_party/nvidia/include/NVGPUToLLVM/Passes.td: -------------------------------------------------------------------------------- 1 | #ifndef NVGPU_CONVERSION_PASSES 2 | #define NVGPU_CONVERSION_PASSES 3 | 4 | include "mlir/Pass/PassBase.td" 5 | 6 | def ConvertNVGPUToLLVM : Pass<"convert-nv-gpu-to-llvm", "mlir::ModuleOp"> { 7 | let summary = "Convert NVGPU to LLVM"; 8 | let description = [{ 9 | 10 | }]; 11 | let constructor = "mlir::triton::createConvertNVGPUToLLVMPass()"; 12 | 13 | let dependentDialects = ["mlir::arith::ArithDialect", 14 | "mlir::LLVM::LLVMDialect", 15 | "mlir::NVVM::NVVMDialect", 16 | "mlir::triton::nvgpu::NVGPUDialect"]; 17 | } 18 | 19 | #endif // NVGPU_CONVERSION_PASSES 20 | -------------------------------------------------------------------------------- /third_party/nvidia/include/TritonNVIDIAGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonNVIDIAGPUToLLVM) 3 | add_public_tablegen_target(TritonNVIDIAGPUConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_PASSES_H 2 | #define TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_PASSES_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | #include "mlir/Pass/Pass.h" 6 | #include "mlir/Transforms/DialectConversion.h" 7 | 8 | #include 9 | 10 | namespace mlir { 11 | 12 | class ModuleOp; 13 | template class OperationPass; 14 | 15 | namespace triton { 16 | 17 | #define GEN_PASS_DECL 18 | #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc" 19 | 20 | std::unique_ptr> createConvertTritonGPUToLLVMPass(); 21 | std::unique_ptr> 22 | createConvertTritonGPUToLLVMPass(int32_t computeCapability); 23 | std::unique_ptr> 24 | createConvertTritonGPUToLLVMPass(int32_t computeCapability, int32_t ptxVersion); 25 | 26 | #define GEN_PASS_REGISTRATION 27 | #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc" 28 | 29 | } // namespace triton 30 | 31 | } // namespace mlir 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Utility.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_UTILITY_H 2 | #define TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_UTILITY_H 3 | 4 | #include "mlir/IR/Operation.h" 5 | 6 | namespace mlir { 7 | namespace triton { 8 | namespace NVIDIA { 9 | 10 | /// Return true if we can skip a barrier synchronization between two operations 11 | /// even if they access the same shared memory. 12 | bool canSkipBarSync(Operation *before, Operation *after); 13 | } // namespace NVIDIA 14 | } // namespace triton 15 | } // namespace mlir 16 | 17 | #endif // TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_UTILITY_H 18 | -------------------------------------------------------------------------------- /third_party/nvidia/language/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | from . import libdevice 2 | 3 | from .utils import (globaltimer, num_threads, num_warps, smid, convert_custom_float8_sm70, convert_custom_float8_sm80) 4 | 5 | from ._experimental_tma import * # noqa: F403 6 | from ._experimental_tma import __all__ as _tma_all 7 | 8 | __all__ = [ 9 | "libdevice", "globaltimer", "num_threads", "num_warps", "smid", "convert_custom_float8_sm70", 10 | "convert_custom_float8_sm80", *_tma_all 11 | ] 12 | 13 | del _tma_all 14 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Dialect) 2 | add_subdirectory(TritonNVIDIAGPUToLLVM) 3 | add_subdirectory(NVGPUToLLVM) 4 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(NVGPU) 2 | add_subdirectory(NVWS) 3 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/Dialect/NVGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/Dialect/NVGPU/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(NVGPUIR 2 | Dialect.cpp 3 | 4 | DEPENDS 5 | NVGPUTableGen 6 | NVGPUAttrDefsIncGen 7 | 8 | LINK_LIBS PUBLIC 9 | MLIRLLVMDialect 10 | ) 11 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/Dialect/NVWS/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/Dialect/NVWS/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(NVWSIR 2 | Dialect.cpp 3 | Ops.cpp 4 | 5 | DEPENDS 6 | NVWSTableGen 7 | NVWSAttrDefsIncGen 8 | 9 | LINK_LIBS PUBLIC 10 | TritonIR 11 | TritonGPUIR 12 | ) 13 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/Dialect/NVWS/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(NVWSTransforms 2 | LowerWarpGroup.cpp 3 | 4 | DEPENDS 5 | NVWSTransformsIncGen 6 | 7 | LINK_LIBS PUBLIC 8 | TritonIR 9 | TritonGPUIR 10 | TritonNvidiaGPUIR 11 | NVWSIR 12 | MLIRTransformUtils 13 | ) 14 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/NVGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(NVGPUToLLVM 2 | NVGPUToLLVMPass.cpp 3 | 4 | DEPENDS 5 | NVGPUConversionPassIncGen 6 | NVGPUIR 7 | ) 8 | -------------------------------------------------------------------------------- /third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonNVIDIAGPUToLLVM 2 | ConvertLayoutOpToLLVM.cpp 3 | ConvertWarpSpecializeToLLVM.cpp 4 | MemoryOpToLLVM.cpp 5 | DotOpToLLVM/MMAv2.cpp 6 | DotOpToLLVM/MMAv5.cpp 7 | DotOpToLLVM/WGMMA.cpp 8 | DotOpToLLVM.cpp 9 | ElementwiseOpToLLVM.cpp 10 | LoadStoreOpToLLVM.cpp 11 | BarrierOpToLLVM.cpp 12 | TritonGPUToLLVM.cpp 13 | TMAToLLVM.cpp 14 | SPMDOpToLLVM.cpp 15 | TensorMemoryToLLVM.cpp 16 | TensorPtrOpsToLLVM.cpp 17 | ClusterOpsToLLVM.cpp 18 | PTXAsmFormat.cpp 19 | Utility.cpp 20 | Fp4ToFpOpToLLVM.cpp 21 | TargetInfo.cpp 22 | 23 | DEPENDS 24 | TritonNVIDIAGPUConversionPassIncGen 25 | NVGPUAttrDefsIncGen 26 | 27 | LINK_LIBS PUBLIC 28 | TritonGPUToLLVM 29 | TritonProtonToLLVM 30 | ) 31 | -------------------------------------------------------------------------------- /third_party/nvidia/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | llvm_canonicalize_cmake_booleans( 2 | MLIR_ENABLE_BINDINGS_PYTHON 3 | ) 4 | 5 | configure_lit_site_cfg( 6 | ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in 7 | ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py 8 | MAIN_CONFIG 9 | ${CMAKE_CURRENT_SOURCE_DIR}/test/lit.cfg.py 10 | ) 11 | 12 | set(TRITON_TEST_DEPENDS 13 | triton-opt 14 | triton-tensor-layout 15 | triton-llvm-opt 16 | ) 17 | 18 | set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck") 19 | set(LIT_ARGS "-Dfilecheck=${FILECHECK_PATH}") 20 | 21 | add_lit_testsuite(check-triton-nvidia-lit-tests "Running the triton nvidia regression tests" 22 | ${CMAKE_CURRENT_BINARY_DIR} 23 | ARGS ${LIT_ARGS} 24 | DEPENDS ${TRITON_TEST_DEPENDS} 25 | ) 26 | 27 | set_target_properties(check-triton-nvidia-lit-tests PROPERTIES FOLDER "Tests") 28 | 29 | add_lit_testsuites(TRITON-NVIDIA-LIT-TESTS ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TRITON_TEST_DEPENDS}) 30 | -------------------------------------------------------------------------------- /third_party/nvidia/test/lit.site.cfg.py.in: -------------------------------------------------------------------------------- 1 | @LIT_SITE_CFG_IN_HEADER@ 2 | 3 | import sys 4 | 5 | config.triton_obj_root = "@triton_BINARY_DIR@" 6 | config.llvm_src_root = "@LLVM_SOURCE_DIR@" 7 | config.llvm_obj_root = "@LLVM_BINARY_DIR@" 8 | config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" 9 | config.llvm_lib_dir = "@LLVM_LIBS_DIR@" 10 | config.llvm_shlib_dir = "@CMAKE_LIBRARY_OUTPUT_DIRECTORY@" 11 | config.llvm_shlib_ext = "@CMAKE_SHARED_LIBRARY_SUFFIX@" 12 | config.llvm_exe_ext = "@EXEEXT@" 13 | config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" 14 | config.mlir_binary_dir = "@MLIR_BINARY_DIR@" 15 | config.python_executable = "@Python3_EXECUTABLE@" 16 | config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@ 17 | 18 | 19 | import lit.llvm 20 | lit.llvm.initialize(lit_config, config) 21 | 22 | # Let the main config do the real work 23 | lit_config.load_config(config, "@triton_SOURCE_DIR@/third_party/nvidia/test/lit.cfg.py") 24 | -------------------------------------------------------------------------------- /third_party/nvidia/tools/cuda/compile.h: -------------------------------------------------------------------------------- 1 | #ifndef TT_KERNEL_INCLUDES 2 | #define TT_KERNEL_INCLUDES 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #endif 10 | 11 | void unload_{kernel_name}(void); 12 | void load_{kernel_name}(void); 13 | // tt-linker: {kernel_name}:{full_signature}:{algo_info} 14 | CUresult{_placeholder} {kernel_name}(CUstream stream, {signature}); 15 | -------------------------------------------------------------------------------- /third_party/nvidia/unittest/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Conversion) 2 | -------------------------------------------------------------------------------- /third_party/nvidia/unittest/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonGPUToLLVM) 2 | -------------------------------------------------------------------------------- /third_party/nvidia/unittest/Conversion/TritonGPUToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_ut( 2 | NAME TestPtxAsmFormat 3 | SRCS PTXAsmFormatTest.cpp 4 | LIBS TritonGPUToLLVM TritonNVIDIAGPUToLLVM 5 | ) 6 | -------------------------------------------------------------------------------- /third_party/proton/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | proton.egg-info 3 | proton/_C/libproton.so 4 | 5 | *.hatchet 6 | -------------------------------------------------------------------------------- /third_party/proton/csrc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_proton_library(Proton 2 | Proton.cpp 3 | ) 4 | 5 | add_subdirectory(lib) 6 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Context/Python.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_CONTEXT_PYTHON_H_ 2 | #define PROTON_CONTEXT_PYTHON_H_ 3 | 4 | #include "Context.h" 5 | 6 | namespace proton { 7 | 8 | /// Unwind the Python stack and early return a list of contexts. 9 | class PythonContextSource : public ContextSource { 10 | public: 11 | PythonContextSource() = default; 12 | 13 | size_t getDepth() override; 14 | 15 | private: 16 | std::vector getContextsImpl() override; 17 | }; 18 | 19 | } // namespace proton 20 | 21 | #endif // PROTON_CONTEXT_PYTHON_H_ 22 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Context/Shadow.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_CONTEXT_SHADOW_H_ 2 | #define PROTON_CONTEXT_SHADOW_H_ 3 | 4 | #include "Context.h" 5 | #include 6 | 7 | namespace proton { 8 | 9 | /// ShadowContextSource is designed to: 10 | /// 11 | /// - Maintain a main context stack for the main thread. 12 | /// - Provide thread-local context stacks for individual threads. 13 | /// - Allow threads to inherit and shadow the main context stack with their 14 | /// own user-defined scopes. 15 | /// 16 | /// This implementation is suited for use cases like PyTorch, where: 17 | /// 18 | /// - The main thread initializes the main context stack during session setup. 19 | /// - The backward phase spawns multiple CPU threads. 20 | class ShadowContextSource : public ContextSource, public ScopeInterface { 21 | public: 22 | ShadowContextSource() = default; 23 | 24 | void enterScope(const Scope &scope) override; 25 | 26 | void exitScope(const Scope &scope) override; 27 | 28 | size_t getDepth() override; 29 | 30 | private: 31 | std::vector getContextsImpl() override; 32 | 33 | void initializeThreadContext(); 34 | 35 | std::vector *mainContextStack{}; 36 | static thread_local std::map 37 | threadContextInitialized; 38 | static thread_local std::map> 39 | threadContextStack; 40 | }; 41 | 42 | } // namespace proton 43 | 44 | #endif // PROTON_CONTEXT_CONTEXT_H_ 45 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Data/TraceData.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DATA_TRACE_DATA_H_ 2 | #define PROTON_DATA_TRACE_DATA_H_ 3 | 4 | #include "Data.h" 5 | 6 | namespace proton { 7 | 8 | class TraceData : public Data { 9 | public: 10 | using Data::Data; 11 | virtual ~TraceData() = default; 12 | 13 | size_t addOp(size_t scopeId, const std::string &name) override; 14 | 15 | void addMetric(size_t scopeId, std::shared_ptr metric) override; 16 | 17 | void 18 | addMetrics(size_t scopeId, 19 | const std::map &metrics) override; 20 | 21 | void clear() override; 22 | 23 | protected: 24 | // ScopeInterface 25 | void enterScope(const Scope &scope) override final; 26 | 27 | void exitScope(const Scope &scope) override final; 28 | 29 | private: 30 | void doDump(std::ostream &os, OutputFormat outputFormat) const override; 31 | }; 32 | 33 | } // namespace proton 34 | 35 | #endif // PROTON_DATA_TRACE_DATA_H_ 36 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Driver/Device.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DRIVER_DEVICE_H_ 2 | #define PROTON_DRIVER_DEVICE_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace proton { 8 | 9 | enum class DeviceType { HIP, CUDA, COUNT }; 10 | 11 | template struct DeviceTraits; 12 | 13 | template <> struct DeviceTraits { 14 | constexpr static DeviceType type = DeviceType::CUDA; 15 | constexpr static const char *name = "CUDA"; 16 | }; 17 | 18 | template <> struct DeviceTraits { 19 | constexpr static DeviceType type = DeviceType::HIP; 20 | constexpr static const char *name = "HIP"; 21 | }; 22 | 23 | struct Device { 24 | DeviceType type; 25 | uint64_t id; 26 | uint64_t clockRate; // khz 27 | uint64_t memoryClockRate; // khz 28 | uint64_t busWidth; 29 | uint64_t numSms; 30 | std::string arch; 31 | 32 | Device() = default; 33 | 34 | Device(DeviceType type, uint64_t id, uint64_t clockRate, 35 | uint64_t memoryClockRate, uint64_t busWidth, uint64_t numSms, 36 | std::string arch) 37 | : type(type), id(id), clockRate(clockRate), 38 | memoryClockRate(memoryClockRate), busWidth(busWidth), numSms(numSms), 39 | arch(arch) {} 40 | }; 41 | 42 | Device getDevice(DeviceType type, uint64_t index); 43 | 44 | const std::string getDeviceTypeString(DeviceType type); 45 | 46 | }; // namespace proton 47 | 48 | #endif // PROTON_DRIVER_DEVICE_H_ 49 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Driver/GPU/CudaApi.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DRIVER_GPU_CUDA_H_ 2 | #define PROTON_DRIVER_GPU_CUDA_H_ 3 | 4 | #include "Driver/Device.h" 5 | #include "cuda.h" 6 | 7 | namespace proton { 8 | 9 | namespace cuda { 10 | 11 | template CUresult init(int flags); 12 | 13 | template CUresult ctxSynchronize(); 14 | 15 | template CUresult ctxGetCurrent(CUcontext *pctx); 16 | 17 | template 18 | CUresult deviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); 19 | 20 | template CUresult deviceGet(CUdevice *device, int ordinal); 21 | 22 | Device getDevice(uint64_t index); 23 | 24 | } // namespace cuda 25 | 26 | } // namespace proton 27 | 28 | #endif // PROTON_DRIVER_GPU_CUDA_H_ 29 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Driver/GPU/HipApi.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DRIVER_GPU_HIP_H_ 2 | #define PROTON_DRIVER_GPU_HIP_H_ 3 | 4 | #include "Driver/Device.h" 5 | #include "hip/hip_runtime_api.h" 6 | 7 | namespace proton { 8 | 9 | namespace hip { 10 | 11 | template hipError_t deviceSynchronize(); 12 | 13 | template 14 | hipError_t deviceGetAttribute(int *value, hipDeviceAttribute_t attribute, 15 | int deviceId); 16 | 17 | template hipError_t getDeviceCount(int *count); 18 | 19 | template 20 | hipError_t getDeviceProperties(hipDeviceProp_t *prop, int deviceId); 21 | 22 | Device getDevice(uint64_t index); 23 | 24 | const std::string getHipArchName(uint64_t index); 25 | 26 | const char *getKernelNameRef(const hipFunction_t f); 27 | const char *getKernelNameRefByPtr(const void *hostFunction, hipStream_t stream); 28 | 29 | } // namespace hip 30 | 31 | } // namespace proton 32 | 33 | #endif // PROTON_DRIVER_GPU_HIP_H_ 34 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Driver/GPU/HsaApi.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DRIVER_GPU_HSA_H_ 2 | #define PROTON_DRIVER_GPU_HSA_H_ 3 | 4 | #include "Driver/Device.h" 5 | #include "hsa/hsa_ext_amd.h" 6 | 7 | namespace proton { 8 | 9 | namespace hsa { 10 | 11 | template 12 | hsa_status_t agentGetInfo(hsa_agent_t agent, hsa_agent_info_t attribute, 13 | void *value); 14 | 15 | hsa_status_t iterateAgents(hsa_status_t (*callback)(hsa_agent_t agent, 16 | void *data), 17 | void *data); 18 | 19 | } // namespace hsa 20 | 21 | } // namespace proton 22 | 23 | #endif // PROTON_DRIVER_GPU_HSA_H_ 24 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_PROFILER_CUPTI_PROFILER_H_ 2 | #define PROTON_PROFILER_CUPTI_PROFILER_H_ 3 | 4 | #include "Profiler/GPUProfiler.h" 5 | 6 | namespace proton { 7 | 8 | class CuptiProfiler : public GPUProfiler { 9 | public: 10 | CuptiProfiler(); 11 | virtual ~CuptiProfiler(); 12 | 13 | private: 14 | struct CuptiProfilerPimpl; 15 | }; 16 | 17 | } // namespace proton 18 | 19 | #endif // PROTON_PROFILER_CUPTI_PROFILER_H_ 20 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_PROFILER_ROCTRACER_PROFILER_H_ 2 | #define PROTON_PROFILER_ROCTRACER_PROFILER_H_ 3 | 4 | #include "Profiler/GPUProfiler.h" 5 | 6 | namespace proton { 7 | 8 | class RoctracerProfiler : public GPUProfiler { 9 | public: 10 | RoctracerProfiler(); 11 | virtual ~RoctracerProfiler(); 12 | 13 | private: 14 | struct RoctracerProfilerPimpl; 15 | }; 16 | 17 | } // namespace proton 18 | 19 | #endif // PROTON_PROFILER_ROCTRACER_PROFILER_H_ 20 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Proton.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_H_ 2 | #define PROTON_H_ 3 | 4 | #include "Context/Context.h" 5 | #include "Data/Data.h" 6 | #include "Data/Metric.h" 7 | #include "Session/Session.h" 8 | 9 | #endif // PROTON_H_ 10 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/Atomic.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_ATOMIC_H_ 2 | #define PROTON_UTILITY_ATOMIC_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace proton { 8 | 9 | template T atomicMax(std::atomic &target, T value) { 10 | T current = target.load(); 11 | while (current < value && !target.compare_exchange_weak(current, value)) 12 | ; 13 | return current; 14 | } 15 | 16 | template T atomicMin(std::atomic &target, T value) { 17 | T current = target.load(); 18 | while (current > value && !target.compare_exchange_weak(current, value)) 19 | ; 20 | return current; 21 | } 22 | 23 | template 24 | void doubleCheckedLock(Condition enterCondition, std::mutex &lock, 25 | Function function) { 26 | if (!enterCondition()) 27 | return; 28 | 29 | std::unique_lock guard(lock); 30 | 31 | if (!enterCondition()) 32 | return; 33 | 34 | function(); 35 | } 36 | 37 | } // namespace proton 38 | 39 | #endif // PROTON_UTILITY_ATOMIC_H_ 40 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/Errors.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_ERRORS_H_ 2 | #define PROTON_UTILITY_ERRORS_H_ 3 | 4 | #include 5 | 6 | namespace proton { 7 | 8 | class NotImplemented : public std::logic_error { 9 | public: 10 | NotImplemented() : std::logic_error("Not yet implemented") {}; 11 | }; 12 | 13 | } // namespace proton 14 | 15 | #endif // PROTON_UTILITY_ERRORS_H_ 16 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/Set.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_SET_H_ 2 | #define PROTON_UTILITY_SET_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace proton { 8 | 9 | /// A simple thread safe set with read/write lock. 10 | template > 11 | class ThreadSafeSet { 12 | public: 13 | ThreadSafeSet() = default; 14 | 15 | void insert(const Key &key) { 16 | std::unique_lock lock(mutex); 17 | set.insert(key); 18 | } 19 | 20 | bool contain(const Key &key) { 21 | std::shared_lock lock(mutex); 22 | auto it = set.find(key); 23 | if (it == set.end()) 24 | return false; 25 | return true; 26 | } 27 | 28 | bool erase(const Key &key) { 29 | std::unique_lock lock(mutex); 30 | return set.erase(key) > 0; 31 | } 32 | 33 | void clear() { 34 | std::unique_lock lock(mutex); 35 | set.clear(); 36 | } 37 | 38 | private: 39 | Container set; 40 | std::shared_mutex mutex; 41 | }; 42 | 43 | } // namespace proton 44 | 45 | #endif // PROTON_UTILITY_MAP_H_ 46 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/Singleton.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_SINGLETON_H_ 2 | #define PROTON_UTILITY_SINGLETON_H_ 3 | 4 | namespace proton { 5 | 6 | template class Singleton { 7 | public: 8 | Singleton(const Singleton &) = delete; 9 | Singleton &operator=(const Singleton &) = delete; 10 | 11 | static T &instance() { 12 | static T _; 13 | return _; 14 | } 15 | 16 | protected: 17 | Singleton() = default; 18 | }; 19 | 20 | } // namespace proton 21 | 22 | #endif // PROTON_UTILITY_SINGLETON_H_ 23 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/String.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_STRING_H_ 2 | #define PROTON_UTILITY_STRING_H_ 3 | 4 | #include 5 | 6 | namespace proton { 7 | 8 | inline std::string toLower(const std::string &str) { 9 | std::string lower; 10 | for (auto c : str) { 11 | lower += tolower(c); 12 | } 13 | return lower; 14 | } 15 | 16 | inline std::string replace(const std::string &str, const std::string &src, 17 | const std::string &dst) { 18 | std::string replaced = str; 19 | size_t pos = replaced.find(src); 20 | while (pos != std::string::npos) { 21 | replaced.replace(pos, src.length(), dst); 22 | pos += dst.length(); 23 | pos = replaced.find(src, pos); 24 | } 25 | return replaced; 26 | } 27 | 28 | inline bool endWith(const std::string &str, const std::string &sub) { 29 | if (str.length() < sub.length()) { 30 | return false; 31 | } 32 | return str.compare(str.length() - sub.length(), sub.length(), sub) == 0; 33 | } 34 | 35 | inline std::string trim(const std::string &str) { 36 | size_t start = 0; 37 | size_t end = str.length(); 38 | while (start < end && isspace(str[start])) { 39 | start++; 40 | } 41 | while (end > start && isspace(str[end - 1])) { 42 | end--; 43 | } 44 | return str.substr(start, end - start); 45 | } 46 | 47 | } // namespace proton 48 | 49 | #endif // PROTON_UTILITY_STRING_H_ 50 | -------------------------------------------------------------------------------- /third_party/proton/csrc/include/Utility/Traits.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_UTILITY_TRAITS_H_ 2 | #define PROTON_UTILITY_TRAITS_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace proton { 8 | template 9 | struct is_one_of : std::disjunction...> {}; 10 | } // namespace proton 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Context) 2 | add_subdirectory(Data) 3 | add_subdirectory(Driver) 4 | add_subdirectory(Profiler) 5 | add_subdirectory(Session) 6 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Context/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_proton_library(ProtonContext 2 | Context.cpp 3 | Python.cpp 4 | Shadow.cpp 5 | ) 6 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Context/Context.cpp: -------------------------------------------------------------------------------- 1 | #include "Context/Context.h" 2 | 3 | namespace proton { 4 | 5 | /*static*/ thread_local std::optional ContextSource::state = 6 | std::nullopt; 7 | 8 | std::atomic Scope::scopeIdCounter{1}; 9 | 10 | /*static*/ thread_local std::map 11 | ThreadLocalOpInterface::opInProgress; 12 | 13 | } // namespace proton 14 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Data/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_proton_library(ProtonData 2 | Data.cpp 3 | TraceData.cpp 4 | TreeData.cpp 5 | ) 6 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Data/Data.cpp: -------------------------------------------------------------------------------- 1 | #include "Data/Data.h" 2 | #include "Utility/String.h" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace proton { 11 | 12 | void Data::dump(OutputFormat outputFormat) { 13 | std::shared_lock lock(mutex); 14 | 15 | std::unique_ptr out; 16 | if (path.empty() || path == "-") { 17 | out.reset(new std::ostream(std::cout.rdbuf())); // Redirecting to cout 18 | } else { 19 | out.reset(new std::ofstream( 20 | path + "." + 21 | outputFormatToString(outputFormat))); // Opening a file for output 22 | } 23 | doDump(*out, outputFormat); 24 | } 25 | 26 | OutputFormat parseOutputFormat(const std::string &outputFormat) { 27 | if (toLower(outputFormat) == "hatchet") { 28 | return OutputFormat::Hatchet; 29 | } 30 | throw std::runtime_error("Unknown output format: " + outputFormat); 31 | } 32 | 33 | const std::string outputFormatToString(OutputFormat outputFormat) { 34 | if (outputFormat == OutputFormat::Hatchet) { 35 | return "hatchet"; 36 | } 37 | throw std::runtime_error("Unknown output format: " + 38 | std::to_string(static_cast(outputFormat))); 39 | } 40 | 41 | } // namespace proton 42 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Data/TraceData.cpp: -------------------------------------------------------------------------------- 1 | #include "Data/TraceData.h" 2 | #include "Utility/Errors.h" 3 | 4 | #include 5 | 6 | namespace proton { 7 | 8 | void TraceData::enterScope(const Scope &scope) { throw NotImplemented(); } 9 | 10 | void TraceData::exitScope(const Scope &scope) { throw NotImplemented(); } 11 | 12 | size_t TraceData::addOp(size_t scopeId, const std::string &name) { 13 | throw NotImplemented(); 14 | } 15 | 16 | void TraceData::addMetric(size_t scopeId, std::shared_ptr metric) { 17 | throw NotImplemented(); 18 | } 19 | 20 | void TraceData::addMetrics( 21 | size_t scopeId, const std::map &metrics) { 22 | throw NotImplemented(); 23 | } 24 | 25 | void TraceData::clear() { throw NotImplemented(); } 26 | 27 | void TraceData::doDump(std::ostream &os, OutputFormat outputFormat) const { 28 | throw NotImplemented(); 29 | } 30 | 31 | } // namespace proton 32 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Driver/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_proton_library(ProtonDriver 2 | Device.cpp 3 | GPU/CudaApi.cpp 4 | GPU/CuptiApi.cpp 5 | GPU/HipApi.cpp 6 | GPU/HsaApi.cpp 7 | GPU/RoctracerApi.cpp 8 | ) 9 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Driver/Device.cpp: -------------------------------------------------------------------------------- 1 | #include "Driver/Device.h" 2 | #include "Driver/GPU/CudaApi.h" 3 | #include "Driver/GPU/HipApi.h" 4 | 5 | #include "Utility/Errors.h" 6 | 7 | namespace proton { 8 | 9 | Device getDevice(DeviceType type, uint64_t index) { 10 | if (type == DeviceType::CUDA) { 11 | return cuda::getDevice(index); 12 | } 13 | if (type == DeviceType::HIP) { 14 | return hip::getDevice(index); 15 | } 16 | throw std::runtime_error("DeviceType not supported"); 17 | } 18 | 19 | const std::string getDeviceTypeString(DeviceType type) { 20 | if (type == DeviceType::CUDA) { 21 | return DeviceTraits::name; 22 | } else if (type == DeviceType::HIP) { 23 | return DeviceTraits::name; 24 | } 25 | throw std::runtime_error("DeviceType not supported"); 26 | } 27 | 28 | } // namespace proton 29 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Driver/GPU/HsaApi.cpp: -------------------------------------------------------------------------------- 1 | #include "Driver/GPU/HsaApi.h" 2 | #include "Driver/Dispatch.h" 3 | 4 | namespace proton { 5 | 6 | namespace hsa { 7 | 8 | struct ExternLibHsa : public ExternLibBase { 9 | using RetType = hsa_status_t; 10 | static constexpr const char *name = "libhsa-runtime64.so"; 11 | static constexpr const char *defaultDir = ""; 12 | static constexpr RetType success = HSA_STATUS_SUCCESS; 13 | static void *lib; 14 | }; 15 | 16 | void *ExternLibHsa::lib = nullptr; 17 | 18 | DEFINE_DISPATCH(ExternLibHsa, agentGetInfo, hsa_agent_get_info, hsa_agent_t, 19 | hsa_agent_info_t, void *); 20 | 21 | hsa_status_t iterateAgents(hsa_status_t (*callback)(hsa_agent_t agent, 22 | void *data), 23 | void *data) { 24 | typedef hsa_status_t (*hsa_iterate_agents_t)( 25 | hsa_status_t (*)(hsa_agent_t, void *), void *data); 26 | static hsa_iterate_agents_t func = nullptr; 27 | Dispatch::init(ExternLibHsa::name, &ExternLibHsa::lib); 28 | if (func == nullptr) 29 | func = reinterpret_cast( 30 | dlsym(ExternLibHsa::lib, "hsa_iterate_agents")); 31 | return (func ? func(callback, data) : HSA_STATUS_ERROR_FATAL); 32 | } 33 | 34 | } // namespace hsa 35 | 36 | } // namespace proton 37 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Profiler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_proton_library(ProtonProfiler 2 | Cupti/CuptiPCSampling.cpp 3 | Cupti/CuptiProfiler.cpp 4 | RocTracer/RoctracerProfiler.cpp 5 | ) 6 | -------------------------------------------------------------------------------- /third_party/proton/csrc/lib/Session/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_proton_library(ProtonSession 2 | Session.cpp 3 | ) 4 | -------------------------------------------------------------------------------- /third_party/proton/dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) 2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) 3 | add_subdirectory(include) 4 | add_subdirectory(lib) 5 | if(TRITON_BUILD_PYTHON_MODULE) 6 | add_triton_plugin(TritonProton ${CMAKE_CURRENT_SOURCE_DIR}/triton_proton.cc) 7 | target_link_libraries(TritonProton PRIVATE ProtonIR Python3::Module pybind11::headers) 8 | endif() 9 | -------------------------------------------------------------------------------- /third_party/proton/dialect/include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Dialect) 2 | -------------------------------------------------------------------------------- /third_party/proton/dialect/include/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Proton) 2 | -------------------------------------------------------------------------------- /third_party/proton/dialect/include/Dialect/Proton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /third_party/proton/dialect/include/Dialect/Proton/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 2 | 3 | set(LLVM_TARGET_DEFINITIONS ProtonOps.td) 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=proton) 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=proton) 6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions) 7 | mlir_tablegen(Ops.h.inc -gen-op-decls) 8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs) 9 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) 10 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) 11 | add_mlir_doc(ProtonDialect ProtonDialect dialects/ -gen-dialect-doc) 12 | add_mlir_doc(ProtonOps ProtonOps dialects/ -gen-op-doc) 13 | add_public_tablegen_target(ProtonTableGen) 14 | 15 | set(LLVM_TARGET_DEFINITIONS ProtonAttrDefs.td) 16 | mlir_tablegen(ProtonAttrDefs.h.inc -gen-attrdef-decls) 17 | mlir_tablegen(ProtonAttrDefs.cpp.inc -gen-attrdef-defs) 18 | add_public_tablegen_target(ProtonAttrDefsIncGen) 19 | -------------------------------------------------------------------------------- /third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_DIALECT_PROTON_IR_DIALECT_H_ 2 | #define TRITON_DIALECT_PROTON_IR_DIALECT_H_ 3 | 4 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 5 | #include "mlir/IR/BuiltinOps.h" 6 | #include "mlir/IR/Dialect.h" 7 | #include "mlir/IR/PatternMatch.h" 8 | #include "proton/dialect/include/Dialect/Proton/IR/Dialect.h.inc" 9 | #include "proton/dialect/include/Dialect/Proton/IR/OpsEnums.h.inc" 10 | 11 | #define GET_ATTRDEF_CLASSES 12 | #include "proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.h.inc" 13 | 14 | #define GET_OP_CLASSES 15 | #include "proton/dialect/include/Dialect/Proton/IR/Ops.h.inc" 16 | 17 | namespace mlir { 18 | namespace triton { 19 | namespace proton {} // namespace proton 20 | } // namespace triton 21 | } // namespace mlir 22 | 23 | #endif // TRITON_DIALECT_PROTON_IR_DIALECT_H_ 24 | -------------------------------------------------------------------------------- /third_party/proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_ATTRDEFS 2 | #define PROTON_ATTRDEFS 3 | 4 | include "mlir/IR/AttrTypeBase.td" 5 | include "ProtonDialect.td" 6 | 7 | class Proton_Attr traits = [], 8 | string baseCppClass = "::mlir::Attribute"> 9 | : AttrDef { 10 | } 11 | 12 | #endif // PROTON_ATTRDEFS 13 | -------------------------------------------------------------------------------- /third_party/proton/dialect/include/Dialect/Proton/IR/ProtonDialect.td: -------------------------------------------------------------------------------- 1 | #ifndef PROTON_DIALECT 2 | #define PROTON_DIALECT 3 | 4 | include "mlir/IR/OpBase.td" 5 | 6 | def Proton_Dialect : Dialect { 7 | let name = "proton"; 8 | let cppNamespace = "::mlir::triton::proton"; 9 | 10 | let description = [{ 11 | Proton Dialect provides core ops for building third-party compiler-based 12 | performance profiling and analysis tools. 13 | }]; 14 | 15 | let dependentDialects = []; 16 | } 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h: -------------------------------------------------------------------------------- 1 | #ifndef TRITON_CONVERSION_TRITONPROTON_TO_LLVM_PATTERNS_TRITON_PROTON_OP_TO_LLVM_H 2 | #define TRITON_CONVERSION_TRITONPROTON_TO_LLVM_PATTERNS_TRITON_PROTON_OP_TO_LLVM_H 3 | 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" 5 | 6 | namespace mlir::triton { 7 | class TargetInfoBase; 8 | namespace proton { 9 | void populateRecordOpToLLVMPattern(LLVMTypeConverter &typeConverter, 10 | RewritePatternSet &patterns, 11 | const TargetInfoBase &targetInfo, 12 | PatternBenefit benefit); 13 | } // namespace proton 14 | } // namespace mlir::triton 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /third_party/proton/dialect/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Dialect) 2 | add_subdirectory(TritonProtonToLLVM) 3 | -------------------------------------------------------------------------------- /third_party/proton/dialect/lib/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Proton) 2 | -------------------------------------------------------------------------------- /third_party/proton/dialect/lib/Dialect/Proton/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /third_party/proton/dialect/lib/Dialect/Proton/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(ProtonIR 2 | Dialect.cpp 3 | Ops.cpp 4 | 5 | DEPENDS 6 | ProtonTableGen 7 | ProtonAttrDefsIncGen 8 | 9 | LINK_LIBS PUBLIC 10 | MLIRLLVMDialect 11 | TritonIR 12 | TritonGPUIR 13 | ) 14 | -------------------------------------------------------------------------------- /third_party/proton/dialect/lib/Dialect/Proton/IR/Dialect.cpp: -------------------------------------------------------------------------------- 1 | #include "mlir/IR/DialectImplementation.h" 2 | #include "mlir/IR/OpImplementation.h" 3 | 4 | // clang-format off 5 | #include "Dialect/Proton/IR/Dialect.h" 6 | #include "Dialect/Proton/IR/Dialect.cpp.inc" 7 | // clang-format on 8 | 9 | using namespace mlir; 10 | using namespace mlir::triton::proton; 11 | 12 | void mlir::triton::proton::ProtonDialect::initialize() { 13 | addAttributes< 14 | #define GET_ATTRDEF_LIST 15 | #include "Dialect/Proton/IR/ProtonAttrDefs.cpp.inc" 16 | >(); 17 | 18 | addOperations< 19 | #define GET_OP_LIST 20 | #include "Dialect/Proton/IR/Ops.cpp.inc" 21 | >(); 22 | } 23 | 24 | #define GET_ATTRDEF_CLASSES 25 | #include "Dialect/Proton/IR/ProtonAttrDefs.cpp.inc" 26 | -------------------------------------------------------------------------------- /third_party/proton/dialect/lib/Dialect/Proton/IR/Ops.cpp: -------------------------------------------------------------------------------- 1 | #include "Dialect/Proton/IR/Dialect.h" 2 | #include "mlir/IR/Builders.h" 3 | #include "mlir/IR/BuiltinAttributes.h" 4 | #include "mlir/IR/BuiltinTypes.h" 5 | #include "mlir/IR/OperationSupport.h" 6 | #include "mlir/Interfaces/FunctionImplementation.h" 7 | #include "mlir/Interfaces/FunctionInterfaces.h" 8 | #include "mlir/Support/LLVM.h" 9 | #include "triton/Dialect/Triton/IR/Dialect.h" 10 | #include "triton/Dialect/Triton/IR/Types.h" 11 | #include "triton/Dialect/Triton/IR/Utility.h" 12 | 13 | #define GET_OP_CLASSES 14 | #include "Dialect/Proton/IR/Ops.cpp.inc" 15 | #include "Dialect/Proton/IR/OpsEnums.cpp.inc" 16 | 17 | namespace mlir { 18 | namespace triton { 19 | namespace proton { 20 | 21 | // -- RecordOp -- 22 | void RecordOp::getEffects( 23 | SmallVectorImpl> 24 | &effects) { 25 | effects.emplace_back(MemoryEffects::Write::get(), 26 | SideEffects::DefaultResource::get()); 27 | effects.emplace_back(MemoryEffects::Read::get(), 28 | SideEffects::DefaultResource::get()); 29 | } 30 | 31 | } // namespace proton 32 | } // namespace triton 33 | } // namespace mlir 34 | -------------------------------------------------------------------------------- /third_party/proton/dialect/lib/TritonProtonToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_library(TritonProtonToLLVM 2 | RecordOpToLLVM.cpp 3 | 4 | LINK_LIBS PUBLIC 5 | ProtonIR 6 | ) 7 | -------------------------------------------------------------------------------- /third_party/proton/dialect/triton_proton.cc: -------------------------------------------------------------------------------- 1 | #include "Dialect/Proton/IR/Dialect.h" 2 | #include "mlir/Pass/PassManager.h" 3 | #include "passes.h" 4 | #include 5 | #include 6 | #include 7 | 8 | namespace py = pybind11; 9 | 10 | void init_triton_proton(py::module &&m) { 11 | auto passes = m.def_submodule("passes"); 12 | 13 | // load dialects 14 | m.def("load_dialects", [](mlir::MLIRContext &context) { 15 | mlir::DialectRegistry registry; 16 | registry.insert(); 17 | context.appendDialectRegistry(registry); 18 | context.loadAllAvailableDialects(); 19 | }); 20 | } 21 | -------------------------------------------------------------------------------- /third_party/proton/proton/_C/include: -------------------------------------------------------------------------------- 1 | ../../csrc/include/ -------------------------------------------------------------------------------- /third_party/proton/proton/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | from .scope import scope, cpu_timed_scope, enter_scope, exit_scope 3 | from .state import state, enter_state, exit_state 4 | from .profile import ( 5 | start, 6 | activate, 7 | deactivate, 8 | finalize, 9 | profile, 10 | DEFAULT_PROFILE_NAME, 11 | ) 12 | from . import context 13 | -------------------------------------------------------------------------------- /third_party/proton/proton/context.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from triton._C.libproton import proton as libproton 3 | from .flags import get_profiling_on 4 | 5 | 6 | def depth(session: Optional[int] = 0) -> Optional[int]: 7 | """ 8 | Get the depth of the context. 9 | 10 | Args: 11 | session (int): The session ID of the profiling session. Defaults to 0. 12 | 13 | Returns: 14 | depth (int or None): The depth of the context. If profiling is off, returns None. 15 | """ 16 | if not get_profiling_on(): 17 | return None 18 | return libproton.get_context_depth(session) 19 | -------------------------------------------------------------------------------- /third_party/proton/proton/flags.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the global flags used in the proton package. 3 | """ 4 | 5 | # Whether to enable profiling. Default is False. 6 | profiling_on = False 7 | # Whether the script is run from the command line. Default is False. 8 | command_line = False 9 | 10 | 11 | def set_profiling_on(): 12 | global profiling_on 13 | profiling_on = True 14 | 15 | 16 | def set_profiling_off(): 17 | global profiling_on 18 | profiling_on = False 19 | 20 | 21 | def get_profiling_on(): 22 | global profiling_on 23 | return profiling_on 24 | 25 | 26 | def set_command_line(): 27 | global command_line 28 | command_line = True 29 | 30 | 31 | def is_command_line(): 32 | global command_line 33 | return command_line 34 | -------------------------------------------------------------------------------- /third_party/proton/proton/hook.py: -------------------------------------------------------------------------------- 1 | from .state import enter_state, exit_state 2 | from .scope import enter_scope, exit_scope 3 | from triton.compiler import CompiledKernel, LazyDict 4 | 5 | COMPUTE_METADATA_SCOPE_NAME = "__proton_launch_metadata" 6 | 7 | 8 | class TritonHook: 9 | flops_width = [8, 16, 32, 64] 10 | metrics = [f"flops{width}" for width in flops_width] + ["bytes"] + ["flops"] 11 | 12 | @staticmethod 13 | def enter(lazy_dict: LazyDict) -> None: 14 | enter_state(COMPUTE_METADATA_SCOPE_NAME) 15 | metadata = lazy_dict.get() 16 | exit_state() 17 | fn_metrics = {k: metadata[k] for k in TritonHook.metrics if k in metadata} 18 | enter_scope(metadata["name"], triton_op=True, metrics=fn_metrics) 19 | 20 | @staticmethod 21 | def exit(lazy_dict: LazyDict) -> None: 22 | exit_scope(triton_op=True) 23 | 24 | 25 | def register_triton_hook() -> None: 26 | if CompiledKernel.launch_enter_hook is None: 27 | CompiledKernel.launch_enter_hook = TritonHook.enter 28 | CompiledKernel.launch_exit_hook = TritonHook.exit 29 | 30 | 31 | def unregister_triton_hook() -> None: 32 | if CompiledKernel.launch_enter_hook == TritonHook.enter: 33 | CompiledKernel.launch_enter_hook = None 34 | CompiledKernel.launch_exit_hook = None 35 | -------------------------------------------------------------------------------- /third_party/proton/proton/language.py: -------------------------------------------------------------------------------- 1 | from triton.language import core as tl 2 | from triton.language.core import builtin 3 | import warnings 4 | 5 | 6 | @builtin 7 | def record(isStart: bool, regionId: int, _builder=None): 8 | warnings.warn( 9 | "\nWarning the proton language module within Proton contains under development features that are not intended to be used outside of the core development team" 10 | ) 11 | return tl.tensor(_builder.create_proton_record(isStart, regionId), tl.void) 12 | -------------------------------------------------------------------------------- /third_party/proton/test/examples/frame.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "children": [ 4 | { 5 | "children": [ 6 | { 7 | "children": [], 8 | "frame": { 9 | "name": "/home/user/projects/example.py/test.py:1@foo", 10 | "type": "function" 11 | }, 12 | "metrics": { 13 | "count": 1, 14 | "device_id": "0", 15 | "device_type": "HIP", 16 | "time (ns)": 204800 17 | } 18 | } 19 | ], 20 | "frame": { 21 | "name": "test0" 22 | }, 23 | "metrics": {} 24 | }, 25 | { 26 | "children": [], 27 | "frame": { 28 | "name": "test1" 29 | }, 30 | "metrics": { 31 | "count": 1, 32 | "device_id": "0", 33 | "device_type": "HIP", 34 | "time (ns)": 204800 35 | } 36 | } 37 | ], 38 | "frame": { 39 | "name": "ROOT", 40 | "type": "function" 41 | }, 42 | "metrics": { 43 | "count": 0, 44 | "time (ns)": 0 45 | } 46 | }, 47 | { 48 | "HIP": { 49 | "0": { 50 | "arch": "gfx90a", 51 | "bus_width": 4096, 52 | "clock_rate": 1700000, 53 | "memory_clock_rate": 1600000, 54 | "num_sms": 104 55 | } 56 | } 57 | } 58 | ] 59 | -------------------------------------------------------------------------------- /third_party/proton/test/helper.py: -------------------------------------------------------------------------------- 1 | import triton.profiler as proton 2 | 3 | import torch 4 | import sys 5 | 6 | from helper_kernels import custom_add 7 | 8 | 9 | def main(): 10 | a = torch.zeros(1, device="cuda") 11 | with proton.scope("test"): 12 | custom_add[(1, )](a) 13 | 14 | 15 | def test_main(): 16 | main() 17 | 18 | 19 | if __name__ == "__main__": 20 | if sys.argv[1] == "test": 21 | main() 22 | -------------------------------------------------------------------------------- /third_party/proton/test/helper_kernels.py: -------------------------------------------------------------------------------- 1 | import triton.language as tl 2 | import triton 3 | 4 | 5 | @triton.jit 6 | def custom_add(a_ptr): 7 | tl.store(a_ptr, 1.0) 8 | -------------------------------------------------------------------------------- /third_party/proton/test/test_record.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pathlib 3 | 4 | import triton 5 | import triton.language as tl 6 | import triton.profiler.language as pl 7 | 8 | 9 | def test_proton_record(tmp_path: pathlib.Path): 10 | 11 | @triton.jit 12 | def add_kernel( 13 | x_ptr, 14 | y_ptr, 15 | output_ptr, 16 | n_elements, 17 | BLOCK_SIZE: tl.constexpr, 18 | ): 19 | pid = tl.program_id(axis=0) 20 | block_start = pid * BLOCK_SIZE 21 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 22 | mask = offsets < n_elements 23 | x = tl.load(x_ptr + offsets, mask=mask) 24 | pl.record(True, 0) 25 | y = tl.load(y_ptr + offsets, mask=mask) 26 | pl.record(False, 0) 27 | output = x + y 28 | tl.store(output_ptr + offsets, output, mask=mask) 29 | 30 | torch.manual_seed(0) 31 | size = 2**12 32 | x = torch.rand(size, device='cuda') 33 | y = torch.rand(size, device='cuda') 34 | output = torch.empty_like(x) 35 | n_elements = output.numel() 36 | grid = (1, 1, 1) 37 | pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) 38 | ttir = pgm.asm['ttir'] 39 | assert "proton.record() {isStart = true, regionId = 0 : i32}" in ttir 40 | assert "proton.record() {isStart = false, regionId = 0 : i32}" in ttir 41 | -------------------------------------------------------------------------------- /unittest/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_ut( 2 | NAME TestTritonAnalysis 3 | SRCS UtilityTest.cpp 4 | LIBS 5 | TritonAnalysis 6 | TritonIR 7 | TritonGPUIR 8 | ) 9 | -------------------------------------------------------------------------------- /unittest/Analysis/UtilityTest.cpp: -------------------------------------------------------------------------------- 1 | #include "triton/Dialect/Triton/IR/Utility.h" 2 | 3 | #include "llvm/Support/Signals.h" 4 | #include 5 | 6 | namespace mlir { 7 | 8 | TEST(Analysis, reorder) { 9 | SmallVector shape({10, 20, 30}); 10 | { 11 | SmallVector order({2, 1, 0}); 12 | auto reordered = triton::applyPermutation(shape, order); 13 | EXPECT_EQ(reordered[0], 30); 14 | EXPECT_EQ(reordered[1], 20); 15 | EXPECT_EQ(reordered[2], 10); 16 | } 17 | { 18 | SmallVector order({1, 0, 2}); 19 | auto reordered = triton::applyPermutation(shape, order); 20 | EXPECT_EQ(reordered[0], 20); 21 | EXPECT_EQ(reordered[1], 10); 22 | EXPECT_EQ(reordered[2], 30); 23 | } 24 | } 25 | 26 | } // namespace mlir 27 | 28 | int main(int argc, char *argv[]) { 29 | llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); 30 | testing::InitGoogleTest(&argc, argv); 31 | return RUN_ALL_TESTS(); 32 | } 33 | -------------------------------------------------------------------------------- /unittest/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | add_subdirectory(Dialect) 3 | add_subdirectory(Tools) 4 | -------------------------------------------------------------------------------- /unittest/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TritonGPU) 2 | -------------------------------------------------------------------------------- /unittest/Dialect/TritonGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_ut( 2 | NAME TestSwizzling 3 | SRCS SwizzleTest.cpp 4 | LIBS TritonGPUIR TritonNvidiaGPUIR 5 | ) 6 | add_triton_ut( 7 | NAME Dialect 8 | SRCS DialectTest.cpp 9 | LIBS TritonGPUIR 10 | ) 11 | add_triton_ut( 12 | NAME LinearLayoutConversions 13 | SRCS LinearLayoutConversionsTest.cpp 14 | LIBS TritonGPUIR 15 | ) 16 | 17 | add_triton_ut( 18 | NAME DumpLayoutTest 19 | SRCS DumpLayoutTest.cpp 20 | LIBS TritonGPUIR 21 | ) 22 | -------------------------------------------------------------------------------- /unittest/Tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_triton_ut( 2 | NAME LinearLayout 3 | SRCS LayoutUtilsTest.cpp LinearLayoutTest.cpp 4 | LIBS TritonTools 5 | ) 6 | -------------------------------------------------------------------------------- /unittest/googletest.cmake: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | 3 | set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against") 4 | 5 | if(GOOGLETEST_DIR) 6 | set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override") 7 | endif() 8 | 9 | FetchContent_Declare( 10 | googletest 11 | GIT_REPOSITORY https://github.com/google/googletest.git 12 | GIT_TAG release-1.12.1 13 | ) 14 | 15 | FetchContent_GetProperties(googletest) 16 | 17 | if(NOT googletest_POPULATED) 18 | FetchContent_Populate(googletest) 19 | if (MSVC) 20 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 21 | endif() 22 | add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL) 23 | endif() 24 | -------------------------------------------------------------------------------- /utils/nightly.pypirc: -------------------------------------------------------------------------------- 1 | [distutils] 2 | Index-servers = 3 | Triton-Nightly 4 | 5 | [Triton-Nightly] 6 | Repository = https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/upload/ 7 | --------------------------------------------------------------------------------