├── .clang-format
├── .editorconfig
├── .git-blame-ignore-revs
├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug.yml
    │   ├── config.yml
    │   └── performance.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── build-macos.yml
    │   ├── ci.yml
    │   ├── create_release.yml
    │   ├── documentation.yml
    │   ├── integration-tests-amd.yml
    │   ├── integration-tests-nvidia.yml
    │   ├── llvm-build.yml
    │   ├── llvm-build
    │       ├── almalinux.Dockerfile
    │       └── centos.Dockerfile
    │   ├── pre-commit.yml
    │   ├── runner-preparation.yml
    │   └── wheels.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── RELEASE.md
├── bin
    ├── CMakeLists.txt
    ├── RegisterTritonDialects.h
    ├── triton-llvm-opt.cpp
    ├── triton-lsp.cpp
    ├── triton-opt.cpp
    ├── triton-reduce.cpp
    └── triton-tensor-layout.cpp
├── cmake
    ├── AddTritonUnitTest.cmake
    ├── FindLLVM.cmake
    ├── json-version.txt
    ├── llvm-hash.txt
    └── nvidia-toolchain-version.json
├── docs
    ├── Makefile
    ├── _templates
    │   └── versions.html
    ├── backend
    │   ├── ldmatrixOperand0.svg
    │   └── ldmatrixOperand1.svg
    ├── conf.py
    ├── getting-started
    │   ├── installation.rst
    │   └── tutorials
    │   │   ├── grouped_vs_row_major_ordering.png
    │   │   ├── parallel_reduction.png
    │   │   └── random_bits.png
    ├── index.rst
    ├── meetups
    │   ├── 01-24-2024
    │   │   └── notes.md
    │   ├── 02-20-2024
    │   │   ├── Proton.pdf
    │   │   └── notes.md
    │   ├── 03-12-2025
    │   │   └── notes.md
    │   ├── 04-02-2024
    │   │   └── notes.md
    │   ├── 05-01-2025
    │   │   └── notes.md
    │   ├── 05-07-2024
    │   │   └── notes.md
    │   ├── 07-18-2023
    │   │   └── notes.md
    │   ├── 08-06-2024
    │   │   └── notes.md
    │   ├── 08-22-2023
    │   │   ├── amd-update.pdf
    │   │   ├── intel-xpu-update.pptx
    │   │   └── notes.md
    │   ├── 10-25-2023
    │   │   ├── intel-xpu-update.pdf
    │   │   ├── notes.md
    │   │   └── triton-shared.pptx
    │   ├── 12-13-2023
    │   │   └── notes.md
    │   ├── dev-meetup-2023.md
    │   └── dev_conference_2024.md
    ├── programming-guide
    │   ├── chapter-1
    │   │   ├── cuda-parallel-matmul.png
    │   │   ├── introduction.rst
    │   │   └── triton-parallel-matmul.png
    │   ├── chapter-2
    │   │   ├── halide-iteration.png
    │   │   ├── polyhedral-iteration.png
    │   │   └── related-work.rst
    │   └── chapter-3
    │   │   └── debugging.rst
    └── python-api
    │   ├── triton-semantics.rst
    │   ├── triton.language.extra.cuda.rst
    │   ├── triton.language.rst
    │   ├── triton.rst
    │   └── triton.testing.rst
├── include
    ├── CMakeLists.txt
    └── triton
    │   ├── Analysis
    │       ├── Alias.h
    │       ├── Allocation.h
    │       ├── AxisInfo.h
    │       ├── Membar.h
    │       └── Utility.h
    │   ├── CMakeLists.txt
    │   ├── Conversion
    │       ├── CMakeLists.txt
    │       ├── MLIRTypes.h
    │       ├── TritonGPUToLLVM
    │       │   ├── AllocateSharedMemoryUtility.h
    │       │   ├── AsmFormat.h
    │       │   ├── CMakeLists.txt
    │       │   ├── ElementwiseOpToLLVMBase.h
    │       │   ├── FMADotUtility.h
    │       │   ├── Passes.h
    │       │   ├── Passes.td
    │       │   ├── PatternTritonGPUOpToLLVM.h
    │       │   ├── TargetInfoBase.h
    │       │   ├── TypeConverter.h
    │       │   └── Utility.h
    │       └── TritonToTritonGPU
    │       │   ├── CMakeLists.txt
    │       │   ├── Passes.h
    │       │   └── Passes.td
    │   ├── Dialect
    │       ├── CMakeLists.txt
    │       ├── Triton
    │       │   ├── CMakeLists.txt
    │       │   ├── IR
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Dialect.h
    │       │   │   ├── DiscardableAttributes.h
    │       │   │   ├── Interfaces.h
    │       │   │   ├── OpInterfaces.h
    │       │   │   ├── Traits.h
    │       │   │   ├── TritonAttrDefs.td
    │       │   │   ├── TritonDialect.td
    │       │   │   ├── TritonInterfaces.td
    │       │   │   ├── TritonOpInterfaces.td
    │       │   │   ├── TritonOps.td
    │       │   │   ├── TritonTypes.td
    │       │   │   ├── Types.h
    │       │   │   └── Utility.h
    │       │   └── Transforms
    │       │   │   ├── ArithTypeConversion.h
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── FunctionTypeConversion.h
    │       │   │   ├── LoopPeeling.h
    │       │   │   ├── Passes.h
    │       │   │   └── Passes.td
    │       ├── TritonGPU
    │       │   ├── CMakeLists.txt
    │       │   ├── IR
    │       │   │   ├── Attributes.h
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Dialect.h
    │       │   │   ├── LayoutUtility.h
    │       │   │   ├── LinearLayoutConversions.h
    │       │   │   ├── Traits.h
    │       │   │   ├── TritonGPUAttrDefs.td
    │       │   │   ├── TritonGPUDialect.td
    │       │   │   ├── TritonGPUInterfaces.h
    │       │   │   ├── TritonGPUOps.td
    │       │   │   ├── TritonGPUTypeInterfaces.td
    │       │   │   ├── TritonGPUTypes.td
    │       │   │   └── Types.h
    │       │   └── Transforms
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── DecomposeScaledBlocked.h
    │       │   │   ├── MMAv5PipelineUtility.h
    │       │   │   ├── Partition.h
    │       │   │   ├── Passes.h
    │       │   │   ├── Passes.td
    │       │   │   ├── PipelineExpander.h
    │       │   │   ├── PipeliningUtility.h
    │       │   │   ├── Schedule.h
    │       │   │   ├── TritonGPUConversion.h
    │       │   │   ├── Utility.h
    │       │   │   └── WarpSpecialization.h
    │       ├── TritonInstrument
    │       │   ├── CMakeLists.txt
    │       │   ├── IR
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Dialect.h
    │       │   │   ├── TritonInstrumentDialect.td
    │       │   │   └── TritonInstrumentOps.td
    │       │   └── Transforms
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── Passes.h
    │       │   │   └── Passes.td
    │       └── TritonNvidiaGPU
    │       │   ├── CMakeLists.txt
    │       │   ├── IR
    │       │       ├── CMakeLists.txt
    │       │       ├── Dialect.h
    │       │       ├── TritonNvidiaGPUAttrDefs.td
    │       │       ├── TritonNvidiaGPUDialect.td
    │       │       ├── TritonNvidiaGPUOpInterfaces.td
    │       │       └── TritonNvidiaGPUOps.td
    │       │   └── Transforms
    │       │       ├── CMakeLists.txt
    │       │       ├── Passes.h
    │       │       ├── Passes.td
    │       │       └── TMAUtilities.h
    │   ├── Target
    │       ├── CMakeLists.txt
    │       └── LLVMIR
    │       │   ├── CMakeLists.txt
    │       │   ├── Passes.h
    │       │   └── Passes.td
    │   └── Tools
    │       ├── GenericSwizzling.h
    │       ├── LayoutUtils.h
    │       ├── LinearLayout.h
    │       ├── StrUtil.h
    │       └── Sys
    │           └── GetEnv.hpp
├── lib
    ├── Analysis
    │   ├── Alias.cpp
    │   ├── Allocation.cpp
    │   ├── AxisInfo.cpp
    │   ├── CMakeLists.txt
    │   ├── Membar.cpp
    │   └── Utility.cpp
    ├── CMakeLists.txt
    ├── Conversion
    │   ├── CMakeLists.txt
    │   ├── TritonGPUToLLVM
    │   │   ├── AllocateSharedMemory.cpp
    │   │   ├── AllocateSharedMemoryUtility.cpp
    │   │   ├── AllocateWarpGroups.cpp
    │   │   ├── AssertOpToLLVM.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── ControlFlowOpToLLVM.cpp
    │   │   ├── ConvertLayoutOpToLLVM.cpp
    │   │   ├── DotOpToLLVM
    │   │   │   ├── FMA.cpp
    │   │   │   └── FMADotUtility.cpp
    │   │   ├── ElementwiseOpToLLVM.cpp
    │   │   ├── FuncOpToLLVM.cpp
    │   │   ├── GatherOpToLLVM.cpp
    │   │   ├── GlobalScratchMemoryAllocation.cpp
    │   │   ├── HistogramOpToLLVM.cpp
    │   │   ├── MakeRangeOpToLLVM.cpp
    │   │   ├── MemoryOpToLLVM.cpp
    │   │   ├── PrintOpToLLVM.cpp
    │   │   ├── ReduceOpToLLVM.cpp
    │   │   ├── ReduceScanCommon.h
    │   │   ├── SPMDOpToLLVM.cpp
    │   │   ├── ScanOpToLLVM.cpp
    │   │   ├── TypeConverter.cpp
    │   │   ├── Utility.cpp
    │   │   └── ViewOpToLLVM.cpp
    │   ├── TritonInstrumentToLLVM
    │   │   ├── CMakeLists.txt
    │   │   └── InstrumentationToLLVM.cpp
    │   └── TritonToTritonGPU
    │   │   ├── CMakeLists.txt
    │   │   ├── RelayoutTritonGPU.cpp
    │   │   ├── TritonGPUConversion.cpp
    │   │   └── TritonToTritonGPUPass.cpp
    ├── Dialect
    │   ├── CMakeLists.txt
    │   ├── Triton
    │   │   ├── CMakeLists.txt
    │   │   ├── IR
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Canonicalize.td
    │   │   │   ├── Dialect.cpp
    │   │   │   ├── DiscardableAttributes.cpp
    │   │   │   ├── OpInterfaces.cpp
    │   │   │   ├── Ops.cpp
    │   │   │   ├── Traits.cpp
    │   │   │   ├── Types.cpp
    │   │   │   └── Utility.cpp
    │   │   └── Transforms
    │   │   │   ├── ArithTypeConversion.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Combine.cpp
    │   │   │   ├── Combine.td
    │   │   │   ├── FunctionTypeConversion.cpp
    │   │   │   ├── LoopAwareCSE.cpp
    │   │   │   ├── LoopInvariantCodeMotion.cpp
    │   │   │   ├── LoopPeeling.cpp
    │   │   │   ├── LoopUnroll.cpp
    │   │   │   ├── ReorderBroadcast.cpp
    │   │   │   ├── RewriteTensorDescriptorToPointer.cpp
    │   │   │   └── RewriteTensorPointer.cpp
    │   ├── TritonGPU
    │   │   ├── CMakeLists.txt
    │   │   ├── IR
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Dialect.cpp
    │   │   │   ├── LayoutUtility.cpp
    │   │   │   ├── LinearLayoutConversions.cpp
    │   │   │   ├── Ops.cpp
    │   │   │   └── Types.cpp
    │   │   └── Transforms
    │   │   │   ├── AccelerateMatmul.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Canonicalize.cpp
    │   │   │   ├── Coalesce.cpp
    │   │   │   ├── CoalesceAsyncCopy.cpp
    │   │   │   ├── CombineTensorSelectAndIf.cpp
    │   │   │   ├── DecomposeScaledBlocked.cpp
    │   │   │   ├── F32DotTC.cpp
    │   │   │   ├── FuseNestedLoops.cpp
    │   │   │   ├── HoistTMEMAlloc.cpp
    │   │   │   ├── OptimizeAccumulatorInit.cpp
    │   │   │   ├── OptimizeDotOperands.cpp
    │   │   │   ├── OptimizeThreadLocality.cpp
    │   │   │   ├── Pipeliner
    │   │   │       ├── AssignLatencies.cpp
    │   │   │       ├── LowerLoops.cpp
    │   │   │       ├── MMAv5PipelineUtility.cpp
    │   │   │       ├── PipelineExpander.cpp
    │   │   │       ├── PipeliningUtility.cpp
    │   │   │       ├── Schedule.cpp
    │   │   │       ├── ScheduleLoops.cpp
    │   │   │       ├── SoftwarePipeliner.cpp
    │   │   │       ├── TMAStoresPipeline.cpp
    │   │   │       ├── TestPipelineLowerLoop.cpp
    │   │   │       └── WGMMAPipeline.cpp
    │   │   │   ├── Prefetch.cpp
    │   │   │   ├── ReduceDataDuplication.cpp
    │   │   │   ├── RemoveLayoutConversions.cpp
    │   │   │   ├── ReorderInstructions.cpp
    │   │   │   ├── Utility.cpp
    │   │   │   └── WarpSpecialization
    │   │   │       ├── AutomaticWarpSpecialization.cpp
    │   │   │       ├── LoadMMASpecialization.cpp
    │   │   │       ├── OptimizePartitionWarps.cpp
    │   │   │       ├── Partition.cpp
    │   │   │       ├── PartitionBuilder.cpp
    │   │   │       ├── PartitionBuilder.h
    │   │   │       ├── PartitionLoops.cpp
    │   │   │       ├── PartitionScheduling.cpp
    │   │   │       └── RewritePartitionDependencies.cpp
    │   ├── TritonInstrument
    │   │   ├── CMakeLists.txt
    │   │   ├── IR
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Dialect.cpp
    │   │   │   └── Ops.cpp
    │   │   └── Transforms
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── ConcurrencySanitizer.cpp
    │   └── TritonNvidiaGPU
    │   │   ├── CMakeLists.txt
    │   │   ├── IR
    │   │       ├── CMakeLists.txt
    │   │       ├── Dialect.cpp
    │   │       └── Ops.cpp
    │   │   └── Transforms
    │   │       ├── CMakeLists.txt
    │   │       ├── FenceInsertion.cpp
    │   │       ├── InterleaveTMem.cpp
    │   │       ├── MMALowering.cpp
    │   │       ├── OptimizeDescriptorEncoding.cpp
    │   │       ├── OptimizeTMemLayouts.cpp
    │   │       ├── PlanCTA.cpp
    │   │       ├── PromoteLHSToTMem.cpp
    │   │       ├── ProxFenceInsertion.cpp
    │   │       ├── RemoveTMEMTokens.cpp
    │   │       ├── TMALowering.cpp
    │   │       ├── TMAUtilities.cpp
    │   │       └── TensorMemoryAllocation.cpp
    ├── Instrumentation
    │   ├── CMakeLists.txt
    │   └── PrintLoadStoreMemSpaces.cpp
    ├── Target
    │   ├── CMakeLists.txt
    │   └── LLVMIR
    │   │   ├── CMakeLists.txt
    │   │   ├── LLVMDIScope.cpp
    │   │   ├── LLVMIRBreakPhiStruct.cpp
    │   │   └── LLVMPasses.h
    └── Tools
    │   ├── CMakeLists.txt
    │   ├── GenericSwizzling.cpp
    │   ├── LayoutUtils.cpp
    │   └── LinearLayout.cpp
├── pyproject.toml
├── python
    ├── build_helpers.py
    ├── requirements.txt
    ├── src
    │   ├── gluon_ir.cc
    │   ├── interpreter.cc
    │   ├── ir.cc
    │   ├── ir.h
    │   ├── llvm.cc
    │   ├── main.cc
    │   ├── passes.cc
    │   └── passes.h
    ├── test-requirements.txt
    ├── test
    │   ├── backend
    │   │   ├── extension_backend.c
    │   │   └── test_device_backend.py
    │   ├── conftest.py
    │   ├── gluon
    │   │   ├── test_consan.py
    │   │   ├── test_core.py
    │   │   └── test_frontend.py
    │   ├── kernel_comparison
    │   │   └── kernels.yml
    │   ├── regression
    │   │   ├── test_cast_matmul.py
    │   │   └── test_functional_regressions.py
    │   └── unit
    │   │   ├── blackwell
    │   │       └── test_tmem.py
    │   │   ├── cuda
    │   │       ├── __init__.py
    │   │       ├── test_mixed_io.py
    │   │       ├── test_tensor_descriptor.py
    │   │       ├── test_tma_descriptor.py
    │   │       └── test_tma_store_gemm.py
    │   │   ├── instrumentation
    │   │       └── test_gpuhello.py
    │   │   ├── language
    │   │       ├── print_helper.py
    │   │       ├── test_annotations.py
    │   │       ├── test_block_pointer.py
    │   │       ├── test_compile_errors.py
    │   │       ├── test_compile_only.py
    │   │       ├── test_conversions.py
    │   │       ├── test_core.py
    │   │       ├── test_decorator.py
    │   │       ├── test_frontend.py
    │   │       ├── test_libdevice.py
    │   │       ├── test_line_info.py
    │   │       ├── test_matmul.py
    │   │       ├── test_module.py
    │   │       ├── test_mxfp.py
    │   │       ├── test_pipeliner.py
    │   │       ├── test_random.py
    │   │       ├── test_reproducer.py
    │   │       ├── test_standard.py
    │   │       ├── test_subprocess.py
    │   │       ├── test_tensor_descriptor.py
    │   │       ├── test_tuple.py
    │   │       └── test_warp_specialization.py
    │   │   ├── runtime
    │   │       ├── test_autotuner.py
    │   │       ├── test_bindings.py
    │   │       ├── test_build.py
    │   │       ├── test_cache.py
    │   │       ├── test_compilation_listener.py
    │   │       ├── test_cublas.py
    │   │       ├── test_driver.py
    │   │       ├── test_jit.py
    │   │       ├── test_launch.py
    │   │       └── test_subproc.py
    │   │   ├── test_debug.py
    │   │   ├── test_debug_dump.py
    │   │   ├── test_filecheck.py
    │   │   ├── test_knobs.py
    │   │   ├── test_perf_warning.py
    │   │   └── tools
    │   │       ├── test_aot.py
    │   │       ├── test_disasm.py
    │   │       └── test_irsource.py
    ├── triton
    │   ├── __init__.py
    │   ├── _filecheck.py
    │   ├── _internal_testing.py
    │   ├── _utils.py
    │   ├── backends
    │   │   ├── __init__.py
    │   │   ├── compiler.py
    │   │   └── driver.py
    │   ├── compiler
    │   │   ├── __init__.py
    │   │   ├── code_generator.py
    │   │   ├── compiler.py
    │   │   ├── errors.py
    │   │   └── make_launcher.py
    │   ├── errors.py
    │   ├── experimental
    │   │   ├── __init__.py
    │   │   └── gluon
    │   │   │   ├── __init__.py
    │   │   │   ├── _compiler.py
    │   │   │   ├── _runtime.py
    │   │   │   ├── language
    │   │   │       ├── __init__.py
    │   │   │       ├── _core.py
    │   │   │       ├── _layouts.py
    │   │   │       ├── _math.py
    │   │   │       ├── _semantic.py
    │   │   │       ├── _standard.py
    │   │   │       └── nvidia
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── ampere
    │   │   │       │       ├── __init__.py
    │   │   │       │       ├── async_copy.py
    │   │   │       │       └── mbarrier.py
    │   │   │       │   ├── blackwell
    │   │   │       │       ├── __init__.py
    │   │   │       │       └── tma.py
    │   │   │       │   └── hopper
    │   │   │       │       ├── __init__.py
    │   │   │       │       ├── mbarrier.py
    │   │   │       │       └── tma.py
    │   │   │   └── nvidia
    │   │   │       ├── __init__.py
    │   │   │       ├── blackwell.py
    │   │   │       └── hopper.py
    │   ├── knobs.py
    │   ├── language
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── extra
    │   │   │   ├── __init__.py
    │   │   │   └── libdevice.py
    │   │   ├── math.py
    │   │   ├── random.py
    │   │   ├── semantic.py
    │   │   └── standard.py
    │   ├── runtime
    │   │   ├── __init__.py
    │   │   ├── _allocation.py
    │   │   ├── _async_compile.py
    │   │   ├── autotuner.py
    │   │   ├── build.py
    │   │   ├── cache.py
    │   │   ├── driver.py
    │   │   ├── errors.py
    │   │   ├── interpreter.py
    │   │   └── jit.py
    │   ├── testing.py
    │   └── tools
    │   │   ├── __init__.py
    │   │   ├── build_extern.py
    │   │   ├── compile.py
    │   │   ├── disasm.py
    │   │   ├── link.py
    │   │   ├── mxfp.py
    │   │   └── tensor_descriptor.py
    ├── triton_kernels
    │   ├── .gitignore
    │   ├── bench
    │   │   └── bench_mlp.py
    │   ├── pyproject.toml
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── test_compaction.py
    │   │   ├── test_matmul.py
    │   │   ├── test_mxfp.py
    │   │   ├── test_routing.py
    │   │   └── test_swiglu.py
    │   └── triton_kernels
    │   │   ├── __init__.py
    │   │   ├── compaction.py
    │   │   ├── compaction_details
    │   │       └── _masked_compaction.py
    │   │   ├── datastruct.py
    │   │   ├── matmul_ogs.py
    │   │   ├── matmul_ogs_details
    │   │       ├── _common.py
    │   │       ├── _finalize_matmul.py
    │   │       ├── _matmul_ogs.py
    │   │       ├── _p_matmul_ogs.py
    │   │       ├── _weight_transpose.py
    │   │       ├── fast_contiguous.py
    │   │       ├── opt_flags.py
    │   │       ├── opt_flags_amd.py
    │   │       └── opt_flags_nvidia.py
    │   │   ├── numerics.py
    │   │   ├── numerics_details
    │   │       ├── __init__.py
    │   │       ├── flexpoint.py
    │   │       └── mxfp.py
    │   │   ├── proton_opts.py
    │   │   ├── reduction_details
    │   │       └── reduce_bitmatrix.py
    │   │   ├── routing.py
    │   │   ├── routing_details
    │   │       ├── _expt_data.py
    │   │       └── _routing_compute.py
    │   │   ├── specialize.py
    │   │   ├── swiglu.py
    │   │   ├── swiglu_details
    │   │       └── _swiglu.py
    │   │   ├── target_info.py
    │   │   ├── testing.py
    │   │   ├── topk.py
    │   │   └── topk_details
    │   │       ├── __init__.py
    │   │       ├── _topk_backward.py
    │   │       └── _topk_forward.py
    └── tutorials
    │   ├── 01-vector-add.py
    │   ├── 02-fused-softmax.py
    │   ├── 03-matrix-multiplication.py
    │   ├── 04-low-memory-dropout.py
    │   ├── 05-layer-norm.py
    │   ├── 06-fused-attention.py
    │   ├── 07-extern-functions.py
    │   ├── 08-grouped-gemm.py
    │   ├── 09-persistent-matmul.py
    │   ├── 10-block-scaled-matmul.py
    │   ├── 11-programmatic-dependent-launch.py
    │   ├── README.rst
    │   └── gluon
    │       └── 01-attention-forward.py
├── scripts
    └── build-llvm-project.sh
├── setup.py
├── test
    ├── Analysis
    │   ├── amd
    │   │   └── test-alignment.mlir
    │   ├── test-alias.mlir
    │   ├── test-alignment.mlir
    │   ├── test-allocation.mlir
    │   ├── test-membar-ttng.mlir
    │   └── test-membar.mlir
    ├── CMakeLists.txt
    ├── Conversion
    │   ├── allocate_shared_memory.mlir
    │   ├── allocate_warp_groups.mlir
    │   ├── amd
    │   │   ├── allocate_shared_memory.mlir
    │   │   ├── amdgpu_membar.mlir
    │   │   ├── async-ops-alias-scopes.mlir
    │   │   ├── async_ops_to_llvm.mlir
    │   │   ├── async_ops_to_llvm_invalid.mlir
    │   │   ├── buffer_load_store.mlir
    │   │   ├── buffer_load_to_local_to_llvm.mlir
    │   │   ├── builtin_func_to_llvm.mlir
    │   │   ├── compute-base-ptr.mlir
    │   │   ├── dedup-by-constancy.mlir
    │   │   ├── ds_transpose.mlir
    │   │   ├── fp_to_fp.mlir
    │   │   ├── in_thread_transpose.mlir
    │   │   ├── invalid_concat_op.mlir
    │   │   ├── invalid_extractslice_to_llvm.mlir
    │   │   ├── load_store.mlir
    │   │   ├── math-denorm-handling.mlir
    │   │   ├── mfma-shortcut.mlir
    │   │   ├── minmax.mlir
    │   │   ├── tritongpu_to_llvm.mlir
    │   │   ├── tritongpu_to_llvm_rdna.mlir
    │   │   └── tritongpu_wmma_dot_to_llvm.mlir
    │   ├── atomic_ldst.mlir
    │   ├── cvt_to_llvm.mlir.unsupported
    │   ├── dedup-by-constancy.mlir
    │   ├── divide-by-0.mlir
    │   ├── gather_to_llvm.mlir.unsupported
    │   ├── nvgpu_to_llvm.mlir
    │   ├── reduce_to_llvm.mlir
    │   ├── relayout_tritongpu.mlir
    │   ├── scan_to_llvm.mlir
    │   ├── tma_to_llvm.mlir
    │   ├── triton_to_tritongpu.mlir
    │   ├── tritongpu_to_llvm.mlir
    │   ├── tritongpu_to_llvm_blackwell.mlir
    │   ├── tritongpu_to_llvm_block_dot_shortcut.mlir
    │   ├── tritongpu_to_llvm_debug.mlir
    │   ├── tritongpu_to_llvm_hopper.mlir
    │   ├── tritongpu_to_llvm_hopper_ptx80.mlir
    │   ├── tritongpu_to_llvm_volta.mlir
    │   ├── tritongpu_to_ptx.mlir
    │   ├── tritonnvidiagpu_to_llvm.mlir
    │   └── warp_specialize_to_llvm.mlir
    ├── Hopper
    │   ├── CMakeLists.txt
    │   └── WarpSpecialization
    │   │   ├── ws_code_partition.mlir
    │   │   ├── ws_data_partition.mlir
    │   │   ├── ws_task_id_propagation.mlir
    │   │   └── ws_task_partition.mlir
    ├── LLVMIR
    │   └── break-phi-struct.ll
    ├── NVWS
    │   ├── invalid.mlir
    │   ├── lower_aref.mlir
    │   ├── lower_warp_group.mlir
    │   └── ops.mlir
    ├── Proton
    │   └── ops.mlir
    ├── Tools
    │   └── tensor_layout_print.mlir
    ├── Triton
    │   ├── canonicalize.mlir
    │   ├── combine.mlir
    │   ├── invalid.mlir
    │   ├── loop-invariant-code-motion.mlir
    │   ├── loop-peeling.mlir
    │   ├── loop-unroll.mlir
    │   ├── loop_cse.mlir
    │   ├── ops.mlir
    │   ├── reorder-broadcast.mlir
    │   ├── reproducer.mlir
    │   ├── rewrite-tensor-descriptor-to-pointer.mlir
    │   ├── rewrite-tensor-pointer.mlir
    │   ├── vecadd.mlir
    │   └── verify-make-range.mlir
    ├── TritonGPU
    │   ├── accelerate-matmul.mlir
    │   ├── accumulator-init.mlir
    │   ├── amd
    │   │   ├── accelerate-amd-matmul-chain-dot.mlir
    │   │   ├── accelerate-amd-matmul-fma.mlir
    │   │   ├── accelerate-amd-matmul-mfma-gfx950.mlir
    │   │   ├── accelerate-amd-matmul-mfma.mlir
    │   │   ├── accelerate-amd-matmul-wmma-gen1.mlir
    │   │   ├── accelerate-amd-matmul-wmma-gen2.mlir
    │   │   ├── amd-block-pingpong.mlir
    │   │   ├── amd-canonicalize-extract-slice.mlir
    │   │   ├── amd-canonicalize-pointers-dont-run-mlir-canonicalizer.mlir
    │   │   ├── amd-canonicalize-pointers.mlir
    │   │   ├── amd-coalesce-async-copy.mlir
    │   │   ├── amd-concat-op.mlir
    │   │   ├── amd-conditional-barrier.mlir
    │   │   ├── amd-convert-buffer-ops-range-analysis.mlir
    │   │   ├── amd-convert-buffer-ops.mlir
    │   │   ├── amd-extractslice-op.mlir
    │   │   ├── amd-fold-true-cmpi.mlir
    │   │   ├── amd-hoist-cvtToDotOp.mlir
    │   │   ├── amd-instruction-sched.mlir
    │   │   ├── amd-optimize-epilogue.mlir
    │   │   ├── amd-range-analysis.mlir
    │   │   ├── amd-reorder-instructions.mlir
    │   │   ├── amd-schedule-hint.mlir
    │   │   ├── amd-stream-loop-assume.mlir
    │   │   ├── amd-stream-prefetch.mlir
    │   │   ├── amd-update-async-wait-count.mlir
    │   │   ├── in-thread-transpose.mlir
    │   │   ├── invalid.mlir
    │   │   ├── mfma-double-rate.mlir
    │   │   ├── mfma-xf32.mlir
    │   │   ├── optimize-lds-usage.mlir
    │   │   └── sink-setprio-mfma.mlir
    │   ├── atomic-cas.mlir
    │   ├── automatic-warp-specialization.mlir
    │   ├── canonicalize.mlir
    │   ├── coalesce-async-copy.mlir
    │   ├── coalesce.mlir
    │   ├── combine-select-if.mlir
    │   ├── combine.mlir
    │   ├── consan-negative.mlir
    │   ├── consan.mlir
    │   ├── dot-operands.mlir
    │   ├── fence-inserstion.mlir
    │   ├── fuse-nested-loops.mlir
    │   ├── global_scratch_alloc.mlir
    │   ├── global_scratch_to_llvm.mlir
    │   ├── hoist-tmem-alloc.mlir
    │   ├── inline.mlir
    │   ├── invalid-attributes.mlir
    │   ├── invalid.mlir
    │   ├── load-mma-specialization.mlir
    │   ├── loop-pipeline-async-latencies.mlir
    │   ├── loop-pipeline-blackwell.mlir
    │   ├── loop-pipeline-cuda.mlir
    │   ├── loop-pipeline-expand.mlir
    │   ├── loop-pipeline-hip.mlir
    │   ├── loop-pipeline-hopper-remove-wait.mlir
    │   ├── loop-pipeline-hopper.mlir
    │   ├── loop-pipeline-indirect-load.mlir
    │   ├── loop-pipeline.mlir
    │   ├── loop-schedule.mlir
    │   ├── matmul-loop-pipeline.mlir
    │   ├── matmul.mlir
    │   ├── memdesc-subview-split.mlir
    │   ├── ops.mlir
    │   ├── optimize-locality.mlir
    │   ├── optimize-partition-warps.mlir
    │   ├── optimize_epilogue.mlir
    │   ├── partition-loops.mlir
    │   ├── partition-scheduling.mlir
    │   ├── pipeline-assign-latencies.mlir
    │   ├── pipeline-loop-nest.mlir
    │   ├── pipeline-lower-loop.mlir
    │   ├── pipeline-schedule-loop.mlir
    │   ├── prefetch.mlir
    │   ├── promote-lhs-to-tmem.mlir
    │   ├── prox_fence_insertion.mlir
    │   ├── reduce-data-duplication.mlir
    │   ├── reorder-instructions.mlir
    │   ├── rewrite-partition-dependencies.mlir
    │   ├── samples
    │   │   ├── descriptor-matmul-pipeline.mlir
    │   │   ├── descriptor-matmul-pipeline.mlir.in
    │   │   ├── simulated-grouped-gemm.mlir
    │   │   └── simulated-grouped-gemm.mlir.in
    │   ├── tf32x3-matmul.mlir
    │   └── verify-blocked-layout.mlir
    ├── TritonNvidiaGPU
    │   ├── bf16-atomics.mlir
    │   ├── canonicalize.mlir
    │   ├── inline.mlir
    │   ├── interleave_tmem.mlir
    │   ├── invalid.mlir
    │   ├── membar.mlir
    │   ├── mma_lowering.mlir
    │   ├── ops.mlir
    │   ├── optimize_descriptor_encoding.mlir
    │   ├── test_promotion_to_tensor_memory.mlir
    │   ├── test_tensor_memory_allocation.mlir
    │   ├── tma_lowering.mlir
    │   └── tmem_layouts.mlir
    ├── include
    │   └── Analysis
    │   │   └── TestAxisInfo.h
    ├── lib
    │   ├── Analysis
    │   │   ├── CMakeLists.txt
    │   │   ├── TestAlias.cpp
    │   │   ├── TestAllocation.cpp
    │   │   ├── TestAxisInfo.cpp
    │   │   └── TestMembar.cpp
    │   ├── CMakeLists.txt
    │   ├── Dialect
    │   │   ├── CMakeLists.txt
    │   │   └── TestLoopPeeling.cpp
    │   └── Instrumentation
    │   │   ├── CMakeLists.txt
    │   │   └── GPUHello.cpp
    ├── lit.cfg.py
    └── lit.site.cfg.py.in
├── third_party
    ├── amd
    │   ├── CMakeLists.txt
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── compiler.py
    │   │   ├── driver.c
    │   │   ├── driver.py
    │   │   ├── include
    │   │   │   ├── hip
    │   │   │   │   ├── amd_detail
    │   │   │   │   │   ├── amd_channel_descriptor.h
    │   │   │   │   │   ├── amd_device_functions.h
    │   │   │   │   │   ├── amd_hip_atomic.h
    │   │   │   │   │   ├── amd_hip_bf16.h
    │   │   │   │   │   ├── amd_hip_bfloat16.h
    │   │   │   │   │   ├── amd_hip_common.h
    │   │   │   │   │   ├── amd_hip_complex.h
    │   │   │   │   │   ├── amd_hip_cooperative_groups.h
    │   │   │   │   │   ├── amd_hip_fp16.h
    │   │   │   │   │   ├── amd_hip_fp8.h
    │   │   │   │   │   ├── amd_hip_gl_interop.h
    │   │   │   │   │   ├── amd_hip_math_constants.h
    │   │   │   │   │   ├── amd_hip_runtime.h
    │   │   │   │   │   ├── amd_hip_runtime_pt_api.h
    │   │   │   │   │   ├── amd_hip_unsafe_atomics.h
    │   │   │   │   │   ├── amd_hip_vector_types.h
    │   │   │   │   │   ├── amd_math_functions.h
    │   │   │   │   │   ├── amd_surface_functions.h
    │   │   │   │   │   ├── amd_warp_functions.h
    │   │   │   │   │   ├── amd_warp_sync_functions.h
    │   │   │   │   │   ├── concepts.hpp
    │   │   │   │   │   ├── device_library_decls.h
    │   │   │   │   │   ├── functional_grid_launch.hpp
    │   │   │   │   │   ├── grid_launch.h
    │   │   │   │   │   ├── grid_launch.hpp
    │   │   │   │   │   ├── grid_launch_GGL.hpp
    │   │   │   │   │   ├── helpers.hpp
    │   │   │   │   │   ├── hip_api_trace.hpp
    │   │   │   │   │   ├── hip_assert.h
    │   │   │   │   │   ├── hip_cooperative_groups_helper.h
    │   │   │   │   │   ├── hip_fp16_gcc.h
    │   │   │   │   │   ├── hip_fp16_math_fwd.h
    │   │   │   │   │   ├── hip_ldg.h
    │   │   │   │   │   ├── hip_prof_str.h
    │   │   │   │   │   ├── hip_runtime_prof.h
    │   │   │   │   │   ├── host_defines.h
    │   │   │   │   │   ├── hsa_helpers.hpp
    │   │   │   │   │   ├── macro_based_grid_launch.hpp
    │   │   │   │   │   ├── math_fwd.h
    │   │   │   │   │   ├── ockl_image.h
    │   │   │   │   │   ├── program_state.hpp
    │   │   │   │   │   ├── texture_fetch_functions.h
    │   │   │   │   │   └── texture_indirect_functions.h
    │   │   │   │   ├── channel_descriptor.h
    │   │   │   │   ├── device_functions.h
    │   │   │   │   ├── driver_types.h
    │   │   │   │   ├── hip_bf16.h
    │   │   │   │   ├── hip_bfloat16.h
    │   │   │   │   ├── hip_common.h
    │   │   │   │   ├── hip_complex.h
    │   │   │   │   ├── hip_cooperative_groups.h
    │   │   │   │   ├── hip_deprecated.h
    │   │   │   │   ├── hip_ext.h
    │   │   │   │   ├── hip_fp16.h
    │   │   │   │   ├── hip_fp8.h
    │   │   │   │   ├── hip_gl_interop.h
    │   │   │   │   ├── hip_hcc.h
    │   │   │   │   ├── hip_math_constants.h
    │   │   │   │   ├── hip_profile.h
    │   │   │   │   ├── hip_runtime.h
    │   │   │   │   ├── hip_runtime_api.h
    │   │   │   │   ├── hip_texture_types.h
    │   │   │   │   ├── hip_vector_types.h
    │   │   │   │   ├── hip_version.h
    │   │   │   │   ├── hiprtc.h
    │   │   │   │   ├── library_types.h
    │   │   │   │   ├── math_functions.h
    │   │   │   │   ├── surface_types.h
    │   │   │   │   └── texture_types.h
    │   │   │   ├── hsa
    │   │   │   │   ├── Brig.h
    │   │   │   │   ├── amd_hsa_common.h
    │   │   │   │   ├── amd_hsa_elf.h
    │   │   │   │   ├── amd_hsa_kernel_code.h
    │   │   │   │   ├── amd_hsa_queue.h
    │   │   │   │   ├── amd_hsa_signal.h
    │   │   │   │   ├── hsa.h
    │   │   │   │   ├── hsa_amd_tool.h
    │   │   │   │   ├── hsa_api_trace.h
    │   │   │   │   ├── hsa_api_trace_version.h
    │   │   │   │   ├── hsa_ext_amd.h
    │   │   │   │   ├── hsa_ext_finalize.h
    │   │   │   │   ├── hsa_ext_image.h
    │   │   │   │   ├── hsa_ven_amd_aqlprofile.h
    │   │   │   │   ├── hsa_ven_amd_loader.h
    │   │   │   │   └── hsa_ven_amd_pc_sampling.h
    │   │   │   └── roctracer
    │   │   │   │   ├── ext
    │   │   │   │       └── prof_protocol.h
    │   │   │   │   ├── hip_ostream_ops.h
    │   │   │   │   ├── hsa_ostream_ops.h
    │   │   │   │   ├── hsa_prof_str.h
    │   │   │   │   ├── roctracer.h
    │   │   │   │   ├── roctracer_ext.h
    │   │   │   │   ├── roctracer_hcc.h
    │   │   │   │   ├── roctracer_hip.h
    │   │   │   │   ├── roctracer_hsa.h
    │   │   │   │   ├── roctracer_plugin.h
    │   │   │   │   ├── roctracer_roctx.h
    │   │   │   │   └── roctx.h
    │   │   └── lib
    │   │   │   ├── asanrtl.bc
    │   │   │   ├── ockl.bc
    │   │   │   └── ocml.bc
    │   ├── include
    │   │   ├── Analysis
    │   │   │   ├── AMDGPUAllocation.h
    │   │   │   ├── AxisInfoExt.h
    │   │   │   └── RangeAnalysis.h
    │   │   ├── CMakeLists.txt
    │   │   ├── Dialect
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── TritonAMDGPU
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── IR
    │   │   │   │       ├── CMakeLists.txt
    │   │   │   │       ├── Dialect.h
    │   │   │   │       ├── TritonAMDGPUAttrDefs.td
    │   │   │   │       ├── TritonAMDGPUDialect.td
    │   │   │   │       └── TritonAMDGPUOps.td
    │   │   │   │   └── Utility
    │   │   │   │       └── CommonUtils.h
    │   │   ├── TritonAMDGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── GCNAsmFormat.h
    │   │   │   ├── MembarUtility.h
    │   │   │   ├── Passes.h
    │   │   │   ├── Passes.td
    │   │   │   ├── PatternTritonAMDGPUToLLVM.h
    │   │   │   └── TargetUtils.h
    │   │   ├── TritonAMDGPUTransforms
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── MfmaGroup.h
    │   │   │   ├── Passes.h
    │   │   │   ├── Passes.td
    │   │   │   ├── TritonGPUConversion.h
    │   │   │   └── WmmaGroup.h
    │   │   └── Utils
    │   │   │   └── Utility.h
    │   ├── language
    │   │   └── hip
    │   │   │   ├── __init__.py
    │   │   │   ├── libdevice.py
    │   │   │   └── utils.py
    │   ├── lib
    │   │   ├── Analysis
    │   │   │   ├── AMDGPUAllocation.cpp
    │   │   │   ├── AxisInfoExt.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── RangeAnalysis.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── Dialect
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── TritonAMDGPU
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── IR
    │   │   │   │       ├── CMakeLists.txt
    │   │   │   │       └── Dialect.cpp
    │   │   │   │   └── Utility
    │   │   │   │       ├── CMakeLists.txt
    │   │   │   │       └── CommonUtils.cpp
    │   │   ├── TritonAMDGPUDialectToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── ConcatOpToLLVM.cpp
    │   │   │   ├── ExtractSliceOpToLLVM.cpp
    │   │   │   ├── InThreadTransposeOpToTTG.cpp
    │   │   │   ├── TritonAMDGPUToLLVMPatterns.cpp
    │   │   │   ├── Utility.cpp
    │   │   │   └── Utility.h
    │   │   ├── TritonAMDGPUToLLVM
    │   │   │   ├── AllocateSharedMemory.cpp
    │   │   │   ├── AsyncUtility.cpp
    │   │   │   ├── AsyncUtility.h
    │   │   │   ├── AtomicRMWOpsEmitter.cpp
    │   │   │   ├── AtomicRMWOpsEmitter.h
    │   │   │   ├── BufferOpsEmitter.cpp
    │   │   │   ├── BufferOpsEmitter.h
    │   │   │   ├── BuiltinFuncToLLVM.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── ConvertLayoutOpToLLVM.cpp
    │   │   │   ├── ConvertLayoutOpToLLVM
    │   │   │   │   ├── SharedToDotOperandHelper.cpp
    │   │   │   │   ├── SharedToDotOperandHelper.h
    │   │   │   │   ├── SharedToDotOperandMFMA.cpp
    │   │   │   │   └── SharedToDotOperandWMMA.cpp
    │   │   │   ├── DotOpToLLVM.cpp
    │   │   │   ├── DotOpToLLVM
    │   │   │   │   ├── FMA.cpp
    │   │   │   │   ├── MFMA.cpp
    │   │   │   │   └── WMMA.cpp
    │   │   │   ├── ElementwiseOpToLLVM.cpp
    │   │   │   ├── GCNAsmFormat.cpp
    │   │   │   ├── LoadStoreOpToLLVM.cpp
    │   │   │   ├── MembarUtility.cpp
    │   │   │   ├── MemoryOpToLLVM.cpp
    │   │   │   ├── OptimizeLDSUsage.cpp
    │   │   │   ├── OptimizeLDSUtility.cpp
    │   │   │   ├── OptimizeLDSUtility.h
    │   │   │   ├── PatternTritonGPUOpToLLVM.h
    │   │   │   ├── SPMDOpToLLVM.cpp
    │   │   │   ├── ScalarizePackedFOps.cpp
    │   │   │   ├── SchedInstructions.cpp
    │   │   │   ├── SchedInstructions.h
    │   │   │   ├── TargetInfo.cpp
    │   │   │   ├── TargetInfo.h
    │   │   │   ├── TargetUtils.cpp
    │   │   │   ├── TritonGPUToLLVM.cpp
    │   │   │   ├── UpcastMXFPToLLVM.cpp
    │   │   │   ├── Utility.cpp
    │   │   │   └── Utility.h
    │   │   └── TritonAMDGPUTransforms
    │   │   │   ├── AccelerateAMDMatmul.cpp
    │   │   │   ├── BlockPingpong.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── CanonicalizePointers.cpp
    │   │   │   ├── CoalesceAsyncCopy.cpp
    │   │   │   ├── ConvertToBufferOps.cpp
    │   │   │   ├── FoldTrueCmpIOp.cpp
    │   │   │   ├── HoistLayoutConversions.cpp
    │   │   │   ├── InThreadTranspose.cpp
    │   │   │   ├── MfmaGroup.cpp
    │   │   │   ├── OptimizeEpilogue.cpp
    │   │   │   ├── ReorderInstructions.cpp
    │   │   │   ├── StreamPipeline.cpp
    │   │   │   ├── UpdateAsyncWaitCount.cpp
    │   │   │   ├── Utility.cpp
    │   │   │   ├── Utility.h
    │   │   │   └── WmmaGroup.cpp
    │   ├── python
    │   │   ├── test
    │   │   │   ├── address_sanitizer_helper.py
    │   │   │   ├── attn_fwd.ttir
    │   │   │   ├── test_address_sanitizer.py
    │   │   │   ├── test_extract_slice_concat_op.py
    │   │   │   └── test_scalarize_packed_fops.py
    │   │   └── triton_amd.cc
    │   ├── test
    │   │   ├── CMakeLists.txt
    │   │   └── lib
    │   │   │   ├── Analysis
    │   │   │       ├── CMakeLists.txt
    │   │   │       ├── TestAMDGPUMembar.cpp
    │   │   │       ├── TestAMDRangeAnalysis.cpp
    │   │   │       └── TestAxisInfo.cpp
    │   │   │   └── CMakeLists.txt
    │   ├── tools
    │   │   └── hip
    │   │   │   ├── compile.cpp
    │   │   │   └── compile.h
    │   └── unittest
    │   │   ├── CMakeLists.txt
    │   │   └── Conversion
    │   │       ├── CMakeLists.txt
    │   │       └── OptimizeLDSTest.cpp
    ├── f2reduce
    │   ├── CMakeLists.txt
    │   ├── LICENCE.txt
    │   ├── README.md
    │   ├── VERSION
    │   ├── f2reduce.cpp
    │   └── f2reduce.h
    ├── nvidia
    │   ├── CMakeLists.txt
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── compiler.py
    │   │   ├── driver.c
    │   │   ├── driver.py
    │   │   └── lib
    │   │   │   └── libdevice.10.bc
    │   ├── hopper
    │   │   ├── CMakeLists.txt
    │   │   ├── include
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Transforms
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── Passes.h
    │   │   │   │   └── Passes.td
    │   │   └── lib
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Transforms
    │   │   │       ├── CMakeLists.txt
    │   │   │       ├── WarpSpecialization.cpp
    │   │   │       └── WarpSpecialization
    │   │   │           ├── CodePartitionUtility.cpp
    │   │   │           ├── CodePartitionUtility.h
    │   │   │           ├── TaskIdPropagation.cpp
    │   │   │           ├── TaskIdPropagation.h
    │   │   │           ├── Utility.cpp
    │   │   │           ├── Utility.h
    │   │   │           ├── WSBuffer.cpp
    │   │   │           ├── WSCodePartition.cpp
    │   │   │           ├── WSDataPartition.cpp
    │   │   │           ├── WSLowerMem.cpp
    │   │   │           ├── WSLowerToken.cpp
    │   │   │           ├── WSSpecialize.cpp
    │   │   │           ├── WSTaskIdPropagate.cpp
    │   │   │           └── WSTaskPartition.cpp
    │   ├── include
    │   │   ├── CMakeLists.txt
    │   │   ├── Dialect
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── NVGPU
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── IR
    │   │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   │   ├── Dialect.h
    │   │   │   │   │   ├── NVGPUAttrDefs.td
    │   │   │   │   │   ├── NVGPUDialect.td
    │   │   │   │   │   └── NVGPUOps.td
    │   │   │   └── NVWS
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── IR
    │   │   │   │       ├── CMakeLists.txt
    │   │   │   │       ├── Dialect.h
    │   │   │   │       ├── NVWSAttrDefs.td
    │   │   │   │       ├── NVWSDialect.td
    │   │   │   │       ├── NVWSOps.td
    │   │   │   │       └── NVWSTypes.td
    │   │   │   │   └── Transforms
    │   │   │   │       ├── CMakeLists.txt
    │   │   │   │       ├── Passes.h
    │   │   │   │       └── Passes.td
    │   │   ├── NVGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── NVGPUToLLVMPass.h
    │   │   │   ├── Passes.h
    │   │   │   └── Passes.td
    │   │   ├── TritonNVIDIAGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── PTXAsmFormat.h
    │   │   │   ├── Passes.h
    │   │   │   ├── Passes.td
    │   │   │   └── Utility.h
    │   │   ├── cublas_instance.h
    │   │   └── cublas_types.h
    │   ├── language
    │   │   └── cuda
    │   │   │   ├── __init__.py
    │   │   │   ├── gdc.py
    │   │   │   ├── libdevice.py
    │   │   │   └── utils.py
    │   ├── lib
    │   │   ├── CMakeLists.txt
    │   │   ├── Dialect
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── NVGPU
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── IR
    │   │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   │   └── Dialect.cpp
    │   │   │   └── NVWS
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── IR
    │   │   │   │       ├── CMakeLists.txt
    │   │   │   │       ├── Dialect.cpp
    │   │   │   │       └── Ops.cpp
    │   │   │   │   └── Transforms
    │   │   │   │       ├── CMakeLists.txt
    │   │   │   │       ├── LowerAref.cpp
    │   │   │   │       └── LowerWarpGroup.cpp
    │   │   ├── NVGPUToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── NVGPUToLLVMPass.cpp
    │   │   └── TritonNVIDIAGPUToLLVM
    │   │   │   ├── BarrierOpToLLVM.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── ClusterOpsToLLVM.cpp
    │   │   │   ├── ConvertLayoutOpToLLVM.cpp
    │   │   │   ├── ConvertWarpSpecializeToLLVM.cpp
    │   │   │   ├── DotOpToLLVM.cpp
    │   │   │   ├── DotOpToLLVM
    │   │   │       ├── MMAHelpers.h
    │   │   │       ├── MMAv2.cpp
    │   │   │       ├── MMAv5.cpp
    │   │   │       └── WGMMA.cpp
    │   │   │   ├── ElementwiseOpToLLVM.cpp
    │   │   │   ├── Fp4ToFpOpToLLVM.cpp
    │   │   │   ├── LoadStoreOpToLLVM.cpp
    │   │   │   ├── MemoryOpToLLVM.cpp
    │   │   │   ├── PTXAsmFormat.cpp
    │   │   │   ├── PatternTritonGPUOpToLLVM.h
    │   │   │   ├── SPMDOpToLLVM.cpp
    │   │   │   ├── TMAToLLVM.cpp
    │   │   │   ├── TargetInfo.cpp
    │   │   │   ├── TargetInfo.h
    │   │   │   ├── TensorMemoryToLLVM.cpp
    │   │   │   ├── TensorPtrOpsToLLVM.cpp
    │   │   │   ├── TritonGPUToLLVM.cpp
    │   │   │   ├── Utility.cpp
    │   │   │   └── Utility.h
    │   ├── tools
    │   │   └── cuda
    │   │   │   ├── compile.c
    │   │   │   └── compile.h
    │   ├── triton_nvidia.cc
    │   └── unittest
    │   │   ├── CMakeLists.txt
    │   │   └── Conversion
    │   │       ├── CMakeLists.txt
    │   │       └── TritonGPUToLLVM
    │   │           ├── CMakeLists.txt
    │   │           └── PTXAsmFormatTest.cpp
    └── proton
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── csrc
    │       ├── CMakeLists.txt
    │       ├── Proton.cpp
    │       ├── include
    │       │   ├── Context
    │       │   │   ├── Context.h
    │       │   │   ├── Python.h
    │       │   │   └── Shadow.h
    │       │   ├── Data
    │       │   │   ├── Data.h
    │       │   │   ├── Metric.h
    │       │   │   ├── TraceData.h
    │       │   │   └── TreeData.h
    │       │   ├── Driver
    │       │   │   ├── Device.h
    │       │   │   ├── Dispatch.h
    │       │   │   └── GPU
    │       │   │   │   ├── CudaApi.h
    │       │   │   │   ├── CuptiApi.h
    │       │   │   │   ├── HipApi.h
    │       │   │   │   ├── HsaApi.h
    │       │   │   │   └── RoctracerApi.h
    │       │   ├── Profiler
    │       │   │   ├── Cupti
    │       │   │   │   ├── CuptiPCSampling.h
    │       │   │   │   └── CuptiProfiler.h
    │       │   │   ├── GPUProfiler.h
    │       │   │   ├── Profiler.h
    │       │   │   └── Roctracer
    │       │   │   │   └── RoctracerProfiler.h
    │       │   ├── Proton.h
    │       │   ├── Session
    │       │   │   └── Session.h
    │       │   └── Utility
    │       │   │   ├── Atomic.h
    │       │   │   ├── Errors.h
    │       │   │   ├── Map.h
    │       │   │   ├── Set.h
    │       │   │   ├── Singleton.h
    │       │   │   ├── String.h
    │       │   │   └── Traits.h
    │       └── lib
    │       │   ├── CMakeLists.txt
    │       │   ├── Context
    │       │       ├── CMakeLists.txt
    │       │       ├── Context.cpp
    │       │       ├── Python.cpp
    │       │       └── Shadow.cpp
    │       │   ├── Data
    │       │       ├── CMakeLists.txt
    │       │       ├── Data.cpp
    │       │       ├── TraceData.cpp
    │       │       └── TreeData.cpp
    │       │   ├── Driver
    │       │       ├── CMakeLists.txt
    │       │       ├── Device.cpp
    │       │       └── GPU
    │       │       │   ├── CudaApi.cpp
    │       │       │   ├── CuptiApi.cpp
    │       │       │   ├── HipApi.cpp
    │       │       │   ├── HsaApi.cpp
    │       │       │   └── RoctracerApi.cpp
    │       │   ├── Profiler
    │       │       ├── CMakeLists.txt
    │       │       ├── Cupti
    │       │       │   ├── CuptiPCSampling.cpp
    │       │       │   └── CuptiProfiler.cpp
    │       │       └── RocTracer
    │       │       │   └── RoctracerProfiler.cpp
    │       │   └── Session
    │       │       ├── CMakeLists.txt
    │       │       └── Session.cpp
    │   ├── dialect
    │       ├── CMakeLists.txt
    │       ├── include
    │       │   ├── CMakeLists.txt
    │       │   ├── Dialect
    │       │   │   ├── CMakeLists.txt
    │       │   │   └── Proton
    │       │   │   │   ├── CMakeLists.txt
    │       │   │   │   └── IR
    │       │   │   │       ├── CMakeLists.txt
    │       │   │   │       ├── Dialect.h
    │       │   │   │       ├── ProtonAttrDefs.td
    │       │   │   │       ├── ProtonDialect.td
    │       │   │   │       └── ProtonOps.td
    │       │   └── TritonProtonToLLVM
    │       │   │   └── PatternTritonProtonOpToLLVM.h
    │       ├── lib
    │       │   ├── CMakeLists.txt
    │       │   ├── Dialect
    │       │   │   ├── CMakeLists.txt
    │       │   │   └── Proton
    │       │   │   │   ├── CMakeLists.txt
    │       │   │   │   └── IR
    │       │   │   │       ├── CMakeLists.txt
    │       │   │   │       ├── Dialect.cpp
    │       │   │   │       └── Ops.cpp
    │       │   └── TritonProtonToLLVM
    │       │   │   ├── CMakeLists.txt
    │       │   │   └── RecordOpToLLVM.cpp
    │       └── triton_proton.cc
    │   ├── proton
    │       ├── __init__.py
    │       ├── context.py
    │       ├── flags.py
    │       ├── hook.py
    │       ├── language.py
    │       ├── profile.py
    │       ├── proton.py
    │       ├── scope.py
    │       ├── specs.py
    │       ├── state.py
    │       └── viewer.py
    │   ├── test
    │       ├── examples
    │       │   ├── cuda.json
    │       │   ├── frame.json
    │       │   ├── hip.json
    │       │   ├── leaf_nodes.json
    │       │   └── triton.json
    │       ├── helper.py
    │       ├── helper_kernels.py
    │       ├── instrument.py
    │       ├── test_api.py
    │       ├── test_cmd.py
    │       ├── test_lib.py
    │       ├── test_profile.py
    │       ├── test_record.py
    │       └── test_viewer.py
    │   └── tutorials
    │       ├── dynamic_net.py
    │       └── matmul.py
├── unittest
    ├── Analysis
    │   ├── CMakeLists.txt
    │   └── UtilityTest.cpp
    ├── CMakeLists.txt
    ├── Dialect
    │   ├── CMakeLists.txt
    │   └── TritonGPU
    │   │   ├── CMakeLists.txt
    │   │   ├── DialectTest.cpp
    │   │   ├── DumpLayoutTest.cpp
    │   │   ├── LinearLayoutConversionsTest.cpp
    │   │   └── SwizzleTest.cpp
    ├── Tools
    │   ├── CMakeLists.txt
    │   ├── LayoutUtilsTest.cpp
    │   └── LinearLayoutTest.cpp
    └── googletest.cmake
└── utils
    ├── generate-test-checks.py
    └── nightly.pypirc


/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: LLVM
2 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # https://editorconfig.org/
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | charset = utf-8
 7 | end_of_line = lf
 8 | indent_style = space
 9 | indent_size = 4
10 | trim_trailing_whitespace = true
11 | insert_final_newline = true
12 | 
13 | [*.py]
14 | indent_size = 4
15 | src_paths=python
16 | 
17 | [*.{yaml,yml}]
18 | indent_size = 2
19 | 
20 | [*.md]
21 | indent_size = 2
22 | x-soft-wrap-text = true
23 | 
24 | [*.rst]
25 | indent_size = 4
26 | x-soft-wrap-text = true
27 | 
28 | [CMakeLists.txt,*.cmake]
29 | indent_size = 2
30 | 
31 | [Makefile]
32 | indent_style = tab
33 | 
34 | [*.{c,cc,cpp,h,hpp,cu,cuh}]
35 | indent_size = 2
36 | 
37 | [*.mlir]
38 | indent_size = 2
39 | 
40 | [*.td]
41 | indent_size = 4
42 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
 1 | # Commits listed here are ignored by `git blame`.  Add "big and uninteresting
 2 | # changes" here.  Don't forget that it has to be a separate commit (and, because
 3 | # our automation squashes PRs, a separate PR)!
 4 | #
 5 | # Run the following command to teach your `git blame` to pick up this file.
 6 | #
 7 | #  $ git config blame.ignoreRevsFile .git-blame-ignore-revs`
 8 | 
 9 | 841a77d1b5961b43e1b64e5265bdfe52c133574d
10 | cb68a0d9d501657258ed9f7ad7610d0784c9be9a
11 | 03184de8b535bb24fb1f49cc1f5e008bcbaa73ef
12 | bc4a8e66da036fafc01b87ee9e210df7ee8fb738
13 | 846d6e7e77891706d179b20f27b1278ac3b9a9ac
14 | 0327b9d32db6d1d63d207ccab722bd45e00a6678
15 | df08301e76a56d9ab3f36ff00ab7133672baa8d3
16 | f88b01f558df06f010a869e01473253a5f5cd8db
17 | 312cf97e147e962562877026fd82c928cf6eaa30
18 | 53d868113a706988394134ca1f7f85cb3016cc81
19 | 539fbe5049570f29e73dc6843f984cd4913c5505
20 | 053af4e9f8f005e1bc3f8ac9bf285eaf0ac9bf72
21 | 5b36cb48ad9ce566dd24ff7183f207a1cb9358b5
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: Community help
4 |     url: https://discord.gg/gpumode
5 |     about: GPU-mode discord community has a triton channel which is a great resource for help writing/learning triton
6 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   # Enable version updates for GitHub Actions
 9 |   - package-ecosystem: "github-actions"
10 |     # Look for GitHub Actions workflows in the `root` directory
11 |     directory: "/"
12 |     # Check the for updates once a week
13 |     schedule:
14 |       interval: "weekly"
15 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: Pre-Commit Check
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | jobs:
 7 |   pre-commit:
 8 |     name: pre-commit (code formatting)
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Checkout
12 |         uses: actions/checkout@v4
13 |       - uses: actions/setup-python@v5
14 |         with:
15 |           python-version: '3.12'
16 |           cache: 'pip'
17 |       - name: Compute hash of pre-commit config
18 |         id: cache-key
19 |         run: |
20 |           echo "pre_commit_hash=$(sha256sum .pre-commit-config.yaml | cut -d ' ' -f 1)" >> $GITHUB_OUTPUT
21 |         shell: bash
22 |       - name: Cache pre-commit's cache dir
23 |         uses: actions/cache@v4
24 |         with:
25 |           # Note that we cannot use environment variables here given there is
26 |           # no shell to interpret them in the paths.
27 |           path: |
28 |             ~/.cache/pre-commit
29 |           key: ${{ runner.os }}-${{ steps.cache-key.outputs.pre_commit_hash }}
30 |       - name: Check pre-commit
31 |         run: |
32 |           python3 -m pip install --upgrade pre-commit
33 |           python3 -m pre_commit run --all-files --verbose
34 |       - name: Print diff of changes if pre-commit failed
35 |         if: failure()
36 |         run: |
37 |           git diff
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2018-2020 Philippe Tillet
 3 | * Copyright 2020-2022 OpenAI
 4 | *
 5 | * Permission is hereby granted, free of charge, to any person obtaining
 6 | * a copy of this software and associated documentation files
 7 | * (the "Software"), to deal in the Software without restriction,
 8 | * including without limitation the rights to use, copy, modify, merge,
 9 | * publish, distribute, sublicense, and/or sell copies of the Software,
10 | * and to permit persons to whom the Software is furnished to do so,
11 | * subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be
14 | * included in all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | */
24 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | graft bin
 2 | graft cmake
 3 | graft docs
 4 | graft include
 5 | graft lib
 6 | graft python/src
 7 | graft python/test
 8 | graft python/triton
 9 | graft test
10 | graft third_party
11 | graft unittest
12 | include CMakeLists.txt
13 | include Makefile
14 | include python/build_helpers.py
15 | include python/requirements.txt
16 | include python/test-requirements.txt
17 | 


--------------------------------------------------------------------------------
/bin/triton-lsp.cpp:
--------------------------------------------------------------------------------
 1 | #include "./RegisterTritonDialects.h"
 2 | 
 3 | #include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h"
 4 | 
 5 | int main(int argc, char **argv) {
 6 |   mlir::DialectRegistry registry;
 7 |   registerTritonDialects(registry);
 8 | 
 9 |   return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry));
10 | }
11 | 


--------------------------------------------------------------------------------
/bin/triton-opt.cpp:
--------------------------------------------------------------------------------
 1 | #include "./RegisterTritonDialects.h"
 2 | 
 3 | #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 4 | 
 5 | int main(int argc, char **argv) {
 6 |   mlir::DialectRegistry registry;
 7 |   registerTritonDialects(registry);
 8 | 
 9 |   return mlir::asMainReturnCode(mlir::MlirOptMain(
10 |       argc, argv, "Triton (GPU) optimizer driver\n", registry));
11 | }
12 | 


--------------------------------------------------------------------------------
/bin/triton-reduce.cpp:
--------------------------------------------------------------------------------
 1 | #include "./RegisterTritonDialects.h"
 2 | 
 3 | #include "mlir/Tools/mlir-reduce/MlirReduceMain.h"
 4 | 
 5 | int main(int argc, char **argv) {
 6 |   mlir::DialectRegistry registry;
 7 |   registerTritonDialects(registry);
 8 | 
 9 |   mlir::MLIRContext context(registry);
10 |   return mlir::failed(mlir::mlirReduceMain(argc, argv, context));
11 | }
12 | 


--------------------------------------------------------------------------------
/cmake/AddTritonUnitTest.cmake:
--------------------------------------------------------------------------------
 1 | include(${PROJECT_SOURCE_DIR}/unittest/googletest.cmake)
 2 | 
 3 | include(GoogleTest)
 4 | enable_testing()
 5 | 
 6 | function(add_triton_ut)
 7 |   set(options)
 8 |   set(oneValueArgs NAME)
 9 |   set(multiValueArgs SRCS LIBS DEFS)
10 |   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
11 | 
12 |   add_test(NAME ${__NAME}
13 |           COMMAND ${__NAME})
14 |   add_executable(
15 |           ${__NAME}
16 |           ${__SRCS})
17 |   target_link_libraries(
18 |           ${__NAME}
19 |           PRIVATE
20 |           GTest::gtest_main
21 |           gmock
22 |           ${__LIBS})
23 | 
24 |   if(NOT MSVC)
25 |     target_compile_options(${__NAME} PRIVATE -fno-rtti)
26 |   endif()
27 | 
28 |   target_compile_definitions(${__NAME} PRIVATE ${__DEFS})
29 | 
30 |   # Without the TEST_DISCOVERY_TIMEOUT, the tests randomly time out on my mac
31 |   # laptop.  I think the issue may be that the very first time you run a program
32 |   # it's a bit slow.
33 |   gtest_discover_tests(${__NAME} DISCOVERY_TIMEOUT 60)
34 | 
35 |   # Add the unit test to the top-level unit test target.
36 |   add_dependencies(TritonUnitTests ${__NAME})
37 | endfunction()
38 | 


--------------------------------------------------------------------------------
/cmake/json-version.txt:
--------------------------------------------------------------------------------
1 | v3.11.3
2 | 


--------------------------------------------------------------------------------
/cmake/llvm-hash.txt:
--------------------------------------------------------------------------------
1 | 570885128351868c1308bb22e8ca351d318bc4a1
2 | 


--------------------------------------------------------------------------------
/cmake/nvidia-toolchain-version.json:
--------------------------------------------------------------------------------
1 | {
2 |   "ptxas": "12.8.93",
3 |   "cuobjdump": "12.8.55",
4 |   "nvdisasm": "12.8.55",
5 |   "cudacrt": "12.8.61",
6 |   "cudart": "12.8.57",
7 |   "cupti": "12.8.90"
8 | }
9 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = Triton
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_templates/versions.html:
--------------------------------------------------------------------------------
 1 | {%- if current_version %}
 2 | <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
 3 |     <span class="rst-current-version" data-toggle="rst-current-version">
 4 |         <span class="fa fa-book"> Other Versions</span>
 5 |         v: {{ current_version.name }}
 6 |         <span class="fa fa-caret-down"></span>
 7 |     </span>
 8 |     <div class="rst-other-versions">
 9 |         {%- if versions.tags %}
10 |         <dl>
11 |             <dt>Tags</dt>
12 |             {%- for item in versions.tags %}
13 |             <dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
14 |             {%- endfor %}
15 |         </dl>
16 |         {%- endif %}
17 |         {%- if versions.branches %}
18 |         <dl>
19 |             <dt>Branches</dt>
20 |             {%- for item in versions.branches %}
21 |             <dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
22 |             {%- endfor %}
23 |         </dl>
24 |         {%- endif %}
25 |     </div>
26 | </div>
27 | {%- endif %}
28 | 


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/parallel_reduction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/getting-started/tutorials/parallel_reduction.png


--------------------------------------------------------------------------------
/docs/getting-started/tutorials/random_bits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/getting-started/tutorials/random_bits.png


--------------------------------------------------------------------------------
/docs/meetups/02-20-2024/Proton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/meetups/02-20-2024/Proton.pdf


--------------------------------------------------------------------------------
/docs/meetups/02-20-2024/notes.md:
--------------------------------------------------------------------------------
 1 | #### Agenda:
 2 | 
 3 | ##### Items:
 4 | 1. Intel update
 5 | 2. AMD update
 6 | 3. Profiler update
 7 | 4. We are in the process of transitioning to a pro slack plan, so everybody will be able to see history. Expect this to take a few more weeks.
 8 | 5. We are still working on finalizing a document about our technical governance structure. Expect this to take a few more weeks too.4. Open discussion.
 9 | 
10 | ##### Minutes:
11 | Recording link [here](https://youtu.be/JDQCdj18Snc)
12 | 
13 | 1. Intel GPU integration with Triton and Pytorch:
14 |    - No strong requirement from PyTorch for specific backends to be part of Triton official release.
15 |    - Can use a separate branch/fork for CI/CD and testing.
16 |    - Intel team will work with Pytorch offline to close.
17 | 2. AMD GPU backend update:
18 |    - AMD team shared the refactored design for AMD backend.
19 |    - The new design is modularized and reduces clutter and duplication in upstream Triton.
20 |    - Further work needed for regression testing and secure runners.
21 | 3. Proton profiler update:
22 |    - Keren from the OpenAI team presented a new profiler tool for Triton kernels, which supports multiple vendors, metrics, and formats.
23 |    - Outlined the plan for open-sourcing, integrating, and extending the tool.
24 | 


--------------------------------------------------------------------------------
/docs/meetups/08-06-2024/notes.md:
--------------------------------------------------------------------------------
 1 | #### Agenda:
 2 | 1. Triton-CPU Update
 3 | 2. Intel GPU backend update
 4 | 
 5 | ##### Items:
 6 | Meeting notes:
 7 | 1. Triton-CPU Update: Intel and Meta jointly presented the work on Triton-CPU, highlighting good progress on coverage and performance improvements. They also covered some of the optimizations they leveraged to get performance comparable to torch-native and torch-inductor. More details are in their slides.
 8 | 2. Intel GPU Backend: Intel GPU backend shows good performance close to expert-tuned kernels and the use of block pointers for performance gains. There were questions around the future of block pointers and their importance for performance gains. With block-pointer deprecation there is a need for a more generic interface to support various backends including Intel GPU.
 9 | 3. The 2024 Triton conference is on September 17th 2024 in Fremont California! Please register [here](README.md).
10 | ##### Minutes:
11 | Recording link [here](https://youtu.be/dfL3L4_3ujg)
12 | 
13 | Presentations repo [here](https://drive.google.com/drive/folders/1fQ3zVrM7DT8W8FGJWKx1wNr2X53tYbeT?usp=sharing)
14 | 


--------------------------------------------------------------------------------
/docs/meetups/08-22-2023/amd-update.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/meetups/08-22-2023/amd-update.pdf


--------------------------------------------------------------------------------
/docs/meetups/08-22-2023/intel-xpu-update.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/meetups/08-22-2023/intel-xpu-update.pptx


--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/intel-xpu-update.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/meetups/10-25-2023/intel-xpu-update.pdf


--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/notes.md:
--------------------------------------------------------------------------------
 1 | #### Agenda:
 2 | 
 3 | ##### Items:
 4 | 1. H100 updates
 5 | 2. Triton-Shared layer updates
 6 | 3. Intel update
 7 | 4. Open discussion
 8 | 
 9 | ##### Minutes:
10 | Recording link [here](https://youtu.be/KZAzpKx1ebI)
11 | 
12 | 1. H100 updates
13 |    - Enabled WGMMA by default, now any matmul can reuse it.
14 |    - fp8 formats enabled – 1.3 Petaflops on dense matmul on H100 (gemm performance)
15 |    - Enabled Flash Attention using wgmma, resulting in 450 teraflop on fwd pass and 250 on backward pass – still working on perf for flash attention
16 |    - fp8 numbers with flash attention running in fp8 with matmul is tricky, because the fp8 layout is significantly different than what is returned by wgmma, still wip
17 | 
18 | 2. Triton-Shared layer
19 |    - Please refer to slides for more details
20 |    - Created a repo where you can find the middle layer
21 |    - Available as a plugin into triton
22 | 
23 | 3. Intel Update
24 |    - Please refer to slides for more details
25 | 


--------------------------------------------------------------------------------
/docs/meetups/10-25-2023/triton-shared.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/meetups/10-25-2023/triton-shared.pptx


--------------------------------------------------------------------------------
/docs/meetups/12-13-2023/notes.md:
--------------------------------------------------------------------------------
 1 | #### Agenda:
 2 | 
 3 | ##### Items:
 4 | 1. Refactoring plan for 3rd party backends
 5 | 2. Front end refactoring (AMD)
 6 | 3. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development.
 7 | 
 8 | ##### Minutes:
 9 | Recording link [here](https://youtu.be/Lo43DQYkOWM)
10 | 
11 | 1. Refactoring plan for 3rd party backends
12 |    - Refactoring to be completed by end of the year so that all GPU backends can be individual passes on Triton GPU IR instead of being completely out of tree. The goal is for users to get other GPUs besides Cuda when they install Triton. Non-GPU Triton IR expected to stay as is.
13 | 3. Front end refactoring (AMD)
14 |    - Will work with Phil for AMD related refactoring. Will share more details in next meetup about where AMD has diverged from Triton GPU IR and in the codeflow.
15 | 4. Things like block pointers, ptr_analysis, mask_analysis can be used for GPUs, is there a plan to incrementally include components from Triton shared for GPU development.
16 |    - Can look at it on a case by case basis.
17 | 


--------------------------------------------------------------------------------
/docs/meetups/dev_conference_2024.md:
--------------------------------------------------------------------------------
1 | The conference slides are available [here](https://drive.google.com/drive/folders/1osK9hwcX_lC1EjdZGB-v4w5oKx23UnU2?usp=drive_link)
2 | 
3 | The conference videos are available [here](https://www.youtube.com/playlist?list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz).
4 | 


--------------------------------------------------------------------------------
/docs/programming-guide/chapter-1/cuda-parallel-matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/programming-guide/chapter-1/cuda-parallel-matmul.png


--------------------------------------------------------------------------------
/docs/programming-guide/chapter-1/triton-parallel-matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/programming-guide/chapter-1/triton-parallel-matmul.png


--------------------------------------------------------------------------------
/docs/programming-guide/chapter-2/halide-iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/programming-guide/chapter-2/halide-iteration.png


--------------------------------------------------------------------------------
/docs/programming-guide/chapter-2/polyhedral-iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/docs/programming-guide/chapter-2/polyhedral-iteration.png


--------------------------------------------------------------------------------
/docs/python-api/triton.language.extra.cuda.rst:
--------------------------------------------------------------------------------
 1 | triton.language.extra.cuda
 2 | ==========================
 3 | 
 4 | .. currentmodule:: triton.language.extra.cuda
 5 | 
 6 | Programmatic Dependent Launch
 7 | -----------------------------
 8 | 
 9 | .. autosummary::
10 |     :toctree: generated
11 |     :nosignatures:
12 | 
13 |     gdc_wait
14 |     gdc_launch_dependents
15 | 


--------------------------------------------------------------------------------
/docs/python-api/triton.rst:
--------------------------------------------------------------------------------
 1 | triton
 2 | ======
 3 | 
 4 | .. currentmodule:: triton
 5 | 
 6 | .. autosummary::
 7 |     :toctree: generated
 8 |     :nosignatures:
 9 | 
10 |     jit
11 |     autotune
12 |     heuristics
13 |     Config
14 | 


--------------------------------------------------------------------------------
/docs/python-api/triton.testing.rst:
--------------------------------------------------------------------------------
 1 | triton.testing
 2 | ==============
 3 | 
 4 | .. currentmodule:: triton.testing
 5 | 
 6 | .. autosummary::
 7 |     :toctree: generated
 8 |     :nosignatures:
 9 | 
10 |     Benchmark
11 |     do_bench
12 |     do_bench_cudagraph
13 |     perf_report
14 |     assert_close
15 | 


--------------------------------------------------------------------------------
/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(triton)
2 | 


--------------------------------------------------------------------------------
/include/triton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Conversion)
2 | add_subdirectory(Dialect)
3 | add_subdirectory(Target)
4 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonGPUToLLVM)
2 | add_subdirectory(TritonToTritonGPU)
3 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ALLOCATE_UTILITY_H_
 2 | #define TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ALLOCATE_UTILITY_H_
 3 | 
 4 | #include "mlir/IR/BuiltinOps.h"
 5 | #include "triton/Analysis/Allocation.h"
 6 | 
 7 | namespace mlir::triton::gpu {
 8 | 
 9 | /// Attach shared memory related attributes to module and operations inside it.
10 | /// This includes total shared memory consumption in module and shared memory
11 | /// offsets of buffers associated with operations.
12 | void attachAllocationSizeAndOffsetAttr(ModuleOp mod,
13 |                                        ModuleAllocation &allocation);
14 | 
15 | } // namespace mlir::triton::gpu
16 | 
17 | #endif // TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ALLOCATE_UTILITY_H_
18 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/AsmFormat.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
 2 | #define TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
 3 | 
 4 | #include "mlir/IR/Value.h"
 5 | #include "triton/Dialect/Triton/IR/Dialect.h"
 6 | #include "llvm/ADT/SmallVector.h"
 7 | #include "llvm/ADT/StringExtras.h"
 8 | #include "llvm/ADT/StringRef.h"
 9 | #include <memory>
10 | #include <string>
11 | 
12 | namespace mlir {
13 | class ConversionPatternRewriter;
14 | class Location;
15 | 
16 | namespace triton {
17 | using llvm::StringRef;
18 | 
19 | inline std::string strJoin(llvm::ArrayRef<std::string> strs,
20 |                            llvm::StringRef delimiter) {
21 |   return llvm::join(strs.begin(), strs.end(), delimiter);
22 | }
23 | 
24 | } // namespace triton
25 | } // namespace mlir
26 | 
27 | #endif // TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ASM_FORMAT_H_
28 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonGPUToLLVM)
3 | add_public_tablegen_target(TritonGPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonGPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H
 2 | #define TRITONGPU_CONVERSION_TRITONGPUTOLLVM_PASSES_H
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | 
 6 | #include <memory>
 7 | 
 8 | namespace mlir {
 9 | 
10 | class ModuleOp;
11 | template <typename T> class OperationPass;
12 | 
13 | namespace triton::gpu {
14 | 
15 | #define GEN_PASS_DECL
16 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
17 | 
18 | #define GEN_PASS_REGISTRATION
19 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
20 | 
21 | } // namespace triton::gpu
22 | 
23 | } // namespace mlir
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonToTritonGPU)
3 | add_public_tablegen_target(TritonConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Conversion/TritonToTritonGPU/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_PASSES_H
 2 | #define TRITON_CONVERSION_PASSES_H
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | 
 6 | namespace mlir::triton {
 7 | 
 8 | #define GEN_PASS_DECL
 9 | #include "triton/Conversion/TritonToTritonGPU/Passes.h.inc"
10 | #define GEN_PASS_REGISTRATION
11 | #include "triton/Conversion/TritonToTritonGPU/Passes.h.inc"
12 | 
13 | } // namespace mlir::triton
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Triton)
2 | add_subdirectory(TritonGPU)
3 | add_subdirectory(TritonNvidiaGPU)
4 | add_subdirectory(TritonInstrument)
5 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonOps.td)
 4 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 5 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 6 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
 7 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
 8 | add_mlir_doc(TritonOps TritonOps dialects/ -gen-op-doc)
 9 | 
10 | set(LLVM_TARGET_DEFINITIONS TritonDialect.td)
11 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls)
12 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs)
13 | add_mlir_doc(TritonDialect TritonDialect dialects/ -gen-dialect-doc)
14 | 
15 | set(LLVM_TARGET_DEFINITIONS TritonTypes.td)
16 | mlir_tablegen(Types.h.inc -gen-typedef-decls)
17 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs)
18 | 
19 | set(LLVM_TARGET_DEFINITIONS TritonInterfaces.td)
20 | mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls)
21 | mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs)
22 | 
23 | set(LLVM_TARGET_DEFINITIONS TritonOpInterfaces.td)
24 | mlir_tablegen(OpInterfaces.h.inc -gen-op-interface-decls)
25 | mlir_tablegen(OpInterfaces.cpp.inc -gen-op-interface-defs)
26 | 
27 | add_public_tablegen_target(TritonTableGen)
28 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/DiscardableAttributes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITON_IR_DISCARDABLE_ATTRIBUTES_H_
 2 | #define TRITON_DIALECT_TRITON_IR_DISCARDABLE_ATTRIBUTES_H_
 3 | 
 4 | #include "mlir/Support/LLVM.h"
 5 | #include "triton/Dialect/Triton/IR/Dialect.h"
 6 | 
 7 | namespace mlir::triton {
 8 | 
 9 | // Filter out attributes from the given operation that are not present in
10 | // the allowList.
11 | [[nodiscard]] SmallVector<NamedAttribute>
12 | filterDiscardableAttrs(Operation *op, ArrayRef<StringRef> allowList);
13 | 
14 | } // namespace mlir::triton
15 | #endif // TRITON_DIALECT_TRITON_IR_DISCARDABLE_ATTRIBUTES_H_
16 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/OpInterfaces.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_IR_OP_INTERFACES_H_
 2 | #define TRITON_IR_OP_INTERFACES_H_
 3 | 
 4 | #include "mlir/IR/OpDefinition.h"
 5 | #include "triton/Dialect/Triton/IR/Types.h"
 6 | 
 7 | namespace mlir {
 8 | 
 9 | namespace triton {
10 | 
11 | namespace impl {
12 | 
13 | LogicalResult verifyTransposeOpInterface(Operation *op);
14 | 
15 | LogicalResult verifyDotOpInterface(Operation *op);
16 | 
17 | } // namespace impl
18 | 
19 | } // namespace triton
20 | } // namespace mlir
21 | 
22 | #include "triton/Dialect/Triton/IR/OpInterfaces.h.inc"
23 | 
24 | #endif // TRITON_IR_OP_INTERFACES_H_
25 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/IR/Types.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_IR_TYPES_H_
 2 | #define TRITON_IR_TYPES_H_
 3 | 
 4 | #include "mlir/IR/BuiltinTypes.h"
 5 | #include "mlir/IR/TypeSupport.h"
 6 | #include "mlir/IR/Types.h"
 7 | 
 8 | #define GET_TYPEDEF_CLASSES
 9 | #include "triton/Dialect/Triton/IR/Types.h.inc"
10 | 
11 | namespace mlir {
12 | 
13 | namespace triton {
14 | 
15 | bool isTensorPointerType(Type type);
16 | 
17 | bool isTensorOrTensorPointerType(Type type);
18 | 
19 | unsigned getPointeeBitWidth(Type type);
20 | 
21 | Type getPointeeType(Type type);
22 | 
23 | Type getPointerType(Type type, int addressSpace = 1);
24 | 
25 | int getAddressSpace(Type type);
26 | 
27 | Type getElementTypeOfTensorPointerType(Type type);
28 | 
29 | Type getI1SameShape(Type type);
30 | 
31 | Type getI32SameShape(Type type);
32 | 
33 | Type getPointerTypeSameShape(Type type);
34 | 
35 | Type getPointerTypeToElement(Type type);
36 | 
37 | } // namespace triton
38 | 
39 | } // namespace mlir
40 | 
41 | #endif // TRITON_IR_TYPES_H_
42 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/ArithTypeConversion.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITON_TRANSFORMS_ARITH_TYPE_CONVERSION_H_
 2 | #define TRITON_DIALECT_TRITON_TRANSFORMS_ARITH_TYPE_CONVERSION_H_
 3 | #include "mlir/Transforms/DialectConversion.h"
 4 | 
 5 | namespace mlir::triton {
 6 | 
 7 | /**
 8 |  * @brief Provides helper patterns for converting arith operations using a type
 9 |  * converter.
10 |  *
11 |  * Note at of the time of writing this isn't provided in upstream mlir.
12 |  */
13 | void populateArithTypeConversions(const TypeConverter &converter,
14 |                                   RewritePatternSet &patterns);
15 | 
16 | } // namespace mlir::triton
17 | 
18 | #endif // TRITON_DIALECT_TRITON_TRANSFORMS_ARITH_TYPE_CONVERSION_H_
19 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name Triton)
3 | add_public_tablegen_target(TritonTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/FunctionTypeConversion.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITON_TRANSFORMS_FUNCTION_TYPE_CONVERSION_H_
 2 | #define TRITON_DIALECT_TRITON_TRANSFORMS_FUNCTION_TYPE_CONVERSION_H_
 3 | #include "mlir/Transforms/DialectConversion.h"
 4 | 
 5 | namespace mlir::triton {
 6 | 
 7 | /**
 8 |  * @brief Provides helper patterns for converting triton function operations
 9 |  * using a type converter.
10 |  *
11 |  * Note we cannot use upstream passes for this because they are unaware of
12 |  * tt.call and tt.return.
13 |  */
14 | void populateFunctionTypeConversions(const TypeConverter &converter,
15 |                                      RewritePatternSet &patterns);
16 | 
17 | } // namespace mlir::triton
18 | 
19 | #endif // TRITON_DIALECT_TRITON_TRANSFORMS_FUNCTION_TYPE_CONVERSION_H_
20 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/LoopPeeling.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITON_TRANSFORMS_LOOP_PEELING_H_
 2 | #define TRITON_DIALECT_TRITON_TRANSFORMS_LOOP_PEELING_H_
 3 | 
 4 | #include "mlir/Dialect/SCF/IR/SCF.h"
 5 | 
 6 | namespace mlir {
 7 | namespace triton {
 8 | 
 9 | // Peel the single last iteration of the loop.
10 | void peelLoopEpilogue(
11 |     scf::ForOp forOp,
12 |     function_ref<Operation *(RewriterBase &, Operation *, bool)>
13 |         processPeeledOp = nullptr);
14 | 
15 | } // namespace triton
16 | } // namespace mlir
17 | 
18 | #endif // TRITON_DIALECT_TRITON_TRANSFORMS_LOOP_PEELING_H_
19 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/Triton/Transforms/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_
 2 | #define TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | 
 6 | namespace mlir {
 7 | namespace triton {
 8 | 
 9 | // Generate the pass class declarations.
10 | #define GEN_PASS_DECL
11 | #include "triton/Dialect/Triton/Transforms/Passes.h.inc"
12 | 
13 | #define GEN_PASS_REGISTRATION
14 | #include "triton/Dialect/Triton/Transforms/Passes.h.inc"
15 | 
16 | } // namespace triton
17 | } // namespace mlir
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/Attributes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
 2 | #define TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
 3 | 
 4 | #include "mlir/IR/Attributes.h"
 5 | #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 6 | 
 7 | #define GET_ATTRDEF_CLASSES
 8 | #include "triton/Dialect/TritonGPU/IR/AttrDefs.h.inc"
 9 | 
10 | #endif // TRITON_DIALECT_TRITONGPU_IR_ATTRIBUTES_H_
11 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=ttg)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=ttg)
 6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=ttg)
 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=ttg)
10 | add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc)
11 | add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc)
12 | add_public_tablegen_target(TritonGPUTableGen)
13 | 
14 | set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td)
15 | mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls)
16 | mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs)
17 | mlir_tablegen(AttrDefs.h.inc -gen-attrdef-decls)
18 | mlir_tablegen(AttrDefs.cpp.inc -gen-attrdef-defs)
19 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
20 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
21 | add_public_tablegen_target(TritonGPUAttrDefsIncGen)
22 | 
23 | set(LLVM_TARGET_DEFINITIONS TritonGPUTypeInterfaces.td)
24 | mlir_tablegen(TypeInterfaces.h.inc -gen-type-interface-decls)
25 | mlir_tablegen(TypeInterfaces.cpp.inc -gen-type-interface-defs)
26 | add_public_tablegen_target(TritonGPUTypeInterfacesIncGen)
27 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/LayoutUtility.h:
--------------------------------------------------------------------------------
 1 | #include <llvm/Support/LogicalResult.h>
 2 | #include <triton/Dialect/TritonGPU/IR/Dialect.h>
 3 | 
 4 | namespace mlir::triton::gpu {
 5 | 
 6 | CTALayoutAttr permuteCTALayout(MLIRContext *ctx, CTALayoutAttr layout,
 7 |                                ArrayRef<int> order);
 8 | 
 9 | } // namespace mlir::triton::gpu
10 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/Traits.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_IR_TRAITS_H_
 2 | #define TRITONGPU_IR_TRAITS_H_
 3 | 
 4 | #include "mlir/IR/BuiltinTypes.h"
 5 | #include "mlir/IR/OpDefinition.h"
 6 | #include "mlir/Interfaces/InferTypeOpInterface.h"
 7 | #include "mlir/Support/LogicalResult.h"
 8 | #include "triton/Dialect/Triton/IR/Types.h"
 9 | 
10 | namespace mlir {
11 | namespace OpTrait {
12 | 
13 | template <typename ConcreteType>
14 | class MemDescViewTrait
15 |     : public mlir::OpTrait::TraitBase<ConcreteType, MemDescViewTrait> {
16 |   // Optional: Add methods or verification logic here
17 | };
18 | 
19 | } // namespace OpTrait
20 | } // namespace mlir
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_DIALECT
 2 | #define TRITONGPU_DIALECT
 3 | 
 4 | include "mlir/IR/OpBase.td"
 5 | 
 6 | def TritonGPU_Dialect : Dialect {
 7 |   let name = "ttg";
 8 | 
 9 |   let cppNamespace = "::mlir::triton::gpu";
10 | 
11 |   let hasOperationAttrVerify = 1;
12 | 
13 |   let description = [{
14 |     Triton GPU Dialect.
15 |   }];
16 | 
17 |   let dependentDialects = [
18 |     "triton::TritonDialect",
19 |     "mlir::gpu::GPUDialect",
20 |   ];
21 | 
22 |   let extraClassDeclaration = [{
23 |     void registerTypes();
24 | 
25 |     LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
26 |     LinearEncodingAttr toLinearEncoding(ArrayRef<int64_t> shape, Attribute layout);
27 | 
28 |     static int getNumCTAs(ModuleOp mod);
29 |     static int getThreadsPerWarp(ModuleOp mod);
30 | 
31 |     private:
32 |       LinearLayoutCache llCache;
33 |       LinearEncodingCache leCache;
34 |   }];
35 | 
36 |   let useDefaultTypePrinterParser = 1;
37 |   let useDefaultAttributePrinterParser = 1;
38 |   let usePropertiesForAttributes = 1;
39 | }
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_GPU_DIALECT_INTERFACES_H
 2 | #define TRITON_GPU_DIALECT_INTERFACES_H
 3 | 
 4 | // clang-format off
 5 | #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 6 | #include "triton/Dialect/TritonGPU/IR/AttrInterfaces.h.inc"
 7 | // clang-format on
 8 | 
 9 | #endif // TRITON_GPU_DIALECT_INTERFACES_H
10 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_GPU_TYPE_INTERFACES
 2 | #define TRITON_GPU_TYPE_INTERFACES
 3 | 
 4 | include "mlir/IR/OpBase.td"
 5 | 
 6 | // Interface dynamically attached to RankedTensorType and MemDescType.
 7 | def TTG_TensorOrMemDesc : TypeInterface<"TensorOrMemDesc"> {
 8 |   let cppNamespace = "::mlir::triton::gpu";
 9 |   let methods = [
10 |     InterfaceMethod<"Returns the encoding of the tensor or memory descriptor",
11 |       "mlir::Attribute", "getEncoding", (ins)>,
12 |     InterfaceMethod<"Returns element type",
13 |       "mlir::Type", "getElementType", (ins)>,
14 |     InterfaceMethod<"Returns the type shape",
15 |       "llvm::ArrayRef<int64_t>", "getShape", (ins)>,
16 |     InterfaceMethod<"Returns the tensor or buffer rank",
17 |       "int64_t", "getRank", (ins)>,
18 |     InterfaceMethod<"Returns the element type bit width",
19 |       "int64_t", "getElementTypeBitWidth", (ins)>,
20 |   ];
21 | }
22 | 
23 | #endif // TRITON_GPU_TYPE_INTERFACES
24 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/IR/Types.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_IR_TYPES_H_
 2 | #define TRITONGPU_IR_TYPES_H_
 3 | 
 4 | #include "mlir/IR/BuiltinTypes.h"
 5 | #include "mlir/IR/TypeSupport.h"
 6 | #include "mlir/IR/Types.h"
 7 | #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 8 | 
 9 | #define GET_TYPEDEF_CLASSES
10 | #include "triton/Dialect/TritonGPU/IR/Types.h.inc"
11 | 
12 | #include "triton/Dialect/TritonGPU/IR/TypeInterfaces.h.inc"
13 | 
14 | #endif // TRITON_IR_TYPES_H_
15 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonGPU)
3 | add_public_tablegen_target(TritonGPUTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h:
--------------------------------------------------------------------------------
1 | #include "mlir/IR/PatternMatch.h"
2 | 
3 | namespace mlir::triton::gpu {
4 | 
5 | void populateDecomposeScaledBlockedPatterns(mlir::RewritePatternSet &patterns,
6 |                                             int benefit);
7 | 
8 | } // namespace mlir::triton::gpu
9 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_
 2 | #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 6 | 
 7 | namespace mlir {
 8 | namespace triton {
 9 | namespace gpu {
10 | 
11 | // Generate the pass class declarations.
12 | #define GEN_PASS_DECL
13 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
14 | 
15 | /// Generate the code for registering passes.
16 | #define GEN_PASS_REGISTRATION
17 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
18 | 
19 | } // namespace gpu
20 | } // namespace triton
21 | } // namespace mlir
22 | #endif
23 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonGPU/Transforms/WarpSpecialization.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_TRITONGPU_TRANSFORM_PIPELINE_WARPSPECIALIZATION_H_
 2 | #define TRITON_TRITONGPU_TRANSFORM_PIPELINE_WARPSPECIALIZATION_H_
 3 | 
 4 | #include "mlir/Support/LogicalResult.h"
 5 | 
 6 | namespace mlir {
 7 | namespace scf {
 8 | class ForOp;
 9 | } // namespace scf
10 | namespace triton::gpu {
11 | // This is the final step to prepare a loop for warp specialization. This takes
12 | // a loop with a partition schedule and rewrites the loop such that all SSA
13 | // dependencies between partitions are passed through shared memory and
14 | // multibuffers them according to partition stages.
15 | LogicalResult rewritePartitionDependencies(scf::ForOp &loop);
16 | // Given a loop where the partitions' inputs and outputs have been fully
17 | // rewritten to be reference semantic, partitiong the loop into a
18 | // `ttg.warp_specialize` by duplicating the loop for each partition and
19 | // rematerializing, as necessary, operations in the root partition.
20 | LogicalResult partitionLoop(scf::ForOp loop);
21 | } // namespace triton::gpu
22 | } // namespace mlir
23 | 
24 | #endif // TRITON_TRITONGPU_TRANSFORM_PIPELINE_WARPSPECIALIZATION_H_
25 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonInstrument/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonInstrumentDialect.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=tti)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=tti)
 6 | add_mlir_doc(TritonInstrumentDialect TritonInstrumentDialect dialects/ -gen-dialect-doc)
 7 | 
 8 | set(LLVM_TARGET_DEFINITIONS TritonInstrumentOps.td)
 9 | mlir_tablegen(Ops.h.inc -gen-op-decls)
10 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
11 | add_mlir_doc(TritonInstrumentOps TritonInstrumentOps dialects/ -gen-op-doc)
12 | 
13 | add_public_tablegen_target(TritonInstrumentTableGen)
14 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonInstrument/IR/Dialect.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONINSTRUMENT_IR_DIALECT_H_
 2 | #define TRITON_DIALECT_TRITONINSTRUMENT_IR_DIALECT_H_
 3 | 
 4 | // TritonInstrument depends on Triton and TritonGPU
 5 | #include "triton/Dialect/Triton/IR/Dialect.h"
 6 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 7 | 
 8 | #define GET_OP_CLASSES
 9 | #include "triton/Dialect/TritonInstrument/IR/Dialect.h.inc"
10 | #include "triton/Dialect/TritonInstrument/IR/Ops.h.inc"
11 | 
12 | #endif // TRITON_DIALECT_TRITONINSTRUMENT_IR_DIALECT_H_
13 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonInstrument/IR/TritonInstrumentDialect.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONINSTRUMENT_DIALECT
 2 | #define TRITONINSTRUMENT_DIALECT
 3 | 
 4 | include "mlir/IR/OpBase.td"
 5 | 
 6 | def TritonInstrument_Dialect : Dialect {
 7 |   let name = "tti";
 8 |   let cppNamespace = "::mlir::triton::instrument";
 9 | }
10 | 
11 | #endif // TRITONINSTRUMENT_DIALECT
12 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonInstrument/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonInstrument)
3 | add_public_tablegen_target(TritonInstrumentTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonInstrument/Transforms/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_TRITONINSTRUMENT_TRANSFORMS_PASSES_H_
 2 | #define TRITON_DIALECT_TRITONINSTRUMENT_TRANSFORMS_PASSES_H_
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | #include "triton/Dialect/TritonInstrument/IR/Dialect.h"
 6 | 
 7 | namespace mlir {
 8 | namespace triton {
 9 | namespace instrument {
10 | 
11 | // Generate the pass class declarations.
12 | #define GEN_PASS_DECL
13 | #include "triton/Dialect/TritonInstrument/Transforms/Passes.h.inc"
14 | 
15 | /// Generate the code for registering passes.
16 | #define GEN_PASS_REGISTRATION
17 | #include "triton/Dialect/TritonInstrument/Transforms/Passes.h.inc"
18 | 
19 | } // namespace instrument
20 | } // namespace triton
21 | } // namespace mlir
22 | #endif
23 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonInstrument/Transforms/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONINSTRUMENT_PASSES
 2 | #define TRITONINSTRUMENT_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def TritonInstrumentConcurrencySanitizer: Pass<"tritoninstrument-concurrency-sanitizer", "mlir::ModuleOp"> {
 7 |   let summary = "Add runtime verification of asynchronous operations";
 8 | 
 9 |   let description = "Instrument the program with runtime verification of asynchronous operations.";
10 | 
11 |   let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
12 |                            "mlir::triton::TritonDialect",
13 |                            "mlir::triton::instrument::TritonInstrumentDialect"];
14 | }
15 | 
16 | #endif // TRITON_INSTRUMENT_PASSES
17 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=ttng)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=ttng)
 6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 8 | add_mlir_doc(TritonNvidiaGPUDialect TritonNvidiaGPUDialect dialects/ -gen-dialect-doc)
 9 | add_mlir_doc(TritonNvidiaGPUOps TritonNvidiaGPUOps dialects/ -gen-op-doc)
10 | add_public_tablegen_target(TritonNvidiaGPUTableGen)
11 | 
12 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUAttrDefs.td)
13 | mlir_tablegen(TritonNvidiaGPUAttrDefs.h.inc -gen-attrdef-decls)
14 | mlir_tablegen(TritonNvidiaGPUAttrDefs.cpp.inc -gen-attrdef-defs)
15 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
16 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
17 | add_public_tablegen_target(TritonNvidiaGPUAttrDefsIncGen)
18 | 
19 | set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOpInterfaces.td)
20 | mlir_tablegen(TritonNvidiaGPUOpInterfaces.h.inc -gen-op-interface-decls)
21 | mlir_tablegen(TritonNvidiaGPUOpInterfaces.cpp.inc -gen-op-interface-defs)
22 | add_public_tablegen_target(TritonNvidiaGPUOpInterfacesIncGen)
23 | 


--------------------------------------------------------------------------------
/include/triton/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonNvidiaGPU)
3 | add_public_tablegen_target(TritonNvidiaGPUTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Target/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(LLVMIR)
2 | 


--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name LLVMIR)
3 | add_public_tablegen_target(LLVMIRIncGen)
4 | 


--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_TARGET_LLVM_IR_PASSES_H
 2 | #define TRITON_TARGET_LLVM_IR_PASSES_H
 3 | 
 4 | #include "mlir/Pass/Pass.h"
 5 | 
 6 | namespace mlir {
 7 | 
 8 | // Generate the pass class declarations.
 9 | #define GEN_PASS_DECL
10 | #include "triton/Target/LLVMIR/Passes.h.inc"
11 | 
12 | // Generate the code for registering conversion passes.
13 | #define GEN_PASS_REGISTRATION
14 | #include "triton/Target/LLVMIR/Passes.h.inc"
15 | 
16 | } // namespace mlir
17 | 
18 | #endif // TRITON_TARGET_LLVM_IR_PASSES_H
19 | 


--------------------------------------------------------------------------------
/include/triton/Target/LLVMIR/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_TARGET_LLVMIR_PASSES
 2 | #define TRITON_TARGET_LLVMIR_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def LLVMDIScope: Pass<"enable-line-info", "mlir::ModuleOp"> {
 7 |   let summary = "Materialize LLVM line info";
 8 |   let description = [{
 9 |     This pass materializes line mapping information for LLVM IR dialect operations.
10 |   }];
11 | }
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/include/triton/Tools/GenericSwizzling.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_GENERIC_SWIZZLING_H
 2 | #define TRITON_GENERIC_SWIZZLING_H
 3 | 
 4 | #include "llvm/ADT/ArrayRef.h"
 5 | #include "llvm/ADT/SmallVector.h"
 6 | #include <cstdint>
 7 | 
 8 | namespace mlir::triton {
 9 | class LinearLayout;
10 | }
11 | 
12 | namespace mlir::triton::gpu {
13 | LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
14 |                               int32_t bitwidth);
15 | 
16 | std::pair<int, int> logBankConflicts(const LinearLayout &src,
17 |                                      const LinearLayout &dst,
18 |                                      const LinearLayout &smem,
19 |                                      int32_t bitwidth);
20 | } // namespace mlir::triton::gpu
21 | 
22 | #endif // TRITON_GENERIC_SWIZZLING_H
23 | 


--------------------------------------------------------------------------------
/lib/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAnalysis
 2 |   AxisInfo.cpp
 3 |   Allocation.cpp
 4 |   Membar.cpp
 5 |   Alias.cpp
 6 |   Utility.cpp
 7 | 
 8 |   DEPENDS
 9 |   TritonTableGen
10 |   TritonGPUTableGen
11 |   TritonGPUAttrDefsIncGen
12 |   TritonGPUTypeInterfacesIncGen
13 | 
14 |   LINK_LIBS PUBLIC
15 |   MLIRAnalysis
16 |   MLIRLLVMDialect
17 |   TritonIR
18 |   TritonGPUIR
19 |   TritonNvidiaGPUIR
20 | )
21 | 


--------------------------------------------------------------------------------
/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Analysis)
2 | add_subdirectory(Conversion)
3 | add_subdirectory(Dialect)
4 | add_subdirectory(Target)
5 | add_subdirectory(Tools)
6 | add_subdirectory(Instrumentation)
7 | 


--------------------------------------------------------------------------------
/lib/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonToTritonGPU)
2 | add_subdirectory(TritonGPUToLLVM)
3 | add_subdirectory(TritonInstrumentToLLVM)
4 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonGPUToLLVM/AllocateSharedMemory.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Analysis/Allocation.h"
 2 | #include "triton/Analysis/Utility.h"
 3 | #include "triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h"
 4 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 5 | #include "triton/Dialect/Triton/IR/Dialect.h"
 6 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 7 | 
 8 | using namespace mlir;
 9 | using namespace mlir::triton;
10 | 
11 | namespace mlir::triton::gpu {
12 | #define GEN_PASS_DEF_ALLOCATESHAREDMEMORY
13 | #include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
14 | } // namespace mlir::triton::gpu
15 | 
16 | namespace {
17 | struct AllocateSharedMemory
18 |     : public mlir::triton::gpu::impl::AllocateSharedMemoryBase<
19 |           AllocateSharedMemory> {
20 |   void runOnOperation() override {
21 |     ModuleOp mod = getOperation();
22 |     ModuleAllocation allocation(mod);
23 | 
24 |     mlir::triton::gpu::attachAllocationSizeAndOffsetAttr(mod, allocation);
25 |   }
26 | };
27 | } // namespace
28 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonGPUToLLVM
 2 |     DotOpToLLVM/FMA.cpp
 3 |     DotOpToLLVM/FMADotUtility.cpp
 4 |     AllocateSharedMemory.cpp
 5 |     AllocateSharedMemoryUtility.cpp
 6 |     AllocateWarpGroups.cpp
 7 |     AssertOpToLLVM.cpp
 8 |     ControlFlowOpToLLVM.cpp
 9 |     ConvertLayoutOpToLLVM.cpp
10 |     ElementwiseOpToLLVM.cpp
11 |     FuncOpToLLVM.cpp
12 |     GatherOpToLLVM.cpp
13 |     GlobalScratchMemoryAllocation.cpp
14 |     HistogramOpToLLVM.cpp
15 |     MakeRangeOpToLLVM.cpp
16 |     MemoryOpToLLVM.cpp
17 |     PrintOpToLLVM.cpp
18 |     ReduceOpToLLVM.cpp
19 |     ScanOpToLLVM.cpp
20 |     SPMDOpToLLVM.cpp
21 |     TypeConverter.cpp
22 |     Utility.cpp
23 |     ViewOpToLLVM.cpp
24 | 
25 |     DEPENDS
26 |     TritonGPUConversionPassIncGen
27 | 
28 |     LINK_LIBS PUBLIC
29 |     MLIRIR
30 |     MLIRPass
31 |     MLIRGPUDialect
32 |     MLIRGPUToNVVMTransforms
33 |     MLIRGPUToROCDLTransforms
34 |     MLIRGPUTransforms
35 |     TritonAnalysis
36 |     TritonIR
37 |     TritonGPUIR
38 |     TritonGPUTransforms
39 |     TritonNvidiaGPUTransforms
40 | )
41 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMA.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Conversion/TritonGPUToLLVM/FMADotUtility.h"
 2 | #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 3 | 
 4 | using namespace mlir;
 5 | using namespace mlir::triton;
 6 | using namespace ::mlir::triton::gpu;
 7 | 
 8 | namespace {
 9 | class GenericFMAVectorMultiplier : public FMAVectorMultiplier {
10 |   OpBuilder &builder;
11 |   Location loc;
12 | 
13 | public:
14 |   GenericFMAVectorMultiplier(OpBuilder &builder, Location loc)
15 |       : builder(builder), loc(loc) {}
16 | 
17 |   Value multiplyVectors(ArrayRef<Value> a, ArrayRef<Value> b,
18 |                         Value c) override {
19 |     auto K = a.size();
20 |     assert(b.size() == K);
21 |     Value accum = c;
22 |     for (auto [aElem, bElem] : llvm::zip(a, b))
23 |       accum = builder.create<LLVM::FMulAddOp>(loc, aElem, bElem, accum);
24 |     return accum;
25 |   }
26 | };
27 | 
28 | } // namespace
29 | 
30 | LogicalResult convertFMADot(DotOp op, DotOp::Adaptor adaptor,
31 |                             const LLVMTypeConverter *typeConverter,
32 |                             ConversionPatternRewriter &rewriter) {
33 |   auto *ctx = rewriter.getContext();
34 |   auto loc = op.getLoc();
35 |   GenericFMAVectorMultiplier multiplier(rewriter, loc);
36 |   return parametricConvertFMADot(op, adaptor, typeConverter, rewriter,
37 |                                  multiplier);
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonInstrumentToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonInstrumentToLLVM
 2 |     InstrumentationToLLVM.cpp
 3 | 
 4 |     LINK_LIBS PUBLIC
 5 |     MLIRIR
 6 |     MLIRPass
 7 |     TritonIR
 8 |     TritonGPUIR
 9 |     TritonInstrumentIR
10 | )
11 | 


--------------------------------------------------------------------------------
/lib/Conversion/TritonToTritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonToTritonGPU
 2 |     RelayoutTritonGPU.cpp
 3 |     TritonGPUConversion.cpp
 4 |     TritonToTritonGPUPass.cpp
 5 | 
 6 |     DEPENDS
 7 |     TritonConversionPassIncGen
 8 | 
 9 |     LINK_LIBS PUBLIC
10 |     MLIRIR
11 |     MLIRPass
12 |     MLIRTransforms
13 |     TritonIR
14 |     ProtonIR
15 |     TritonGPUIR
16 | )
17 | 


--------------------------------------------------------------------------------
/lib/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Triton)
2 | add_subdirectory(TritonGPU)
3 | add_subdirectory(TritonNvidiaGPU)
4 | add_subdirectory(TritonInstrument)
5 | 


--------------------------------------------------------------------------------
/lib/Dialect/Triton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/lib/Dialect/Triton/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LLVM_TARGET_DEFINITIONS Canonicalize.td)
 2 | mlir_tablegen(TritonCanonicalize.inc -gen-rewriters)
 3 | add_public_tablegen_target(TritonCanonicalizeIncGen)
 4 | 
 5 | add_triton_library(TritonIR
 6 |   Dialect.cpp
 7 |   DiscardableAttributes.cpp
 8 |   Ops.cpp
 9 |   Traits.cpp
10 |   Types.cpp
11 |   OpInterfaces.cpp
12 |   Utility.cpp
13 | 
14 |   DEPENDS
15 |   TritonTableGen
16 |   TritonCanonicalizeIncGen
17 | 
18 |   LINK_LIBS PUBLIC
19 |   MLIRIR
20 |   MLIRArithDialect
21 |   MLIRMathDialect
22 |   MLIRSCFDialect
23 | )
24 | 


--------------------------------------------------------------------------------
/lib/Dialect/Triton/IR/Canonicalize.td:
--------------------------------------------------------------------------------
 1 | #ifndef TT_PATTERNS
 2 | #define TT_PATTERNS
 3 | 
 4 | include "mlir/IR/PatternBase.td"
 5 | include "triton/Dialect/Triton/IR/TritonOps.td"
 6 | 
 7 | // broadcast(splat(x)) -> splat(x)
 8 | def BroadcastSplatPattern :
 9 |     Pat<(TT_BroadcastOp (TT_SplatOp $x)),
10 |         (TT_SplatOp $x)>;
11 | 
12 | // broadcast(broadcast(x)) -> broadcast(x)
13 | def BroadcastBroadcastPattern :
14 |     Pat<(TT_BroadcastOp (TT_BroadcastOp $x)),
15 |         (TT_BroadcastOp $x)>;
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/lib/Dialect/Triton/IR/DiscardableAttributes.cpp:
--------------------------------------------------------------------------------
 1 | #include "mlir/Support/LLVM.h"
 2 | #include "triton/Dialect/Triton/IR/Dialect.h"
 3 | 
 4 | namespace mlir::triton {
 5 | 
 6 | SmallVector<NamedAttribute>
 7 | filterDiscardableAttrs(Operation *op, ArrayRef<StringRef> allowList) {
 8 |   SmallVector<NamedAttribute> propagatedAttrs;
 9 |   for (auto attrName : allowList) {
10 |     Attribute attr = op->getDiscardableAttr(attrName);
11 |     if (attr)
12 |       propagatedAttrs.emplace_back(attrName, attr);
13 |   }
14 |   return propagatedAttrs;
15 | }
16 | 
17 | } // namespace mlir::triton
18 | 


--------------------------------------------------------------------------------
/lib/Dialect/Triton/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LLVM_TARGET_DEFINITIONS Combine.td)
 2 | mlir_tablegen(TritonCombine.inc -gen-rewriters)
 3 | add_public_tablegen_target(TritonCombineIncGen)
 4 | 
 5 | add_triton_library(TritonTransforms
 6 |   Combine.cpp
 7 |   LoopAwareCSE.cpp
 8 |   LoopInvariantCodeMotion.cpp
 9 |   LoopPeeling.cpp
10 |   LoopUnroll.cpp
11 |   ReorderBroadcast.cpp
12 |   RewriteTensorPointer.cpp
13 |   RewriteTensorDescriptorToPointer.cpp
14 |   ArithTypeConversion.cpp
15 |   FunctionTypeConversion.cpp
16 | 
17 |   DEPENDS
18 |   TritonTransformsIncGen
19 |   TritonCombineIncGen
20 | 
21 |   LINK_LIBS PUBLIC
22 |   MLIRPass
23 |   MLIRTransformUtils
24 |   MLIRTransforms
25 |   MLIRSCFToControlFlow
26 |   TritonIR
27 | )
28 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonGPUIR
 2 |   Dialect.cpp
 3 |   LinearLayoutConversions.cpp
 4 |   LayoutUtility.cpp
 5 |   Ops.cpp
 6 |   Types.cpp
 7 | 
 8 |   DEPENDS
 9 |   TritonGPUTableGen
10 |   TritonGPUAttrDefsIncGen
11 |   TritonGPUTypeInterfacesIncGen
12 | 
13 |   LINK_LIBS PUBLIC
14 |   MLIRGPUDialect
15 |   TritonIR
16 |   TritonTools
17 | )
18 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/IR/LayoutUtility.cpp:
--------------------------------------------------------------------------------
 1 | #include <triton/Dialect/TritonGPU/IR/LayoutUtility.h>
 2 | 
 3 | #include <llvm/ADT/SmallVector.h>
 4 | #include <triton/Dialect/Triton/IR/Utility.h>
 5 | 
 6 | namespace mlir::triton::gpu {
 7 | 
 8 | CTALayoutAttr permuteCTALayout(MLIRContext *ctx, CTALayoutAttr layout,
 9 |                                ArrayRef<int> order) {
10 |   auto n = order.size();
11 |   assert(n == layout.getRank() && "order and layout rank mismatch");
12 | 
13 |   auto invOrder = inversePermutation(order);
14 |   llvm::SmallVector<unsigned> invOrderUnsigned(invOrder.begin(),
15 |                                                invOrder.end());
16 |   return CTALayoutAttr::get(
17 |       ctx, applyPermutation(layout.getCTAsPerCGA(), order),
18 |       applyPermutation(layout.getCTASplitNum(), order),
19 |       applyPermutation(invOrderUnsigned, layout.getCTAOrder()));
20 | }
21 | 
22 | } // namespace mlir::triton::gpu
23 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/Transforms/Pipeliner/TestPipelineLowerLoop.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 2 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 3 | #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 4 | #include "triton/Dialect/TritonGPU/Transforms/Schedule.h"
 5 | #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 6 | 
 7 | using namespace mlir;
 8 | namespace tt = mlir::triton;
 9 | namespace ttg = mlir::triton::gpu;
10 | 
11 | namespace mlir {
12 | namespace triton {
13 | namespace gpu {
14 | 
15 | #define GEN_PASS_DEF_TRITONGPUTESTPIPELINELOWERLOOP
16 | #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
17 | 
18 | struct TestPipelineLowerLoop
19 |     : public impl::TritonGPUTestPipelineLowerLoopBase<TestPipelineLowerLoop> {
20 |   using impl::TritonGPUTestPipelineLowerLoopBase<
21 |       TestPipelineLowerLoop>::TritonGPUTestPipelineLowerLoopBase;
22 | 
23 |   void runOnOperation() override {
24 |     ModuleOp m = getOperation();
25 | 
26 |     lowerLoops(m);
27 |   }
28 | };
29 | 
30 | } // namespace gpu
31 | } // namespace triton
32 | } // namespace mlir
33 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.cpp:
--------------------------------------------------------------------------------
 1 | #include "PartitionBuilder.h"
 2 | #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
 3 | #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 4 | 
 5 | using namespace mlir;
 6 | using namespace triton;
 7 | using namespace triton::gpu;
 8 | 
 9 | Value PartitionBuilder::intCst(int value, unsigned width) {
10 |   return create<arith::ConstantIntOp>(value, width);
11 | }
12 | 
13 | Value PartitionBuilder::boolCst(bool value) {
14 |   return intCst(value, /*width=*/1);
15 | }
16 | 
17 | void PartitionBuilder::assignStage(Operation *op, StageCluster stageCluster) {
18 |   if (stageCluster) {
19 |     op->setAttr(kLoopStageAttrName, getI32IntegerAttr(stageCluster->first));
20 |     op->setAttr(kLoopClusterAttrName, getI32IntegerAttr(stageCluster->second));
21 |   }
22 | }
23 | 
24 | void PartitionBuilder::assignPartition(Operation *op, Partition &partition) {
25 |   op->setAttr(kPartitionAttrName, getI32IntegerAttr(partition.getIndex()));
26 | }
27 | 
28 | StageCluster triton::gpu::getStageCluster(Operation *op) {
29 |   auto stageAttr = op->getAttrOfType<IntegerAttr>(kLoopStageAttrName);
30 |   auto clusterAttr = op->getAttrOfType<IntegerAttr>(kLoopClusterAttrName);
31 |   if (!stageAttr || !clusterAttr)
32 |     return std::nullopt;
33 |   return std::make_pair(stageAttr.getInt(), clusterAttr.getInt());
34 | }
35 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_WARPSPECIALIZATION_PARTITIONBUILDER_H
 2 | #define TRITONGPU_WARPSPECIALIZATION_PARTITIONBUILDER_H
 3 | 
 4 | #include "mlir/IR/ImplicitLocOpBuilder.h"
 5 | 
 6 | namespace mlir::triton::gpu {
 7 | 
 8 | class Partition;
 9 | 
10 | using StageCluster = std::optional<std::pair<int, int>>;
11 | 
12 | struct PartitionBuilder : public ImplicitLocOpBuilder {
13 |   using ImplicitLocOpBuilder::ImplicitLocOpBuilder;
14 | 
15 |   Value intCst(int value, unsigned width = 32);
16 |   Value boolCst(bool value);
17 | 
18 |   void assignStage(Operation *op, StageCluster stageCluster);
19 |   void assignPartition(Operation *op, Partition &partition);
20 | 
21 |   template <typename OpT, typename... Args>
22 |   auto createInto(Partition &partition, StageCluster stageCluster,
23 |                   Args &&...args) {
24 |     auto op = create<OpT>(std::forward<Args>(args)...);
25 |     assignPartition(op, partition);
26 |     assignStage(op, stageCluster);
27 |     return op;
28 |   }
29 | };
30 | 
31 | // Get the stage and cluster for an operation, if it has one assigned.
32 | StageCluster getStageCluster(Operation *op);
33 | 
34 | } // namespace mlir::triton::gpu
35 | 
36 | #endif // TRITONGPU_WARPSPECIALIZATION_PARTITIONBUILDER_H
37 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonInstrument/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonInstrument/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonInstrumentIR
 2 |   Dialect.cpp
 3 |   Ops.cpp
 4 | 
 5 |   DEPENDS
 6 |     TritonInstrumentTableGen
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |     MLIRIR
10 |     TritonIR
11 |     TritonGPUIR
12 | )
13 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonInstrument/IR/Dialect.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Dialect/Triton/IR/Dialect.h"
 2 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 3 | #include "triton/Dialect/TritonInstrument/IR/Dialect.h"
 4 | 
 5 | #include <numeric>
 6 | 
 7 | #include "mlir/IR/DialectImplementation.h"
 8 | #include "mlir/IR/OpImplementation.h"
 9 | #include "triton/Dialect/Triton/IR/Interfaces.h"
10 | #include "triton/Dialect/Triton/IR/Utility.h"
11 | #include "triton/Dialect/TritonInstrument/IR/Dialect.cpp.inc"
12 | 
13 | using namespace mlir;
14 | using namespace mlir::triton::gpu;
15 | using namespace mlir::triton::instrument;
16 | 
17 | void TritonInstrumentDialect::initialize() {
18 |   addOperations<
19 | #define GET_OP_LIST
20 | #include "triton/Dialect/TritonInstrument/IR/Ops.cpp.inc"
21 |       >();
22 | }
23 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonInstrument/IR/Ops.cpp:
--------------------------------------------------------------------------------
1 | #include "triton/Dialect/TritonInstrument/IR/Dialect.h"
2 | 
3 | #define GET_OP_CLASSES
4 | #include "triton/Dialect/TritonInstrument/IR/Ops.cpp.inc"
5 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonInstrument/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonInstrumentTransforms
 2 |   ConcurrencySanitizer.cpp
 3 | 
 4 |   DEPENDS
 5 |   TritonInstrumentTransformsIncGen
 6 | 
 7 |   LINK_LIBS PUBLIC
 8 |   MLIRTransforms
 9 |   MLIRTransformUtils
10 |   TritonIR
11 |   TritonGPUIR
12 |   TritonNvidiaGPUIR
13 |   TritonToTritonGPU
14 |   TritonInstrumentIR
15 |   MLIRTransformUtils
16 | )
17 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonNvidiaGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonNvidiaGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonNvidiaGPUIR
 2 |   Dialect.cpp
 3 |   Ops.cpp
 4 | 
 5 |   DEPENDS
 6 |   TritonNvidiaGPUTableGen
 7 |   TritonNvidiaGPUAttrDefsIncGen
 8 |   TritonNvidiaGPUOpInterfacesIncGen
 9 | 
10 |   LINK_LIBS PUBLIC
11 |   TritonIR
12 |   TritonGPUIR
13 | )
14 | 


--------------------------------------------------------------------------------
/lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonNvidiaGPUTransforms
 2 |   FenceInsertion.cpp
 3 |   InterleaveTMem.cpp
 4 |   MMALowering.cpp
 5 |   OptimizeDescriptorEncoding.cpp
 6 |   OptimizeTMemLayouts.cpp
 7 |   PlanCTA.cpp
 8 |   PromoteLHSToTMem.cpp
 9 |   ProxFenceInsertion.cpp
10 |   RemoveTMEMTokens.cpp
11 |   TensorMemoryAllocation.cpp
12 |   TMALowering.cpp
13 |   TMAUtilities.cpp
14 | 
15 |   DEPENDS
16 |   TritonNvidiaGPUTransformsIncGen
17 | 
18 |   LINK_LIBS PUBLIC
19 |   TritonIR
20 |   TritonGPUIR
21 |   TritonGPUTransforms
22 |   TritonNvidiaGPUIR
23 |   MLIRTransformUtils
24 | )
25 | 


--------------------------------------------------------------------------------
/lib/Target/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(LLVMIR)
2 | 


--------------------------------------------------------------------------------
/lib/Target/LLVMIR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonLLVMIR
 2 |         LLVMDIScope.cpp
 3 |         LLVMIRBreakPhiStruct.cpp
 4 | 
 5 |         DEPENDS
 6 |         LLVMIRIncGen
 7 | 
 8 |         LINK_LIBS
 9 |         ${CMAKE_DL_LIBS}
10 |         PUBLIC
11 |         MLIRArithToLLVM
12 |         MLIRBuiltinToLLVMIRTranslation
13 |         MLIRIndexToLLVM
14 |         MLIRIR
15 |         MLIRLLVMDialect
16 |         MLIRLLVMToLLVMIRTranslation
17 |         MLIRNVVMToLLVMIRTranslation
18 |         MLIRROCDLToLLVMIRTranslation
19 |         MLIRSCFToControlFlow
20 |         MLIRSupport
21 |         MLIRTargetLLVMIRExport
22 |         TritonGPUToLLVM
23 |         )
24 | 
25 | set_source_files_properties(
26 |         LLVMIRTranslation.cpp
27 |         PROPERTIES
28 |         COMPILE_FLAGS "-D__BUILD_DIR__=\\\"${CMAKE_BINARY_DIR}\\\"")
29 | 


--------------------------------------------------------------------------------
/lib/Target/LLVMIR/LLVMPasses.h:
--------------------------------------------------------------------------------
 1 | #include "llvm/IR/PassManager.h"
 2 | #include "llvm/Pass.h"
 3 | #include "llvm/Support/CodeGen.h"
 4 | 
 5 | namespace llvm {
 6 | 
 7 | // Pass to pre-process LLVM IR before optimization and break up phi of struct.
 8 | // Breaking up those phis into elementary types allows better optimizations
 9 | // downstream.
10 | struct BreakStructPhiNodesPass : PassInfoMixin<BreakStructPhiNodesPass> {
11 |   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
12 | 
13 |   static StringRef name() { return "BreakStructPhiNodesPass"; }
14 | };
15 | 
16 | } // namespace llvm
17 | 


--------------------------------------------------------------------------------
/lib/Tools/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonTools
 2 |   GenericSwizzling.cpp
 3 |   LayoutUtils.cpp
 4 |   LinearLayout.cpp
 5 | 
 6 |   DEPENDS
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   MLIRIR
10 |   MLIRLLVMDialect
11 |   f2reduce
12 | )
13 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=40.8.0", "cmake>=3.20,<4.0", "ninja>=1.11.1", "pybind11>=2.13.1"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.mypy]
 6 | mypy_path = "$MYPY_CONFIG_FILE_DIR/python"
 7 | files = [
 8 |     "python/triton/knobs.py",
 9 |     "python/triton/runtime/build.py",
10 |     "python/triton/runtime/driver.py",
11 |     "python/triton/_utils.py",
12 |     "python/test/unit/test_knobs.py",
13 |     "python/test/unit/runtime/test_build.py",
14 |     "python/test/unit/runtime/test_compilation_listener.py",
15 | ]
16 | exclude = ["/build/"]
17 | follow_imports = "silent"
18 | 
19 | [tool.yapf]
20 | based_on_style = "pep8"
21 | column_limit = 120
22 | disable_split_list_with_comment = true
23 | each_dict_entry_on_separate_line=false
24 | split_before_named_assigns = false
25 | split_complex_comprehension = true
26 | 
27 | # We're incrementally switching from autopep8 to ruff.
28 | [tool.autopep8]
29 | aggressive = 1
30 | ignore = "E501,E701,E731,W690,W503"
31 | max_line_length = 88
32 | 
33 | [tool.ruff]
34 | line-length = 120
35 | 
36 | [tool.ruff.lint]
37 | ignore = ["E501", "E701", "E731", "E741"]
38 | 


--------------------------------------------------------------------------------
/python/build_helpers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sysconfig
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def get_base_dir():
 8 |     return os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
 9 | 
10 | 
11 | def _get_cmake_dir():
12 |     plat_name = sysconfig.get_platform()
13 |     python_version = sysconfig.get_python_version()
14 |     dir_name = f"cmake.{plat_name}-{sys.implementation.name}-{python_version}"
15 |     return Path(get_base_dir()) / "build" / dir_name
16 | 
17 | 
18 | def get_cmake_dir():
19 |     cmake_dir = os.getenv("TRITON_BUILD_DIR", default=_get_cmake_dir())
20 |     cmake_dir = Path(cmake_dir)
21 |     cmake_dir.mkdir(parents=True, exist_ok=True)
22 |     return cmake_dir
23 | 


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools>=40.8.0
2 | wheel
3 | cmake>=3.20,<4.0
4 | ninja>=1.11.1
5 | pybind11>=2.13.1
6 | lit
7 | 


--------------------------------------------------------------------------------
/python/test-requirements.txt:
--------------------------------------------------------------------------------
 1 | autopep8
 2 | isort
 3 | numpy
 4 | pytest
 5 | pytest-forked
 6 | pytest-xdist
 7 | scipy>=1.7.1
 8 | llnl-hatchet
 9 | expecttest
10 | 


--------------------------------------------------------------------------------
/python/test/kernel_comparison/kernels.yml:
--------------------------------------------------------------------------------
 1 | name_and_extension:
 2 |   - name: _kernel_0d1d2d3de4de5de6c7de8de9c10de11c
 3 |     extension: ptx
 4 |   - name: _kernel_0d1d2d3de4de5de6de7c8de9c10de11c
 5 |     extension: ptx
 6 |   - name: _kernel_0d1d2d345de6c789c1011c
 7 |     extension: ptx
 8 |   - name: _kernel_0d1d2d3456c789c1011c
 9 |     extension: ptx
10 |   - name: _kernel_0d1d2d3de4de5de6c7de8c9de10de11c
11 |     extension: ptx
12 |   - name: _kernel_0d1d2d34567c8c91011c
13 |     extension: ptx
14 |   - name: _kernel_0d1d2d3456c78c91011c
15 |     extension: ptx
16 |   - name: _kernel_0d1d2d3de4de5de6de7c8c9de10de11c
17 |     extension: ptx
18 |   - name: _kernel_0d1d2d34567c89c1011c
19 |     extension: ptx
20 |   - name: _kernel_0d1d2d345de6de7c89c1011c
21 |     extension: ptx
22 |   - name: _kernel_0d1d2d345de6de7c8c9de1011c
23 |     extension: ptx
24 |   - name: kernel_0d1d2de
25 |     extension: ptx
26 |   - name: _kernel_0d1d2d345de6c78c9de1011c
27 |     extension: ptx
28 |   - name: _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11de12de13de14de15c16de17de18de19c20de21de22de23c2425de26de
29 |     extension: ptx
30 |   - name: _fwd_kernel_0d1d2d34d5d6de7de8de9c10de11de12de13c14de15de16de17c18de19de20de21c2223de24de
31 |     extension: ptx
32 |   - name: _bwd_preprocess_0d1d2d
33 |     extension: ptx
34 | 


--------------------------------------------------------------------------------
/python/test/unit/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/test/unit/cuda/__init__.py


--------------------------------------------------------------------------------
/python/test/unit/cuda/test_tensor_descriptor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | from triton._internal_testing import requires_tma
 5 | from triton.tools.tensor_descriptor import TensorDescriptor
 6 | 
 7 | 
 8 | @requires_tma
 9 | def test_specialization_after_host_tensordesc():
10 | 
11 |     @triton.jit
12 |     def kernel(a, b):
13 |         pass
14 | 
15 |     device = "cuda"
16 |     A = torch.randn(1024, device=device)
17 |     desc = TensorDescriptor.from_tensor(A, [128])
18 |     h = kernel.warmup(desc, 16, grid=(1, ))
19 |     assert ", %arg3: i32 {tt.divisibility = 16 : i32}" in h.asm["ttir"]
20 | 


--------------------------------------------------------------------------------
/python/test/unit/language/test_module.py:
--------------------------------------------------------------------------------
1 | import triton
2 | 
3 | 
4 | @triton.jit
5 | def function_with_name():
6 |     pass
7 | 


--------------------------------------------------------------------------------
/python/test/unit/language/test_reproducer.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import re
 3 | 
 4 | 
 5 | def test_triton_reproducer_path(monkeypatch, tmp_path):
 6 |     # If we get a cache hit there will be no reproducer generated
 7 |     monkeypatch.setenv("TRITON_ALWAYS_COMPILE", "1")
 8 | 
 9 |     @triton.jit
10 |     def triton_():
11 |         return
12 | 
13 |     # We need an temp empty file for MLIR to write the reproducer to, and then
14 |     # the TRITON_REPRODUCER_PATH env var enables crash the reproduction
15 |     # generation in MLIR.
16 |     repro_path = tmp_path / "repro.mlir"
17 |     repro_path.touch()
18 |     monkeypatch.setenv("TRITON_REPRODUCER_PATH", str(repro_path))
19 | 
20 |     # Run the kernel so MLIR will generate a crash reproducer. It doesn't really
21 |     # matter what the kernel does, just that the PassManager runs its passes.
22 |     triton_[(1, )]()
23 | 
24 |     repro = repro_path.read_text()
25 |     assert "mlir_reproducer" in repro, f"Expected MLIR reproducer in {repro_path}. Got:\n{repro}"
26 |     m = re.search(r"pipeline: \"(.*)\"", repro)
27 |     assert m, "Expected to match pass pipeline after \"pipeline:\" in MLIR reproducer"
28 |     pipeline_str = m.group(1)
29 |     assert pipeline_str, "Expected non-empty pass pipeline in MLIR reproducer"
30 | 


--------------------------------------------------------------------------------
/python/test/unit/runtime/test_jit.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import pytest
 3 | import torch
 4 | 
 5 | import triton
 6 | import triton.language as tl
 7 | 
 8 | 
 9 | def test_pre_call_hooks(device):
10 | 
11 |     @triton.jit
12 |     def add_kernel(
13 |         in_ptr0,
14 |         in_ptr1,
15 |         out_ptr,
16 |         n_elements,
17 |         BLOCK_SIZE: "tl.constexpr",
18 |     ):
19 |         pid = tl.program_id(axis=0)
20 |         block_start = pid * BLOCK_SIZE
21 |         offsets = block_start + tl.arange(0, BLOCK_SIZE)
22 |         mask = offsets < n_elements
23 |         x = tl.load(in_ptr0 + offsets, mask=mask)
24 |         y = tl.load(in_ptr1 + offsets, mask=mask)
25 |         output = x + y
26 |         tl.store(out_ptr + offsets, output, mask=mask)
27 | 
28 |     class MyTensor(torch.Tensor):
29 |         pass
30 | 
31 |     def my_hook(*args, **kwargs):
32 |         for arg in itertools.chain(args, kwargs.values()):
33 |             if isinstance(arg, MyTensor):
34 |                 raise Exception("MyTensor is not allowed")
35 | 
36 |     add_kernel.add_pre_run_hook(my_hook)
37 | 
38 |     x = torch.randn(4, device=device)
39 |     y = MyTensor(x)
40 |     out = torch.zeros_like(x)
41 |     with pytest.raises(Exception):
42 |         add_kernel[(4, )](x, y, out, 4, 4)
43 | 


--------------------------------------------------------------------------------
/python/test/unit/test_filecheck.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import triton
 3 | 
 4 | from triton._filecheck import run_filecheck_test
 5 | 
 6 | 
 7 | @triton.jit
 8 | def anchor(v):
 9 |     pass
10 | 
11 | 
12 | # Smoke test to make sure filecheck is working correctly.
13 | def test_filecheck_positive():
14 | 
15 |     @triton.jit
16 |     def test_kernel():
17 |         # CHECK-LABEL: test_kernel
18 |         scalar = 42
19 |         # CHECK: %c42_i32 = arith.constant 42 : i32
20 |         # CHECK-NEXT: call @{{.*}}anchor{{.*}}(%c42_i32) : (i32) -> ()
21 |         anchor(scalar)
22 | 
23 |     run_filecheck_test(test_kernel)
24 | 
25 | 
26 | def test_filecheck_negative():
27 | 
28 |     @triton.jit
29 |     def test_kernel():
30 |         # CHECK-LABEL: test_kernel
31 |         scalar = 11
32 |         # CHECK: %c42_i32
33 |         anchor(scalar)
34 | 
35 |     with pytest.raises(ValueError, match="expected string not found in input\n # CHECK: %c42_i32"):
36 |         run_filecheck_test(test_kernel)
37 | 


--------------------------------------------------------------------------------
/python/test/unit/tools/test_disasm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | import pytest
 5 | import triton.language as tl
 6 | 
 7 | 
 8 | def test_disam_cubin():
 9 |     if not triton.runtime.driver.active.get_current_target().backend == "cuda":
10 |         pytest.skip("Test requires CUDA.")
11 | 
12 |     @triton.jit
13 |     def kernel(X, i: tl.constexpr):
14 |         tl.store(X, i)
15 | 
16 |     x = torch.empty(1, dtype=torch.int32, device='cuda')
17 |     h = kernel[(1, )](x, i=12)
18 |     assert x[0] == 12
19 |     sass = h.asm["sass"]
20 |     # check that the sass has a store instruction.
21 |     assert "STG.E" in sass
22 | 


--------------------------------------------------------------------------------
/python/triton/compiler/__init__.py:
--------------------------------------------------------------------------------
1 | from .compiler import CompiledKernel, ASTSource, IRSource, compile, make_backend, LazyDict, get_cache_key
2 | from .errors import CompilationError
3 | 
4 | __all__ = [
5 |     "compile", "make_backend", "ASTSource", "IRSource", "CompiledKernel", "CompilationError", "LazyDict",
6 |     "get_cache_key"
7 | ]
8 | 


--------------------------------------------------------------------------------
/python/triton/compiler/make_launcher.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/triton/compiler/make_launcher.py


--------------------------------------------------------------------------------
/python/triton/errors.py:
--------------------------------------------------------------------------------
1 | """Base class for all errors raised by Triton"""
2 | 
3 | 
4 | class TritonError(Exception):
5 |     ...
6 | 


--------------------------------------------------------------------------------
/python/triton/experimental/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/triton/experimental/__init__.py


--------------------------------------------------------------------------------
/python/triton/experimental/gluon/__init__.py:
--------------------------------------------------------------------------------
1 | from . import nvidia
2 | from ._runtime import jit
3 | 
4 | __all__ = ["jit", "nvidia"]
5 | 


--------------------------------------------------------------------------------
/python/triton/experimental/gluon/_compiler.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/triton/experimental/gluon/_compiler.py


--------------------------------------------------------------------------------
/python/triton/experimental/gluon/language/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._core import *  # NOQA: F403
 2 | from ._core import __all__ as __core_all
 3 | from ._layouts import *  # NOQA: F403
 4 | from ._layouts import __all__ as __layouts_all
 5 | from ._math import *  # NOQA: F403
 6 | from ._math import __all__ as __math_all
 7 | from ._standard import *  # NOQA: F403
 8 | from ._standard import __all__ as __standard_all
 9 | 
10 | from . import nvidia
11 | 
12 | __all__ = [
13 |     *__core_all,
14 |     *__layouts_all,
15 |     *__math_all,
16 |     *__standard_all,
17 |     "nvidia",
18 | ]
19 | 


--------------------------------------------------------------------------------
/python/triton/experimental/gluon/language/_math.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | import triton.language.math as tl_math
 3 | from ._core import builtin
 4 | 
 5 | __all__ = [
 6 |     "umulhi", "exp", "exp2", "fma", "log", "log2", "cos", "rsqrt", "sin", "sqrt", "sqrt_rn", "abs", "fdiv", "div_rn",
 7 |     "erf", "floor", "ceil"
 8 | ]
 9 | 
10 | for name in __all__:
11 |     fn = getattr(tl_math, name)
12 |     globals()[name] = builtin(fn)
13 | 


--------------------------------------------------------------------------------
/python/triton/experimental/gluon/language/nvidia/__init__.py:
--------------------------------------------------------------------------------
1 | from . import blackwell
2 | from . import hopper
3 | 
4 | __all__ = ["blackwell", "hopper"]
5 | 


--------------------------------------------------------------------------------
/python/triton/experimental/gluon/language/nvidia/ampere/__init__.py:
--------------------------------------------------------------------------------
1 | from . import async_copy, mbarrier
2 | 
3 | __all__ = ["async_copy", "mbarrier"]
4 | 


--------------------------------------------------------------------------------
/python/triton/experimental/gluon/nvidia/__init__.py:
--------------------------------------------------------------------------------
1 | from . import hopper
2 | from . import blackwell
3 | 
4 | __all__ = ["hopper", "blackwell"]
5 | 


--------------------------------------------------------------------------------
/python/triton/experimental/gluon/nvidia/blackwell.py:
--------------------------------------------------------------------------------
1 | from .hopper import TensorDescriptor
2 | 
3 | __all__ = ["TensorDescriptor"]
4 | 


--------------------------------------------------------------------------------
/python/triton/language/extra/__init__.py:
--------------------------------------------------------------------------------
 1 | import pkgutil
 2 | from importlib.util import module_from_spec
 3 | from sys import modules
 4 | 
 5 | _backends = []
 6 | for module_finder, module_name, is_pkg in pkgutil.iter_modules(
 7 |         __path__,
 8 |         prefix=__name__ + ".",
 9 | ):
10 |     # skip .py files (like libdevice.py)
11 |     if not is_pkg:
12 |         continue
13 | 
14 |     # import backends (like cuda and hip) that are included during setup.py
15 |     spec = module_finder.find_spec(module_name)
16 |     if spec is None or spec.loader is None:
17 |         continue
18 |     module = module_from_spec(spec)
19 |     spec.loader.exec_module(module)
20 | 
21 |     _backends.append(module_name)
22 |     modules[module_name] = module
23 | 
24 | __all__ = _backends
25 | 
26 | del _backends
27 | 


--------------------------------------------------------------------------------
/python/triton/runtime/__init__.py:
--------------------------------------------------------------------------------
 1 | from .autotuner import (Autotuner, Config, Heuristics, autotune, heuristics)
 2 | from .cache import RedisRemoteCacheBackend, RemoteCacheBackend
 3 | from .driver import driver
 4 | from .jit import JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret
 5 | from .errors import OutOfResources, InterpreterError
 6 | 
 7 | __all__ = [
 8 |     "autotune",
 9 |     "Autotuner",
10 |     "Config",
11 |     "driver",
12 |     "Heuristics",
13 |     "heuristics",
14 |     "InterpreterError",
15 |     "JITFunction",
16 |     "KernelInterface",
17 |     "MockTensor",
18 |     "OutOfResources",
19 |     "RedisRemoteCacheBackend",
20 |     "reinterpret",
21 |     "RemoteCacheBackend",
22 |     "TensorWrapper",
23 | ]
24 | 


--------------------------------------------------------------------------------
/python/triton/runtime/_allocation.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Protocol
 2 | 
 3 | 
 4 | class Buffer(Protocol):
 5 | 
 6 |     def data_ptr(self) -> int:
 7 |         ...
 8 | 
 9 | 
10 | class Allocator(Protocol):
11 | 
12 |     def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
13 |         ...
14 | 
15 | 
16 | class NullAllocator:
17 | 
18 |     def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
19 |         raise RuntimeError("Kernel requires a runtime memory allocation, but no allocator was set. " +
20 |                            "Use triton.set_allocator to specify an allocator.")
21 | 
22 | 
23 | _allocator: Allocator = NullAllocator()
24 | 
25 | 
26 | def set_allocator(allocator: Allocator):
27 |     """
28 |     The allocator function is called during kernel launch for kernels that
29 |     require additional global memory workspace.
30 |     """
31 |     global _allocator
32 |     _allocator = allocator
33 | 


--------------------------------------------------------------------------------
/python/triton/runtime/errors.py:
--------------------------------------------------------------------------------
 1 | from ..errors import TritonError
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class InterpreterError(TritonError):
 6 | 
 7 |     def __init__(self, error_message: Optional[str] = None):
 8 |         self.error_message = error_message
 9 | 
10 |     def __str__(self) -> str:
11 |         return self.error_message or ""
12 | 
13 | 
14 | class OutOfResources(TritonError):
15 | 
16 |     def __init__(self, required, limit, name):
17 |         self.required = required
18 |         self.limit = limit
19 |         self.name = name
20 | 
21 |     def __str__(self) -> str:
22 |         return f"out of resource: {self.name}, Required: {self.required}, Hardware limit: {self.limit}. Reducing block sizes or `num_stages` may help."
23 | 
24 |     def __reduce__(self):
25 |         # this is necessary to make CompilationError picklable
26 |         return (type(self), (self.required, self.limit, self.name))
27 | 
28 | 
29 | class PTXASError(TritonError):
30 | 
31 |     def __init__(self, error_message: Optional[str] = None):
32 |         self.error_message = error_message
33 | 
34 |     def __str__(self) -> str:
35 |         error_message = self.error_message or ""
36 |         return f"PTXAS error: {error_message}"
37 | 


--------------------------------------------------------------------------------
/python/triton/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/triton/tools/__init__.py


--------------------------------------------------------------------------------
/python/triton_kernels/.gitignore:
--------------------------------------------------------------------------------
1 | triton_bench.egg-info/
2 | 


--------------------------------------------------------------------------------
/python/triton_kernels/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "triton_kernels"
 3 | version = "1.0.0"
 4 | dependencies = ["torch", "numpy", "pytest"]
 5 | 
 6 | [build-system]
 7 | requires = ["setuptools>=64.0"]
 8 | build-backend = "setuptools.build_meta"
 9 | 
10 | [tool.setuptools.packages.find]
11 | include = ["triton_kernels*"]
12 | 


--------------------------------------------------------------------------------
/python/triton_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/triton_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/python/triton_kernels/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption("--device", action="store", default="cuda")
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def device(request):
10 |     return request.config.getoption("--device")
11 | 


--------------------------------------------------------------------------------
/python/triton_kernels/tests/test_compaction.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from triton_kernels.compaction import compaction, compaction_torch
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("n_tokens, n_cols, k, p", [
 7 |     (8192, 64, 4, 0.5),
 8 |     (8192, 64, 4, 1.0),
 9 |     (131, 128, 16, 0.6),
10 |     (496, 128, 16, 0.),
11 | ])
12 | def test_compaction(n_tokens, n_cols, k, p, device):
13 |     yi = torch.rand((n_tokens, n_cols), device=device).argsort(dim=-1)
14 |     yi = yi[:, :k].to(torch.int32)
15 |     yv = torch.randn((n_tokens, k), dtype=torch.bfloat16, device=device)
16 |     # "drop" indices from yi with probability `p`
17 |     mask = torch.zeros((n_tokens, n_cols), dtype=torch.int32, device=device)
18 |     keep = (torch.rand(yi.shape, device=device) < p)
19 |     if keep.any():
20 |         rows = torch.arange(yi.size(0), device=device).unsqueeze(1).expand_as(yi)
21 |         mask[rows[keep], yi[keep]] = 1
22 |     chunks = mask.view(*mask.shape[:-1], -1, 32)
23 |     weights = (1 << torch.arange(32, dtype=torch.int32, device=device))
24 |     bitmask = (chunks.int() * weights).sum(dim=-1)
25 |     yv_ref, yi_ref = compaction_torch(yv, yi, bitmask)
26 |     yv_tri, yi_tri = compaction(yv, yi, bitmask)
27 |     assert torch.all(yi_ref == yi_tri)
28 |     assert torch.all(yv_ref == yv_tri)
29 | 


--------------------------------------------------------------------------------
/python/triton_kernels/triton_kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/triton_kernels/triton_kernels/__init__.py


--------------------------------------------------------------------------------
/python/triton_kernels/triton_kernels/compaction_details/_masked_compaction.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import triton.language as tl
 3 | 
 4 | 
 5 | @triton.jit
 6 | def _masked_compaction(Yv, Yi, BitMask, stride_bm, stride_bn, RetYv, RetYi, sentinel, K: tl.constexpr):
 7 |     pid_m = tl.program_id(0)
 8 |     yv = tl.load(Yv + pid_m * K + tl.arange(0, K))
 9 |     yi = tl.load(Yi + pid_m * K + tl.arange(0, K))
10 |     div = yi // 32
11 |     rem = yi % 32
12 |     active_bits = (tl.load(BitMask + pid_m * stride_bm + div * stride_bn) >> rem) & 1
13 |     exc_cumsum = tl.cumsum(active_bits, 0) - active_bits
14 |     rev_arange = tl.where(active_bits, 0, K - 1 - tl.arange(0, K))
15 |     write_indx = exc_cumsum + rev_arange
16 |     yv = tl.where(active_bits, yv, sentinel)
17 |     yi = tl.where(active_bits, yi, sentinel)
18 |     tl.store(RetYv + pid_m * K + write_indx, yv)
19 |     tl.store(RetYi + pid_m * K + write_indx, yi)
20 | 


--------------------------------------------------------------------------------
/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_amd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | from triton_kernels.target_info import get_cdna_version
 4 | 
 5 | 
 6 | def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microscaling_ctx):
 7 |     lhs_width = lhs_dtype.itemsize
 8 |     rhs_width = rhs_dtype.itemsize if rhs_dtype != torch.uint8 else 0.5
 9 | 
10 |     # block_n:
11 |     n_cu = torch.cuda.get_device_properties(0).multi_processor_count
12 |     if n is not None:
13 |         if n <= 128 and (n & (n - 1)) == 0:
14 |             block_n = n
15 |         else:
16 |             block_n = max(32, min(256, triton.next_power_of_2(grid_m * n * num_xcds // n_cu)))
17 |     elif block_m > 64:
18 |         block_n = 256
19 |     else:
20 |         block_n = 128
21 | 
22 |     if get_cdna_version() == 4 and block_m == 128:
23 |         block_n = 512
24 | 
25 |     # block_k needs to match the cacheline size (128B)
26 |     block_k = int(128 // min(lhs_width, rhs_width))
27 | 
28 |     # TODO: block_k = 128 seems to work better for now.
29 |     #       perhaps due to increased number of k loops to pipeline
30 |     if microscaling_ctx.weight_scale is not None and get_cdna_version() != 4:
31 |         block_k = 128
32 |     return block_n, block_k
33 | 


--------------------------------------------------------------------------------
/python/triton_kernels/triton_kernels/numerics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from dataclasses import dataclass
 3 | 
 4 | MAX_FINITE_FLOAT8E5 = 57344.0
 5 | MAX_FINITE_FLOAT8E4NV = 448.0
 6 | MAX_FINITE_FLOAT8E4B8 = 240.0
 7 | 
 8 | 
 9 | @dataclass(frozen=True)
10 | class BaseFlexData:
11 |     dtype: torch.dtype | None = None
12 | 
13 |     def view(self, x: torch.Tensor):
14 |         if self.dtype is None:
15 |             return x
16 |         return x.view(self.dtype)
17 | 
18 |     def reinterpret(self, x):
19 |         if self.dtype is None or x.dtype.itemsize > 1:
20 |             return x
21 |         return x.view(self.dtype)
22 | 
23 | 
24 | @dataclass(frozen=True)
25 | class InFlexData(BaseFlexData):
26 |     scale: torch.Tensor | None = None
27 | 
28 |     @property
29 |     def is_per_batch(self):
30 |         return False if self.scale is None else len(self.scale) > 1
31 | 
32 | 
33 | @dataclass(frozen=True)
34 | class OutFlexData(BaseFlexData):
35 |     expected_scale: torch.Tensor | None = None
36 |     actual_scale: torch.Tensor | None = None
37 |     checksum_scale: torch.Tensor | None = None
38 | 
39 |     def __iter__(self):
40 |         yield self.expected_scale
41 |         yield self.actual_scale
42 |         yield self.checksum_scale
43 | 


--------------------------------------------------------------------------------
/python/triton_kernels/triton_kernels/numerics_details/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/triton_kernels/triton_kernels/numerics_details/__init__.py


--------------------------------------------------------------------------------
/python/triton_kernels/triton_kernels/proton_opts.py:
--------------------------------------------------------------------------------
 1 | # proton options
 2 | 
 3 | import os
 4 | 
 5 | _launch_metadata_allow_sync = None
 6 | 
 7 | 
 8 | def launch_metadata_allow_sync():
 9 |     global _launch_metadata_allow_sync
10 |     if _launch_metadata_allow_sync is None:
11 |         _launch_metadata_allow_sync = not (os.getenv("PROTON_LAUNCH_METADATA_NOSYNC") == "1")
12 |     return _launch_metadata_allow_sync
13 | 
14 | 
15 | def set_launch_metadata_allow_sync(allow_sync: bool):
16 |     global _launch_metadata_allow_sync
17 |     _launch_metadata_allow_sync = allow_sync
18 | 


--------------------------------------------------------------------------------
/python/triton_kernels/triton_kernels/topk_details/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/python/triton_kernels/triton_kernels/topk_details/__init__.py


--------------------------------------------------------------------------------
/python/tutorials/README.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | =========
 3 | 
 4 | Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one.
 5 | 
 6 | To install the dependencies for the tutorials:
 7 | 
 8 | .. code-block:: bash
 9 | 
10 |     cd triton
11 |     pip install -e './python[tutorials]'
12 | 


--------------------------------------------------------------------------------
/test/Analysis/amd/test-alignment.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -test-print-amd-alignment -split-input-file -verify-diagnostics=only-expected -o /dev/null
 2 | 
 3 | #mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
 4 | 
 5 | tt.func public @kernel(%arg0: tensor<256x64xf16, #mma> {tt.contiguity=256 : i32, tt.divisibility=6: i32, tt.constancy=1: i32}) attributes {noinline = false} {
 6 |   // expeted-remark @below {{contiguity = [128, 32], divisibility = [6, 6], constancy = [1, 1], constant_value = <none>}}
 7 |   %0 = amdgpu.extract_slice %arg0 [128, 32] : tensor<256x64xf16, #mma> to tensor<128x32xf16, #mma>
 8 |   tt.return
 9 | }
10 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(lib)
 2 | 
 3 | llvm_canonicalize_cmake_booleans(
 4 |   MLIR_ENABLE_BINDINGS_PYTHON
 5 | )
 6 | 
 7 | configure_lit_site_cfg(
 8 |   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
 9 |   ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
10 |   MAIN_CONFIG
11 |   ${CMAKE_CURRENT_SOURCe_DIR}/lit.cfg.py
12 | )
13 | 
14 | set(TRITON_TEST_DEPENDS
15 |   triton-opt
16 |   triton-tensor-layout
17 |   triton-llvm-opt
18 | )
19 | 
20 | set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck")
21 | set(LIT_ARGS "-Dfilecheck=${FILECHECK_PATH}")
22 | 
23 | add_lit_testsuite(check-triton-lit-tests "Running the triton regression tests"
24 |   ${CMAKE_CURRENT_BINARY_DIR}
25 |   ARGS ${LIT_ARGS}
26 |   DEPENDS ${TRITON_TEST_DEPENDS}
27 |   )
28 | 
29 | set_target_properties(check-triton-lit-tests PROPERTIES FOLDER "Tests")
30 | 
31 | add_lit_testsuites(TRITON-LIT-TESTS ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TRITON_TEST_DEPENDS})
32 | 


--------------------------------------------------------------------------------
/test/Conversion/allocate_shared_memory.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s --allocate-shared-memory | FileCheck %s
 2 | 
 3 | #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [1, 0]}>
 4 | 
 5 | // CHECK-LABEL: module
 6 | // CHECK-SAME: ttg.shared = 131072 : i32
 7 | module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 8 | 
 9 | // CHECK-LABEL: @gather_op
10 | // TODO(jeff): Optimize the lowering to reduce shared memory usage.
11 | tt.func @gather_op(%arg0: tensor<1024x256xi32, #blocked>, %arg1: tensor<128x256xf32, #blocked>) {
12 |   // CHECK-NEXT: allocation.offset = 0 : i32
13 |   %0 = tt.gather %arg1[%arg0] {axis = 0 : i32} : (tensor<128x256xf32, #blocked>, tensor<1024x256xi32, #blocked>) -> tensor<1024x256xf32, #blocked>
14 |   tt.return
15 | }
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/test/Conversion/amd/builtin_func_to_llvm.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=True" --convert-builtin-func-to-llvm="ftz=True" | FileCheck %s --check-prefix=LLVM_FTZ
 2 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=False" --convert-builtin-func-to-llvm="ftz=False" | FileCheck %s --check-prefix=LLVM_NO_FTZ
 3 | 
 4 | #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 5 | module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
 6 |   tt.func public @test_fast_expf(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
 7 |     // LLVM_FTZ: llvm.amdgcn.exp2.f32
 8 |     // LLVM_NO_FTZ: llvm.exp2.f32
 9 |     %0 = tt.extern_elementwise %arg0 {libname = "libdevice", libpath = "", pure = true, symbol = "__triton_hip_fast_expf"} : (tensor<64xf32, #blocked>) -> tensor<64xf32, #blocked>
10 |     tt.return
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/test/Conversion/amd/minmax.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck %s --check-prefix=GFX942
 2 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck %s --check-prefix=GFX950
 3 | 
 4 | module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
 5 | 
 6 | // GFX942: llvm.func @min_max
 7 | // GFX942-COUNT-2: llvm.fcmp
 8 | // GFX942: llvm.or
 9 | // GFX942: llvm.intr.minnum
10 | // GFX942-COUNT-2: llvm.fcmp
11 | // GFX942: llvm.or
12 | // GFX942: llvm.intr.maxnum
13 | 
14 | // GFX950: llvm.func @min_max
15 | // GFX950-NEXT: llvm.intr.minimum
16 | // GFX950-NEXT: llvm.intr.maximum
17 |   tt.func public @min_max(%arg0: f32, %arg1: f32) {
18 |     %0 = arith.minimumf %arg0, %arg1 : f32
19 |     %1 = arith.maximumf %arg0, %arg1 : f32
20 |     tt.return
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/test/Conversion/divide-by-0.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm --cse | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: dont_divide_0
 4 | // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
 5 | // CHECK-NOT: llvm.urem %{{.*}}, %[[C0]]
 6 | #blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 7 | #mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 8]}>
 8 | module attributes {"ttg.target" = "cuda:80", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
 9 |   tt.func public @dont_divide_0() attributes {noinline = false} {
10 |     %zero = arith.constant dense<0.000000e+00> : tensor<16x1xf32, #mma>
11 |     %cvt = ttg.convert_layout %zero : tensor<16x1xf32, #mma> -> tensor<16x1xf32, #blocked>
12 |     tt.return
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/test/Conversion/tritongpu_to_llvm_debug.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm --debug| FileCheck %s
 2 | 
 3 | // CHECK-LABEL: convert_identity
 4 | #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 5 | #smem = #ttg.shared_memory
 6 | module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
 7 |   tt.func public @convert_identity(%arg0: tensor<128x128xf16, #blocked>) attributes {noinline = false} {
 8 |     %1 = ttg.convert_layout %arg0 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #blocked>
 9 |     tt.return
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/test/Hopper/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(WarpSpecialization)
2 | 


--------------------------------------------------------------------------------
/test/LLVMIR/break-phi-struct.ll:
--------------------------------------------------------------------------------
 1 | ; RUN: triton-llvm-opt -break-struct-phi-nodes %s | FileCheck %s
 2 | 
 3 | ; CHECK-LABEL: struct
 4 | define {i32, i32} @struct(i1 %c) {
 5 | ; CHECK: br i1 %{{.*}}, label [[TRUE:%.*]], label [[FALSE:%.*]]
 6 |   br i1 %c, label %true, label %false
 7 | 
 8 | true:
 9 |   %s.1 = insertvalue {i32, i32} undef, i32 20, 0
10 |   %s.2 = insertvalue {i32, i32} %s.1, i32 200, 1
11 | 
12 | ; CHECK-DAG: [[E0:%.*]] = extractvalue { i32, i32 } %{{.*}}, 0
13 | ; CHECK-DAG: [[E1:%.*]] = extractvalue { i32, i32 } %{{.*}}, 1
14 | ; CHECK: br
15 |   br label %exit
16 | 
17 | false:
18 |   %s.3 = insertvalue {i32, i32} undef, i32 30, 0
19 |   %s.4 = insertvalue {i32, i32} %s.3, i32 300, 1
20 | ; CHECK-DAG: [[E2:%.*]] = extractvalue { i32, i32 } %{{.*}}, 0
21 | ; CHECK-DAG: [[E3:%.*]] = extractvalue { i32, i32 } %{{.*}}, 1
22 | ; CHECK: br
23 |   br label %exit
24 | 
25 | exit:
26 | ; CHECK-DAG: [[PHI0:%.*]] = phi i32 [ [[E0]], [[TRUE]] ], [ [[E2]], [[FALSE]] ]
27 | ; CHECK-DAG: [[PHI1:%.*]] = phi i32 [ [[E1]], [[TRUE]] ], [ [[E3]], [[FALSE]] ]
28 | ; CHECK: [[S0:%.*]] = insertvalue { i32, i32 } undef, i32 [[PHI0]], 0
29 | ; CHECK: [[S1:%.*]] = insertvalue { i32, i32 } [[S0]], i32 [[PHI1]], 1
30 | ; CHECK: ret { i32, i32 } [[S1]]
31 |   %r = phi {i32, i32} [ %s.2, %true], [ %s.4, %false ]
32 |   ret {i32, i32} %r
33 | }
34 | 


--------------------------------------------------------------------------------
/test/Proton/ops.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt --split-input-file %s -cse -canonicalize | FileCheck %s
 2 | 
 3 | module {
 4 |   // CHECK-LABEL: proton_record
 5 |   tt.func @proton_record() {
 6 |     // CHECK: proton.record() {isStart = true, regionId = 1 : i32}
 7 |     // CHECK-NEXT: proton.record() {isStart = false, regionId = 1 : i32}
 8 |     // CHECK-NEXT: tt.return
 9 |     proton.record() {isStart = true, regionId = 1 : i32}
10 |     proton.record() {isStart = false, regionId = 1 : i32}
11 |     tt.return
12 |   }
13 | } // end module
14 | 
15 | // -----
16 | 


--------------------------------------------------------------------------------
/test/Triton/reproducer.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt --verify-diagnostics --dump-pass-pipeline --run-reproducer %s 2>&1 | FileCheck %s
 2 | 
 3 | module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
 4 |   tt.func public @triton__() attributes {noinline = false} {
 5 |     tt.return
 6 |   }
 7 | }
 8 | 
 9 | {-#
10 |   external_resources: {
11 |     mlir_reproducer: {
12 |       pipeline: "builtin.module(any(convert-scf-to-cf,convert-index-to-llvm{index-bitwidth=0},convert-triton-gpu-to-llvm{compute-capability=90},convert-nv-gpu-to-llvm,convert-arith-to-llvm{index-bitwidth=0},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,symbol-dce,enable-line-info))",
13 |       disable_threading: false,
14 |       verify_each: false
15 |     }
16 |   }
17 | #-}
18 | 
19 | // CHECK: Pass Manager with
20 | // CHECK: convert-triton-gpu-to-llvm
21 | 


--------------------------------------------------------------------------------
/test/Triton/verify-make-range.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt --split-input-file %s --verify-diagnostics
 2 | 
 3 | tt.func public @i64_tensor() {
 4 |     // expected-error @+1 {{i32 elements}}
 5 |     %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : tensor<16xi64>
 6 |     tt.return
 7 | }
 8 | 
 9 | // -----
10 | tt.func public @i32_scalar() {
11 |     // expected-error @+1 {{invalid kind of type}}
12 |     %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : i32
13 |     tt.return
14 | }
15 | 
16 | // -----
17 | tt.func public @_2d_tensor() {
18 |     // expected-error @+1 {{must be a 1D tensor}}
19 |     %a = tt.make_range { start = 0 : i32, end = 16 : i32 } : tensor<16x1xi32>
20 |     tt.return
21 | }
22 | 
23 | // -----
24 | tt.func public @bad_start_end() {
25 |     // expected-error @+1 {{start must be less than end}}
26 |     %a = tt.make_range { start = 0 : i32, end = -16 : i32 } : tensor<16xi32>
27 |     tt.return
28 | }
29 | 
30 | // -----
31 | tt.func public @bad_num_elems() {
32 |     // expected-error @+1 {{number of elements}}
33 |     %a = tt.make_range { start = 0 : i32, end = 32 : i32 } : tensor<16xi32>
34 |     tt.return
35 | }
36 | 
37 | // -----
38 | 
39 | tt.func @same_start_end() {
40 |   // expected-error @+1 {{'tt.make_range' op start must be less than end}}
41 |   %0 = tt.make_range{end = 1 : i32, start = 1 : i32} : tensor<0xi32>
42 |   tt.return
43 | }
44 | 


--------------------------------------------------------------------------------
/test/TritonGPU/amd/amd-conditional-barrier.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
 2 | 
 3 | module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
 4 |   tt.func @conditional_barrier() {
 5 |     // CHECK-LABEL: llvm.func @conditional_barrier
 6 | 
 7 |     // CHECK:   %[[CMP0:.+]] = llvm.icmp "ne" %3, %1 : i32
 8 |     // CHECK:   %[[CMP1:.+]] = llvm.icmp "eq" %3, %1 : i32
 9 |     // CHECK:   llvm.cond_br %[[CMP0]], ^bb1, ^bb2
10 |     // CHECK: ^bb1:
11 |     // CHECK:   rocdl.s.barrier
12 |     // CHECK:   llvm.br ^bb2
13 |     // CHECK: ^bb2:
14 |     // CHECK:   llvm.add
15 |     // CHECK:   llvm.cond_br %[[CMP1]], ^bb3, ^bb4
16 |     // CHECK: ^bb3:
17 |     // CHECK:   rocdl.s.barrier
18 |     // CHECK:   llvm.br ^bb4
19 |     // CHECK: ^bb4:
20 |     // CHECK:   llvm.return
21 | 
22 |     %c256_i32 = arith.constant 256 : i32
23 |     %c0_i32 = arith.constant 0 : i32
24 |     %0 = rocdl.workitem.id.x : i32
25 |     %1 = arith.divsi %0, %c256_i32 : i32
26 |     %2 = arith.cmpi ne, %1, %c0_i32 : i32
27 |     %3 = arith.cmpi eq, %1, %c0_i32 : i32
28 |     amdgpu.cond_barrier %2
29 |     %4 = arith.addi %0, %c256_i32 : i32
30 |     amdgpu.cond_barrier %3
31 |     tt.return
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/test/TritonGPU/global_scratch_to_llvm.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -allow-unregistered-dialect --tritongpu-global-scratch-memory-allocation --convert-triton-gpu-to-llvm | FileCheck %s
 2 | 
 3 | module attributes {"ttg.num-warps" = 4 : i32} {
 4 |   // CHECK-LABEL: @global_scratch_alloc_warpgroup(%arg0: !llvm.ptr<1>)
 5 |   tt.func @global_scratch_alloc_warpgroup() {
 6 |     // CHECK-NEXT: ttg.warp_specialize(%arg0)
 7 |     ttg.warp_specialize()
 8 |     default {
 9 |       ttg.warp_yield
10 |     }
11 |     // CHECK: partition0(%arg1: !llvm.ptr<1>)
12 |     partition0() num_warps(1) {
13 |       // CHECK-COUNT-2: llvm.getelementptr %arg1
14 |       %0 = ttg.global_scratch_alloc {alignment = 8 : i32, nbytes = 100 : i32, ttg.global_scratch_memory_offset = 0 : i32} : !tt.ptr<i8>
15 |       %1 = ttg.global_scratch_alloc {alignment = 8 : i32, nbytes = 100 : i32, ttg.global_scratch_memory_offset = 0 : i32} : !tt.ptr<i8>
16 |       "use"(%0, %1) : (!tt.ptr<i8>, !tt.ptr<i8>) -> ()
17 |       ttg.warp_return
18 |     } : () -> ()
19 |     tt.return
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/test/TritonGPU/inline.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -inline | FileCheck %s
 2 | 
 3 | #smem = #ttg.shared_memory
 4 | #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
 5 | 
 6 | // CHECK-LABEL: @inline_in_warp_specialize
 7 | tt.func public @inline_in_warp_specialize(%arg0: !ttg.memdesc<1xi32, #shared, #smem, mutable>) {
 8 |   ttg.warp_specialize(%arg0)
 9 |   default {
10 |     ttg.warp_yield
11 |   }
12 |   // CHECK: partition0
13 |   partition0(%arg1: !ttg.memdesc<1xi32, #shared, #smem, mutable>) num_warps(4) {
14 |     // CHECK-NEXT: %cst = arith.constant dense<1> : tensor<1xi32>
15 |     // CHECK-NEXT: local_store %cst, %arg1
16 |     tt.call @store_1(%arg1) : (!ttg.memdesc<1xi32, #shared, #smem, mutable>) -> ()
17 |     // CHECK-NEXT: warp_return
18 |     ttg.warp_return
19 |   } : (!ttg.memdesc<1xi32, #shared, #smem, mutable>) -> ()
20 |   tt.return
21 | }
22 | 
23 | tt.func private @store_1(%arg0: !ttg.memdesc<1xi32, #shared, #smem, mutable>) attributes {noinline = false} {
24 |   %cst = arith.constant dense<1> : tensor<1xi32>
25 |   ttg.local_store %cst, %arg0 : tensor<1xi32> -> !ttg.memdesc<1xi32, #shared, #smem, mutable>
26 |   tt.return
27 | }
28 | 


--------------------------------------------------------------------------------
/test/TritonGPU/tf32x3-matmul.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -tritongpu-F32DotTC -canonicalize  | FileCheck %s --check-prefixes=CHECK
 2 | 
 3 | // CHECK:     %[[DOT1:.*]] = tt.dot %[[LHS_LOW:.*]], %[[RHS_HIGH:.*]], %cst, inputPrecision = tf32 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32>
 4 | // CHECK:     %[[DOT2:.*]] = tt.dot %[[LHS_HIGH:.*]], %[[RHS_LOW:.*]], %[[DOT1]], inputPrecision = tf32 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32>
 5 | // CHECK:     %[[CMP:.*]] = arith.cmpf uno, %[[DOT2]], %[[DOT2]] : tensor<16x16xf32>
 6 | // CHECK:     %[[MASKED:.*]] = arith.select %[[CMP]], %cst, %[[DOT2]] : tensor<16x16xi1>, tensor<16x16xf32>
 7 | // CHECK:     %[[RESULT:.*]] = tt.dot %[[LHS_HIGH]], %[[RHS_HIGH]], %[[MASKED]], inputPrecision = tf32 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32>
 8 | 
 9 | module {
10 |   tt.func @dot_test(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>, %arg2: tensor<16x16xf32>) -> tensor<16x16xf32> {
11 |     %4 = tt.dot %arg0, %arg1, %arg2, inputPrecision = tf32x3 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32>
12 |     tt.return %4 : tensor<16x16xf32>
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/test/TritonNvidiaGPU/canonicalize.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -canonicalize | FileCheck %s
 2 | 
 3 | #linear = #ttg.linear<{register = [[0, 1], [0, 2], [32, 0], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 0], [0, 0], [0, 0]], block = []}>
 4 | #tmem_scales = #ttng.tensor_memory_scales_encoding<>
 5 | #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 6 | module attributes {"ttg.num-warps" = 8 : i32, "ttg.num-ctas" = 1 : i32, "ttg.target" = "cuda:80"} {
 7 | 
 8 | // CHECK-LABEL: @test_dce_tmem_alloc
 9 | tt.func @test_dce_tmem_alloc(%arg: tensor<128x4xi8, #linear>) {
10 |   // CHECK-NOT: ttng.tmem_alloc
11 |   %a = ttng.tmem_alloc %arg : (tensor<128x4xi8, #linear>) -> !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>
12 |   // CHECK-NEXT: tt.return
13 |   tt.return
14 | }
15 | 
16 | // CHECK-LABEL: @reinterpret_fold
17 | tt.func @reinterpret_fold(%arg0: !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory>) -> !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory> {
18 |   %0 = ttg.memdesc_reinterpret %arg0 : !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory> -> !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory>
19 |   // CHECK-NEXT: return %arg0
20 |   tt.return %0 : !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory>
21 | }
22 | 
23 | }  // end module
24 | 


--------------------------------------------------------------------------------
/test/TritonNvidiaGPU/inline.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: triton-opt %s -inline | FileCheck %s
 2 | 
 3 | #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
 4 | #smem = #ttg.shared_memory
 5 | 
 6 | module attributes {"ttg.num-warps" = 4 : i32} {
 7 | 
 8 | // CHECK-LABEL: @inline_ttng_ops
 9 | tt.func public @inline_ttng_ops() {
10 |   // CHECK-NEXT: ttg.local_alloc
11 |   // CHECK-NEXT: ttng.init_barrier
12 |   tt.call @function_with_ttng_ops() : () -> ()
13 |   tt.return
14 | }
15 | 
16 | tt.func private @function_with_ttng_ops() {
17 |   %0 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared, #smem, mutable>
18 |   ttng.init_barrier %0, 1 : !ttg.memdesc<1xi64, #shared, #smem, mutable>
19 |   tt.return
20 | }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/test/lib/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(TritonTestAnalysis
 2 |   TestAlias.cpp
 3 |   TestAxisInfo.cpp
 4 |   TestAllocation.cpp
 5 |   TestMembar.cpp
 6 | 
 7 |   LINK_LIBS PUBLIC
 8 |   MLIRPass
 9 |   TritonAnalysis
10 | )
11 | 


--------------------------------------------------------------------------------
/test/lib/Analysis/TestAxisInfo.cpp:
--------------------------------------------------------------------------------
1 | #include "test/include/Analysis/TestAxisInfo.h"
2 | 
3 | namespace mlir {
4 | namespace test {
5 | void registerTestAlignmentPass() { PassRegistration<TestAxisInfoPass>(); }
6 | } // namespace test
7 | } // namespace mlir
8 | 


--------------------------------------------------------------------------------
/test/lib/Analysis/TestMembar.cpp:
--------------------------------------------------------------------------------
 1 | #include "../third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Utility.h"
 2 | #include "mlir/Pass/Pass.h"
 3 | #include "mlir/Transforms/DialectConversion.h"
 4 | #include "triton/Analysis/Allocation.h"
 5 | #include "triton/Analysis/Membar.h"
 6 | 
 7 | using namespace mlir;
 8 | 
 9 | namespace {
10 | 
11 | struct TestMembarPass
12 |     : public PassWrapper<TestMembarPass, OperationPass<ModuleOp>> {
13 | 
14 |   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMembarPass);
15 | 
16 |   StringRef getArgument() const final { return "test-print-membar"; }
17 |   StringRef getDescription() const final {
18 |     return "print the result of the allocation pass";
19 |   }
20 | 
21 |   void runOnOperation() override {
22 |     Operation *operation = getOperation();
23 |     ModuleOp moduleOp = cast<ModuleOp>(operation);
24 |     // Print all ops after membar pass
25 |     ModuleAllocation allocation(moduleOp);
26 |     ModuleMembarAnalysis membarPass(&allocation,
27 |                                     mlir::triton::NVIDIA::canSkipBarSync);
28 |     membarPass.run();
29 |   }
30 | };
31 | 
32 | } // namespace
33 | 
34 | namespace mlir {
35 | namespace test {
36 | void registerTestMembarPass() { PassRegistration<TestMembarPass>(); }
37 | } // namespace test
38 | } // namespace mlir
39 | 


--------------------------------------------------------------------------------
/test/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Analysis)
2 | add_subdirectory(Dialect)
3 | add_subdirectory(Instrumentation)
4 | 


--------------------------------------------------------------------------------
/test/lib/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_library(TritonTestDialect
2 |   TestLoopPeeling.cpp
3 | 
4 |   LINK_LIBS PUBLIC
5 |   MLIRPass
6 |   TritonTransforms
7 | )
8 | 


--------------------------------------------------------------------------------
/test/lib/Dialect/TestLoopPeeling.cpp:
--------------------------------------------------------------------------------
 1 | #include "mlir/Pass/Pass.h"
 2 | #include "triton/Dialect/Triton/Transforms/LoopPeeling.h"
 3 | 
 4 | using namespace mlir;
 5 | 
 6 | namespace {
 7 | 
 8 | bool getPeelEpilogue(scf::ForOp forOp) {
 9 |   return forOp->hasAttr("__test_peel_epilogue");
10 | }
11 | 
12 | struct TestLoopPeelingPass
13 |     : public PassWrapper<TestLoopPeelingPass, OperationPass<ModuleOp>> {
14 | 
15 |   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestLoopPeelingPass);
16 | 
17 |   StringRef getArgument() const final { return "triton-test-loop-peeling"; }
18 |   StringRef getDescription() const final {
19 |     return "test the loop peeling pass";
20 |   }
21 | 
22 |   void runOnOperation() override {
23 |     IRRewriter rewriter(getOperation());
24 |     getOperation().walk([&](scf::ForOp forOp) {
25 |       if (getPeelEpilogue(forOp)) {
26 |         mlir::triton::peelLoopEpilogue(forOp);
27 |       }
28 |     });
29 |   }
30 | };
31 | 
32 | } // namespace
33 | 
34 | namespace mlir {
35 | namespace test {
36 | void registerTestLoopPeelingPass() { PassRegistration<TestLoopPeelingPass>(); }
37 | } // namespace test
38 | } // namespace mlir
39 | 


--------------------------------------------------------------------------------
/test/lit.site.cfg.py.in:
--------------------------------------------------------------------------------
 1 | @LIT_SITE_CFG_IN_HEADER@
 2 | 
 3 | import sys
 4 | 
 5 | config.triton_obj_root = "@triton_BINARY_DIR@"
 6 | config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 7 | config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 8 | config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
 9 | config.llvm_lib_dir = "@LLVM_LIBS_DIR@"
10 | config.llvm_shlib_dir = "@CMAKE_LIBRARY_OUTPUT_DIRECTORY@"
11 | config.llvm_shlib_ext = "@CMAKE_SHARED_LIBRARY_SUFFIX@"
12 | config.llvm_exe_ext = "@EXEEXT@"
13 | config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
14 | config.mlir_binary_dir = "@MLIR_BINARY_DIR@"
15 | config.python_executable = "@Python3_EXECUTABLE@"
16 | config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@
17 | 
18 | 
19 | import lit.llvm
20 | lit.llvm.initialize(lit_config, config)
21 | 
22 | # Let the main config do the real work
23 | lit_config.load_config(config, "@triton_SOURCE_DIR@/test/lit.cfg.py")
24 | 


--------------------------------------------------------------------------------
/third_party/amd/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 3 | add_subdirectory(include)
 4 | add_subdirectory(lib)
 5 | if(TRITON_BUILD_PYTHON_MODULE)
 6 |   add_triton_plugin(TritonAMD ${CMAKE_CURRENT_SOURCE_DIR}/python/triton_amd.cc LINK_LIBS TritonAMDGPUToLLVM TritonAMDGPUTransforms TritonAMDGPUDialectToLLVM)
 7 |   target_link_libraries(TritonAMD PRIVATE Python3::Module pybind11::headers)
 8 | endif()
 9 | if(TRITON_BUILD_UT)
10 |   add_subdirectory(unittest)
11 | endif()
12 | add_subdirectory(test)
13 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/third_party/amd/backend/__init__.py


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/amd_detail/grid_launch_GGL.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | #pragma once
23 | 
24 | #if GENERIC_GRID_LAUNCH == 1
25 | #include "macro_based_grid_launch.hpp"
26 | #endif  // GENERIC_GRID_LAUNCH


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_texture_types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | */
22 | 
23 | 
24 | #ifndef HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
25 | #define HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
26 | 
27 | #include <hip/texture_types.h>
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/include/hip/hip_version.h:
--------------------------------------------------------------------------------
 1 | // Auto-generated by cmake
 2 | 
 3 | #ifndef HIP_VERSION_H
 4 | #define HIP_VERSION_H
 5 | 
 6 | #define HIP_VERSION_MAJOR 6
 7 | #define HIP_VERSION_MINOR 2
 8 | #define HIP_VERSION_PATCH 41134
 9 | #define HIP_VERSION_GITHASH "65d174c3e"
10 | #define HIP_VERSION_BUILD_ID 0
11 | #define HIP_VERSION_BUILD_NAME ""
12 | #define HIP_VERSION    (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
13 | 
14 | #define __HIP_HAS_GET_PCH 1
15 | 
16 | #endif
17 | 
18 | 


--------------------------------------------------------------------------------
/third_party/amd/backend/lib/asanrtl.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/third_party/amd/backend/lib/asanrtl.bc


--------------------------------------------------------------------------------
/third_party/amd/backend/lib/ockl.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/third_party/amd/backend/lib/ockl.bc


--------------------------------------------------------------------------------
/third_party/amd/backend/lib/ocml.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/third_party/amd/backend/lib/ocml.bc


--------------------------------------------------------------------------------
/third_party/amd/include/Analysis/AMDGPUAllocation.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONAMD_ANALYSIS_AMDGPU_ALLOCATION_H
 2 | #define TRITONAMD_ANALYSIS_AMDGPU_ALLOCATION_H
 3 | 
 4 | #include "mlir/IR/BuiltinTypes.h"
 5 | #include "mlir/IR/Operation.h"
 6 | 
 7 | namespace mlir::triton::AMD {
 8 | 
 9 | constexpr char AttrSharedMemPadded[] = "amdgpu.use_padded_scratch_shmem";
10 | 
11 | unsigned getConvertLayoutScratchInBytes(RankedTensorType srcTy,
12 |                                         RankedTensorType dstTy,
13 |                                         bool usePadding);
14 | 
15 | unsigned AMDAllocationAnalysisScratchSizeFn(Operation *op);
16 | 
17 | } // namespace mlir::triton::AMD
18 | 
19 | #endif // TRITONAMD_ANALYSIS_AMDGPU_ALLOCATION_H
20 | 


--------------------------------------------------------------------------------
/third_party/amd/include/Analysis/AxisInfoExt.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONAMD_ANALYSIS_AXIS_INFO_EXT_H
 2 | #define TRITONAMD_ANALYSIS_AXIS_INFO_EXT_H
 3 | 
 4 | #include "include/triton/Analysis/AxisInfo.h"
 5 | 
 6 | namespace mlir::triton::AMD {
 7 | 
 8 | struct AxisInfoExt {
 9 |   static void addVisitors(mlir::triton::AxisInfoVisitorList &visitors);
10 | };
11 | 
12 | class ModuleAxisInfoAnalysis : public mlir::triton::ModuleAxisInfoAnalysis {
13 | public:
14 |   explicit ModuleAxisInfoAnalysis(ModuleOp moduleOp)
15 |       : mlir::triton::ModuleAxisInfoAnalysis(moduleOp,
16 |                                              AxisInfoExt::addVisitors) {}
17 | };
18 | } // namespace mlir::triton::AMD
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/third_party/amd/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Dialect)
2 | add_subdirectory(TritonAMDGPUToLLVM)
3 | add_subdirectory(TritonAMDGPUTransforms)
4 | 


--------------------------------------------------------------------------------
/third_party/amd/include/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonAMDGPU)
2 | 


--------------------------------------------------------------------------------
/third_party/amd/include/Dialect/TritonAMDGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 


--------------------------------------------------------------------------------
/third_party/amd/include/Dialect/TritonAMDGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS TritonAMDGPUOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=amdgpu)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=amdgpu)
 6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions)
 7 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 9 | add_mlir_doc(TritonAMDGPUDialect TritonAMDGPUDialect dialects/ -gen-dialect-doc)
10 | add_mlir_doc(TritonAMDGPUOps TritonAMDGPUOps dialects/ -gen-op-doc)
11 | add_public_tablegen_target(TritonAMDGPUTableGen)
12 | 
13 | set(LLVM_TARGET_DEFINITIONS TritonAMDGPUAttrDefs.td)
14 | mlir_tablegen(TritonAMDGPUEnums.h.inc -gen-enum-decls)
15 | mlir_tablegen(TritonAMDGPUEnums.cpp.inc -gen-enum-defs)
16 | mlir_tablegen(TritonAMDGPUAttrDefs.h.inc -gen-attrdef-decls)
17 | mlir_tablegen(TritonAMDGPUAttrDefs.cpp.inc -gen-attrdef-defs)
18 | add_public_tablegen_target(TritonAMDGPUAttrDefsIncGen)
19 | 


--------------------------------------------------------------------------------
/third_party/amd/include/Dialect/TritonAMDGPU/Utility/CommonUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_DIALECT_TRITONAMDGPU_UTILITY_COMMONUTILS_H_
 2 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_DIALECT_TRITONAMDGPU_UTILITY_COMMONUTILS_H_
 3 | 
 4 | #include "mlir/Dialect/SCF/IR/SCF.h"
 5 | #include "triton/Dialect/Triton/IR/Dialect.h"
 6 | 
 7 | namespace mlir::triton::AMD {
 8 | SmallVector<scf::ForOp> getLeafForOps(triton::FuncOp funcOp);
 9 | } // namespace mlir::triton::AMD
10 | 
11 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_DIALECT_TRITONAMDGPU_UTILITY_COMMONUTILS_H_
12 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonAMDGPUToLLVM)
3 | add_public_tablegen_target(TritonAMDGPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUToLLVM/PatternTritonAMDGPUToLLVM.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_PATTERNTRITONAMDGPUTOLLVM_H_
 2 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_PATTERNTRITONAMDGPUTOLLVM_H_
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | 
 6 | namespace mlir::triton::AMD {
 7 | 
 8 | void populateExtractSliceOpToLLVMPatterns(
 9 |     mlir::LLVMTypeConverter &typeConverter, mlir::RewritePatternSet &patterns,
10 |     mlir::PatternBenefit benefit);
11 | 
12 | void populateInThreadTransposeOpToTTGPatterns(mlir::RewritePatternSet &patterns,
13 |                                               mlir::PatternBenefit benefit);
14 | void populateConcatOpToLLVMPatterns(mlir::LLVMTypeConverter &typeConverter,
15 |                                     mlir::RewritePatternSet &patterns,
16 |                                     mlir::PatternBenefit benefit);
17 | 
18 | } // namespace mlir::triton::AMD
19 | 
20 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_PATTERNTRITONAMDGPUTOLLVM_H_
21 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUToLLVM/TargetUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_TARGETUTILS_H_
 2 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_TARGETUTILS_H_
 3 | 
 4 | #include "llvm/ADT/StringRef.h"
 5 | 
 6 | namespace mlir::triton::AMD {
 7 | 
 8 | // A list of ISA families we care about.
 9 | enum class ISAFamily {
10 |   Unknown,
11 |   CDNA1,
12 |   CDNA2,
13 |   CDNA3,
14 |   CDNA4,
15 |   RDNA1,
16 |   RDNA2,
17 |   RDNA3,
18 | };
19 | 
20 | // Deduces the corresponding ISA family for the given target gfx |arch|.
21 | ISAFamily deduceISAFamily(llvm::StringRef arch);
22 | 
23 | // Retursn true if given architecture support V_DOT instruction.
24 | bool supportsVDot(llvm::StringRef arch);
25 | 
26 | bool isCDNA(ISAFamily isaFamily);
27 | 
28 | bool isRDNA(ISAFamily isaFamily);
29 | 
30 | // Here is a partial definition of DppCtrl enums. For the complete definition,
31 | // please check:
32 | // https://github.com/llvm/llvm-project/blob/8c75290/llvm/lib/Target/AMDGPU/SIDefines.h#L939
33 | enum class DppCtrl : uint32_t {
34 |   QUAD_PERM_FIRST = 0,
35 |   ROW_SHL0 = 0x100,
36 |   ROW_SHR0 = 0x110,
37 |   BCAST15 = 0x142,
38 |   BCAST31 = 0x143
39 | };
40 | 
41 | } // namespace mlir::triton::AMD
42 | 
43 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_TARGETUTILS_H_
44 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUTransforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonAMDGPU)
3 | add_public_tablegen_target(TritonAMDGPUTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/amd/include/TritonAMDGPUTransforms/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_PASSES_H_
 2 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_PASSES_H_
 3 | 
 4 | #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 5 | #include "mlir/Pass/Pass.h"
 6 | #include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
 7 | 
 8 | namespace mlir {
 9 | 
10 | // Generate the pass class declarations.
11 | #define GEN_PASS_DECL
12 | #include "TritonAMDGPUTransforms/Passes.h.inc"
13 | 
14 | /// Generate the code for registering passes.
15 | #define GEN_PASS_REGISTRATION
16 | #include "TritonAMDGPUTransforms/Passes.h.inc"
17 | 
18 | } // namespace mlir
19 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_PASSES_H_
20 | 


--------------------------------------------------------------------------------
/third_party/amd/include/Utils/Utility.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_UTILS_UTILITY_H_
 2 | #define TRITON_THIRD_PARTY_AMD_INCLUDE_UTILS_UTILITY_H_
 3 | 
 4 | #include "llvm/ADT/ArrayRef.h"
 5 | #include <cassert>
 6 | #include <vector>
 7 | namespace mlir::LLVM::AMD {
 8 | 
 9 | template <typename T, typename U, typename BinaryOp>
10 | std::vector<unsigned> multiDimElementwise(const ArrayRef<T> &lhs,
11 |                                           const ArrayRef<U> &rhs, BinaryOp op) {
12 |   assert(lhs.size() == rhs.size() && "Input dimensions must match");
13 |   std::vector<unsigned> result;
14 |   result.reserve(lhs.size());
15 |   for (size_t i = 0, n = lhs.size(); i < n; ++i) {
16 |     unsigned a = static_cast<unsigned>(lhs[i]);
17 |     unsigned b = static_cast<unsigned>(rhs[i]);
18 |     result.push_back(op(a, b));
19 |   }
20 |   return result;
21 | }
22 | } // namespace mlir::LLVM::AMD
23 | #endif // TRITON_THIRD_PARTY_AMD_INCLUDE_UTILS_UTILITY_H_
24 | 


--------------------------------------------------------------------------------
/third_party/amd/language/hip/__init__.py:
--------------------------------------------------------------------------------
1 | from . import libdevice
2 | 
3 | from .utils import memrealtime
4 | 
5 | __all__ = ["libdevice", "memrealtime"]
6 | 


--------------------------------------------------------------------------------
/third_party/amd/language/hip/utils.py:
--------------------------------------------------------------------------------
 1 | from triton.language import core
 2 | 
 3 | 
 4 | @core.extern
 5 | def memrealtime(_semantic=None):
 6 |     """
 7 |     Returns a 64-bit real time-counter value
 8 |     """
 9 |     target_arch = _semantic.builder.options.arch
10 |     if 'gfx11' in target_arch or 'gfx12' in target_arch:
11 |         return core.inline_asm_elementwise(
12 |             """
13 |             s_sendmsg_rtn_b64 $0, sendmsg(MSG_RTN_GET_REALTIME)
14 |             s_waitcnt lgkmcnt(0)
15 |             """,
16 |             "=r",
17 |             [],
18 |             dtype=core.int64,
19 |             is_pure=False,
20 |             pack=1,
21 |             _semantic=_semantic,
22 |         )
23 |     else:
24 |         return core.inline_asm_elementwise(
25 |             """
26 |             s_memrealtime $0
27 |             s_waitcnt vmcnt(0)
28 |             """,
29 |             "=r",
30 |             [],
31 |             dtype=core.int64,
32 |             is_pure=False,
33 |             pack=1,
34 |             _semantic=_semantic,
35 |         )
36 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/Analysis/AxisInfoExt.cpp:
--------------------------------------------------------------------------------
 1 | #include "third_party/amd/include/Analysis/AxisInfoExt.h"
 2 | #include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
 3 | 
 4 | namespace mlir::triton::AMD {
 5 | 
 6 | namespace {
 7 | template <typename OpTy> class CastOpAxisInfoVisitor : public AxisInfoVisitor {
 8 | public:
 9 |   using AxisInfoVisitor::AxisInfoVisitor;
10 | 
11 |   AxisInfo
12 |   getAxisInfo(Operation *op,
13 |               ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) final {
14 |     return operands[0]->getValue();
15 |   }
16 | 
17 |   virtual bool match(Operation *op) final { return isa<OpTy>(op); }
18 | };
19 | } // namespace
20 | 
21 | void AxisInfoExt::addVisitors(mlir::triton::AxisInfoVisitorList &visitors) {
22 |   visitors.append<CastOpAxisInfoVisitor<amdgpu::ExtractSliceOp>>();
23 |   return;
24 | }
25 | } // namespace mlir::triton::AMD
26 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAMDAnalysis
 2 |   RangeAnalysis.cpp
 3 |   AxisInfoExt.cpp
 4 |   AMDGPUAllocation.cpp
 5 | 
 6 |   DEPENDS
 7 |   TritonTableGen
 8 | 
 9 |   LINK_LIBS PUBLIC
10 |   MLIRAnalysis
11 |   MLIRLLVMDialect
12 |   TritonIR
13 |   TritonGPUIR
14 | )
15 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Analysis)
2 | add_subdirectory(Dialect)
3 | add_subdirectory(TritonAMDGPUToLLVM)
4 | add_subdirectory(TritonAMDGPUDialectToLLVM)
5 | add_subdirectory(TritonAMDGPUTransforms)
6 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonAMDGPU)
2 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/Dialect/TritonAMDGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Utility)
3 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/Dialect/TritonAMDGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAMDGPUIR
 2 |   Dialect.cpp
 3 | 
 4 |   DEPENDS
 5 |   TritonAMDGPUTableGen
 6 |   TritonAMDGPUAttrDefsIncGen
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   MLIRLLVMDialect
10 |   TritonIR
11 |   TritonGPUIR
12 | )
13 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/Dialect/TritonAMDGPU/Utility/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_triton_library(TritonAMDUtils
2 |   CommonUtils.cpp
3 | 
4 |   LINK_LIBS PUBLIC
5 |   MLIRLLVMDialect
6 |   TritonIR
7 |   TritonGPUIR
8 | )
9 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/Dialect/TritonAMDGPU/Utility/CommonUtils.cpp:
--------------------------------------------------------------------------------
 1 | #include "third_party/amd/include/Dialect/TritonAMDGPU/Utility/CommonUtils.h"
 2 | 
 3 | namespace mlir::triton::AMD {
 4 | SmallVector<scf::ForOp> getLeafForOps(triton::FuncOp funcOp) {
 5 |   SmallVector<scf::ForOp> allOps;
 6 |   funcOp->walk([&](scf::ForOp forOp) { allOps.push_back(forOp); });
 7 | 
 8 |   SmallVector<scf::ForOp> leafOps;
 9 |   for (scf::ForOp forOp : allOps) {
10 |     auto searchResult = forOp.getBody()->walk(
11 |         [](scf::ForOp) { return WalkResult::interrupt(); });
12 |     if (!searchResult.wasInterrupted())
13 |       leafOps.push_back(forOp);
14 |   }
15 |   return leafOps;
16 | }
17 | } // namespace mlir::triton::AMD
18 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUDialectToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAMDGPUDialectToLLVM
 2 |     TritonAMDGPUToLLVMPatterns.cpp
 3 |     ExtractSliceOpToLLVM.cpp
 4 |     InThreadTransposeOpToTTG.cpp
 5 |     ConcatOpToLLVM.cpp
 6 |     Utility.cpp
 7 | 
 8 |     DEPENDS
 9 |     TritonAMDGPUIR
10 | )
11 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUDialectToLLVM/InThreadTransposeOpToTTG.cpp:
--------------------------------------------------------------------------------
 1 | #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 2 | #include "triton/Conversion/MLIRTypes.h"
 3 | 
 4 | using namespace mlir;
 5 | using namespace mlir::triton;
 6 | namespace ttg = mlir::triton::gpu;
 7 | 
 8 | namespace {
 9 | 
10 | struct InThreadTransposeOpConversion
11 |     : public OpConversionPattern<triton::amdgpu::InThreadTransposeOp> {
12 | public:
13 |   explicit InThreadTransposeOpConversion(MLIRContext *ctx,
14 |                                          PatternBenefit benefit)
15 |       : OpConversionPattern(ctx, benefit) {}
16 | 
17 |   LogicalResult
18 |   matchAndRewrite(triton::amdgpu::InThreadTransposeOp op, OpAdaptor adaptor,
19 |                   ConversionPatternRewriter &rewriter) const override {
20 |     rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(op, op.getType(),
21 |                                                       op.getSrc());
22 |     return success();
23 |   }
24 | };
25 | 
26 | } // namespace
27 | 
28 | namespace mlir::triton::AMD {
29 | 
30 | void populateInThreadTransposeOpToTTGPatterns(RewritePatternSet &patterns,
31 |                                               PatternBenefit benefit) {
32 |   patterns.add<InThreadTransposeOpConversion>(patterns.getContext(), benefit);
33 | }
34 | 
35 | } // namespace mlir::triton::AMD
36 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUDialectToLLVM/TritonAMDGPUToLLVMPatterns.cpp:
--------------------------------------------------------------------------------
 1 | #include "third_party/amd/include/TritonAMDGPUToLLVM/PatternTritonAMDGPUToLLVM.h"
 2 | #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 3 | 
 4 | namespace mlir::triton::AMD {
 5 | void populateTritonAMDGPUToLLVMPatterns(LLVMTypeConverter &typeConverter,
 6 |                                         RewritePatternSet &patterns,
 7 |                                         PatternBenefit benefit) {
 8 |   populateExtractSliceOpToLLVMPatterns(typeConverter, patterns, benefit);
 9 |   populateInThreadTransposeOpToTTGPatterns(patterns, benefit);
10 |   populateConcatOpToLLVMPatterns(typeConverter, patterns, benefit);
11 | }
12 | } // namespace mlir::triton::AMD
13 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUDialectToLLVM/Utility.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUDIALECTTOLLVM_UTILITY_H_
 2 | #define TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUDIALECTTOLLVM_UTILITY_H_
 3 | 
 4 | #include "triton/Tools/LinearLayout.h"
 5 | 
 6 | namespace mlir::LLVM::AMD {
 7 | 
 8 | // Determine the order in which CTA tiles are laid out across the tensor.
 9 | // That is, create vector of dimensions from fastest to slowest varying.
10 | SmallVector<unsigned> getCTATileOrder(MLIRContext *ctx,
11 |                                       const mlir::triton::LinearLayout &layout);
12 | 
13 | } // namespace mlir::LLVM::AMD
14 | #endif // TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUDIALECTTOLLVM_UTILITY_H_
15 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUToLLVM/AllocateSharedMemory.cpp:
--------------------------------------------------------------------------------
 1 | #include "Analysis/AMDGPUAllocation.h"
 2 | #include "TritonAMDGPUToLLVM/Passes.h"
 3 | #include "triton/Analysis/Allocation.h"
 4 | #include "triton/Analysis/Utility.h"
 5 | #include "triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h"
 6 | 
 7 | using namespace mlir;
 8 | using namespace mlir::triton;
 9 | using namespace mlir::triton::AMD;
10 | 
11 | namespace mlir::triton {
12 | #define GEN_PASS_DEF_ALLOCATEAMDGPUSHAREDMEMORY
13 | #include "TritonAMDGPUToLLVM/Passes.h.inc"
14 | } // namespace mlir::triton
15 | 
16 | namespace {
17 | 
18 | struct AllocateAMDGPUSharedMemory
19 |     : public mlir::triton::impl::AllocateAMDGPUSharedMemoryBase<
20 |           AllocateAMDGPUSharedMemory> {
21 |   void runOnOperation() override {
22 |     ModuleOp mod = getOperation();
23 |     ModuleAllocation allocation(mod, AMDAllocationAnalysisScratchSizeFn);
24 | 
25 |     mlir::triton::gpu::attachAllocationSizeAndOffsetAttr(mod, allocation);
26 |   }
27 | };
28 | 
29 | } // namespace
30 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAMDGPUToLLVM
 2 |     AsyncUtility.cpp
 3 |     AtomicRMWOpsEmitter.cpp
 4 |     AllocateSharedMemory.cpp
 5 |     BufferOpsEmitter.cpp
 6 |     ConvertLayoutOpToLLVM/SharedToDotOperandHelper.cpp
 7 |     ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
 8 |     ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
 9 |     ConvertLayoutOpToLLVM.cpp
10 |     MemoryOpToLLVM.cpp
11 |     DotOpToLLVM/FMA.cpp
12 |     DotOpToLLVM/MFMA.cpp
13 |     DotOpToLLVM/WMMA.cpp
14 |     DotOpToLLVM.cpp
15 |     ElementwiseOpToLLVM.cpp
16 |     LoadStoreOpToLLVM.cpp
17 |     GCNAsmFormat.cpp
18 |     TritonGPUToLLVM.cpp
19 |     BuiltinFuncToLLVM.cpp
20 |     Utility.cpp
21 |     TargetInfo.cpp
22 |     TargetUtils.cpp
23 |     OptimizeLDSUsage.cpp
24 |     OptimizeLDSUtility.cpp
25 |     SPMDOpToLLVM.cpp
26 |     SchedInstructions.cpp
27 |     UpcastMXFPToLLVM.cpp
28 |     MembarUtility.cpp
29 |     ScalarizePackedFOps.cpp
30 | 
31 |     DEPENDS
32 |     TritonAMDGPUConversionPassIncGen
33 |     LLVMIRIncGen
34 | 
35 |     LINK_LIBS PUBLIC
36 |     TritonGPUToLLVM
37 |     TritonAMDGPUIR
38 |     TritonProtonToLLVM
39 |     LLVMCore
40 |     LLVMPasses
41 |     LLVMSupport
42 | )
43 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUToLLVM/MembarUtility.cpp:
--------------------------------------------------------------------------------
 1 | #include "TritonAMDGPUToLLVM/MembarUtility.h"
 2 | #include "AsyncUtility.h"
 3 | #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 4 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 5 | 
 6 | namespace mlir::triton::AMD {
 7 | namespace {
 8 | // Returns true if one of the operands is a LocalLoad synced via AsyncWait.
 9 | bool filterAsyncLocalLoadsDependencies(Operation *op1, Operation *op2) {
10 |   auto isAsyncLoad = [](Operation *op) {
11 |     return llvm::isa<triton::gpu::AsyncCopyGlobalToLocalOp,
12 |                      triton::amdgpu::BufferLoadToLocalOp>(op);
13 |   };
14 |   auto isLocalLoadWithAsyncWaitToken = [](Operation *op) {
15 |     auto localLoad = llvm::dyn_cast<triton::gpu::LocalLoadOp>(op);
16 |     return localLoad && isSyncedViaAsyncWait(localLoad);
17 |   };
18 | 
19 |   // Early return if neither or both operands are an AsyncLoad
20 |   if (isAsyncLoad(op1) == isAsyncLoad(op2)) {
21 |     return false;
22 |   }
23 | 
24 |   return isLocalLoadWithAsyncWaitToken(op1) ||
25 |          isLocalLoadWithAsyncWaitToken(op2);
26 | };
27 | } // namespace
28 | 
29 | bool membarFilter(Operation *op1, Operation *op2) {
30 |   return filterAsyncLocalLoadsDependencies(op1, op2);
31 | }
32 | } // namespace mlir::triton::AMD
33 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_SCHEDINSTRUCTIONS_H_
 2 | #define TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_SCHEDINSTRUCTIONS_H_
 3 | 
 4 | #include "mlir/IR/Types.h"
 5 | #include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
 6 | #include "triton/Dialect/Triton/IR/Dialect.h"
 7 | #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 8 | 
 9 | // The following functions are used to collect and set side-channel information
10 | // during to LLVM conversion/lowering to facilitate instruction scheduling
11 | // controls.
12 | namespace mlir::triton {
13 | triton::DotOp getSingleDotOpIfExists(scf::ForOp forOp);
14 | } // namespace mlir::triton
15 | 
16 | #endif // TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_SCHEDINSTRUCTIONS_H_
17 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonAMDGPUTransforms
 2 |   AccelerateAMDMatmul.cpp
 3 |   BlockPingpong.cpp
 4 |   CanonicalizePointers.cpp
 5 |   CoalesceAsyncCopy.cpp
 6 |   ConvertToBufferOps.cpp
 7 |   OptimizeEpilogue.cpp
 8 |   HoistLayoutConversions.cpp
 9 |   ReorderInstructions.cpp
10 |   StreamPipeline.cpp
11 |   MfmaGroup.cpp
12 |   WmmaGroup.cpp
13 |   InThreadTranspose.cpp
14 |   FoldTrueCmpIOp.cpp
15 |   UpdateAsyncWaitCount.cpp
16 |   Utility.cpp
17 | 
18 |   DEPENDS
19 |   TritonAMDGPUIR
20 |   TritonAMDGPUTransformsIncGen
21 |   TritonGPUIR
22 |   TritonAMDUtils
23 |   TritonAMDAnalysis
24 | )
25 | 
26 | target_include_directories(TritonAMDGPUTransforms PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
27 | target_include_directories(TritonAMDGPUTransforms PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../include)
28 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUTransforms/FoldTrueCmpIOp.cpp:
--------------------------------------------------------------------------------
 1 | #include "TritonAMDGPUTransforms/Passes.h"
 2 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 3 | #include "third_party/amd/include/Analysis/RangeAnalysis.h"
 4 | #include "triton/Analysis/Utility.h"
 5 | 
 6 | using namespace mlir::triton;
 7 | 
 8 | namespace mlir {
 9 | 
10 | #define GEN_PASS_DEF_TRITONAMDFOLDTRUECMPI
11 | #include "TritonAMDGPUTransforms/Passes.h.inc"
12 | 
13 | struct TritonAMDFoldTrueCmpIOpPass
14 |     : impl::TritonAMDFoldTrueCmpIBase<TritonAMDFoldTrueCmpIOpPass> {
15 | 
16 |   void runOnOperation() override {
17 |     DenseMap<Value, SetVector<Operation *>> assumptions =
18 |         AMD::TritonIntegerRangeAnalysis::collectAssumptions(getOperation());
19 |     ModuleOp mod = getOperation();
20 |     std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
21 |     AMD::TritonIntegerRangeAnalysis *rangeAnalysis =
22 |         solver->load<AMD::TritonIntegerRangeAnalysis>(assumptions);
23 |     AMD::initializeFuncOps(mod, rangeAnalysis);
24 |     if (failed(solver->initializeAndRun(getOperation())))
25 |       return signalPassFailure();
26 | 
27 |     RewritePatternSet patterns(&getContext());
28 |     AMD::populateFoldTrueCmpIOpPatterns(patterns, solver.get());
29 |     (void)applyPatternsGreedily(mod, std::move(patterns));
30 |   }
31 | };
32 | 
33 | } // namespace mlir
34 | 


--------------------------------------------------------------------------------
/third_party/amd/lib/TritonAMDGPUTransforms/Utility.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTRANSFORMS_UTILITY_H_
 2 | #define TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTRANSFORMS_UTILITY_H_
 3 | 
 4 | #include "mlir/IR/Operation.h"
 5 | #include "mlir/IR/Value.h"
 6 | 
 7 | using namespace mlir;
 8 | 
 9 | // DFS the def chain of 'defValue' starting from 'consumer' and will return the
10 | // minimum found when accumulating countFunc(op) for all non control flow ops
11 | // between value and the consumer. This function will traverse through for loop
12 | // iterations and to the outside of the loop to find all its producers.
13 | //    CountOp(Operation*) should return the value to accumulate for the
14 | //    operation
15 | // Returns 0 if there is an error traversing the def chain
16 | int deduceMinCountOnDefChain(Value defValue, Operation *consumerOp,
17 |                              llvm::function_ref<int(Operation *)> countFunc);
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/third_party/amd/python/test/address_sanitizer_helper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | 
 5 | size = 4096
 6 | x = torch.rand(size, device='cuda')
 7 | y = torch.rand(size, device='cuda')
 8 | output = torch.empty_like(x)
 9 | n_elements = output.numel()
10 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
11 | 
12 | 
13 | @triton.jit
14 | def add_kernel(
15 |     x_ptr,
16 |     y_ptr,
17 |     output_ptr,
18 |     n_elements,
19 |     BLOCK_SIZE: tl.constexpr,
20 | ):
21 |     pid = tl.program_id(axis=0)
22 |     block_start = pid * BLOCK_SIZE
23 |     #Set access to go out of bounds for ASAN test
24 |     offsets = block_start + tl.arange(0, BLOCK_SIZE) + 1
25 |     x = tl.load(x_ptr + offsets)
26 |     y = tl.load(y_ptr + offsets)
27 |     output = x + y
28 |     tl.store(output_ptr + offsets, output)
29 | 
30 | 
31 | pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
32 | amdgcn = pgm.asm['amdgcn']
33 | print(amdgcn)
34 | 


--------------------------------------------------------------------------------
/third_party/amd/test/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(lib)
2 | 


--------------------------------------------------------------------------------
/third_party/amd/test/lib/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(TritonAMDGPUTestAnalysis
 2 |   TestAMDRangeAnalysis.cpp
 3 |   TestAMDGPUMembar.cpp
 4 |   TestAxisInfo.cpp
 5 | 
 6 |   DEPENDS
 7 |   TritonTableGen
 8 |   TritonGPUTableGen
 9 |   TritonGPUAttrDefsIncGen
10 |   TritonGPUTypeInterfacesIncGen
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRPass
14 |   ${triton_libs}
15 | )
16 | 


--------------------------------------------------------------------------------
/third_party/amd/test/lib/Analysis/TestAMDGPUMembar.cpp:
--------------------------------------------------------------------------------
 1 | #include "TritonAMDGPUToLLVM/MembarUtility.h"
 2 | #include "amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h"
 3 | #include "mlir/Pass/Pass.h"
 4 | #include "mlir/Transforms/DialectConversion.h"
 5 | #include "triton/Analysis/Allocation.h"
 6 | #include "triton/Analysis/Membar.h"
 7 | 
 8 | using namespace mlir;
 9 | 
10 | namespace {
11 | 
12 | struct TestAMDGPUMembarPass
13 |     : public PassWrapper<TestAMDGPUMembarPass, OperationPass<ModuleOp>> {
14 | 
15 |   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAMDGPUMembarPass);
16 | 
17 |   StringRef getArgument() const final { return "test-tritonamdgpu-membar"; }
18 |   StringRef getDescription() const final {
19 |     return "print the result of the membar analysis as run in the amdgpu "
20 |            "backend";
21 |   }
22 | 
23 |   void runOnOperation() override {
24 |     ModuleOp moduleOp = getOperation();
25 |     triton::AMD::annotateLocalLoadsSyncedViaAsyncWait(moduleOp);
26 |     // Print all ops after membar pass
27 |     ModuleAllocation allocation(moduleOp);
28 |     ModuleMembarAnalysis membarPass(&allocation, triton::AMD::membarFilter);
29 |     membarPass.run();
30 |   }
31 | };
32 | 
33 | } // namespace
34 | 
35 | namespace mlir::test {
36 | void registerTestAMDGPUMembarPass() {
37 |   PassRegistration<TestAMDGPUMembarPass>();
38 | }
39 | } // namespace mlir::test
40 | 


--------------------------------------------------------------------------------
/third_party/amd/test/lib/Analysis/TestAxisInfo.cpp:
--------------------------------------------------------------------------------
 1 | #include "test/include/Analysis/TestAxisInfo.h"
 2 | #include "third_party/amd/include/Analysis/AxisInfoExt.h"
 3 | 
 4 | namespace {
 5 | 
 6 | struct AMDTestAxisInfoPass : public mlir::test::TestAxisInfoPass {
 7 | 
 8 |   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AMDTestAxisInfoPass);
 9 | 
10 |   StringRef getArgument() const final { return "test-print-amd-alignment"; }
11 | 
12 | protected:
13 |   ModuleAxisInfoAnalysis getAnalysis(ModuleOp moduleOp) const final {
14 |     return AMD::ModuleAxisInfoAnalysis(moduleOp);
15 |   }
16 | };
17 | } // namespace
18 | 
19 | namespace mlir::test {
20 | void registerAMDTestAlignmentPass() { PassRegistration<AMDTestAxisInfoPass>(); }
21 | } // namespace mlir::test
22 | 


--------------------------------------------------------------------------------
/third_party/amd/test/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Analysis)
2 | 


--------------------------------------------------------------------------------
/third_party/amd/tools/hip/compile.h:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: MIT
 2 | // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <hip/hip_runtime.h>
 7 | #include <inttypes.h>
 8 | #include <stdint.h>
 9 | #include <stdio.h>
10 | 
11 | void unload_{kernel_name}(void);
12 | void load_{kernel_name}(void);
13 | hipError_t{_placeholder} {kernel_name}(hipStream_t stream, {signature});
14 | 


--------------------------------------------------------------------------------
/third_party/amd/unittest/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Conversion)
2 | 


--------------------------------------------------------------------------------
/third_party/amd/unittest/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_ut(
 2 |   NAME TestOptimizeLDS
 3 |   SRCS OptimizeLDSTest.cpp
 4 |   LIBS
 5 |     TritonAnalysis
 6 |     TritonIR
 7 |     TritonGPUIR
 8 |     TritonAMDGPUToLLVM
 9 |     MLIRUBToLLVM
10 |     TritonAMDUtils
11 |     TritonAMDAnalysis
12 |     TritonAMDGPUTransforms
13 |     TritonAMDGPUDialectToLLVM
14 | )
15 | 


--------------------------------------------------------------------------------
/third_party/f2reduce/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_triton_library(f2reduce
2 |   f2reduce.cpp
3 | )
4 | 


--------------------------------------------------------------------------------
/third_party/f2reduce/LICENCE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2023 Adam P. Goucher, Hatsya Limited
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/third_party/f2reduce/VERSION:
--------------------------------------------------------------------------------
1 | Cloned from https://gitlab.com/hatsya/open-source/f2reduce at revision
2 | 949b91d022c001bbce19157f806013d37f05fbf5.
3 | 


--------------------------------------------------------------------------------
/third_party/f2reduce/f2reduce.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stdint.h>
 3 | 
 4 | // OpenAI change: Switched from `extern "C"` to `namespace f2reduce`.
 5 | namespace f2reduce {
 6 | 
 7 | /**
 8 |  * Converts a matrix over F_2 into row-reduced echelon form.
 9 |  *
10 |  * The matrix should be in row-major format. The stride parameter specifies
11 |  * the offset (in 64-bit words, *not* bytes!) between successive rows of the
12 |  * matrix, and should obey the inequality:
13 |  *
14 |  *     64 |stride| >= cols
15 |  *
16 |  * i.e. that the rows occupy disjoint regions of memory. For best performance
17 |  * the stride should be divisible by 16 words (128 bytes).
18 |  *
19 |  * We adopt 'little-endian' semantics: the element in row i and column j+64*k
20 |  * of the matrix (zero-indexed) is given by (matrix[i * stride + k] >> j) & 1.
21 |  *
22 |  * The matrix is overwritten in place with its row-reduced echelon form.
23 |  */
24 | void inplace_rref_strided(uint64_t *matrix, uint64_t rows, uint64_t cols, uint64_t stride);
25 | 
26 | uint64_t get_recommended_stride(uint64_t cols);
27 | 
28 | }  // namespace f2reduce
29 | 


--------------------------------------------------------------------------------
/third_party/nvidia/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 3 | add_subdirectory(include)
 4 | add_subdirectory(lib)
 5 | if(TRITON_BUILD_PYTHON_MODULE)
 6 |   add_triton_plugin(TritonNVIDIA ${CMAKE_CURRENT_SOURCE_DIR}/triton_nvidia.cc LINK_LIBS TritonNVIDIAGPUToLLVM NVGPUToLLVM)
 7 |   target_link_libraries(TritonNVIDIA PRIVATE Python3::Module pybind11::headers)
 8 | endif()
 9 | if(TRITON_BUILD_UT)
10 |   add_subdirectory(unittest)
11 | endif()
12 | add_subdirectory(hopper)
13 | 


--------------------------------------------------------------------------------
/third_party/nvidia/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/third_party/nvidia/backend/__init__.py


--------------------------------------------------------------------------------
/third_party/nvidia/backend/lib/libdevice.10.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-lang/triton/299b3bb9cc214d5ac0f685088aa13790e94b22df/third_party/nvidia/backend/lib/libdevice.10.bc


--------------------------------------------------------------------------------
/third_party/nvidia/hopper/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(include)
2 | add_subdirectory(lib)
3 | 


--------------------------------------------------------------------------------
/third_party/nvidia/hopper/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Transforms)
2 | 


--------------------------------------------------------------------------------
/third_party/nvidia/hopper/include/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name NVHopperTransforms)
3 | add_public_tablegen_target(NVHopperTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/nvidia/hopper/include/Transforms/Passes.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef DIALECT_NV_TRANSFORMS_PASSES_H_
 3 | #define DIALECT_NV_TRANSFORMS_PASSES_H_
 4 | 
 5 | #include "mlir/Pass/Pass.h"
 6 | 
 7 | namespace mlir {
 8 | 
 9 | // Generate the pass class declarations.
10 | #define GEN_PASS_DECL
11 | #include "nvidia/hopper/include/Transforms/Passes.h.inc"
12 | 
13 | /// Generate the code for registering passes.
14 | #define GEN_PASS_REGISTRATION
15 | #include "nvidia/hopper/include/Transforms/Passes.h.inc"
16 | 
17 | } // namespace mlir
18 | #endif // DIALECT_NV_TRANSFORMS_PASSES_H_
19 | 


--------------------------------------------------------------------------------
/third_party/nvidia/hopper/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Transforms)
2 | 


--------------------------------------------------------------------------------
/third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(NVHopperTransforms
 2 |   WarpSpecialization.cpp
 3 |   WarpSpecialization/CodePartitionUtility.cpp
 4 |   WarpSpecialization/TaskIdPropagation.cpp
 5 |   WarpSpecialization/Utility.cpp
 6 |   WarpSpecialization/WSBuffer.cpp
 7 |   WarpSpecialization/WSCodePartition.cpp
 8 |   WarpSpecialization/WSDataPartition.cpp
 9 |   WarpSpecialization/WSLowerMem.cpp
10 |   WarpSpecialization/WSLowerToken.cpp
11 |   WarpSpecialization/WSSpecialize.cpp
12 |   WarpSpecialization/WSTaskIdPropagate.cpp
13 |   WarpSpecialization/WSTaskPartition.cpp
14 | 
15 |   DEPENDS
16 |   NVHopperTransformsIncGen
17 | 
18 |   LINK_LIBS PUBLIC
19 |   TritonIR
20 |   TritonGPUIR
21 |   MLIRTransformUtils
22 | )
23 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Dialect)
2 | add_subdirectory(TritonNVIDIAGPUToLLVM)
3 | add_subdirectory(NVGPUToLLVM)
4 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(NVGPU)
2 | add_subdirectory(NVWS)
3 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/Dialect/NVGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/Dialect/NVGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu)
 6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions)
 7 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 9 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
10 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
11 | add_mlir_doc(NVGPUDialect NVGPUDialect dialects/ -gen-dialect-doc)
12 | add_mlir_doc(NVGPUOps NVGPUOps dialects/ -gen-op-doc)
13 | add_public_tablegen_target(NVGPUTableGen)
14 | 
15 | set(LLVM_TARGET_DEFINITIONS NVGPUAttrDefs.td)
16 | mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls)
17 | mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs)
18 | add_public_tablegen_target(NVGPUAttrDefsIncGen)
19 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/Dialect/NVWS/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/Dialect/NVWS/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS NVWSOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvws)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvws)
 6 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 7 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 8 | mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=nvws)
 9 | mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=nvws)
10 | add_mlir_doc(NVWSDialect NVWSDialect dialects/ -gen-dialect-doc)
11 | add_mlir_doc(NVWSOps NVWSOps dialects/ -gen-op-doc)
12 | add_public_tablegen_target(NVWSTableGen)
13 | 
14 | set(LLVM_TARGET_DEFINITIONS NVWSAttrDefs.td)
15 | mlir_tablegen(NVWSAttrDefs.h.inc -gen-attrdef-decls)
16 | mlir_tablegen(NVWSAttrDefs.cpp.inc -gen-attrdef-defs)
17 | mlir_tablegen(NVWSAttrEnums.h.inc -gen-enum-decls)
18 | mlir_tablegen(NVWSAttrEnums.cpp.inc -gen-enum-defs)
19 | add_public_tablegen_target(NVWSAttrDefsIncGen)
20 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/Dialect/NVWS/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name NVWSTransforms)
3 | add_public_tablegen_target(NVWSTransformsIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/NVGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name NVGPUToLLVM)
3 | add_public_tablegen_target(NVGPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_NVGPU_TO_LLVM_PASS_H
 2 | #define TRITON_CONVERSION_NVGPU_TO_LLVM_PASS_H
 3 | 
 4 | #include <string>
 5 | #include <utility>
 6 | #include <vector>
 7 | 
 8 | #include "mlir/IR/Operation.h"
 9 | #include "mlir/IR/PatternMatch.h"
10 | #include "mlir/IR/Value.h"
11 | #include "mlir/Support/LogicalResult.h"
12 | 
13 | namespace mlir {
14 | 
15 | class ModuleOp;
16 | template <typename T> class OperationPass;
17 | 
18 | namespace triton {
19 | 
20 | namespace nvgpu {
21 | 
22 | using Constraints = std::vector<std::string>;
23 | using OperandsAndConstraints = std::vector<std::pair<Value, std::string>>;
24 | 
25 | LogicalResult
26 | rewriteAsPtxAsm(mlir::Operation *op, mlir::PatternRewriter &rewriter,
27 |                 std::string ptxAsm,
28 |                 const OperandsAndConstraints &operandsAndConstraints = {},
29 |                 const Constraints &outputConstraints = {});
30 | 
31 | } // namespace nvgpu
32 | 
33 | } // namespace triton
34 | 
35 | } // namespace mlir
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/NVGPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef NVGPU_CONVERSION_PASSES_H
 2 | #define NVGPU_CONVERSION_PASSES_H
 3 | 
 4 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 5 | #include "mlir/Pass/Pass.h"
 6 | #include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h"
 7 | 
 8 | namespace mlir {
 9 | namespace triton {
10 | 
11 | #define GEN_PASS_DECL
12 | #include "nvidia/include/NVGPUToLLVM/Passes.h.inc"
13 | 
14 | #define GEN_PASS_REGISTRATION
15 | #include "nvidia/include/NVGPUToLLVM/Passes.h.inc"
16 | 
17 | } // namespace triton
18 | } // namespace mlir
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/NVGPUToLLVM/Passes.td:
--------------------------------------------------------------------------------
 1 | #ifndef NVGPU_CONVERSION_PASSES
 2 | #define NVGPU_CONVERSION_PASSES
 3 | 
 4 | include "mlir/Pass/PassBase.td"
 5 | 
 6 | def ConvertNVGPUToLLVM : Pass<"convert-nv-gpu-to-llvm", "mlir::ModuleOp"> {
 7 |     let summary = "Convert NVGPU to LLVM";
 8 |     let description = [{
 9 | 
10 |     }];
11 | 
12 |     let dependentDialects = ["mlir::arith::ArithDialect",
13 |                              "mlir::LLVM::LLVMDialect",
14 |                              "mlir::NVVM::NVVMDialect",
15 |                              "mlir::triton::nvgpu::NVGPUDialect"];
16 | }
17 | 
18 | #endif // NVGPU_CONVERSION_PASSES
19 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonNVIDIAGPUToLLVM)
3 | add_public_tablegen_target(TritonNVIDIAGPUConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_PASSES_H
 2 | #define TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_PASSES_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | #include "mlir/Pass/Pass.h"
 6 | #include "mlir/Transforms/DialectConversion.h"
 7 | 
 8 | #include <memory>
 9 | 
10 | namespace mlir {
11 | 
12 | class ModuleOp;
13 | template <typename T> class OperationPass;
14 | 
15 | namespace triton {
16 | 
17 | #define GEN_PASS_DECL
18 | #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc"
19 | 
20 | std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonGPUToLLVMPass();
21 | std::unique_ptr<OperationPass<ModuleOp>>
22 | createConvertTritonGPUToLLVMPass(int32_t computeCapability);
23 | std::unique_ptr<OperationPass<ModuleOp>>
24 | createConvertTritonGPUToLLVMPass(int32_t computeCapability, int32_t ptxVersion);
25 | 
26 | #define GEN_PASS_REGISTRATION
27 | #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc"
28 | 
29 | } // namespace triton
30 | 
31 | } // namespace mlir
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Utility.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_UTILITY_H
 2 | #define TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_UTILITY_H
 3 | 
 4 | #include "mlir/IR/Operation.h"
 5 | 
 6 | namespace mlir {
 7 | namespace triton {
 8 | namespace NVIDIA {
 9 | 
10 | /// Return true if we can skip a barrier synchronization between two operations
11 | /// even if they access the same shared memory.
12 | bool canSkipBarSync(Operation *before, Operation *after);
13 | } // namespace NVIDIA
14 | } // namespace triton
15 | } // namespace mlir
16 | 
17 | #endif // TRITONGPU_CONVERSION_TRITONNVIDIAGPUTOLLVM_UTILITY_H
18 | 


--------------------------------------------------------------------------------
/third_party/nvidia/language/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import libdevice
 2 | 
 3 | from .utils import (globaltimer, num_threads, num_warps, smid, convert_custom_float8_sm70, convert_custom_float8_sm80)
 4 | from .gdc import (gdc_launch_dependents, gdc_wait)
 5 | 
 6 | __all__ = [
 7 |     "libdevice",
 8 |     "globaltimer",
 9 |     "num_threads",
10 |     "num_warps",
11 |     "smid",
12 |     "convert_custom_float8_sm70",
13 |     "convert_custom_float8_sm80",
14 |     "gdc_launch_dependents",
15 |     "gdc_wait",
16 | ]
17 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Dialect)
2 | add_subdirectory(TritonNVIDIAGPUToLLVM)
3 | add_subdirectory(NVGPUToLLVM)
4 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(NVGPU)
2 | add_subdirectory(NVWS)
3 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/Dialect/NVGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/Dialect/NVGPU/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(NVGPUIR
 2 |   Dialect.cpp
 3 | 
 4 |   DEPENDS
 5 |   NVGPUTableGen
 6 |   NVGPUAttrDefsIncGen
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   MLIRLLVMDialect
10 | )
11 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/Dialect/NVWS/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/Dialect/NVWS/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(NVWSIR
 2 |   Dialect.cpp
 3 |   Ops.cpp
 4 | 
 5 |   DEPENDS
 6 |   NVWSTableGen
 7 |   NVWSAttrDefsIncGen
 8 | 
 9 |   LINK_LIBS PUBLIC
10 |   TritonIR
11 |   TritonGPUIR
12 | )
13 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/Dialect/NVWS/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(NVWSTransforms
 2 |   LowerAref.cpp
 3 |   LowerWarpGroup.cpp
 4 | 
 5 |   DEPENDS
 6 |   NVWSTransformsIncGen
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   TritonIR
10 |   TritonGPUIR
11 |   TritonNvidiaGPUIR
12 |   NVWSIR
13 |   MLIRTransformUtils
14 | )
15 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/NVGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_triton_library(NVGPUToLLVM
2 |     NVGPUToLLVMPass.cpp
3 | 
4 |     DEPENDS
5 |     NVGPUConversionPassIncGen
6 |     NVGPUIR
7 | )
8 | 


--------------------------------------------------------------------------------
/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(TritonNVIDIAGPUToLLVM
 2 |     ConvertLayoutOpToLLVM.cpp
 3 |     ConvertWarpSpecializeToLLVM.cpp
 4 |     MemoryOpToLLVM.cpp
 5 |     DotOpToLLVM/MMAv2.cpp
 6 |     DotOpToLLVM/MMAv5.cpp
 7 |     DotOpToLLVM/WGMMA.cpp
 8 |     DotOpToLLVM.cpp
 9 |     ElementwiseOpToLLVM.cpp
10 |     LoadStoreOpToLLVM.cpp
11 |     BarrierOpToLLVM.cpp
12 |     TritonGPUToLLVM.cpp
13 |     TMAToLLVM.cpp
14 |     SPMDOpToLLVM.cpp
15 |     TensorMemoryToLLVM.cpp
16 |     TensorPtrOpsToLLVM.cpp
17 |     ClusterOpsToLLVM.cpp
18 |     PTXAsmFormat.cpp
19 |     Utility.cpp
20 |     Fp4ToFpOpToLLVM.cpp
21 |     TargetInfo.cpp
22 | 
23 |     DEPENDS
24 |     TritonNVIDIAGPUConversionPassIncGen
25 |     NVGPUAttrDefsIncGen
26 | 
27 |     LINK_LIBS PUBLIC
28 |     TritonGPUToLLVM
29 |     TritonProtonToLLVM
30 |     TritonInstrumentToLLVM
31 |     MLIRReconcileUnrealizedCasts
32 | )
33 | 


--------------------------------------------------------------------------------
/third_party/nvidia/tools/cuda/compile.h:
--------------------------------------------------------------------------------
 1 | #ifndef TT_KERNEL_INCLUDES
 2 | #define TT_KERNEL_INCLUDES
 3 | 
 4 | #include <cuda.h>
 5 | #include <inttypes.h>
 6 | #include <stdint.h>
 7 | #include <stdio.h>
 8 | 
 9 | #endif
10 | 
11 | void unload_{kernel_name}(void);
12 | void load_{kernel_name}(void);
13 | // tt-linker: {kernel_name}:{full_signature}:{algo_info}
14 | CUresult{_placeholder} {kernel_name}(CUstream stream, {signature});
15 | 


--------------------------------------------------------------------------------
/third_party/nvidia/unittest/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Conversion)
2 | 


--------------------------------------------------------------------------------
/third_party/nvidia/unittest/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonGPUToLLVM)
2 | 


--------------------------------------------------------------------------------
/third_party/nvidia/unittest/Conversion/TritonGPUToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_triton_ut(
2 |   NAME TestPtxAsmFormat
3 |   SRCS PTXAsmFormatTest.cpp
4 |   LIBS
5 |     TritonGPUToLLVM
6 |     TritonNVIDIAGPUToLLVM
7 |     NVGPUIR MLIRUBToLLVM
8 | )
9 | 


--------------------------------------------------------------------------------
/third_party/proton/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | proton.egg-info
3 | proton/_C/libproton.so
4 | 
5 | *.hatchet
6 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_proton_library(Proton
2 | 	Proton.cpp
3 | )
4 | 
5 | add_subdirectory(lib)
6 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Context/Python.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_CONTEXT_PYTHON_H_
 2 | #define PROTON_CONTEXT_PYTHON_H_
 3 | 
 4 | #include "Context.h"
 5 | 
 6 | namespace proton {
 7 | 
 8 | /// Unwind the Python stack and early return a list of contexts.
 9 | class PythonContextSource : public ContextSource {
10 | public:
11 |   PythonContextSource() = default;
12 | 
13 |   size_t getDepth() override;
14 | 
15 | private:
16 |   std::vector<Context> getContextsImpl() override;
17 | };
18 | 
19 | } // namespace proton
20 | 
21 | #endif // PROTON_CONTEXT_PYTHON_H_
22 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Data/TraceData.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_DATA_TRACE_DATA_H_
 2 | #define PROTON_DATA_TRACE_DATA_H_
 3 | 
 4 | #include "Data.h"
 5 | 
 6 | namespace proton {
 7 | 
 8 | class TraceData : public Data {
 9 | public:
10 |   using Data::Data;
11 |   virtual ~TraceData() = default;
12 | 
13 |   size_t addOp(size_t scopeId, const std::string &name) override;
14 | 
15 |   void addMetric(size_t scopeId, std::shared_ptr<Metric> metric) override;
16 | 
17 |   void
18 |   addMetrics(size_t scopeId,
19 |              const std::map<std::string, MetricValueType> &metrics) override;
20 | 
21 |   void clear() override;
22 | 
23 | protected:
24 |   // ScopeInterface
25 |   void enterScope(const Scope &scope) override final;
26 | 
27 |   void exitScope(const Scope &scope) override final;
28 | 
29 | private:
30 |   void doDump(std::ostream &os, OutputFormat outputFormat) const override;
31 | };
32 | 
33 | } // namespace proton
34 | 
35 | #endif // PROTON_DATA_TRACE_DATA_H_
36 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Driver/GPU/CudaApi.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_DRIVER_GPU_CUDA_H_
 2 | #define PROTON_DRIVER_GPU_CUDA_H_
 3 | 
 4 | #include "Driver/Device.h"
 5 | #include "cuda.h"
 6 | 
 7 | namespace proton {
 8 | 
 9 | namespace cuda {
10 | 
11 | template <bool CheckSuccess> CUresult init(int flags);
12 | 
13 | template <bool CheckSuccess> CUresult ctxSynchronize();
14 | 
15 | template <bool CheckSuccess> CUresult ctxGetCurrent(CUcontext *pctx);
16 | 
17 | template <bool CheckSuccess>
18 | CUresult deviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
19 | 
20 | template <bool CheckSuccess> CUresult deviceGet(CUdevice *device, int ordinal);
21 | 
22 | Device getDevice(uint64_t index);
23 | 
24 | } // namespace cuda
25 | 
26 | } // namespace proton
27 | 
28 | #endif // PROTON_DRIVER_GPU_CUDA_H_
29 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Driver/GPU/HipApi.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_DRIVER_GPU_HIP_H_
 2 | #define PROTON_DRIVER_GPU_HIP_H_
 3 | 
 4 | #include "Driver/Device.h"
 5 | #include "hip/hip_runtime_api.h"
 6 | 
 7 | namespace proton {
 8 | 
 9 | namespace hip {
10 | 
11 | template <bool CheckSuccess> hipError_t deviceSynchronize();
12 | 
13 | template <bool CheckSuccess>
14 | hipError_t deviceGetAttribute(int *value, hipDeviceAttribute_t attribute,
15 |                               int deviceId);
16 | 
17 | template <bool CheckSuccess> hipError_t getDeviceCount(int *count);
18 | 
19 | template <bool CheckSuccess>
20 | hipError_t getDeviceProperties(hipDeviceProp_t *prop, int deviceId);
21 | 
22 | Device getDevice(uint64_t index);
23 | 
24 | const std::string getHipArchName(uint64_t index);
25 | 
26 | const char *getKernelNameRef(const hipFunction_t f);
27 | const char *getKernelNameRefByPtr(const void *hostFunction, hipStream_t stream);
28 | 
29 | } // namespace hip
30 | 
31 | } // namespace proton
32 | 
33 | #endif // PROTON_DRIVER_GPU_HIP_H_
34 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Driver/GPU/HsaApi.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_DRIVER_GPU_HSA_H_
 2 | #define PROTON_DRIVER_GPU_HSA_H_
 3 | 
 4 | #include "Driver/Device.h"
 5 | #include "hsa/hsa_ext_amd.h"
 6 | 
 7 | namespace proton {
 8 | 
 9 | namespace hsa {
10 | 
11 | template <bool CheckSuccess>
12 | hsa_status_t agentGetInfo(hsa_agent_t agent, hsa_agent_info_t attribute,
13 |                           void *value);
14 | 
15 | hsa_status_t iterateAgents(hsa_status_t (*callback)(hsa_agent_t agent,
16 |                                                     void *data),
17 |                            void *data);
18 | 
19 | } // namespace hsa
20 | 
21 | } // namespace proton
22 | 
23 | #endif // PROTON_DRIVER_GPU_HSA_H_
24 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_PROFILER_CUPTI_PROFILER_H_
 2 | #define PROTON_PROFILER_CUPTI_PROFILER_H_
 3 | 
 4 | #include "Profiler/GPUProfiler.h"
 5 | 
 6 | namespace proton {
 7 | 
 8 | class CuptiProfiler : public GPUProfiler<CuptiProfiler> {
 9 | public:
10 |   CuptiProfiler();
11 |   virtual ~CuptiProfiler();
12 | 
13 | private:
14 |   struct CuptiProfilerPimpl;
15 | };
16 | 
17 | } // namespace proton
18 | 
19 | #endif // PROTON_PROFILER_CUPTI_PROFILER_H_
20 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_PROFILER_ROCTRACER_PROFILER_H_
 2 | #define PROTON_PROFILER_ROCTRACER_PROFILER_H_
 3 | 
 4 | #include "Profiler/GPUProfiler.h"
 5 | 
 6 | namespace proton {
 7 | 
 8 | class RoctracerProfiler : public GPUProfiler<RoctracerProfiler> {
 9 | public:
10 |   RoctracerProfiler();
11 |   virtual ~RoctracerProfiler();
12 | 
13 | private:
14 |   struct RoctracerProfilerPimpl;
15 | };
16 | 
17 | } // namespace proton
18 | 
19 | #endif // PROTON_PROFILER_ROCTRACER_PROFILER_H_
20 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Proton.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_H_
 2 | #define PROTON_H_
 3 | 
 4 | #include "Context/Context.h"
 5 | #include "Data/Data.h"
 6 | #include "Data/Metric.h"
 7 | #include "Session/Session.h"
 8 | 
 9 | #endif // PROTON_H_
10 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/Atomic.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_ATOMIC_H_
 2 | #define PROTON_UTILITY_ATOMIC_H_
 3 | 
 4 | #include <atomic>
 5 | #include <mutex>
 6 | 
 7 | namespace proton {
 8 | 
 9 | template <typename T> T atomicMax(std::atomic<T> &target, T value) {
10 |   T current = target.load();
11 |   while (current < value && !target.compare_exchange_weak(current, value))
12 |     ;
13 |   return current;
14 | }
15 | 
16 | template <typename T> T atomicMin(std::atomic<T> &target, T value) {
17 |   T current = target.load();
18 |   while (current > value && !target.compare_exchange_weak(current, value))
19 |     ;
20 |   return current;
21 | }
22 | 
23 | template <typename Condition, typename Function>
24 | void doubleCheckedLock(Condition enterCondition, std::mutex &lock,
25 |                        Function function) {
26 |   if (!enterCondition())
27 |     return;
28 | 
29 |   std::unique_lock<std::mutex> guard(lock);
30 | 
31 |   if (!enterCondition())
32 |     return;
33 | 
34 |   function();
35 | }
36 | 
37 | } // namespace proton
38 | 
39 | #endif // PROTON_UTILITY_ATOMIC_H_
40 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/Errors.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_ERRORS_H_
 2 | #define PROTON_UTILITY_ERRORS_H_
 3 | 
 4 | #include <stdexcept>
 5 | 
 6 | namespace proton {
 7 | 
 8 | class NotImplemented : public std::logic_error {
 9 | public:
10 |   NotImplemented() : std::logic_error("Not yet implemented") {};
11 | };
12 | 
13 | } // namespace proton
14 | 
15 | #endif // PROTON_UTILITY_ERRORS_H_
16 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/Set.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_SET_H_
 2 | #define PROTON_UTILITY_SET_H_
 3 | 
 4 | #include <set>
 5 | #include <shared_mutex>
 6 | 
 7 | namespace proton {
 8 | 
 9 | /// A simple thread safe set with read/write lock.
10 | template <typename Key, typename Container = std::set<Key>>
11 | class ThreadSafeSet {
12 | public:
13 |   ThreadSafeSet() = default;
14 | 
15 |   void insert(const Key &key) {
16 |     std::unique_lock<std::shared_mutex> lock(mutex);
17 |     set.insert(key);
18 |   }
19 | 
20 |   bool contain(const Key &key) {
21 |     std::shared_lock<std::shared_mutex> lock(mutex);
22 |     auto it = set.find(key);
23 |     if (it == set.end())
24 |       return false;
25 |     return true;
26 |   }
27 | 
28 |   bool erase(const Key &key) {
29 |     std::unique_lock<std::shared_mutex> lock(mutex);
30 |     return set.erase(key) > 0;
31 |   }
32 | 
33 |   void clear() {
34 |     std::unique_lock<std::shared_mutex> lock(mutex);
35 |     set.clear();
36 |   }
37 | 
38 | private:
39 |   Container set;
40 |   std::shared_mutex mutex;
41 | };
42 | 
43 | } // namespace proton
44 | 
45 | #endif // PROTON_UTILITY_MAP_H_
46 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/Singleton.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_SINGLETON_H_
 2 | #define PROTON_UTILITY_SINGLETON_H_
 3 | 
 4 | namespace proton {
 5 | 
 6 | template <typename T> class Singleton {
 7 | public:
 8 |   Singleton(const Singleton &) = delete;
 9 |   Singleton &operator=(const Singleton &) = delete;
10 | 
11 |   static T &instance() {
12 |     static T _;
13 |     return _;
14 |   }
15 | 
16 | protected:
17 |   Singleton() = default;
18 | };
19 | 
20 | } // namespace proton
21 | 
22 | #endif // PROTON_UTILITY_SINGLETON_H_
23 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/String.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_STRING_H_
 2 | #define PROTON_UTILITY_STRING_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | namespace proton {
 7 | 
 8 | inline std::string toLower(const std::string &str) {
 9 |   std::string lower;
10 |   for (auto c : str) {
11 |     lower += tolower(c);
12 |   }
13 |   return lower;
14 | }
15 | 
16 | inline std::string replace(const std::string &str, const std::string &src,
17 |                            const std::string &dst) {
18 |   std::string replaced = str;
19 |   size_t pos = replaced.find(src);
20 |   while (pos != std::string::npos) {
21 |     replaced.replace(pos, src.length(), dst);
22 |     pos += dst.length();
23 |     pos = replaced.find(src, pos);
24 |   }
25 |   return replaced;
26 | }
27 | 
28 | inline bool endWith(const std::string &str, const std::string &sub) {
29 |   if (str.length() < sub.length()) {
30 |     return false;
31 |   }
32 |   return str.compare(str.length() - sub.length(), sub.length(), sub) == 0;
33 | }
34 | 
35 | inline std::string trim(const std::string &str) {
36 |   size_t start = 0;
37 |   size_t end = str.length();
38 |   while (start < end && isspace(str[start])) {
39 |     start++;
40 |   }
41 |   while (end > start && isspace(str[end - 1])) {
42 |     end--;
43 |   }
44 |   return str.substr(start, end - start);
45 | }
46 | 
47 | } // namespace proton
48 | 
49 | #endif // PROTON_UTILITY_STRING_H_
50 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/include/Utility/Traits.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_UTILITY_TRAITS_H_
 2 | #define PROTON_UTILITY_TRAITS_H_
 3 | 
 4 | #include <type_traits>
 5 | #include <variant>
 6 | 
 7 | namespace proton {
 8 | template <class T, class... Ts>
 9 | struct is_one_of : std::disjunction<std::is_same<T, Ts>...> {};
10 | } // namespace proton
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Context)
2 | add_subdirectory(Data)
3 | add_subdirectory(Driver)
4 | add_subdirectory(Profiler)
5 | add_subdirectory(Session)
6 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Context/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_proton_library(ProtonContext
2 | 	Context.cpp
3 | 	Python.cpp
4 | 	Shadow.cpp
5 | )
6 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Context/Context.cpp:
--------------------------------------------------------------------------------
 1 | #include "Context/Context.h"
 2 | 
 3 | namespace proton {
 4 | 
 5 | /*static*/ thread_local std::optional<Context> ContextSource::state =
 6 |     std::nullopt;
 7 | 
 8 | std::atomic<size_t> Scope::scopeIdCounter{1};
 9 | 
10 | /*static*/ thread_local std::map<ThreadLocalOpInterface *, bool>
11 |     ThreadLocalOpInterface::opInProgress;
12 | 
13 | } // namespace proton
14 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Data/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_proton_library(ProtonData
2 | 	Data.cpp
3 | 	TraceData.cpp
4 | 	TreeData.cpp
5 | )
6 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Data/Data.cpp:
--------------------------------------------------------------------------------
 1 | #include "Data/Data.h"
 2 | #include "Utility/String.h"
 3 | 
 4 | #include <fstream>
 5 | #include <iostream>
 6 | #include <stdexcept>
 7 | 
 8 | #include <shared_mutex>
 9 | 
10 | namespace proton {
11 | 
12 | void Data::dump(OutputFormat outputFormat) {
13 |   std::shared_lock<std::shared_mutex> lock(mutex);
14 | 
15 |   std::unique_ptr<std::ostream> out;
16 |   if (path.empty() || path == "-") {
17 |     out.reset(new std::ostream(std::cout.rdbuf())); // Redirecting to cout
18 |   } else {
19 |     out.reset(new std::ofstream(
20 |         path + "." +
21 |         outputFormatToString(outputFormat))); // Opening a file for output
22 |   }
23 |   doDump(*out, outputFormat);
24 | }
25 | 
26 | OutputFormat parseOutputFormat(const std::string &outputFormat) {
27 |   if (toLower(outputFormat) == "hatchet") {
28 |     return OutputFormat::Hatchet;
29 |   }
30 |   throw std::runtime_error("Unknown output format: " + outputFormat);
31 | }
32 | 
33 | const std::string outputFormatToString(OutputFormat outputFormat) {
34 |   if (outputFormat == OutputFormat::Hatchet) {
35 |     return "hatchet";
36 |   }
37 |   throw std::runtime_error("Unknown output format: " +
38 |                            std::to_string(static_cast<int>(outputFormat)));
39 | }
40 | 
41 | } // namespace proton
42 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Data/TraceData.cpp:
--------------------------------------------------------------------------------
 1 | #include "Data/TraceData.h"
 2 | #include "Utility/Errors.h"
 3 | 
 4 | #include <stdexcept>
 5 | 
 6 | namespace proton {
 7 | 
 8 | void TraceData::enterScope(const Scope &scope) { throw NotImplemented(); }
 9 | 
10 | void TraceData::exitScope(const Scope &scope) { throw NotImplemented(); }
11 | 
12 | size_t TraceData::addOp(size_t scopeId, const std::string &name) {
13 |   throw NotImplemented();
14 | }
15 | 
16 | void TraceData::addMetric(size_t scopeId, std::shared_ptr<Metric> metric) {
17 |   throw NotImplemented();
18 | }
19 | 
20 | void TraceData::addMetrics(
21 |     size_t scopeId, const std::map<std::string, MetricValueType> &metrics) {
22 |   throw NotImplemented();
23 | }
24 | 
25 | void TraceData::clear() { throw NotImplemented(); }
26 | 
27 | void TraceData::doDump(std::ostream &os, OutputFormat outputFormat) const {
28 |   throw NotImplemented();
29 | }
30 | 
31 | } // namespace proton
32 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Driver/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_proton_library(ProtonDriver
2 | 	Device.cpp
3 | 	GPU/CudaApi.cpp
4 | 	GPU/CuptiApi.cpp
5 | 	GPU/HipApi.cpp
6 | 	GPU/HsaApi.cpp
7 | 	GPU/RoctracerApi.cpp
8 | )
9 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Driver/Device.cpp:
--------------------------------------------------------------------------------
 1 | #include "Driver/Device.h"
 2 | #include "Driver/GPU/CudaApi.h"
 3 | #include "Driver/GPU/HipApi.h"
 4 | 
 5 | #include "Utility/Errors.h"
 6 | 
 7 | namespace proton {
 8 | 
 9 | Device getDevice(DeviceType type, uint64_t index) {
10 |   if (type == DeviceType::CUDA) {
11 |     return cuda::getDevice(index);
12 |   }
13 |   if (type == DeviceType::HIP) {
14 |     return hip::getDevice(index);
15 |   }
16 |   throw std::runtime_error("DeviceType not supported");
17 | }
18 | 
19 | const std::string getDeviceTypeString(DeviceType type) {
20 |   if (type == DeviceType::CUDA) {
21 |     return DeviceTraits<DeviceType::CUDA>::name;
22 |   } else if (type == DeviceType::HIP) {
23 |     return DeviceTraits<DeviceType::HIP>::name;
24 |   }
25 |   throw std::runtime_error("DeviceType not supported");
26 | }
27 | 
28 | } // namespace proton
29 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Driver/GPU/HsaApi.cpp:
--------------------------------------------------------------------------------
 1 | #include "Driver/GPU/HsaApi.h"
 2 | #include "Driver/Dispatch.h"
 3 | 
 4 | namespace proton {
 5 | 
 6 | namespace hsa {
 7 | 
 8 | struct ExternLibHsa : public ExternLibBase {
 9 |   using RetType = hsa_status_t;
10 |   static constexpr const char *name = "libhsa-runtime64.so";
11 |   static constexpr const char *defaultDir = "";
12 |   static constexpr RetType success = HSA_STATUS_SUCCESS;
13 |   static void *lib;
14 | };
15 | 
16 | void *ExternLibHsa::lib = nullptr;
17 | 
18 | DEFINE_DISPATCH(ExternLibHsa, agentGetInfo, hsa_agent_get_info, hsa_agent_t,
19 |                 hsa_agent_info_t, void *);
20 | 
21 | hsa_status_t iterateAgents(hsa_status_t (*callback)(hsa_agent_t agent,
22 |                                                     void *data),
23 |                            void *data) {
24 |   typedef hsa_status_t (*hsa_iterate_agents_t)(
25 |       hsa_status_t (*)(hsa_agent_t, void *), void *data);
26 |   static hsa_iterate_agents_t func = nullptr;
27 |   Dispatch<ExternLibHsa>::init(ExternLibHsa::name, &ExternLibHsa::lib);
28 |   if (func == nullptr)
29 |     func = reinterpret_cast<hsa_iterate_agents_t>(
30 |         dlsym(ExternLibHsa::lib, "hsa_iterate_agents"));
31 |   return (func ? func(callback, data) : HSA_STATUS_ERROR_FATAL);
32 | }
33 | 
34 | } // namespace hsa
35 | 
36 | } // namespace proton
37 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Profiler/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_proton_library(ProtonProfiler
2 | 	Cupti/CuptiPCSampling.cpp
3 | 	Cupti/CuptiProfiler.cpp
4 | 	RocTracer/RoctracerProfiler.cpp
5 | )
6 | 


--------------------------------------------------------------------------------
/third_party/proton/csrc/lib/Session/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_proton_library(ProtonSession
2 |   Session.cpp
3 | )
4 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
2 | include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
3 | add_subdirectory(include)
4 | add_subdirectory(lib)
5 | if(TRITON_BUILD_PYTHON_MODULE)
6 |   add_triton_plugin(TritonProton ${CMAKE_CURRENT_SOURCE_DIR}/triton_proton.cc)
7 |   target_link_libraries(TritonProton PRIVATE ProtonIR Python3::Module pybind11::headers)
8 | endif()
9 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Dialect)
2 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/include/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Proton)
2 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/include/Dialect/Proton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/include/Dialect/Proton/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 2 | 
 3 | set(LLVM_TARGET_DEFINITIONS ProtonOps.td)
 4 | mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=proton)
 5 | mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=proton)
 6 | mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions)
 7 | mlir_tablegen(Ops.h.inc -gen-op-decls)
 8 | mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 9 | mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
10 | mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
11 | add_mlir_doc(ProtonDialect ProtonDialect dialects/ -gen-dialect-doc)
12 | add_mlir_doc(ProtonOps ProtonOps dialects/ -gen-op-doc)
13 | add_public_tablegen_target(ProtonTableGen)
14 | 
15 | set(LLVM_TARGET_DEFINITIONS ProtonAttrDefs.td)
16 | mlir_tablegen(ProtonAttrDefs.h.inc -gen-attrdef-decls)
17 | mlir_tablegen(ProtonAttrDefs.cpp.inc -gen-attrdef-defs)
18 | add_public_tablegen_target(ProtonAttrDefsIncGen)
19 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_DIALECT_PROTON_IR_DIALECT_H_
 2 | #define TRITON_DIALECT_PROTON_IR_DIALECT_H_
 3 | 
 4 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 5 | #include "mlir/IR/BuiltinOps.h"
 6 | #include "mlir/IR/Dialect.h"
 7 | #include "mlir/IR/PatternMatch.h"
 8 | #include "proton/dialect/include/Dialect/Proton/IR/Dialect.h.inc"
 9 | #include "proton/dialect/include/Dialect/Proton/IR/OpsEnums.h.inc"
10 | 
11 | #define GET_ATTRDEF_CLASSES
12 | #include "proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.h.inc"
13 | 
14 | #define GET_OP_CLASSES
15 | #include "proton/dialect/include/Dialect/Proton/IR/Ops.h.inc"
16 | 
17 | namespace mlir {
18 | namespace triton {
19 | namespace proton {} // namespace proton
20 | } // namespace triton
21 | } // namespace mlir
22 | 
23 | #endif // TRITON_DIALECT_PROTON_IR_DIALECT_H_
24 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_ATTRDEFS
 2 | #define PROTON_ATTRDEFS
 3 | 
 4 | include "mlir/IR/AttrTypeBase.td"
 5 | include "ProtonDialect.td"
 6 | 
 7 | class Proton_Attr<string name, list<Trait> traits = [],
 8 |                      string baseCppClass = "::mlir::Attribute">
 9 |   : AttrDef<Proton_Dialect, name, traits, baseCppClass> {
10 | }
11 | 
12 | #endif // PROTON_ATTRDEFS
13 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonDialect.td:
--------------------------------------------------------------------------------
 1 | #ifndef PROTON_DIALECT
 2 | #define PROTON_DIALECT
 3 | 
 4 | include "mlir/IR/OpBase.td"
 5 | 
 6 | def Proton_Dialect : Dialect {
 7 |   let name = "proton";
 8 |   let cppNamespace = "::mlir::triton::proton";
 9 | 
10 |   let description = [{
11 |     Proton Dialect provides core ops for building third-party compiler-based
12 |     performance profiling and analysis tools.
13 |   }];
14 | 
15 |   let dependentDialects = [];
16 | }
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRITON_CONVERSION_TRITONPROTON_TO_LLVM_PATTERNS_TRITON_PROTON_OP_TO_LLVM_H
 2 | #define TRITON_CONVERSION_TRITONPROTON_TO_LLVM_PATTERNS_TRITON_PROTON_OP_TO_LLVM_H
 3 | 
 4 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 5 | 
 6 | namespace mlir::triton {
 7 | class TargetInfoBase;
 8 | namespace proton {
 9 | void populateRecordOpToLLVMPattern(LLVMTypeConverter &typeConverter,
10 |                                    RewritePatternSet &patterns,
11 |                                    const TargetInfoBase &targetInfo,
12 |                                    PatternBenefit benefit);
13 | } // namespace proton
14 | } // namespace mlir::triton
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Dialect)
2 | add_subdirectory(TritonProtonToLLVM)
3 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/lib/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Proton)
2 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/lib/Dialect/Proton/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/lib/Dialect/Proton/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_library(ProtonIR
 2 |   Dialect.cpp
 3 |   Ops.cpp
 4 | 
 5 |   DEPENDS
 6 |   ProtonTableGen
 7 |   ProtonAttrDefsIncGen
 8 | 
 9 |   LINK_LIBS PUBLIC
10 |   MLIRLLVMDialect
11 |   TritonIR
12 |   TritonGPUIR
13 | )
14 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/lib/Dialect/Proton/IR/Dialect.cpp:
--------------------------------------------------------------------------------
 1 | #include "mlir/IR/DialectImplementation.h"
 2 | #include "mlir/IR/OpImplementation.h"
 3 | 
 4 | // clang-format off
 5 | #include "Dialect/Proton/IR/Dialect.h"
 6 | #include "Dialect/Proton/IR/Dialect.cpp.inc"
 7 | // clang-format on
 8 | 
 9 | using namespace mlir;
10 | using namespace mlir::triton::proton;
11 | 
12 | void mlir::triton::proton::ProtonDialect::initialize() {
13 |   addAttributes<
14 | #define GET_ATTRDEF_LIST
15 | #include "Dialect/Proton/IR/ProtonAttrDefs.cpp.inc"
16 |       >();
17 | 
18 |   addOperations<
19 | #define GET_OP_LIST
20 | #include "Dialect/Proton/IR/Ops.cpp.inc"
21 |       >();
22 | }
23 | 
24 | #define GET_ATTRDEF_CLASSES
25 | #include "Dialect/Proton/IR/ProtonAttrDefs.cpp.inc"
26 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/lib/Dialect/Proton/IR/Ops.cpp:
--------------------------------------------------------------------------------
 1 | #include "Dialect/Proton/IR/Dialect.h"
 2 | #include "mlir/IR/Builders.h"
 3 | #include "mlir/IR/BuiltinAttributes.h"
 4 | #include "mlir/IR/BuiltinTypes.h"
 5 | #include "mlir/IR/OperationSupport.h"
 6 | #include "mlir/Interfaces/FunctionImplementation.h"
 7 | #include "mlir/Interfaces/FunctionInterfaces.h"
 8 | #include "mlir/Support/LLVM.h"
 9 | #include "triton/Dialect/Triton/IR/Dialect.h"
10 | #include "triton/Dialect/Triton/IR/Types.h"
11 | #include "triton/Dialect/Triton/IR/Utility.h"
12 | 
13 | #define GET_OP_CLASSES
14 | #include "Dialect/Proton/IR/Ops.cpp.inc"
15 | #include "Dialect/Proton/IR/OpsEnums.cpp.inc"
16 | 
17 | namespace mlir {
18 | namespace triton {
19 | namespace proton {
20 | 
21 | // -- RecordOp --
22 | void RecordOp::getEffects(
23 |     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
24 |         &effects) {
25 |   effects.emplace_back(MemoryEffects::Write::get(),
26 |                        SideEffects::DefaultResource::get());
27 |   effects.emplace_back(MemoryEffects::Read::get(),
28 |                        SideEffects::DefaultResource::get());
29 | }
30 | 
31 | } // namespace proton
32 | } // namespace triton
33 | } // namespace mlir
34 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/lib/TritonProtonToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_triton_library(TritonProtonToLLVM
2 |     RecordOpToLLVM.cpp
3 | 
4 |     LINK_LIBS PUBLIC
5 |     ProtonIR
6 | )
7 | 


--------------------------------------------------------------------------------
/third_party/proton/dialect/triton_proton.cc:
--------------------------------------------------------------------------------
 1 | #include "Dialect/Proton/IR/Dialect.h"
 2 | #include "mlir/Pass/PassManager.h"
 3 | #include "passes.h"
 4 | #include <pybind11/pybind11.h>
 5 | #include <pybind11/stl.h>
 6 | #include <pybind11/stl_bind.h>
 7 | 
 8 | namespace py = pybind11;
 9 | 
10 | void init_triton_proton(py::module &&m) {
11 |   auto passes = m.def_submodule("passes");
12 | 
13 |   // load dialects
14 |   m.def("load_dialects", [](mlir::MLIRContext &context) {
15 |     mlir::DialectRegistry registry;
16 |     registry.insert<mlir::triton::proton::ProtonDialect>();
17 |     context.appendDialectRegistry(registry);
18 |     context.loadAllAvailableDialects();
19 |   });
20 | }
21 | 


--------------------------------------------------------------------------------
/third_party/proton/proton/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | from .scope import scope, cpu_timed_scope, enter_scope, exit_scope
 3 | from .state import state, enter_state, exit_state
 4 | from .profile import (
 5 |     start,
 6 |     activate,
 7 |     deactivate,
 8 |     finalize,
 9 |     profile,
10 |     DEFAULT_PROFILE_NAME,
11 | )
12 | from . import context, specs
13 | 


--------------------------------------------------------------------------------
/third_party/proton/proton/context.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from triton._C.libproton import proton as libproton
 3 | from .flags import get_profiling_on
 4 | 
 5 | 
 6 | def depth(session: Optional[int] = 0) -> Optional[int]:
 7 |     """
 8 |     Get the depth of the context.
 9 | 
10 |     Args:
11 |         session (int): The session ID of the profiling session. Defaults to 0.
12 | 
13 |     Returns:
14 |         depth (int or None): The depth of the context. If profiling is off, returns None.
15 |     """
16 |     if not get_profiling_on():
17 |         return None
18 |     return libproton.get_context_depth(session)
19 | 


--------------------------------------------------------------------------------
/third_party/proton/proton/flags.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains the global flags used in the proton package.
 3 | """
 4 | 
 5 | # Whether to enable profiling. Default is False.
 6 | profiling_on = False
 7 | # Whether the script is run from the command line. Default is False.
 8 | command_line = False
 9 | 
10 | 
11 | def set_profiling_on():
12 |     global profiling_on
13 |     profiling_on = True
14 | 
15 | 
16 | def set_profiling_off():
17 |     global profiling_on
18 |     profiling_on = False
19 | 
20 | 
21 | def get_profiling_on():
22 |     global profiling_on
23 |     return profiling_on
24 | 
25 | 
26 | def set_command_line():
27 |     global command_line
28 |     command_line = True
29 | 
30 | 
31 | def is_command_line():
32 |     global command_line
33 |     return command_line
34 | 


--------------------------------------------------------------------------------
/third_party/proton/proton/hook.py:
--------------------------------------------------------------------------------
 1 | from .state import enter_state, exit_state
 2 | from .scope import enter_scope, exit_scope
 3 | from triton import knobs
 4 | from triton.compiler import LazyDict
 5 | 
 6 | COMPUTE_METADATA_SCOPE_NAME = "__proton_launch_metadata"
 7 | 
 8 | 
 9 | class TritonHook:
10 |     flops_width = [8, 16, 32, 64]
11 |     metrics = [f"flops{width}" for width in flops_width] + ["bytes"] + ["flops"]
12 | 
13 |     @staticmethod
14 |     def enter(lazy_dict: LazyDict) -> None:
15 |         enter_state(COMPUTE_METADATA_SCOPE_NAME)
16 |         metadata = lazy_dict.get()
17 |         exit_state()
18 |         fn_metrics = {k: metadata[k] for k in TritonHook.metrics if k in metadata}
19 |         enter_scope(metadata["name"], triton_op=True, metrics=fn_metrics)
20 | 
21 |     @staticmethod
22 |     def exit(lazy_dict: LazyDict) -> None:
23 |         exit_scope(triton_op=True)
24 | 
25 | 
26 | def register_triton_hook() -> None:
27 |     if knobs.runtime.launch_enter_hook is None:
28 |         knobs.runtime.launch_enter_hook = TritonHook.enter
29 |         knobs.runtime.launch_exit_hook = TritonHook.exit
30 | 
31 | 
32 | def unregister_triton_hook() -> None:
33 |     if knobs.runtime.launch_enter_hook == TritonHook.enter:
34 |         knobs.runtime.launch_enter_hook = None
35 |         knobs.runtime.launch_exit_hook = None
36 | 


--------------------------------------------------------------------------------
/third_party/proton/proton/language.py:
--------------------------------------------------------------------------------
 1 | from triton.language import core as tl
 2 | from triton.language.core import builtin
 3 | import warnings
 4 | 
 5 | 
 6 | @builtin
 7 | def record(isStart: bool, regionId: int, _semantic=None):
 8 |     warnings.warn(
 9 |         "\nWarning the proton language module within Proton contains under development features that are not intended to be used outside of the core development team"
10 |     )
11 |     return tl.tensor(_semantic.builder.create_proton_record(isStart, regionId), tl.void)
12 | 


--------------------------------------------------------------------------------
/third_party/proton/test/examples/frame.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "children": [
 4 |       {
 5 |         "children": [
 6 |           {
 7 |             "children": [],
 8 |             "frame": {
 9 |               "name": "/home/user/projects/example.py/test.py:1@foo",
10 |               "type": "function"
11 |             },
12 |             "metrics": {
13 |               "count": 1,
14 |               "device_id": "0",
15 |               "device_type": "HIP",
16 |               "time (ns)": 204800
17 |             }
18 |           }
19 |         ],
20 |         "frame": {
21 |           "name": "test0"
22 |         },
23 |         "metrics": {}
24 |       },
25 |       {
26 |         "children": [],
27 |         "frame": {
28 |           "name": "test1"
29 |         },
30 |         "metrics": {
31 |           "count": 1,
32 |           "device_id": "0",
33 |           "device_type": "HIP",
34 |           "time (ns)": 204800
35 |         }
36 |       }
37 |     ],
38 |     "frame": {
39 |       "name": "ROOT",
40 |       "type": "function"
41 |     },
42 |     "metrics": {
43 |       "count": 0,
44 |       "time (ns)": 0
45 |     }
46 |   },
47 |   {
48 |     "HIP": {
49 |       "0": {
50 |         "arch": "gfx90a",
51 |         "bus_width": 4096,
52 |         "clock_rate": 1700000,
53 |         "memory_clock_rate": 1600000,
54 |         "num_sms": 104
55 |       }
56 |     }
57 |   }
58 | ]
59 | 


--------------------------------------------------------------------------------
/third_party/proton/test/helper.py:
--------------------------------------------------------------------------------
 1 | import triton.profiler as proton
 2 | 
 3 | import torch
 4 | import sys
 5 | 
 6 | from helper_kernels import custom_add
 7 | 
 8 | 
 9 | def main():
10 |     a = torch.zeros(1, device="cuda")
11 |     with proton.scope("test"):
12 |         custom_add[(1, )](a)
13 | 
14 | 
15 | def test_main():
16 |     main()
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     if sys.argv[1] == "test":
21 |         main()
22 | 


--------------------------------------------------------------------------------
/third_party/proton/test/helper_kernels.py:
--------------------------------------------------------------------------------
1 | import triton.language as tl
2 | import triton
3 | 
4 | 
5 | @triton.jit
6 | def custom_add(a_ptr):
7 |     tl.store(a_ptr, 1.0)
8 | 


--------------------------------------------------------------------------------
/third_party/proton/test/test_record.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pathlib
 3 | 
 4 | import triton
 5 | import triton.language as tl
 6 | import triton.profiler.language as pl
 7 | 
 8 | 
 9 | def test_proton_record(tmp_path: pathlib.Path):
10 | 
11 |     @triton.jit
12 |     def add_kernel(
13 |         x_ptr,
14 |         y_ptr,
15 |         output_ptr,
16 |         n_elements,
17 |         BLOCK_SIZE: tl.constexpr,
18 |     ):
19 |         pid = tl.program_id(axis=0)
20 |         block_start = pid * BLOCK_SIZE
21 |         offsets = block_start + tl.arange(0, BLOCK_SIZE)
22 |         mask = offsets < n_elements
23 |         x = tl.load(x_ptr + offsets, mask=mask)
24 |         pl.record(True, 0)
25 |         y = tl.load(y_ptr + offsets, mask=mask)
26 |         pl.record(False, 0)
27 |         output = x + y
28 |         tl.store(output_ptr + offsets, output, mask=mask)
29 | 
30 |     torch.manual_seed(0)
31 |     size = 2**12
32 |     x = torch.rand(size, device='cuda')
33 |     y = torch.rand(size, device='cuda')
34 |     output = torch.empty_like(x)
35 |     n_elements = output.numel()
36 |     grid = (1, 1, 1)
37 |     pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
38 |     ttir = pgm.asm['ttir']
39 |     assert "proton.record() {isStart = true, regionId = 0 : i32}" in ttir
40 |     assert "proton.record() {isStart = false, regionId = 0 : i32}" in ttir
41 | 


--------------------------------------------------------------------------------
/unittest/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_ut(
 2 |   NAME TestTritonAnalysis
 3 |   SRCS UtilityTest.cpp
 4 |   LIBS
 5 |     TritonAnalysis
 6 |     TritonIR
 7 |     TritonGPUIR
 8 |     TritonGPUTransforms
 9 |     TritonNvidiaGPUTransforms
10 | )
11 | 


--------------------------------------------------------------------------------
/unittest/Analysis/UtilityTest.cpp:
--------------------------------------------------------------------------------
 1 | #include "triton/Dialect/Triton/IR/Utility.h"
 2 | 
 3 | #include "llvm/Support/Signals.h"
 4 | #include <gtest/gtest.h>
 5 | 
 6 | namespace mlir {
 7 | 
 8 | TEST(Analysis, reorder) {
 9 |   SmallVector<int> shape({10, 20, 30});
10 |   {
11 |     SmallVector<unsigned> order({2, 1, 0});
12 |     auto reordered = triton::applyPermutation(shape, order);
13 |     EXPECT_EQ(reordered[0], 30);
14 |     EXPECT_EQ(reordered[1], 20);
15 |     EXPECT_EQ(reordered[2], 10);
16 |   }
17 |   {
18 |     SmallVector<unsigned> order({1, 0, 2});
19 |     auto reordered = triton::applyPermutation(shape, order);
20 |     EXPECT_EQ(reordered[0], 20);
21 |     EXPECT_EQ(reordered[1], 10);
22 |     EXPECT_EQ(reordered[2], 30);
23 |   }
24 | }
25 | 
26 | } // namespace mlir
27 | 
28 | int main(int argc, char *argv[]) {
29 |   llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
30 |   testing::InitGoogleTest(&argc, argv);
31 |   return RUN_ALL_TESTS();
32 | }
33 | 


--------------------------------------------------------------------------------
/unittest/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Analysis)
2 | add_subdirectory(Dialect)
3 | add_subdirectory(Tools)
4 | 


--------------------------------------------------------------------------------
/unittest/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TritonGPU)
2 | 


--------------------------------------------------------------------------------
/unittest/Dialect/TritonGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_triton_ut(
 2 |   NAME TestSwizzling
 3 |   SRCS SwizzleTest.cpp
 4 |   LIBS
 5 |     TritonTools
 6 |     LLVMSupport
 7 |     MLIRSupport
 8 | )
 9 | add_triton_ut(
10 |   NAME Dialect
11 |   SRCS DialectTest.cpp
12 |   LIBS
13 |     MLIRParser
14 |     TritonGPUIR
15 |     TritonGPUTransforms
16 |     TritonNvidiaGPUTransforms
17 | )
18 | add_triton_ut(
19 |   NAME LinearLayoutConversions
20 |   SRCS LinearLayoutConversionsTest.cpp
21 |   LIBS
22 |     TritonGPUIR
23 |     TritonGPUTransforms
24 |     TritonNvidiaGPUTransforms
25 | )
26 | 
27 | add_triton_ut(
28 |   NAME DumpLayoutTest
29 |   SRCS DumpLayoutTest.cpp
30 |   LIBS
31 |     TritonGPUIR
32 |     TritonGPUTransforms
33 |     TritonNvidiaGPUTransforms
34 | )
35 | 


--------------------------------------------------------------------------------
/unittest/Tools/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_triton_ut(
2 | 	NAME LinearLayout
3 | 	SRCS LayoutUtilsTest.cpp LinearLayoutTest.cpp
4 | 	LIBS TritonTools
5 | )
6 | 


--------------------------------------------------------------------------------
/unittest/googletest.cmake:
--------------------------------------------------------------------------------
 1 | include(FetchContent)
 2 | 
 3 | set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against")
 4 | 
 5 | if(GOOGLETEST_DIR)
 6 |   set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override")
 7 | endif()
 8 | 
 9 | FetchContent_Declare(
10 |   googletest
11 |   GIT_REPOSITORY https://github.com/google/googletest.git
12 |   GIT_TAG v1.17.0
13 |   )
14 | 
15 | FetchContent_GetProperties(googletest)
16 | 
17 | if(NOT googletest_POPULATED)
18 |   FetchContent_MakeAvailable(googletest)
19 |   if (MSVC)
20 |     set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
21 |   endif()
22 | endif()
23 | 


--------------------------------------------------------------------------------
/utils/nightly.pypirc:
--------------------------------------------------------------------------------
1 | [distutils]
2 | Index-servers =
3 |   Triton-Nightly
4 | 
5 | [Triton-Nightly]
6 | Repository = https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/upload/
7 | 


--------------------------------------------------------------------------------