├── triton_runner
    ├── bench
    │   ├── __init__.py
    │   ├── matmul
    │   │   └── __init__.py
    │   ├── launch_latency
    │   │   ├── __init__.py
    │   │   └── kernels.py
    │   └── utils.py
    ├── compiler
    │   └── __init__.py
    ├── driver
    │   ├── v3_5_0
    │   │   └── __init__.py
    │   └── __init__.py
    ├── gluon_runner
    │   └── __init__.py
    ├── language
    │   └── __init__.py
    ├── __init__.py
    ├── torch_utils.py
    ├── version_utils.py
    ├── color_print.py
    └── check_utils.py
├── doc
    ├── pdf
    │   └── Triton Runner-1029.pdf
    ├── benchmark.md
    └── solving_triton_issues
    │   ├── high_usage-7268
    │       ├── v3.2.0_cache
    │       │   ├── _bwd_kernel.cubin
    │       │   ├── __grp___bwd_kernel.json
    │       │   └── _bwd_kernel.json
    │       ├── v3.3.0_cache
    │       │   ├── _bwd_kernel.cubin
    │       │   ├── __grp___bwd_kernel.json
    │       │   └── _bwd_kernel.json
    │       ├── attn.py
    │       ├── test
    │       │   └── test.py
    │       └── fix
    │       │   └── fix.py
    │   ├── performance-7096
    │       ├── v3.1.0_cache
    │       │   ├── matmul_kernel.cubin
    │       │   ├── matmul_kernel.json
    │       │   └── __grp__matmul_kernel.json
    │       └── v3.4.0_cache
    │       │   ├── matmul_kernel.cubin
    │       │   ├── __grp__matmul_kernel.json
    │       │   └── matmul_kernel.json
    │   └── README.md
├── benchmark
    └── README.md
├── examples
    ├── runner
    │   ├── v3.2.0
    │   │   ├── cubin
    │   │   │   └── sm90
    │   │   │   │   ├── matmul_kernel.cubin
    │   │   │   │   └── matmul_kernel.json
    │   │   ├── llir
    │   │   │   ├── sm75
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm80
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm86
    │   │   │   │   └── matmul_kernel.json
    │   │   │   └── sm90
    │   │   │   │   └── matmul_kernel.json
    │   │   ├── ptx
    │   │   │   ├── sm75
    │   │   │   │   ├── matmul_kernel.json
    │   │   │   │   └── old_ptx
    │   │   │   │   │   └── matmul_kernel.json
    │   │   │   └── sm90
    │   │   │   │   └── matmul_kernel.json
    │   │   ├── ttgir
    │   │   │   └── sm75
    │   │   │   │   └── matmul_kernel.json
    │   │   └── README.md
    │   ├── v3.4.0
    │   │   ├── cubin
    │   │   │   ├── sm75
    │   │   │   │   ├── matmul_kernel.cubin
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm80
    │   │   │   │   ├── matmul_kernel.cubin
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm86
    │   │   │   │   ├── matmul_kernel.cubin
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm120
    │   │   │   │   ├── matmul_kernel_make_tensor_desciptor.cubin
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   │   └── sm90
    │   │   │   │   ├── matmul_kernel_make_tensor_desciptor.cubin
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   ├── llir
    │   │   │   ├── sm120
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   │   ├── sm90
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   │   ├── sm75
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm80
    │   │   │   │   └── matmul_kernel.json
    │   │   │   └── sm86
    │   │   │   │   └── matmul_kernel.json
    │   │   └── ptx
    │   │   │   ├── sm90
    │   │   │       └── matmul_kernel_make_tensor_desciptor.json
    │   │   │   ├── sm120
    │   │   │       └── matmul_kernel_make_tensor_desciptor.json
    │   │   │   ├── sm75
    │   │   │       └── matmul_kernel.json
    │   │   │   ├── sm80
    │   │   │       └── matmul_kernel.json
    │   │   │   └── sm86
    │   │   │       └── matmul_kernel.json
    │   ├── tlx
    │   │   ├── v3.4.0
    │   │   │   ├── cubin
    │   │   │   │   └── sm90
    │   │   │   │   │   ├── _attn_fwd_ws_pipelined_pingpong.cubin
    │   │   │   │   │   └── _attn_fwd_ws_pipelined_pingpong.json
    │   │   │   ├── ptx
    │   │   │   │   └── sm90
    │   │   │   │   │   └── _attn_fwd_ws_pipelined_pingpong.json
    │   │   │   ├── llir
    │   │   │   │   └── sm90
    │   │   │   │   │   └── _attn_fwd_ws_pipelined_pingpong.json
    │   │   │   └── ttgir
    │   │   │   │   └── sm90
    │   │   │   │       └── _attn_fwd_ws_pipelined_pingpong.json
    │   │   └── README.md
    │   ├── v3.5.x
    │   │   ├── cubin
    │   │   │   └── sm90
    │   │   │   │   ├── matmul_kernel_make_tensor_desciptor.cubin
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   ├── llir
    │   │   │   └── sm90
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   └── ptx
    │   │   │   └── sm90
    │   │   │       └── matmul_kernel_make_tensor_desciptor.json
    │   ├── v3.1.0
    │   │   ├── llir
    │   │   │   ├── sm75
    │   │   │   │   └── matmul_kernel.json
    │   │   │   └── sm86
    │   │   │   │   └── matmul_kernel.json
    │   │   ├── ptx
    │   │   │   ├── sm75
    │   │   │   │   └── matmul_kernel.json
    │   │   │   └── sm86
    │   │   │   │   └── matmul_kernel.json
    │   │   └── README.md
    │   ├── v3.3.x
    │   │   ├── ptx
    │   │   │   ├── sm75
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm80
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm86
    │   │   │   │   └── matmul_kernel.json
    │   │   │   └── sm90
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   ├── llir
    │   │   │   ├── sm75
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm80
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm86
    │   │   │   │   └── matmul_kernel.json
    │   │   │   ├── sm120
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   │   └── sm90
    │   │   │   │   └── matmul_kernel_make_tensor_desciptor.json
    │   │   └── README.md
    │   └── v3.0.0
    │   │   └── README.md
    ├── autotune
    │   ├── kda_gate_fwd_kernel_cache_sm75
    │   │   ├── 3J7PIE43DAZ7RUJM7K3BVQRXM6UADBVU5PLHUNWFTOPEYNC746WA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 43CRH27SFXI4LZFC44JA5STEEQ2L6MBRWI3YABOLNJH7C32G373A
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 456X6RCBTYY2CJLGJR5DWN3QQJEOTE6FJVEK3PJJRKS3DU5WZEOQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 52QBMC4BRT5QHKUCA5JXTV5JWB3OZ7DCRZBVPOU47IG3BUEYMO4A
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 5FL7IJFMUYZQGBFXV6724VXMB2H45RT4EUYVKHD7XJ7JNOKZ2TRA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 6GXRVHZYV6JQF7YTWZAZJQ7AZSBNPUP3BV3SRGNJJG4SHIXJECRA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 6U7U7HN66TGBRRNQCNOGDXNHQDK67IPORWGPUAJIZ4GJVYH3HN6Q
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 7VPODQVY6ADLGD5A4PAOUFY3ID6FGPUI52LZLSNRYAOKZEM3GJXA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── CF43WNCF5YMKCVCXPTDMIRUB6UCGO4DZB2KQRJJKV6K7L6ZFWW7A
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── D4OD4ZC3ME3UMHFZOWSYPJVC64VYXQ45BQUZ7N7ZBQD5Y2YCQR5A
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── DZA3KX3D3GRGPKWBACG3DG6L6GUV7QVVAQM6VATF27WVAUHZ565Q
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── HQQSBVMUM6AOU3ICEFGUEBA3EIMEKEIZKDYL27Z26SP6ABY5CZXQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── JH5VISGO5YTN4D3RVGIYOSMDXV42EMSOZOLEI34HRKULL6CNCFAQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── JXGYDIUIMXWTX4QUNU7244XKKUT7SPFH44HVO4AFURRKCOMGT63Q
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── KV6NGAQMO2J5DNTUIMBAW4GXZ2ISZWK6EEZ7MD2B2TOKBMN55L5Q
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── MN74TNF2LXOHDQ7JWHSBXBXDEYNEHVXNNQYYYU4I7UJXGZV57FTA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── O3BRYXGCQQFTUJVYXY6X53DO5MJVNR5J7J2U7OUUIP2SIQUVUXUA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── SKQRWFMKMDSWZOOBL357YKR7VA2BIU7G4R7PCPWE6PZGSFLAWYEA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── THVH3NEAOI3OOMJVYJWTBZO3HHUY4S7PC62QV3GQE3CTMC4VP7TQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── TJPSS3CJUNFKTYOV2BGC4UWBL7NCDS2F2Z2CVA4MZ5OQDRYWLZFQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── UOQTXANLCIU7DDLW3OAQU4IGAOBMAOKI65PO65DGP5NOEN4XAUUQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── VRLASYKFGKAKD6S2LCNWFCL36AD52XDB2QX6FHCKZ2LYMTTOISPA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── VW3HSUCS6BWQFHZKBTGECL76ULLEMZM42DMEKDRG6Z6PPUZOSXDQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   └── W7F4HYOMUNP4EA5G5VPAPTYPFP64XXCKQH23BYIKIWEHOBXPC6ZA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   ├── kda_gate_fwd_kernel_cache_sm90
    │   │   ├── 4QVBSIEL4EW76HXF6J42YMTVJSSBM3FZ62S63KR44UH74BOAEK6Q
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 7ABF7WWZ6UVKXZHZAPPCTAIG5WB56VBDYUXGBUHJW5AUWAIUYLLA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── 7MSUOZ5SZX22HSJIWKE6UC6ZX3UKWAU7LCYVXLFYH5BR2DE66YGA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── AZWK5R6GU6NKUAOLEDEDMHPVS7GSUTQ5ONAAYJD7VGRLHKYEEJKA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── BDBDUBFGEFPNA7OC7SMORXJVY3MIANC5KKDYZ5U7ORR3WMQPGL7A
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── C5R3K3Q7KWRIV5SPEAF25RSD6PVH6AJBT5EJOLUJ4NBNSF3OFBAQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── DKJUN6TOWV3VPVPTQWGKXKURI4GXR6NZRMJ6WC7G6T5WLAVEXSOQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── EMTXD7MF7V5ON7OHWVDWE3LB3XFQV277E7YCEOM5SZGNOTXK22MQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── EPDDVXU2AXFLJFUG6ENXSPRP2JFWSTRIEDYTLW7QM5AQALS3D4HQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── FA6WAAIL7CJ6ZKWLJBH7AL7NGKGG52HJFIDPVGIIKFWG6T3D5PSA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── HELKQDNRUHYKVDWUXJPTSLBVWGX7SFCAPF76UGEQP6DP4LT2OIHA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── HILJQSDI7AXV7GPWY5BTHCPKLWKXIA6BEET57TQ5W7U43DBQI6FA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── I7HEAVFUSXY6PUPNXIN2JIE5WQQV2FYMBXDCSRYPCOZUCEUM3GQA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── IPDCXESWUO5G5ZRHA2BPP7NGMRXPS2QMWDWRQLYLADND4ADF4GDA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── JDB7JAU3WFAKKCIRCMQKGZBMBYW4WBWGNSG74TUX5PDJGOJ2CLOQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── KLWMK2PCX53G5X7APZPSDMDVB3N7TB5NUKRIJFTV4RBRSH3MONQQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── LHSTUSP6EKUB4EH6D6VPU2WXGLR7IW3RCQSQ5WLNMMW5AG7LAD7Q
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── LTEIDA7H5ZHQZL4RR2OELBFJGRWTQGURZJGZF47P4CX3P2AHTCKQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── ROJJGSWACR4K5G3OWYLO4A6Q6QGAM5DW4BH2NQHKCG3EHZWMYPWQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── S4HQYRTI3C2OZRBHPNMHUFLSPPSXSQJHZZJZW24V4TCTTIU4JS6A
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── WG4QQROEXFGCZ3LYUZRAISDRLGB6BRGGY4RS45ZNIZDVRTWLRHSA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── WK6MKXOWVAYTCMDOQT5PKFGH3YCF3PCK54HDOVFWUPECNQZVVMGA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── Y2FJGW5TXEL6YELXBO4X4BIC6PP4GKBQFZEB34NKY5XNWRNDIZUQ
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── ZAMAV7JUWTDP5URUGLBWWKZ6CKTAOI4H7PMKTXB4ZCDDWHENZMUA
    │   │   │   ├── kda_gate_fwd_kernel.cubin
    │   │   │   └── kda_gate_fwd_kernel.json
    │   │   ├── KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q
    │   │   │   └── cuda_utils.cpython-312-x86_64-linux-gnu.so
    │   │   └── VDPTDIXGIWY3WVB7NQTUNNJBCQG66QRFH5LX5HXNGOE4GJWHBV6Q
    │   │   │   └── __triton_launcher.cpython-312-x86_64-linux-gnu.so
    │   ├── cubin
    │   │   └── test_kda_gate_single.py
    │   └── python
    │   │   └── test_kda_gate_single.py
    ├── utils
    │   └── generate_cmd.py
    ├── native_kernel
    │   └── dump
    │   │   ├── 02-matrix_transpose.py
    │   │   ├── 04-softmax.py
    │   │   └── 05-softmax_lse.py
    ├── test.py
    └── dump
    │   ├── python
    │       └── 02-matrix_transpose
    │       │   ├── dump_2d_load.py
    │       │   ├── dump_2d_trans.py
    │       │   └── dump_boundary
    │       │       └── dump_boundary_trans.py
    │   ├── ttir
    │       └── 02-matrix_transpose
    │       │   ├── dump_2d_load.py
    │       │   └── dump_2d_trans.py
    │   └── ttgir
    │       └── 02-matrix_transpose
    │           ├── dump_2d_load.py
    │           └── dump_2d_trans.py
├── .vscode
    └── settings.json
├── .gitignore
├── pyproject.toml
└── LICENSE


/triton_runner/bench/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/triton_runner/bench/matmul/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/triton_runner/compiler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/triton_runner/driver/v3_5_0/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/triton_runner/bench/launch_latency/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/triton_runner/gluon_runner/__init__.py:
--------------------------------------------------------------------------------
1 | from ._runtime import jit
2 | 


--------------------------------------------------------------------------------
/doc/pdf/Triton Runner-1029.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/pdf/Triton Runner-1029.pdf


--------------------------------------------------------------------------------
/doc/benchmark.md:
--------------------------------------------------------------------------------
1 | # benchmark command
2 | 
3 | ```shell
4 | python benchmark/launch_latency/bench.py
5 | 
6 | python benchmark/matmul/mma/bench.py
7 | ```
8 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
1 | # benchmark command
2 | 
3 | ```shell
4 | python benchmark/launch_latency/bench.py
5 | 
6 | python benchmark/matmul/mma/bench.py
7 | ```
8 | 


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/cubin/sm90/matmul_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.2.0/cubin/sm90/matmul_kernel.cubin


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm75/matmul_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm75/matmul_kernel.cubin


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm80/matmul_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm80/matmul_kernel.cubin


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm86/matmul_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm86/matmul_kernel.cubin


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/v3.2.0_cache/_bwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/solving_triton_issues/high_usage-7268/v3.2.0_cache/_bwd_kernel.cubin


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/v3.3.0_cache/_bwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/solving_triton_issues/high_usage-7268/v3.3.0_cache/_bwd_kernel.cubin


--------------------------------------------------------------------------------
/doc/solving_triton_issues/performance-7096/v3.1.0_cache/matmul_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/solving_triton_issues/performance-7096/v3.1.0_cache/matmul_kernel.cubin


--------------------------------------------------------------------------------
/doc/solving_triton_issues/performance-7096/v3.4.0_cache/matmul_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/solving_triton_issues/performance-7096/v3.4.0_cache/matmul_kernel.cubin


--------------------------------------------------------------------------------
/examples/runner/tlx/v3.4.0/cubin/sm90/_attn_fwd_ws_pipelined_pingpong.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/tlx/v3.4.0/cubin/sm90/_attn_fwd_ws_pipelined_pingpong.cubin


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm120/matmul_kernel_make_tensor_desciptor.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm120/matmul_kernel_make_tensor_desciptor.cubin


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm90/matmul_kernel_make_tensor_desciptor.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm90/matmul_kernel_make_tensor_desciptor.cubin


--------------------------------------------------------------------------------
/examples/runner/v3.5.x/cubin/sm90/matmul_kernel_make_tensor_desciptor.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.5.x/cubin/sm90/matmul_kernel_make_tensor_desciptor.cubin


--------------------------------------------------------------------------------
/triton_runner/language/__init__.py:
--------------------------------------------------------------------------------
1 | import triton
2 | from ..version_utils import is_triton_geq_v3_4
3 | 
4 | if is_triton_geq_v3_4:
5 |     from .dump import dump, dump_boundary, dump_grids
6 | else:
7 |     from .dump_before_3_4_0 import dump, dump_boundary, dump_grids
8 | 


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/3J7PIE43DAZ7RUJM7K3BVQRXM6UADBVU5PLHUNWFTOPEYNC746WA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/3J7PIE43DAZ7RUJM7K3BVQRXM6UADBVU5PLHUNWFTOPEYNC746WA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/43CRH27SFXI4LZFC44JA5STEEQ2L6MBRWI3YABOLNJH7C32G373A/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/43CRH27SFXI4LZFC44JA5STEEQ2L6MBRWI3YABOLNJH7C32G373A/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/456X6RCBTYY2CJLGJR5DWN3QQJEOTE6FJVEK3PJJRKS3DU5WZEOQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/456X6RCBTYY2CJLGJR5DWN3QQJEOTE6FJVEK3PJJRKS3DU5WZEOQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/52QBMC4BRT5QHKUCA5JXTV5JWB3OZ7DCRZBVPOU47IG3BUEYMO4A/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/52QBMC4BRT5QHKUCA5JXTV5JWB3OZ7DCRZBVPOU47IG3BUEYMO4A/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/5FL7IJFMUYZQGBFXV6724VXMB2H45RT4EUYVKHD7XJ7JNOKZ2TRA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/5FL7IJFMUYZQGBFXV6724VXMB2H45RT4EUYVKHD7XJ7JNOKZ2TRA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/6GXRVHZYV6JQF7YTWZAZJQ7AZSBNPUP3BV3SRGNJJG4SHIXJECRA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/6GXRVHZYV6JQF7YTWZAZJQ7AZSBNPUP3BV3SRGNJJG4SHIXJECRA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/6U7U7HN66TGBRRNQCNOGDXNHQDK67IPORWGPUAJIZ4GJVYH3HN6Q/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/6U7U7HN66TGBRRNQCNOGDXNHQDK67IPORWGPUAJIZ4GJVYH3HN6Q/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/7VPODQVY6ADLGD5A4PAOUFY3ID6FGPUI52LZLSNRYAOKZEM3GJXA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/7VPODQVY6ADLGD5A4PAOUFY3ID6FGPUI52LZLSNRYAOKZEM3GJXA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/CF43WNCF5YMKCVCXPTDMIRUB6UCGO4DZB2KQRJJKV6K7L6ZFWW7A/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/CF43WNCF5YMKCVCXPTDMIRUB6UCGO4DZB2KQRJJKV6K7L6ZFWW7A/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/D4OD4ZC3ME3UMHFZOWSYPJVC64VYXQ45BQUZ7N7ZBQD5Y2YCQR5A/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/D4OD4ZC3ME3UMHFZOWSYPJVC64VYXQ45BQUZ7N7ZBQD5Y2YCQR5A/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/DZA3KX3D3GRGPKWBACG3DG6L6GUV7QVVAQM6VATF27WVAUHZ565Q/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/DZA3KX3D3GRGPKWBACG3DG6L6GUV7QVVAQM6VATF27WVAUHZ565Q/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/HQQSBVMUM6AOU3ICEFGUEBA3EIMEKEIZKDYL27Z26SP6ABY5CZXQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/HQQSBVMUM6AOU3ICEFGUEBA3EIMEKEIZKDYL27Z26SP6ABY5CZXQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/JH5VISGO5YTN4D3RVGIYOSMDXV42EMSOZOLEI34HRKULL6CNCFAQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/JH5VISGO5YTN4D3RVGIYOSMDXV42EMSOZOLEI34HRKULL6CNCFAQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/JXGYDIUIMXWTX4QUNU7244XKKUT7SPFH44HVO4AFURRKCOMGT63Q/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/JXGYDIUIMXWTX4QUNU7244XKKUT7SPFH44HVO4AFURRKCOMGT63Q/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/KV6NGAQMO2J5DNTUIMBAW4GXZ2ISZWK6EEZ7MD2B2TOKBMN55L5Q/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/KV6NGAQMO2J5DNTUIMBAW4GXZ2ISZWK6EEZ7MD2B2TOKBMN55L5Q/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/MN74TNF2LXOHDQ7JWHSBXBXDEYNEHVXNNQYYYU4I7UJXGZV57FTA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/MN74TNF2LXOHDQ7JWHSBXBXDEYNEHVXNNQYYYU4I7UJXGZV57FTA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/O3BRYXGCQQFTUJVYXY6X53DO5MJVNR5J7J2U7OUUIP2SIQUVUXUA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/O3BRYXGCQQFTUJVYXY6X53DO5MJVNR5J7J2U7OUUIP2SIQUVUXUA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/SKQRWFMKMDSWZOOBL357YKR7VA2BIU7G4R7PCPWE6PZGSFLAWYEA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/SKQRWFMKMDSWZOOBL357YKR7VA2BIU7G4R7PCPWE6PZGSFLAWYEA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/THVH3NEAOI3OOMJVYJWTBZO3HHUY4S7PC62QV3GQE3CTMC4VP7TQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/THVH3NEAOI3OOMJVYJWTBZO3HHUY4S7PC62QV3GQE3CTMC4VP7TQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/TJPSS3CJUNFKTYOV2BGC4UWBL7NCDS2F2Z2CVA4MZ5OQDRYWLZFQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/TJPSS3CJUNFKTYOV2BGC4UWBL7NCDS2F2Z2CVA4MZ5OQDRYWLZFQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/UOQTXANLCIU7DDLW3OAQU4IGAOBMAOKI65PO65DGP5NOEN4XAUUQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/UOQTXANLCIU7DDLW3OAQU4IGAOBMAOKI65PO65DGP5NOEN4XAUUQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/VRLASYKFGKAKD6S2LCNWFCL36AD52XDB2QX6FHCKZ2LYMTTOISPA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/VRLASYKFGKAKD6S2LCNWFCL36AD52XDB2QX6FHCKZ2LYMTTOISPA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/VW3HSUCS6BWQFHZKBTGECL76ULLEMZM42DMEKDRG6Z6PPUZOSXDQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/VW3HSUCS6BWQFHZKBTGECL76ULLEMZM42DMEKDRG6Z6PPUZOSXDQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/W7F4HYOMUNP4EA5G5VPAPTYPFP64XXCKQH23BYIKIWEHOBXPC6ZA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/W7F4HYOMUNP4EA5G5VPAPTYPFP64XXCKQH23BYIKIWEHOBXPC6ZA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/4QVBSIEL4EW76HXF6J42YMTVJSSBM3FZ62S63KR44UH74BOAEK6Q/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/4QVBSIEL4EW76HXF6J42YMTVJSSBM3FZ62S63KR44UH74BOAEK6Q/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/7ABF7WWZ6UVKXZHZAPPCTAIG5WB56VBDYUXGBUHJW5AUWAIUYLLA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/7ABF7WWZ6UVKXZHZAPPCTAIG5WB56VBDYUXGBUHJW5AUWAIUYLLA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/7MSUOZ5SZX22HSJIWKE6UC6ZX3UKWAU7LCYVXLFYH5BR2DE66YGA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/7MSUOZ5SZX22HSJIWKE6UC6ZX3UKWAU7LCYVXLFYH5BR2DE66YGA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/AZWK5R6GU6NKUAOLEDEDMHPVS7GSUTQ5ONAAYJD7VGRLHKYEEJKA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/AZWK5R6GU6NKUAOLEDEDMHPVS7GSUTQ5ONAAYJD7VGRLHKYEEJKA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/BDBDUBFGEFPNA7OC7SMORXJVY3MIANC5KKDYZ5U7ORR3WMQPGL7A/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/BDBDUBFGEFPNA7OC7SMORXJVY3MIANC5KKDYZ5U7ORR3WMQPGL7A/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/C5R3K3Q7KWRIV5SPEAF25RSD6PVH6AJBT5EJOLUJ4NBNSF3OFBAQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/C5R3K3Q7KWRIV5SPEAF25RSD6PVH6AJBT5EJOLUJ4NBNSF3OFBAQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/DKJUN6TOWV3VPVPTQWGKXKURI4GXR6NZRMJ6WC7G6T5WLAVEXSOQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/DKJUN6TOWV3VPVPTQWGKXKURI4GXR6NZRMJ6WC7G6T5WLAVEXSOQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/EMTXD7MF7V5ON7OHWVDWE3LB3XFQV277E7YCEOM5SZGNOTXK22MQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/EMTXD7MF7V5ON7OHWVDWE3LB3XFQV277E7YCEOM5SZGNOTXK22MQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/EPDDVXU2AXFLJFUG6ENXSPRP2JFWSTRIEDYTLW7QM5AQALS3D4HQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/EPDDVXU2AXFLJFUG6ENXSPRP2JFWSTRIEDYTLW7QM5AQALS3D4HQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/FA6WAAIL7CJ6ZKWLJBH7AL7NGKGG52HJFIDPVGIIKFWG6T3D5PSA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/FA6WAAIL7CJ6ZKWLJBH7AL7NGKGG52HJFIDPVGIIKFWG6T3D5PSA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/HELKQDNRUHYKVDWUXJPTSLBVWGX7SFCAPF76UGEQP6DP4LT2OIHA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/HELKQDNRUHYKVDWUXJPTSLBVWGX7SFCAPF76UGEQP6DP4LT2OIHA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/HILJQSDI7AXV7GPWY5BTHCPKLWKXIA6BEET57TQ5W7U43DBQI6FA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/HILJQSDI7AXV7GPWY5BTHCPKLWKXIA6BEET57TQ5W7U43DBQI6FA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/I7HEAVFUSXY6PUPNXIN2JIE5WQQV2FYMBXDCSRYPCOZUCEUM3GQA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/I7HEAVFUSXY6PUPNXIN2JIE5WQQV2FYMBXDCSRYPCOZUCEUM3GQA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/IPDCXESWUO5G5ZRHA2BPP7NGMRXPS2QMWDWRQLYLADND4ADF4GDA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/IPDCXESWUO5G5ZRHA2BPP7NGMRXPS2QMWDWRQLYLADND4ADF4GDA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/JDB7JAU3WFAKKCIRCMQKGZBMBYW4WBWGNSG74TUX5PDJGOJ2CLOQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/JDB7JAU3WFAKKCIRCMQKGZBMBYW4WBWGNSG74TUX5PDJGOJ2CLOQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/KLWMK2PCX53G5X7APZPSDMDVB3N7TB5NUKRIJFTV4RBRSH3MONQQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/KLWMK2PCX53G5X7APZPSDMDVB3N7TB5NUKRIJFTV4RBRSH3MONQQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/LHSTUSP6EKUB4EH6D6VPU2WXGLR7IW3RCQSQ5WLNMMW5AG7LAD7Q/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/LHSTUSP6EKUB4EH6D6VPU2WXGLR7IW3RCQSQ5WLNMMW5AG7LAD7Q/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/LTEIDA7H5ZHQZL4RR2OELBFJGRWTQGURZJGZF47P4CX3P2AHTCKQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/LTEIDA7H5ZHQZL4RR2OELBFJGRWTQGURZJGZF47P4CX3P2AHTCKQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/ROJJGSWACR4K5G3OWYLO4A6Q6QGAM5DW4BH2NQHKCG3EHZWMYPWQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/ROJJGSWACR4K5G3OWYLO4A6Q6QGAM5DW4BH2NQHKCG3EHZWMYPWQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/S4HQYRTI3C2OZRBHPNMHUFLSPPSXSQJHZZJZW24V4TCTTIU4JS6A/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/S4HQYRTI3C2OZRBHPNMHUFLSPPSXSQJHZZJZW24V4TCTTIU4JS6A/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/WG4QQROEXFGCZ3LYUZRAISDRLGB6BRGGY4RS45ZNIZDVRTWLRHSA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/WG4QQROEXFGCZ3LYUZRAISDRLGB6BRGGY4RS45ZNIZDVRTWLRHSA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/WK6MKXOWVAYTCMDOQT5PKFGH3YCF3PCK54HDOVFWUPECNQZVVMGA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/WK6MKXOWVAYTCMDOQT5PKFGH3YCF3PCK54HDOVFWUPECNQZVVMGA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/Y2FJGW5TXEL6YELXBO4X4BIC6PP4GKBQFZEB34NKY5XNWRNDIZUQ/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/Y2FJGW5TXEL6YELXBO4X4BIC6PP4GKBQFZEB34NKY5XNWRNDIZUQ/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/ZAMAV7JUWTDP5URUGLBWWKZ6CKTAOI4H7PMKTXB4ZCDDWHENZMUA/kda_gate_fwd_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/ZAMAV7JUWTDP5URUGLBWWKZ6CKTAOI4H7PMKTXB4ZCDDWHENZMUA/kda_gate_fwd_kernel.cubin


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q/cuda_utils.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q/cuda_utils.cpython-312-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/VDPTDIXGIWY3WVB7NQTUNNJBCQG66QRFH5LX5HXNGOE4GJWHBV6Q/__triton_launcher.cpython-312-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/VDPTDIXGIWY3WVB7NQTUNNJBCQG66QRFH5LX5HXNGOE4GJWHBV6Q/__triton_launcher.cpython-312-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.associations": {
 3 |         "*.ttir": "mlir",
 4 |         "*.ttgir": "mlir",
 5 |         "*.llir": "mlir",
 6 |         "*.ptx": "mlir",
 7 |         "*.source": "mlir",
 8 |     },
 9 |     "files.trimTrailingWhitespace": true,
10 |     "[mlir]": {
11 |         "files.trimTrailingWhitespace": false
12 |     },
13 | }
14 | 


--------------------------------------------------------------------------------
/triton_runner/driver/__init__.py:
--------------------------------------------------------------------------------
 1 | def get_device_interface():
 2 |     import torch
 3 |     return torch.cuda
 4 | 
 5 | 
 6 | def get_empty_cache_for_benchmark():
 7 |     import torch
 8 | 
 9 |     # We maintain a buffer of 256 MB that we clear
10 |     # before each kernel call to make sure that the L2 cache
11 |     # doesn't contain any input data before the run
12 |     cache_size = 256 * 1024 * 1024
13 |     return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda')
14 | 
15 | 
16 | def clear_cache(cache):
17 |     cache.zero_()
18 | 


--------------------------------------------------------------------------------
/triton_runner/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = '0.3.2'
 2 | 
 3 | from .version_utils import is_support_version, triton_version
 4 | if not is_support_version:
 5 |     raise RuntimeError(f"Triton Runner doesn't support Triton v{triton_version}")
 6 | 
 7 | 
 8 | from .jit import jit
 9 | from .version_utils import is_triton_geq_v3_4
10 | if is_triton_geq_v3_4:
11 |     from .autotune import autotune
12 | from . import color_print
13 | from . import torch_utils
14 | import os
15 | 
16 | 
17 | def get_file_dir(file):
18 |     return os.path.dirname(os.path.abspath(file))
19 | 


--------------------------------------------------------------------------------
/examples/utils/generate_cmd.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def list_files_rel(path):
 4 |     files = []
 5 |     for root, _, filenames in os.walk(path):
 6 |         for f in filenames:
 7 |             if f.endswith('.py'):
 8 |                 rel_path = os.path.relpath(os.path.join(root, f), path)
 9 |                 files.append(rel_path)
10 |     return sorted(files)
11 | 
12 | old_dirname = ""
13 | for f in list_files_rel("examples/dump/python"):
14 |     dirname, filename = os.path.split(f)
15 |     if dirname != old_dirname:
16 |         print()
17 |     old_dirname = dirname
18 |     print("python", f"examples/dump/python/{f}")
19 | 


--------------------------------------------------------------------------------
/examples/runner/v3.1.0/llir/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "1597f8ffb198730637412019688ada11f0c656203824bbd6ca759d828658dc3c", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": false, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "shared": 1152, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.1.0/ptx/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "1597f8ffb198730637412019688ada11f0c656203824bbd6ca759d828658dc3c", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": false, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "shared": 1152, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # triton_runner builds
 2 | build/
 3 | build-*/
 4 | 
 5 | triton_runner*.egg-info/
 6 | 
 7 | # Pytest
 8 | pytest.ini
 9 | 
10 | # Python caches
11 | __pycache__/
12 | *.py[cod]
13 | .pytest_cache
14 | 
15 | # Environments
16 | .venv
17 | venv/
18 | venv.bak/
19 | 
20 | # VS Code project files
21 | .vscode
22 | .vs
23 | 
24 | # JetBrains project files
25 | .idea
26 | cmake-build-*
27 | 
28 | # clangd index. (".clangd" is a config file now, thus trailing slash)
29 | .clangd/
30 | .cache
31 | /compile_commands.json
32 | .vscode
33 | .vs
34 | 
35 | # Vim
36 | *.swp
37 | 
38 | # macOS
39 | .DS_Store
40 | 
41 | **/dump.ttir
42 | **/dump.ttgir
43 | triton_runner/README.md
44 | fused-attention-*
45 | results.html
46 | 


--------------------------------------------------------------------------------
/doc/solving_triton_issues/performance-7096/v3.1.0_cache/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": true, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "shared": 73728, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/v3.2.0_cache/__grp___bwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"child_paths": {"_bwd_kernel.ttir": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.ttir", "_bwd_kernel.ttgir": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.ttgir", "_bwd_kernel.llir": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.llir", "_bwd_kernel.ptx": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.ptx", "_bwd_kernel.cubin": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.cubin", "_bwd_kernel.json": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.json"}}


--------------------------------------------------------------------------------
/examples/runner/v3.1.0/ptx/sm86/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "None", "hash": "64fbcec4e63cf8ac427398e09df770b785f88527f5ca375bac7547e503bd5dbf", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": false, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "triton_version": "3.1.0", "shared": 49152, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/attn.py:
--------------------------------------------------------------------------------
 1 | # Higher shared_memory usage in Triton 3.3
 2 | # https://github.com/triton-lang/triton/issues/7268
 3 | 
 4 | from flash_attn_triton import flash_attn_func
 5 | import torch
 6 | 
 7 | # set seed
 8 | torch.random.manual_seed(0)
 9 | batch_size = 1
10 | nheads = 4
11 | d = 64
12 | seqlen = 16
13 | dtype = torch.bfloat16
14 | q = torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") * 5
15 | k, v = [
16 |     torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") * 3
17 |     for _ in range(2)
18 | ]
19 | q.requires_grad_(True)
20 | k.requires_grad_(True)
21 | v.requires_grad_(True)
22 | out = flash_attn_func(q, k, v)
23 | g = torch.randn_like(out)
24 | out.backward(g)
25 | 


--------------------------------------------------------------------------------
/examples/runner/tlx/README.md:
--------------------------------------------------------------------------------
 1 | Triton v3.4.0 in TLX with commit [9a7a23d](https://github.com/facebookexperimental/triton/commit/9a7a23d0cfa4ed4b37eb9b177b0e36beb254f9e6)
 2 | 
 3 | ### sm90 (H100, H200, H20, etc.)
 4 | ```shell
 5 | python examples/runner/tlx/v3.4.0/python/hopper-fa-ws-pipelined-pingpong.py
 6 | 
 7 | python examples/runner/tlx/v3.4.0/ttir/sm90/hopper-fa-ws-pipelined-pingpong.py
 8 | 
 9 | python examples/runner/tlx/v3.4.0/ttgir/sm90/hopper-fa-ws-pipelined-pingpong.py
10 | 
11 | python examples/runner/tlx/v3.4.0/llir/sm90/hopper-fa-ws-pipelined-pingpong.py
12 | 
13 | python examples/runner/tlx/v3.4.0/ptx/sm90/hopper-fa-ws-pipelined-pingpong.py
14 | 
15 | python examples/runner/tlx/v3.4.0/cubin/sm90/hopper-fa-ws-pipelined-pingpong.py
16 | ```
17 | 


--------------------------------------------------------------------------------
/examples/runner/v3.1.0/llir/sm86/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "None", "hash": "64fbcec4e63cf8ac427398e09df770b785f88527f5ca375bac7547e503bd5dbf", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": false, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "triton_version": "3.1.0", "shared": 49152, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/v3.3.0_cache/__grp___bwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"child_paths": {"_bwd_kernel.ttir": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.ttir", "_bwd_kernel.ttgir": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.ttgir", "_bwd_kernel.llir": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.llir", "_bwd_kernel.ptx": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.ptx", "_bwd_kernel.cubin": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.cubin", "_bwd_kernel.json": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.json"}}


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "triton-runner"
 3 | description = "Multi-Level Triton Runner supporting Python, IR, PTX, and cubin."
 4 | 
 5 | dependencies = [
 6 |     "triton>=3.0.0",
 7 |     "termcolor",
 8 | ]
 9 | 
10 | readme = "README.md"
11 | version = "0.3.2"
12 | authors = [
13 |     { name = "Bob Huang", email = "x@bobhuang.xyz" },
14 | ]
15 | license = "MIT"
16 | 
17 | [project.urls]
18 | repository = "https://github.com/toyaix/triton-runner"
19 | homepage = "https://triton-runner.org"
20 | 
21 | [build-system]
22 | requires = ["setuptools>=61.0"]
23 | build-backend = "setuptools.build_meta"
24 | 
25 | [tool.setuptools]
26 | include-package-data = true
27 | 
28 | [tool.setuptools.package-data]
29 | "triton_runner" = ["README.md"]
30 | 
31 | [tool.setuptools.packages.find]
32 | include = ["triton_runner*"]
33 | 


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/llir/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "b64e769422e5fa5399816ba4a980010ac79cb8b001625b23012ec3b0e699d40a", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 1152, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/llir/sm80/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "56eea00f4a1bf012c5cb9a8b7b95772212bed2037dea35c421ffd3be025a57f5", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 49152, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/llir/sm86/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "a033e32e3e90ad81ddbdf656d192ce47a1b4f8153257ead819af5222682f522d", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 49152, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/ptx/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "b64e769422e5fa5399816ba4a980010ac79cb8b001625b23012ec3b0e699d40a", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 1152, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/v3.2.0_cache/_bwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "33df89de20b58a4369156e1e89b02907a0fd6bbcc78d60a666949ad8f14a7395", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 98304, "name": "_bwd_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/performance-7096/v3.1.0_cache/__grp__matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"child_paths": {"matmul_kernel.ttir": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.ttir", "matmul_kernel.ttgir": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.ttgir", "matmul_kernel.llir": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.llir", "matmul_kernel.ptx": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.ptx", "matmul_kernel.cubin": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.cubin", "matmul_kernel.json": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.json"}}


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/llir/sm90/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "1d3a3576962135e581ebf26145d4beba58e9ee12ff11c7934b9b69a9059ea670", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 73728, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/ptx/sm90/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "1d3a3576962135e581ebf26145d4beba58e9ee12ff11c7934b9b69a9059ea670", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 73728, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/triton_runner/torch_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | import triton
 4 | 
 5 | 
 6 | def pad_2d_to_block_shape(tensor, block_shape):
 7 |     M, K = tensor.shape
 8 |     BLOCK_M, BLOCK_K = block_shape
 9 | 
10 |     pad_M = (BLOCK_M - M % BLOCK_M) % BLOCK_M
11 |     pad_K = (BLOCK_K - K % BLOCK_K) % BLOCK_K
12 | 
13 |     padded = torch.nn.functional.pad(tensor, (0, pad_K, 0, pad_M), value=0)
14 | 
15 |     return padded.to(torch.float32)
16 | 
17 | 
18 | def get_pad_n_elements(tensor, block_shape):
19 |     return math.prod(tuple(triton.cdiv(dim, block) * block for dim, block in zip(tensor.shape, block_shape)))
20 | 
21 | 
22 | def get_grid_dim(tensor_shape, block_shape):
23 |     return tuple(triton.cdiv(dim, block) for dim, block in zip(tensor_shape, block_shape))
24 | 
25 | 
26 | def get_n_elements_with_grid(block_shape, grid):
27 |     return math.prod(block_shape) * math.prod(grid)
28 | 


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/cubin/sm90/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "e36faa33ba3c2c3f39005afa18658f6cd91bf57c877172c9167ad11d19490d2e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 73728, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/performance-7096/v3.4.0_cache/__grp__matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"child_paths": {"matmul_kernel.source": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.source", "matmul_kernel.ttir": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.ttir", "matmul_kernel.ttgir": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.ttgir", "matmul_kernel.llir": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.llir", "matmul_kernel.ptx": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.ptx", "matmul_kernel.cubin": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.cubin", "matmul_kernel.json": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.json"}}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/test/test.py:
--------------------------------------------------------------------------------
 1 | from flash_attn_triton_test import _flash_attn_backward
 2 | import torch
 3 | import math
 4 | import triton
 5 | 
 6 | 
 7 | batch_size, nheads, d, seqlen = 1, 4, 64, 16
 8 | torch.random.manual_seed(0)
 9 | 
10 | dtype = torch.bfloat16
11 | q, k, v, o, do = [
12 |     torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda")
13 |     for _ in range(5)
14 | ]
15 | seqlen_q_rounded = math.ceil(seqlen / 128) * 128
16 | lse = torch.empty((batch_size, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
17 | with torch.inference_mode():
18 |     dq = torch.empty_like(q)
19 |     dk = torch.empty_like(k)
20 |     dv = torch.empty_like(v)
21 |     _flash_attn_backward(
22 |         do,
23 |         q,
24 |         k,
25 |         v,
26 |         o,
27 |         lse,
28 |         dq,
29 |         dk,
30 |         dv
31 |     )
32 |     print(do.min().item(), do.max().item(), do.mean().item(), do.std().item())
33 | 


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/ptx/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "5b38a8c25a5e0d23d4831fc9a66f1ed798e5ec3a0ccb48010e00cbb30f412e3d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/leetgpu/lib/python3.10/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.3.1", "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/ptx/sm80/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "d59a4408912cd758ab87d37f0ec619ab5015f47747ef3de78443c10d79d085bc", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.3.1", "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/ptx/sm86/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "81ea21ee3c20a473bf522f802389aeffc623527513e25c396b8f3b42ed00a367", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.3.1", "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/llir/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "5b38a8c25a5e0d23d4831fc9a66f1ed798e5ec3a0ccb48010e00cbb30f412e3d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/leetgpu/lib/python3.10/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.3.1", "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/llir/sm80/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "d59a4408912cd758ab87d37f0ec619ab5015f47747ef3de78443c10d79d085bc", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.3.1", "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/llir/sm86/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "81ea21ee3c20a473bf522f802389aeffc623527513e25c396b8f3b42ed00a367", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.3.1", "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/ptx/sm75/old_ptx/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "5b38a8c25a5e0d23d4831fc9a66f1ed798e5ec3a0ccb48010e00cbb30f412e3d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/leetgpu/lib/python3.10/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.3.1", "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/ttgir/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "runner-67362256efcd8d0f4d35108b946500eb0390f82c28fdcdb0ff16e44b441546fb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.3.1", "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/v3.3.0_cache/_bwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "76c7c1fc39c029ba69c1dd0221706b3d17bb1d7e12a329ba8d7ea3ded71ced99", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm89", "triton_version": "3.3.0", "shared": 114688, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "_bwd_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/performance-7096/v3.4.0_cache/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"hash": "cd4d7c0aaebb8263c1198aea105fbb7fb1246ac0ef6221051d23f5cdcdb2c4ae", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm89", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 73728, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/high_usage-7268/fix/fix.py:
--------------------------------------------------------------------------------
 1 | from flash_attn_triton_runner import _flash_attn_backward
 2 | import torch
 3 | import math
 4 | 
 5 | torch.random.manual_seed(0)
 6 | 
 7 | batch_size, nheads, d, seqlen = 1, 4, 64, 16
 8 | 
 9 | dtype = torch.bfloat16
10 | q, k, v, o, do = [
11 |     torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda")
12 |     for _ in range(5)
13 | ]
14 | seqlen_q_rounded = math.ceil(seqlen / 128) * 128
15 | lse = torch.empty((batch_size, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
16 | with torch.inference_mode():
17 |     dq = torch.empty_like(q)
18 |     dk = torch.empty_like(k)
19 |     dv = torch.empty_like(v)
20 |     _flash_attn_backward(
21 |         do,
22 |         q,
23 |         k,
24 |         v,
25 |         o,
26 |         lse,
27 |         dq,
28 |         dk,
29 |         dv
30 |     )
31 |     print(do.min().item(), do.max().item(), do.mean().item(), do.std().item())
32 | 
33 | import triton
34 | print(f"Triton version: {triton.__version__}")
35 | 


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/llir/sm120/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"hash": "368b77270b5889e568b3df78a951be3a033e2ae230a7b6af7fd3ea5346f25161", "target": {"backend": "cuda", "arch": 120, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm120", "triton_version": "3.3.1", "shared": 24592, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/llir/sm90/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"hash": "20e60420ec987a18bc4a1f739d4d837501eda754ee672163679f0b985736a80f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.3.1", "shared": 40984, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/ptx/sm90/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"hash": "20e60420ec987a18bc4a1f739d4d837501eda754ee672163679f0b985736a80f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.3.1", "shared": 40984, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2025-Present BobHuang
 3 | *
 4 | * Permission is hereby granted, free of charge, to any person obtaining
 5 | * a copy of this software and associated documentation files
 6 | * (the "Software"), to deal in the Software without restriction,
 7 | * including without limitation the rights to use, copy, modify, merge,
 8 | * publish, distribute, sublicense, and/or sell copies of the Software,
 9 | * and to permit persons to whom the Software is furnished to do so,
10 | * subject to the following conditions:
11 | *
12 | * The above copyright notice and this permission notice shall be
13 | * included in all copies or substantial portions of the Software.
14 | *
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | */
23 | 


--------------------------------------------------------------------------------
/triton_runner/version_utils.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | 
 3 | triton_version = triton.__version__
 4 | version_str = ".".join(triton_version.split('.')[:2])
 5 | 
 6 | is_support_version = version_str in ["3.6", "3.5", "3.4", "3.3", "3.2", "3.1", "3.0"]
 7 | 
 8 | is_triton_v3_6 = version_str == "3.6"
 9 | is_triton_v3_5 = version_str == "3.5"
10 | is_triton_v3_4 = version_str == "3.4"
11 | is_triton_v3_3 = version_str == "3.3"
12 | is_triton_v3_2 = version_str == "3.2"
13 | is_triton_v3_1 = version_str == "3.1"
14 | is_triton_v3_0 = version_str == "3.0"
15 | 
16 | is_triton_geq_v3_3 = version_str in ["3.3", "3.4", "3.5", "3.6"]
17 | is_triton_geq_v3_4 = version_str in ["3.4", "3.5", "3.6"]
18 | is_triton_geq_v3_5 = version_str in ["3.5", "3.6"]
19 | 
20 | is_triton_leq_v3_2 = version_str in ["3.2", "3.1", "3.0"]
21 | is_triton_leq_v3_1 = version_str in ["3.1", "3.0"]
22 | is_disable_multithreading = version_str in ["3.5", "3.4", "3.3", "3.2"]
23 | 
24 | if is_triton_v3_5:
25 |     uni_triton_version = "3.5.x"
26 | elif is_triton_v3_3:
27 |     uni_triton_version = "3.3.x"
28 | else:
29 |     uni_triton_version = triton_version
30 | 
31 | try:
32 |     import triton.language.extra.tlx as tlx
33 |     is_tlx = True
34 | except ImportError as e:
35 |     is_tlx = False
36 | 


--------------------------------------------------------------------------------
/triton_runner/color_print.py:
--------------------------------------------------------------------------------
 1 | import termcolor
 2 | import os
 3 | 
 4 | def blue_print(text):
 5 |     print(termcolor.colored(text, "blue"), flush=True)
 6 | 
 7 | def yellow_print(text):
 8 |     print(termcolor.colored(text, "yellow"), flush=True)
 9 | 
10 | 
11 | def get_project_name():
12 |     return "[Triton Runner]"
13 | 
14 | def warning_dump_mode_ssa_and_op(ssa, op, loc, size, encoding):
15 |     encoding = f" with encoding={encoding[2:]}" if encoding != "" else ""
16 |     blue_print(f"{get_project_name()} In dump mode, ssa={ssa}, op={op}, loc={loc}, size={size}{encoding}")
17 | 
18 | def warning_size_not_supported(ssa, op, loc, size):
19 |     yellow_print(f"{get_project_name()} Warning: size={size} is not supported. And ssa={ssa}, op={op}, loc={loc}")
20 | 
21 | def print_triton_cache_dir(metadata_path, cache_hit=False):
22 |     if os.environ.get("RUNNER_PROD", "0") != "1":
23 |         always_compile_text = " cache hint and" if cache_hit else ""
24 |         blue_print(f"{get_project_name()} Triton kernel{always_compile_text} saved at {os.path.dirname(metadata_path)}")
25 | 
26 | def check_dump_tensor_dtype(dump_tensor):
27 |     import torch
28 |     if dump_tensor.dtype != torch.float32:
29 |         yellow_print(f"Warning: tensor dtype is {dump_tensor.dtype}, not torch.float32!")
30 | 


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/llir/sm120/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "0cb1ad741e9b1a99354522c9b7098bdf3beea9902c9c8ffe195c9433c74ba324", "target": {"backend": "cuda", "arch": 120, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm120", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 24592, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/llir/sm90/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "81bfd88df3fd5c7021b844a5eb534121362b518dd707dd5853f58e08ee3ea8ec", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/ptx/sm90/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "81bfd88df3fd5c7021b844a5eb534121362b518dd707dd5853f58e08ee3ea8ec", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/ptx/sm120/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "7d5c3a22e090a10bfd2a615e080f36adcdfe485fc514fc5af248f382ae86bbfa", "target": {"backend": "cuda", "arch": 120, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm120", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 24592, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm120/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "7d5c3a22e090a10bfd2a615e080f36adcdfe485fc514fc5af248f382ae86bbfa", "target": {"backend": "cuda", "arch": 120, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm120", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 24592, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm90/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "cda82ff2530be72686a807dd5c393106273474af4950c0b73b81fc51f3ab64b8", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/doc/solving_triton_issues/README.md:
--------------------------------------------------------------------------------
 1 | ## [Triton 3.3 Performance Regression on Small Gemms](https://github.com/triton-lang/triton/issues/7096)
 2 | 
 3 | Reproducer in [performance-7096/test.py](./performance-7096/test.py) with Triton v3.4.0 [testing.do_bench](https://github.com/triton-lang/triton/blob/v3.4.0/python/triton/testing.py)
 4 | 
 5 | ```
 6 | GPU: NVIDIA GeForce RTX 4090
 7 | Triton version: 3.1.0
 8 | 512x512: 0.0124ms
 9 | 1024x1024: 0.0210ms
10 | 1536x1536: 0.0673ms
11 | 2048x2048: 0.1181ms
12 | 4096x4096: 0.8580ms
13 | ```
14 | 
15 | ```
16 | GPU: NVIDIA GeForce RTX 4090
17 | Triton version: 3.4.0
18 | 512x512: 0.0137ms
19 | 1024x1024: 0.0225ms
20 | 1536x1536: 0.0711ms
21 | 2048x2048: 0.1222ms
22 | 4096x4096: 0.8852ms
23 | ```
24 | 
25 | Fix use cubin with triton_runner in [fix.py:67](./performance-7096/fix.py#L67)
26 | 
27 | ## [Higher shared_memory usage in Triton 3.3](https://github.com/triton-lang/triton/issues/7268)
28 | 
29 | Reproducer on NVIDIA GeForce RTX 4090
30 | 
31 | [high_usage-7268/v3.2.0_cache/_bwd_kernel.json](./high_usage-7268/v3.2.0_cache/_bwd_kernel.json) has `"shared": 98304` and [high_usage-7268/v3.3x.0_cache/_bwd_kernel.json](./high_usage-7268/v3.3.0_cache/_bwd_kernel.json) has `"shared": 114688`
32 | 
33 | Fix use cubin with triton_runner in [flash_attn_triton_runner.py:152](./high_usage-7268/fix/flash_attn_triton_runner.py#L152)
34 | 


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp32', 'D'), ('b_ptr', '*fp32', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_ak', 'constexpr', 1), ('stride_bk', 'i32', 'D'), ('stride_bn', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_cn', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 16), ('BLOCK_SIZE_N', 'constexpr', 16))", "hash": "ed24cdb025613dcb31695c3d8108650987ba83a42cd3f6befcd7af9c0c1fb3fb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/llir/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp32', 'D'), ('b_ptr', '*fp32', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_ak', 'constexpr', 1), ('stride_bk', 'i32', 'D'), ('stride_bn', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_cn', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 16), ('BLOCK_SIZE_N', 'constexpr', 16))", "hash": "ed24cdb025613dcb31695c3d8108650987ba83a42cd3f6befcd7af9c0c1fb3fb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/ptx/sm75/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp32', 'D'), ('b_ptr', '*fp32', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_ak', 'constexpr', 1), ('stride_bk', 'i32', 'D'), ('stride_bn', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_cn', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 16), ('BLOCK_SIZE_N', 'constexpr', 16))", "hash": "ed24cdb025613dcb31695c3d8108650987ba83a42cd3f6befcd7af9c0c1fb3fb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.0.0/README.md:
--------------------------------------------------------------------------------
 1 | ### sm90 (H100, H200, H20, etc.)
 2 | ```shell
 3 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
 4 | 
 5 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
 6 | ```
 7 | 
 8 | ### sm80 (A100, A30)
 9 | ```shell
10 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
11 | 
12 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
13 | 
14 | python examples/runner/v3.4.0/cubin/sm80/matmul-with-dot-v2.py
15 | ```
16 | 
17 | ### sm120 (RTX PRO 6000, RTX 5090, etc.)
18 | 
19 | **not supported**
20 | 
21 | ### sm86 (A10, RTX 3090, etc.)
22 | ```shell
23 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
24 | 
25 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
26 | 
27 | python examples/runner/v3.1.0/ttgir/sm86/matmul-with-dot-v2.py
28 | 
29 | python examples/runner/v3.1.0/llir/sm86/matmul-with-dot-v2.py
30 | 
31 | python examples/runner/v3.1.0/ptx/sm86/matmul-with-dot-v2.py
32 | ```
33 | 
34 | ### sm75 (T4, RTX 2080, etc.)
35 | 
36 | ```shell
37 | python examples/runner/v3.5.x/python/matmul.py
38 | 
39 | python examples/runner/v3.5.x/ttir/matmul/matmul.py
40 | 
41 | python examples/runner/v3.1.0/ttgir/sm75/matmul.py
42 | 
43 | python examples/runner/v3.1.0/llir/sm75/matmul.py
44 | 
45 | python examples/runner/v3.2.0/ptx/sm75/matmul.py
46 | 
47 | python examples/runner/v3.4.0/cubin/sm75/matmul.py
48 | ```


--------------------------------------------------------------------------------
/examples/runner/v3.1.0/README.md:
--------------------------------------------------------------------------------
 1 | ### sm90 (H100, H200, H20, etc.)
 2 | ```shell
 3 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
 4 | 
 5 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
 6 | ```
 7 | 
 8 | ### sm80 (A100, A30)
 9 | ```shell
10 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
11 | 
12 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
13 | 
14 | python examples/runner/v3.4.0/cubin/sm80/matmul-with-dot-v2.py
15 | ```
16 | 
17 | ### sm120 (RTX PRO 6000, RTX 5090, etc.)
18 | 
19 | **not supported**
20 | 
21 | ### sm86 (A10, RTX 3090, etc.)
22 | ```shell
23 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
24 | 
25 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
26 | 
27 | python examples/runner/v3.1.0/ttgir/sm86/matmul-with-dot-v2.py
28 | 
29 | python examples/runner/v3.1.0/llir/sm86/matmul-with-dot-v2.py
30 | 
31 | python examples/runner/v3.1.0/ptx/sm86/matmul-with-dot-v2.py
32 | ```
33 | 
34 | ### sm75 (T4, RTX 2080, etc.)
35 | 
36 | ```shell
37 | python examples/runner/v3.5.x/python/matmul.py
38 | 
39 | python examples/runner/v3.5.x/ttir/matmul/matmul.py
40 | 
41 | python examples/runner/v3.1.0/ttgir/sm75/matmul.py
42 | 
43 | python examples/runner/v3.1.0/llir/sm75/matmul.py
44 | 
45 | python examples/runner/v3.2.0/ptx/sm75/matmul.py
46 | 
47 | python examples/runner/v3.4.0/cubin/sm75/matmul.py
48 | ```


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/llir/sm80/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38e7e0900bd668a520628198a40bb1cbce9ecf342cb5696635abd3d1c5a1a67b", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/llir/sm86/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38ff42038be6ab9e07fc243f6649bd37974d560308cbcf953bee3250a882217d", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/ptx/sm80/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38e7e0900bd668a520628198a40bb1cbce9ecf342cb5696635abd3d1c5a1a67b", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/ptx/sm86/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38ff42038be6ab9e07fc243f6649bd37974d560308cbcf953bee3250a882217d", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/cubin/test_kda_gate_single.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from fla.ops.kda.gate import kda_gate_ref
 7 | from gate import fused_kda_gate
 8 | from fla.utils import assert_close, device
 9 | 
10 | 
11 | device = "cuda" if torch.cuda.is_available() else "cpu"
12 | 
13 | def test_kda_gate_single():
14 |     """Run single configuration of kda gate test (B=1, T=2, H=2, D=12, use_bias=False)"""
15 |     B, T, H, D, use_bias = (1, 2, 2, 12, False)
16 | 
17 |     print(f"Running test for B={B}, T={T}, H={H}, D={D}, use_bias={use_bias}")
18 | 
19 |     # torch.manual_seed(42)
20 | 
21 |     g = torch.randn(B, T, H * D, dtype=torch.float32)
22 |     g = g * 30
23 |     A = torch.log(torch.randn(1, 1, H, 1, dtype=torch.float32).uniform_(1, 16))
24 |     g_bias = torch.randn(H * D, dtype=torch.float32) if use_bias else None
25 | 
26 |     g, A = map(lambda x: x.to(device).requires_grad_(True), (g, A))
27 |     if g_bias is not None:
28 |         g_bias = g_bias.to(device).requires_grad_(True)
29 | 
30 |     ref = kda_gate_ref(g.clone(), A.clone(), D, g_bias.clone() if g_bias is not None else None)
31 |     tri = fused_kda_gate(g.clone(), A.clone(), D, g_bias.clone() if g_bias is not None else None)
32 | 
33 |     assert_close('o', ref, tri, 1e-4)
34 | 
35 |     print("✅ Test passed for single configuration!")
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     test_kda_gate_single()
40 | 


--------------------------------------------------------------------------------
/examples/autotune/python/test_kda_gate_single.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from fla.ops.kda.gate import kda_gate_ref
 7 | from gate import fused_kda_gate
 8 | from fla.utils import assert_close, device
 9 | 
10 | 
11 | device = "cuda" if torch.cuda.is_available() else "cpu"
12 | 
13 | def test_kda_gate_single():
14 |     """Run single configuration of kda gate test (B=1, T=2, H=2, D=12, use_bias=False)"""
15 |     B, T, H, D, use_bias = (1, 2, 2, 12, False)
16 | 
17 |     print(f"Running test for B={B}, T={T}, H={H}, D={D}, use_bias={use_bias}")
18 | 
19 |     # torch.manual_seed(42)
20 | 
21 |     g = torch.randn(B, T, H * D, dtype=torch.float32)
22 |     g = g * 30
23 |     A = torch.log(torch.randn(1, 1, H, 1, dtype=torch.float32).uniform_(1, 16))
24 |     g_bias = torch.randn(H * D, dtype=torch.float32) if use_bias else None
25 | 
26 |     g, A = map(lambda x: x.to(device).requires_grad_(True), (g, A))
27 |     if g_bias is not None:
28 |         g_bias = g_bias.to(device).requires_grad_(True)
29 | 
30 |     ref = kda_gate_ref(g.clone(), A.clone(), D, g_bias.clone() if g_bias is not None else None)
31 |     tri = fused_kda_gate(g.clone(), A.clone(), D, g_bias.clone() if g_bias is not None else None)
32 | 
33 |     assert_close('o', ref, tri, 1e-4)
34 | 
35 |     print("✅ Test passed for single configuration!")
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     test_kda_gate_single()
40 | 


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm80/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38e7e0900bd668a520628198a40bb1cbce9ecf342cb5696635abd3d1c5a1a67b", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.4.0/cubin/sm86/matmul_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38ff42038be6ab9e07fc243f6649bd37974d560308cbcf953bee3250a882217d", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"}


--------------------------------------------------------------------------------
/examples/runner/v3.5.x/llir/sm90/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D', False), ('b_ptr', '*fp8e5', 'D', False), ('c_ptr', '*fp16', 'D', False), ('M', 'i32', 'D', False), ('N', 'i32', 'D', False), ('K', 'i32', 'D', False), ('BLOCK_SIZE_M', 'constexpr', 128, True), ('BLOCK_SIZE_N', 'constexpr', 64, True), ('BLOCK_SIZE_K', 'constexpr', 64, True))", "hash": "e83e5687ef13dcae980bb1730ac8f10134abd8a85d02f4d676098b23c706e5ad", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "triton_runner_version": "0.3.2", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.5.x/ptx/sm90/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D', False), ('b_ptr', '*fp8e5', 'D', False), ('c_ptr', '*fp16', 'D', False), ('M', 'i32', 'D', False), ('N', 'i32', 'D', False), ('K', 'i32', 'D', False), ('BLOCK_SIZE_M', 'constexpr', 128, True), ('BLOCK_SIZE_N', 'constexpr', 64, True), ('BLOCK_SIZE_K', 'constexpr', 64, True))", "hash": "e83e5687ef13dcae980bb1730ac8f10134abd8a85d02f4d676098b23c706e5ad", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "triton_runner_version": "0.3.2", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/v3.5.x/cubin/sm90/matmul_kernel_make_tensor_desciptor.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D', False), ('b_ptr', '*fp8e5', 'D', False), ('c_ptr', '*fp16', 'D', False), ('M', 'i32', 'D', False), ('N', 'i32', 'D', False), ('K', 'i32', 'D', False), ('BLOCK_SIZE_M', 'constexpr', 128, True), ('BLOCK_SIZE_N', 'constexpr', 64, True), ('BLOCK_SIZE_K', 'constexpr', 64, True))", "hash": "e83e5687ef13dcae980bb1730ac8f10134abd8a85d02f4d676098b23c706e5ad", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "triton_runner_version": "0.3.2", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "matmul_kernel_make_tensor_desciptor"}


--------------------------------------------------------------------------------
/examples/runner/tlx/v3.4.0/ptx/sm90/_attn_fwd_ws_pipelined_pingpong.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "None", "hash": "55058f997ece48ff75c6a860c67ae6321574a5f2f6ce0705f2dddb703d6b5b5b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 12, "num_ctas": 1, "num_stages": 0, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_cluster": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.4.0", "triton_runner_version": "0.2.7", "tensordesc_meta": [{"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}], "shared": 196696, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_attn_fwd_ws_pipelined_pingpong"}


--------------------------------------------------------------------------------
/examples/runner/tlx/v3.4.0/cubin/sm90/_attn_fwd_ws_pipelined_pingpong.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "None", "hash": "55058f997ece48ff75c6a860c67ae6321574a5f2f6ce0705f2dddb703d6b5b5b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 12, "num_ctas": 1, "num_stages": 0, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_cluster": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.4.0", "triton_runner_version": "0.2.7", "tensordesc_meta": [{"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}], "shared": 196696, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_attn_fwd_ws_pipelined_pingpong"}


--------------------------------------------------------------------------------
/examples/runner/tlx/v3.4.0/llir/sm90/_attn_fwd_ws_pipelined_pingpong.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "None", "hash": "55058f997ece48ff75c6a860c67ae6321574a5f2f6ce0705f2dddb703d6b5b5b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 12, "num_ctas": 1, "num_stages": 0, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_cluster": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.4.0", "triton_runner_version": "0.2.7", "tensordesc_meta": [{"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}], "shared": 196696, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_attn_fwd_ws_pipelined_pingpong"}


--------------------------------------------------------------------------------
/examples/runner/tlx/v3.4.0/ttgir/sm90/_attn_fwd_ws_pipelined_pingpong.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "None", "hash": "55058f997ece48ff75c6a860c67ae6321574a5f2f6ce0705f2dddb703d6b5b5b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 12, "num_ctas": 1, "num_stages": 0, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_cluster": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.4.0", "triton_runner_version": "0.2.7", "tensordesc_meta": [{"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}], "shared": 196696, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_attn_fwd_ws_pipelined_pingpong"}


--------------------------------------------------------------------------------
/examples/native_kernel/dump/02-matrix_transpose.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | 
 6 | 
 7 | @triton_runner.jit
 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr):
 9 |     row_index = tl.program_id(axis=0)
10 |     col_index = tl.program_id(axis=1)
11 |     offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
12 |     offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
13 |     old_offs = offs_row[:, None] * cols + offs_col[None, :]
14 |     mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols)
15 |     block = tl.load(input_ptr + old_offs, mask=mask)
16 |     transposed_block = tl.trans(block)
17 |     new_block = offs_col[:, None] * rows + offs_row[None, :]
18 |     tl.store(output_ptr + new_block, transposed_block, mask=mask.T)
19 | 
20 | 
21 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):
22 |     grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE']))
23 | 
24 |     matrix_transpose_kernel[grid](
25 |         input, output,
26 |         rows, cols,
27 |         BLOCK_SIZE=64,
28 |     )
29 | 
30 | if __name__ == "__main__":
31 |     rows, cols = 104, 78
32 |     a = torch.randn((rows, cols), device='cuda')
33 |     torch_output = a.T
34 |     triton_output = torch.empty(torch_output.shape, device='cuda')
35 |     solve(a, triton_output, rows, cols)
36 |     if torch.allclose(triton_output, torch_output):
37 |         print("✅ Triton and Torch match")
38 |     else:
39 |         print("❌ Triton and Torch differ")
40 | 


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/4QVBSIEL4EW76HXF6J42YMTVJSSBM3FZ62S63KR44UH74BOAEK6Q/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "e42a19208be12dff1ee5f279ac32754ca4166cb9f6a5edaa3ce50ffe05c022bd", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/7ABF7WWZ6UVKXZHZAPPCTAIG5WB56VBDYUXGBUHJW5AUWAIUYLLA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "f8025fdad9f52aabe4f903de298106ed83df5423c52e60d0e9b7414b0114c2d6", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/7MSUOZ5SZX22HSJIWKE6UC6ZX3UKWAU7LCYVXLFYH5BR2DE66YGA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "fb254767b2cdf5a3c928b289ea0bd9bee8ab029f58b15bacb83f431d0c9ef60c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/AZWK5R6GU6NKUAOLEDEDMHPVS7GSUTQ5ONAAYJD7VGRLHKYEEJKA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "066caec7c6a79aaa01cb20c8361df597cd2a4e1d73400c247fa9a2b3ab042254", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/BDBDUBFGEFPNA7OC7SMORXJVY3MIANC5KKDYZ5U7ORR3WMQPGL7A/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "08c23a04a6215ed07dc2fc98e8dd35c6d880345d52878cf69f7463bb320f32fe", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/C5R3K3Q7KWRIV5SPEAF25RSD6PVH6AJBT5EJOLUJ4NBNSF3OFBAQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1763b56e1f55a28af64f200baec643f3ea7f01219f48972e89e342d9176e2841", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/DKJUN6TOWV3VPVPTQWGKXKURI4GXR6NZRMJ6WC7G6T5WLAVEXSOQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1a9346fa6eb57757d5f3858cabaa91470d78f9b98b13eb0be6f4fb6582a4bc9d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/EMTXD7MF7V5ON7OHWVDWE3LB3XFQV277E7YCEOM5SZGNOTXK22MQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "232771fd85fd7ae6fdc7b547626d61ddcb0aebff27f022399d964cd74eead699", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/EPDDVXU2AXFLJFUG6ENXSPRP2JFWSTRIEDYTLW7QM5AQALS3D4HQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "23c63ade9a05cab49686f11b793e2fd24b694e2820f135dbf06741002e5b1f0f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/FA6WAAIL7CJ6ZKWLJBH7AL7NGKGG52HJFIDPVGIIKFWG6T3D5PSA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "283d60010bf893ecaacb484ff02fed328c6ee8e92a06fa9908516c6f4f63ebe4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/HELKQDNRUHYKVDWUXJPTSLBVWGX7SFCAPF76UGEQP6DP4LT2OIHA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "3916a80db1a1f0aa8ed4ba5f392c35b1aff91440797fea18907f86fe2e7a720e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/HILJQSDI7AXV7GPWY5BTHCPKLWKXIA6BEET57TQ5W7U43DBQI6FA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "3a16984868f82f5f99f6c7433389ea5d957403c12127dfce1db7e9cd8c30478a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/I7HEAVFUSXY6PUPNXIN2JIE5WQQV2FYMBXDCSRYPCOZUCEUM3GQA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "47ce4054b495f1e7d1edba1ba4a09db4215d170c0dc629470f13b341128cd9a0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/IPDCXESWUO5G5ZRHA2BPP7NGMRXPS2QMWDWRQLYLADND4ADF4GDA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "43c62b9256a3ba6ee6270682f7fda6646ef96a0cb0ed182f0b00da3e0065e186", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/JDB7JAU3WFAKKCIRCMQKGZBMBYW4WBWGNSG74TUX5PDJGOJ2CLOQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "48c3f4829bb140a509111320a3642c0e2dcb06c66c8dfe4e97ebc693393a12dd", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/KLWMK2PCX53G5X7APZPSDMDVB3N7TB5NUKRIJFTV4RBRSH3MONQQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "52ecc569e2bf766edfe07e5f21b0750edbf987ada2a2849675e443191f6c7361", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/LHSTUSP6EKUB4EH6D6VPU2WXGLR7IW3RCQSQ5WLNMMW5AG7LAD7Q/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "59e53a49fe22a81e10fe1faafa6ad732e3f45b7114250ed96d632dd01beb00ff", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/LTEIDA7H5ZHQZL4RR2OELBFJGRWTQGURZJGZF47P4CX3P2AHTCKQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "5cc88183e7ee4f0caf918e9c4584a9346d381a91ca4d92f3efe0afb7e8079895", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/ROJJGSWACR4K5G3OWYLO4A6Q6QGAM5DW4BH2NQHKCG3EHZWMYPWQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "8b92934ac01478ae9b6eb616ee03d0f40c067476e04fa6c0ea11b643e6ccc3ed", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/S4HQYRTI3C2OZRBHPNMHUFLSPPSXSQJHZZJZW24V4TCTTIU4JS6A/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "970f0c4668d8b4ecc4277b587a15727be5794127ce539b6b95e4c539a29c4cbc", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/WG4QQROEXFGCZ3LYUZRAISDRLGB6BRGGY4RS45ZNIZDVRTWLRHSA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "b1b90845c4b94c2ced78a6620448715983e0c4c6c7232e772d464758cecb89e4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/WK6MKXOWVAYTCMDOQT5PKFGH3YCF3PCK54HDOVFWUPECNQZVVMGA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "b2bcc55dd6a83131306e84faf514c7de045dbc4aef0e3754b6a3c826c335ab0c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/Y2FJGW5TXEL6YELXBO4X4BIC6PP4GKBQFZEB34NKY5XNWRNDIZUQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "c68a935bb3b917ec11770bb97e0502f3dfc328302e481df1aac76edb45a34669", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm90/ZAMAV7JUWTDP5URUGLBWWKZ6CKTAOI4H7PMKTXB4ZCDDWHENZMUA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "c8180afd34b4c6fed23432c36b2b3e12a6072387fbd8a9dc3cc8863b1c8dcb28", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/52QBMC4BRT5QHKUCA5JXTV5JWB3OZ7DCRZBVPOU47IG3BUEYMO4A/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "eea0160b818cfb03aa82075379d7a9b076ecfc628e4357ba9cfa0db0d09863b8", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/5FL7IJFMUYZQGBFXV6724VXMB2H45RT4EUYVKHD7XJ7JNOKZ2TRA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "e957f424aca6330304b7afbfae56ec0e8fcec67c2531551c7fba7e96b959d4e2", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/CF43WNCF5YMKCVCXPTDMIRUB6UCGO4DZB2KQRJJKV6K7L6ZFWW7A/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1179bb3445ee18a154577cc6c44681f5046770790e9508a52aaf95f5fb25b5be", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/D4OD4ZC3ME3UMHFZOWSYPJVC64VYXQ45BQUZ7N7ZBQD5Y2YCQR5A/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1f1c3e645b6137461cb975a587a6a2f72b8bc39d0c299fb7f90c07dc6b02847a", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/HQQSBVMUM6AOU3ICEFGUEBA3EIMEKEIZKDYL27Z26SP6ABY5CZXQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "3c2120d5946780ea6d02214d42041b221845111950f0bd7f3af49fe0071d166f", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/MN74TNF2LXOHDQ7JWHSBXBXDEYNEHVXNNQYYYU4I7UJXGZV57FTA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "637fc9b4ba5ddc71c3e9b1e41b86e3261a43d6ed6c318c5388fd137366bdf966", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/UOQTXANLCIU7DDLW3OAQU4IGAOBMAOKI65PO65DGP5NOEN4XAUUQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "a3a13b81ab1229f18d76db810a71060382c03948f75eef74667f5ae237970529", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/VRLASYKFGKAKD6S2LCNWFCL36AD52XDB2QX6FHCKZ2LYMTTOISPA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "ac560961453280a1fa5a589b62897bf007dd5c61d42fe29c4ace97864e6e449e", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/3J7PIE43DAZ7RUJM7K3BVQRXM6UADBVU5PLHUNWFTOPEYNC746WA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "da7ef4139b1833f8d12cfab61ac23767a80186b4ebd67a36c59b9e4c345fe7ac", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/43CRH27SFXI4LZFC44JA5STEEQ2L6MBRWI3YABOLNJH7C32G373A/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "e6c513ebf22dd1c5e4a2e7120eca642434bf3031b2378005cb6a4ff16f46dff6", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/456X6RCBTYY2CJLGJR5DWN3QQJEOTE6FJVEK3PJJRKS3DU5WZEOQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "e77d7f44419e31a125664c7a3b37708248e993c54d48adbd298aa5b1d3b6c91d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/6GXRVHZYV6JQF7YTWZAZJQ7AZSBNPUP3BV3SRGNJJG4SHIXJECRA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "f1af1a9f38af9302ff13b64194c3e0cc82d7d1fb0d772899a949b923a2e920a2", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/6U7U7HN66TGBRRNQCNOGDXNHQDK67IPORWGPUAJIZ4GJVYH3HN6Q/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "f53f4f9dbef4cc18c5b0135c61dda780d5efa1ee8d8cfa0128cf0c9ae0fb3b7d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/7VPODQVY6ADLGD5A4PAOUFY3ID6FGPUI52LZLSNRYAOKZEM3GJXA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "fd5ee1c2b8f006b30fa0e3c0ea171b40fc533e88ee9795c9b1c01cac919b326e", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/DZA3KX3D3GRGPKWBACG3DG6L6GUV7QVVAQM6VATF27WVAUHZ565Q/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1e41b55f63d9a267aac1008db19bcbf1a95fc2b50419ea8265d7ed5050f9efbb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/JH5VISGO5YTN4D3RVGIYOSMDXV42EMSOZOLEI34HRKULL6CNCFAQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "49fb5448ceee26de0f71a991874983bd79a2324ecb96446f878aa8b5f84d1141", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/JXGYDIUIMXWTX4QUNU7244XKKUT7SPFH44HVO4AFURRKCOMGT63Q/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "4dcd81a28865ed3bf2146d3fae72ea5527f93ca7e70f577005a462a139869fb7", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/KV6NGAQMO2J5DNTUIMBAW4GXZ2ISZWK6EEZ7MD2B2TOKBMN55L5Q/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "557cd3020c7693d1b67443020b70d7ce912cd95e2133f60f41d4dca0b1bdeafb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/O3BRYXGCQQFTUJVYXY6X53DO5MJVNR5J7J2U7OUUIP2SIQUVUXUA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "76c31c5cc2840b3a26b8be3d7eec6eeb1356c7a9fa754fba9443f5244295a5e8", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/SKQRWFMKMDSWZOOBL357YKR7VA2BIU7G4R7PCPWE6PZGSFLAWYEA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "92a11b158a60e56cb9c15efbfc2a3fa8341453e6e47ef13ec4f3f2691560b608", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/THVH3NEAOI3OOMJVYJWTBZO3HHUY4S7PC62QV3GQE3CTMC4VP7TQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "99ea7db4807236e73135c26d30e5db39e98e4bef17b50aecd026c5360b957fe7", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/TJPSS3CJUNFKTYOV2BGC4UWBL7NCDS2F2Z2CVA4MZ5OQDRYWLZFQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "9a5f296c49a34aa9e1d5d04c2e52c15fda21cb45d6742a838ccf5d01c7165e4b", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/VW3HSUCS6BWQFHZKBTGECL76ULLEMZM42DMEKDRG6Z6PPUZOSXDQ/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "adb6795052f06d029f2a0ccc412ffea2d646659cd0d8450e26f67cf7d32e95c7", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/examples/autotune/kda_gate_fwd_kernel_cache_sm75/W7F4HYOMUNP4EA5G5VPAPTYPFP64XXCKQH23BYIKIWEHOBXPC6ZA/kda_gate_fwd_kernel.json:
--------------------------------------------------------------------------------
1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "b7cbc3e1cca35fc203a6ed5e07cf0f2bfdcbdc4a81f5b0e10a45887706ef17b2", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"}


--------------------------------------------------------------------------------
/triton_runner/bench/launch_latency/kernels.py:
--------------------------------------------------------------------------------
 1 | # This file is adapted from:
 2 | # https://github.com/pytorch-labs/tritonbench/blob/main/tritonbench/operators/launch_latency/kernels.py
 3 | #
 4 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | import torch
10 | import triton
11 | 
12 | import triton.language as tl
13 | 
14 | import triton_runner
15 | 
16 | @triton.jit
17 | def nop_kernel():
18 |     pass
19 | 
20 | 
21 | @triton.jit
22 | def nop_with_args_kernel(
23 |     t1,
24 |     t2,
25 |     t3,
26 |     t4,
27 |     t5,
28 |     i1,
29 |     i2,
30 |     i3,
31 |     i4,
32 |     i5,
33 |     i6,
34 |     i7,
35 |     i8,
36 |     i9,
37 |     c1: tl.constexpr,
38 |     c2: tl.constexpr,
39 |     c3: tl.constexpr,
40 |     c4: tl.constexpr,
41 |     c5: tl.constexpr,
42 | ):
43 |     pass
44 | 
45 | 
46 | @triton_runner.jit
47 | def runner_nop_kernel():
48 |     pass
49 | 
50 | 
51 | @triton_runner.jit
52 | def runner_nop_with_args_kernel(
53 |     t1,
54 |     t2,
55 |     t3,
56 |     t4,
57 |     t5,
58 |     i1,
59 |     i2,
60 |     i3,
61 |     i4,
62 |     i5,
63 |     i6,
64 |     i7,
65 |     i8,
66 |     i9,
67 |     c1: tl.constexpr,
68 |     c2: tl.constexpr,
69 |     c3: tl.constexpr,
70 |     c4: tl.constexpr,
71 |     c5: tl.constexpr,
72 | ):
73 |     pass
74 | 
75 | 
76 | def get_trivial_add_kernel():
77 |     @torch.compile
78 |     def trivial_add_kernel(*args):
79 |         return sum([torch.tensor(1.0, device="cuda"), *args])
80 | 
81 |     return trivial_add_kernel
82 | 


--------------------------------------------------------------------------------
/triton_runner/check_utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import triton
 3 | from .color_print import get_project_name
 4 | 
 5 | _metadata = {}
 6 | 
 7 | 
 8 | def colored_warning(message, category, filename, lineno, file=None, line=None):
 9 |     if file is None:
10 |         import sys
11 |         file = sys.stderr
12 |     formatted = f"\033[1m\033[93m{category.__name__}: {message} ({filename}:{lineno})\033[0m\n"
13 |     file.write(formatted)
14 | 
15 | 
16 | warnings.showwarning = colored_warning
17 | 
18 | 
19 | def check_kernel_name(kernel_name):
20 |     if _metadata['name'] != kernel_name:
21 |         warnings.warn(f"{get_project_name()} This kernel name {kernel_name} is different with metadata {_metadata['name']}")
22 | 
23 | 
24 | def check_triton_version():
25 |     kernel_version = _metadata.get('triton_version', '')
26 |     from .version_utils import triton_version
27 |     installed_version = triton_version
28 |     if kernel_version and kernel_version != installed_version:
29 |         warnings.warn(f"{get_project_name()} This kernel Triton v{kernel_version} is different with intstalled v{installed_version}")
30 | 
31 | 
32 | def check_cuda_arch_with_capability(kernel_arch, target_arch):
33 |     if kernel_arch != target_arch:
34 |         warnings.warn(f"{get_project_name()} This kernel capability={kernel_arch} is different with device capability={target_arch}")
35 | 
36 | 
37 | def check_cuda_arch(target):
38 |     kernel_arch = _metadata["target"]["arch"]
39 |     check_cuda_arch_with_capability(target.arch, kernel_arch)
40 | 
41 | 
42 | def runner_check_triton(kernel_name, metadata, target):
43 |     global _metadata
44 |     _metadata = metadata
45 |     check_kernel_name(kernel_name)
46 |     check_triton_version()
47 |     check_cuda_arch(target)
48 | 


--------------------------------------------------------------------------------
/examples/runner/v3.2.0/README.md:
--------------------------------------------------------------------------------
 1 | ### sm90 (H100, H200, H20, etc.)
 2 | ```shell
 3 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
 4 | 
 5 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
 6 | 
 7 | python examples/runner/v3.2.0/ttgir/sm90/matmul-with-dot-v2.py
 8 | 
 9 | python examples/runner/v3.2.0/llir/sm90/matmul-with-dot-v2.py
10 | 
11 | python examples/runner/v3.2.0/ptx/sm90/matmul-with-dot-v2.py
12 | 
13 | python examples/runner/v3.2.0/cubin/sm90/matmul-with-dot-v2.py
14 | ```
15 | 
16 | ### sm80 (A100, A30)
17 | ```shell
18 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
19 | 
20 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
21 | 
22 | python examples/runner/v3.2.0/ttgir/sm80/matmul-with-dot-v2.py
23 | 
24 | python examples/runner/v3.2.0/llir/sm80/matmul-with-dot-v2.py
25 | 
26 | python examples/runner/v3.3.x/ptx/sm80/matmul-with-dot-v2.py
27 | ```
28 | 
29 | ### sm120 (RTX PRO 6000, RTX 5090, etc.)
30 | 
31 | **not supported**
32 | 
33 | ### sm86 (A10, RTX 3090, etc.)
34 | ```shell
35 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
36 | 
37 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
38 | 
39 | python examples/runner/v3.2.0/ttgir/sm86/matmul-with-dot-v2.py
40 | 
41 | python examples/runner/v3.2.0/llir/sm86/matmul-with-dot-v2.py
42 | 
43 | python examples/runner/v3.1.0/ptx/sm86/matmul-with-dot-v2.py
44 | 
45 | python examples/runner/v3.4.0/cubin/sm86/matmul-with-dot-v2.py
46 | ```
47 | 
48 | ### sm75 (T4, RTX 2080, etc.)
49 | ```shell
50 | python examples/runner/v3.5.x/python/matmul.py
51 | 
52 | python examples/runner/v3.5.x/ttir/matmul/matmul.py
53 | 
54 | python examples/runner/v3.2.0/ttgir/sm75/matmul.py
55 | 
56 | python examples/runner/v3.2.0/llir/sm75/matmul.py
57 | 
58 | python examples/runner/v3.2.0/ptx/sm75/matmul.py
59 | 
60 | python examples/runner/v3.4.0/cubin/sm75/matmul.py
61 | ```


--------------------------------------------------------------------------------
/examples/native_kernel/dump/04-softmax.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | 
 6 | 
 7 | @triton_runner.jit
 8 | def softmax_kernel(
 9 |     input_ptr, output_ptr,
10 |     N,
11 |     BLOCK_SIZE: tl.constexpr
12 | ):
13 |     input_ptr = input_ptr.to(tl.pointer_type(tl.float32))
14 |     output_ptr = output_ptr.to(tl.pointer_type(tl.float32))
15 |     _max = tl.zeros([BLOCK_SIZE], dtype=tl.float32) - float("inf")
16 |     for off in range(0, N, BLOCK_SIZE):
17 |         cols = off + tl.arange(0, BLOCK_SIZE)
18 |         a = tl.load(input_ptr + cols, mask=cols < N, other=-float("inf"))
19 |         _max = tl.maximum(a, _max)
20 |     max = tl.max(_max, axis=0)
21 |     _sum = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
22 |     for off in range(0, N, BLOCK_SIZE):
23 |         cols = off + tl.arange(0, BLOCK_SIZE)
24 |         a = tl.load(input_ptr + cols, mask=cols < N, other=-float("inf"))
25 |         _sum += tl.exp(a - max)
26 |     sum = tl.sum(_sum, axis=0)
27 |     pid = tl.program_id(0)
28 |     offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
29 |     mask = offset < N
30 |     x = tl.load(input_ptr + offset, mask=mask)
31 |     exp_shifted = tl.exp(x - max)
32 |     normalize_by_sum =  exp_shifted / sum
33 |     tl.store(output_ptr + offset, normalize_by_sum, mask=mask)
34 | 
35 | 
36 | def solve(input: torch.Tensor, output: torch.Tensor, N: int):
37 |     grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), )
38 |     softmax_kernel[grid](
39 |         input, output, N,
40 |         BLOCK_SIZE=4096,
41 |     )
42 | 
43 | if __name__ == "__main__":
44 |     N = 100000
45 |     input = torch.randn((N), device='cuda')
46 |     torch_output = torch.softmax(input, 0)
47 |     triton_output = torch.empty(torch_output.shape, device='cuda')
48 |     solve(input, triton_output, N)
49 |     if torch.allclose(triton_output, torch_output):
50 |         print("✅ Triton and Torch match")
51 |     else:
52 |         print("❌ Triton and Torch differ")
53 | 


--------------------------------------------------------------------------------
/examples/native_kernel/dump/05-softmax_lse.py:
--------------------------------------------------------------------------------
 1 | # softmax use log_sum_exp
 2 | import torch
 3 | import triton
 4 | import triton.language as tl
 5 | import triton_runner
 6 | 
 7 | 
 8 | @triton_runner.jit
 9 | def softmax_kernel(
10 |     input_ptr, output_ptr,
11 |     N,
12 |     BLOCK_SIZE: tl.constexpr
13 | ):
14 |     input_ptr = input_ptr.to(tl.pointer_type(tl.float32))
15 |     output_ptr = output_ptr.to(tl.pointer_type(tl.float32))
16 |     max_acc = tl.zeros([BLOCK_SIZE], dtype=tl.float32) - float("inf")
17 |     log_acc = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
18 | 
19 |     for off in range(0, N, BLOCK_SIZE):
20 |         cols = off + tl.arange(0, BLOCK_SIZE)
21 |         a = tl.load(input_ptr + cols, mask=cols < N, other=-float("inf"))
22 |         block_max = tl.max(a, axis=0)
23 |         max_acc_new = tl.where(max_acc > block_max, max_acc, block_max)
24 | 
25 |         raw_exp = tl.math.exp(a - max_acc_new)
26 | 
27 |         log_acc_new = tl.math.exp(max_acc - max_acc_new) * log_acc + tl.sum(raw_exp, axis=-1)
28 | 
29 |         log_acc = log_acc_new
30 |         max_acc = max_acc_new
31 | 
32 |     pid = tl.program_id(0)
33 |     offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
34 |     mask = offset < N
35 |     x = tl.load(input_ptr + offset, mask=mask)
36 |     o = tl.math.exp(x - max_acc) / log_acc
37 |     tl.store(output_ptr + offset, o, mask=mask)
38 | 
39 | 
40 | def solve(input: torch.Tensor, output: torch.Tensor, N: int):
41 |     grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), )
42 |     softmax_kernel[grid](
43 |         input, output, N,
44 |         BLOCK_SIZE=4096,
45 |     )
46 | 
47 | if __name__ == "__main__":
48 |     N = 100000
49 |     input = torch.randn((N), device='cuda')
50 |     torch_output = torch.softmax(input, 0)
51 |     triton_output = torch.empty(torch_output.shape, device='cuda')
52 |     solve(input, triton_output, N)
53 |     if torch.allclose(triton_output, torch_output):
54 |         print("✅ Triton and Torch match")
55 |     else:
56 |         print("❌ Triton and Torch differ")
57 | 


--------------------------------------------------------------------------------
/examples/test.py:
--------------------------------------------------------------------------------
 1 | import triton
 2 | import torch
 3 | import os
 4 | import re
 5 | import subprocess
 6 | import triton_runner
 7 | 
 8 | from triton_runner.version_utils import triton_version, uni_triton_version
 9 | 
10 | def get_content(file_path):
11 |     return open(file_path, "r").read()
12 | 
13 | def get_lines(match):
14 |     return [line.strip() for line in match.group(1).strip().splitlines() if line.strip()]
15 | 
16 | device = torch.cuda.current_device()
17 | capability = torch.cuda.get_device_capability(device)
18 | capability = capability[0] * 10 + capability[1]
19 | 
20 | pattern = re.compile(rf"### sm{capability}.*?shell(.*?)```", re.DOTALL)
21 | runner_file_path = os.path.join("examples", "runner", f"v{uni_triton_version}", "README.md")
22 | match = pattern.search(get_content(runner_file_path))
23 | if match:
24 |     lines = get_lines(match)
25 |     pattern = re.compile(rf"shell(.*?)```", re.DOTALL)
26 |     bench_file_path = os.path.join("doc", "benchmark.md")
27 |     lines.extend(get_lines(pattern.search(get_content(bench_file_path))))
28 |     from triton_runner.version_utils import is_triton_geq_v3_3
29 |     if is_triton_geq_v3_3:
30 |         debug_file_path = os.path.join("doc", "dump.md")
31 |         for i, m in enumerate(pattern.finditer((get_content(debug_file_path)), 1)):
32 |             lines.extend(get_lines(m))
33 |     triton_runner.color_print.yellow_print(f"TEST on triton v{triton_version}")
34 |     fail_cmd = []
35 |     for cmd in lines:
36 |         triton_runner.color_print.blue_print(cmd)
37 |         result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
38 |         print("stdout:", result.stdout)
39 |         print("return code:", result.returncode)
40 |         if result.returncode:
41 |             fail_cmd.append(cmd)
42 |     if len(fail_cmd) == 0:
43 |         print(f"✅ ALL TEST PASS on triton v{triton_version}")
44 |     else:
45 |         triton_runner.color_print.yellow_print(f"❌ SOME TEST FAIL on triton v{triton_version}")
46 |         print("\n".join(fail_cmd))
47 | else:
48 |     print(f"sm{capability} on triton v{triton.__version__} not found")
49 | 


--------------------------------------------------------------------------------
/triton_runner/bench/utils.py:
--------------------------------------------------------------------------------
 1 | # This file is adapted from:
 2 | # https://github.com/pytorch-labs/tritonbench/blob/main/tritonbench/utils/triton_op.py
 3 | #
 4 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | import torch
10 | import time
11 | import os
12 | from triton_runner.testing import do_bench
13 | 
14 | 
15 | class TimerContext:
16 | 
17 |     def __init__(self, enabled=True):
18 |         self.enabled = enabled
19 |         self.elapsed_ms = None
20 | 
21 |     def __enter__(self):
22 |         if self.enabled:
23 |             self._start_time = time.perf_counter()
24 |         return self
25 | 
26 |     def __exit__(self, *args, **kwargs):
27 |         if self.enabled:
28 |             end_time = time.perf_counter()
29 |             self.elapsed_ms = (end_time - self._start_time) * 1e3
30 | 
31 | 
32 | def benchmark(name, unit_name="ms"):
33 | 
34 |     def decorator(method):
35 | 
36 |         def wrapper(self, *args, **kwargs):
37 |             if kwargs.pop("enable_benchmark", True) is not False:
38 |                 os.environ["RUNNER_PROD"] = "1"
39 |                 input_iter = list(self.get_input_iter())
40 |                 # sum_time = 0
41 |                 input_len = len(input_iter)
42 |                 for idx, input_args in enumerate(input_iter):
43 |                     fn = method(self, *input_args)
44 |                     elapsed_time = do_bench(fn)
45 |                     elapsed_time_str = f"{elapsed_time:8.3f} ms"
46 |                     if unit_name == "us":
47 |                         elapsed_time_str = f"{elapsed_time * 1e3:8.3f} us"
48 |                     if idx == input_len - 1:
49 |                         print(f"[{name:<50}|] time: {elapsed_time_str}")
50 |                     # sum_time += elapsed_time
51 |                 # print(f"[{name + " average":<30}|] time: {sum_time/input_len:.6f} ms")
52 |                 os.environ.pop("RUNNER_PROD", None)
53 |             else:
54 |                 return method(self, *args[0])
55 | 
56 |         return wrapper
57 | 
58 |     return decorator
59 | 


--------------------------------------------------------------------------------
/examples/dump/python/02-matrix_transpose/dump_2d_load.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | import triton_runner.language as dl
 6 | 
 7 | 
 8 | @triton_runner.jit
 9 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr):
10 |     row_index = tl.program_id(axis=0)
11 |     col_index = tl.program_id(axis=1)
12 |     offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
13 |     offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
14 |     old_offs = offs_row[:, None] * cols + offs_col[None, :]
15 |     mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols)
16 |     block = tl.load(input_ptr + old_offs, mask=mask)
17 | 
18 |     # ===== DEBUG START =====
19 |     dl.dump(block)
20 |     # ===== DEBUG END =====
21 | 
22 |     transposed_block = tl.trans(block)
23 |     new_block = offs_col[:, None] * rows + offs_row[None, :]
24 |     tl.store(output_ptr + new_block, transposed_block, mask=mask.T)
25 | 
26 | 
27 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):
28 |     grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE']))
29 | 
30 |     BLOCK_SIZE = 64
31 |     dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device)
32 | 
33 |     matrix_transpose_kernel[grid](
34 |         input, output,
35 |         rows, cols,
36 |         BLOCK_SIZE=BLOCK_SIZE,
37 |         dump_tensor=dump_tensor,
38 |     )
39 |     triton_runner.color_print.blue_print(f"debug {dump_tensor}")
40 |     dump_torch = input
41 |     max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor))
42 |     triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}")
43 | 
44 | if __name__ == "__main__":
45 |     rows, cols = 104, 78
46 |     a = torch.randn((rows, cols), device='cuda')
47 |     torch_output = a.T
48 |     triton_output = torch.empty(torch_output.shape, device='cuda')
49 |     solve(a, triton_output, rows, cols)
50 |     if torch.allclose(triton_output, torch_output):
51 |         print("✅ Triton and Torch match")
52 |     else:
53 |         print("❌ Triton and Torch differ")
54 | 


--------------------------------------------------------------------------------
/examples/dump/python/02-matrix_transpose/dump_2d_trans.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | import triton_runner.language as dl
 6 | 
 7 | @triton_runner.jit
 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr):
 9 |     row_index = tl.program_id(axis=0)
10 |     col_index = tl.program_id(axis=1)
11 |     offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
12 |     offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
13 |     old_offs = offs_row[:, None] * cols + offs_col[None, :]
14 |     mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols)
15 |     block = tl.load(input_ptr + old_offs, mask=mask)
16 |     transposed_block = tl.trans(block)
17 | 
18 |     # ===== DEBUG START =====
19 |     dl.dump(transposed_block)
20 |     # ===== DEBUG END =====
21 | 
22 |     new_block = offs_col[:, None] * rows + offs_row[None, :]
23 |     tl.store(output_ptr + new_block, transposed_block, mask=mask.T)
24 | 
25 | 
26 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):
27 |     grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE']))
28 | 
29 |     BLOCK_SIZE = 64
30 |     dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device)
31 | 
32 |     matrix_transpose_kernel[grid](
33 |         input, output,
34 |         rows, cols,
35 |         BLOCK_SIZE=BLOCK_SIZE,
36 |         dump_tensor=dump_tensor,
37 |     )
38 |     triton_runner.color_print.blue_print(f"debug {dump_tensor}")
39 |     dump_torch = output
40 |     max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor))
41 |     triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}")
42 | 
43 | if __name__ == "__main__":
44 |     rows, cols = 104, 78
45 |     a = torch.randn((rows, cols), device='cuda')
46 |     torch_output = a.T
47 |     triton_output = torch.empty(torch_output.shape, device='cuda')
48 |     solve(a, triton_output, rows, cols)
49 |     if torch.allclose(triton_output, torch_output):
50 |         print("✅ Triton and Torch match")
51 |     else:
52 |         print("❌ Triton and Torch differ")
53 | 


--------------------------------------------------------------------------------
/examples/dump/ttir/02-matrix_transpose/dump_2d_load.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | 
 6 | @triton_runner.jit
 7 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr):
 8 |     row_index = tl.program_id(axis=0)
 9 |     col_index = tl.program_id(axis=1)
10 |     offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
11 |     offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
12 |     old_offs = offs_row[:, None] * cols + offs_col[None, :]
13 |     mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols)
14 |     block = tl.load(input_ptr + old_offs, mask=mask)
15 |     transposed_block = tl.trans(block)
16 |     new_block = offs_col[:, None] * rows + offs_row[None, :]
17 |     tl.store(output_ptr + new_block, transposed_block, mask=mask.T)
18 | 
19 | 
20 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):
21 |     grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE']))
22 | 
23 |     BLOCK_SIZE = 64
24 |     dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device)
25 |     # dump_value can be "%block_18"(block = tl.load)
26 |     dump_value = "%block_18"
27 | 
28 |     matrix_transpose_kernel[grid](
29 |         input, output,
30 |         rows, cols,
31 |         BLOCK_SIZE=BLOCK_SIZE,
32 |         ttir_dir=triton_runner.get_file_dir(__file__),
33 |         dump_tensor=dump_tensor,
34 |         dump_value=dump_value,
35 |     )
36 |     triton_runner.color_print.blue_print(f"debug {dump_tensor}")
37 |     dump_torch = input
38 |     max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor))
39 |     triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}")
40 | 
41 | if __name__ == "__main__":
42 |     rows, cols = 104, 78
43 |     a = torch.randn((rows, cols), device='cuda')
44 |     torch_output = a.T
45 |     triton_output = torch.empty(torch_output.shape, device='cuda')
46 |     solve(a, triton_output, rows, cols)
47 |     if torch.allclose(triton_output, torch_output):
48 |         print("✅ Triton and Torch match")
49 |     else:
50 |         print("❌ Triton and Torch differ")
51 | 


--------------------------------------------------------------------------------
/examples/dump/ttgir/02-matrix_transpose/dump_2d_load.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | 
 6 | @triton_runner.jit
 7 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr):
 8 |     row_index = tl.program_id(axis=0)
 9 |     col_index = tl.program_id(axis=1)
10 |     offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
11 |     offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
12 |     old_offs = offs_row[:, None] * cols + offs_col[None, :]
13 |     mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols)
14 |     block = tl.load(input_ptr + old_offs, mask=mask)
15 |     transposed_block = tl.trans(block)
16 |     new_block = offs_col[:, None] * rows + offs_row[None, :]
17 |     tl.store(output_ptr + new_block, transposed_block, mask=mask.T)
18 | 
19 | 
20 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):
21 |     grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE']))
22 | 
23 |     BLOCK_SIZE = 64
24 |     dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device)
25 |     # dump_value can be "%block_38"(block = tl.load)
26 |     dump_value = "%block_38"
27 | 
28 |     matrix_transpose_kernel[grid](
29 |         input, output,
30 |         rows, cols,
31 |         BLOCK_SIZE=BLOCK_SIZE,
32 |         ttgir_dir=triton_runner.get_file_dir(__file__),
33 |         dump_tensor=dump_tensor,
34 |         dump_value=dump_value,
35 |     )
36 |     triton_runner.color_print.blue_print(f"debug {dump_tensor}")
37 |     dump_torch = input
38 |     max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor))
39 |     triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}")
40 | 
41 | if __name__ == "__main__":
42 |     rows, cols = 104, 78
43 |     a = torch.randn((rows, cols), device='cuda')
44 |     torch_output = a.T
45 |     triton_output = torch.empty(torch_output.shape, device='cuda')
46 |     solve(a, triton_output, rows, cols)
47 |     if torch.allclose(triton_output, torch_output):
48 |         print("✅ Triton and Torch match")
49 |     else:
50 |         print("❌ Triton and Torch differ")
51 | 


--------------------------------------------------------------------------------
/examples/dump/ttgir/02-matrix_transpose/dump_2d_trans.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | 
 6 | 
 7 | @triton_runner.jit
 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr):
 9 |     row_index = tl.program_id(axis=0)
10 |     col_index = tl.program_id(axis=1)
11 |     offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
12 |     offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
13 |     old_offs = offs_row[:, None] * cols + offs_col[None, :]
14 |     mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols)
15 |     block = tl.load(input_ptr + old_offs, mask=mask)
16 |     transposed_block = tl.trans(block)
17 |     new_block = offs_col[:, None] * rows + offs_row[None, :]
18 |     tl.store(output_ptr + new_block, transposed_block, mask=mask.T)
19 | 
20 | 
21 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):
22 |     grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE']))
23 | 
24 |     BLOCK_SIZE = 64
25 |     dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device)
26 |     # dump_value can be "%transposed_block"(transposed_block = tl.trans(block))
27 |     dump_value = "%transposed_block"
28 | 
29 |     matrix_transpose_kernel[grid](
30 |         input, output,
31 |         rows, cols,
32 |         BLOCK_SIZE=BLOCK_SIZE,
33 |         ttgir_dir=triton_runner.get_file_dir(__file__),
34 |         dump_tensor=dump_tensor,
35 |         dump_value=dump_value,
36 |     )
37 |     triton_runner.color_print.blue_print(f"debug {dump_tensor}")
38 |     dump_torch = output
39 |     max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor))
40 |     triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}")
41 | 
42 | if __name__ == "__main__":
43 |     rows, cols = 104, 78
44 |     a = torch.randn((rows, cols), device='cuda')
45 |     torch_output = a.T
46 |     triton_output = torch.empty(torch_output.shape, device='cuda')
47 |     solve(a, triton_output, rows, cols)
48 |     if torch.allclose(triton_output, torch_output):
49 |         print("✅ Triton and Torch match")
50 |     else:
51 |         print("❌ Triton and Torch differ")
52 | 


--------------------------------------------------------------------------------
/examples/dump/ttir/02-matrix_transpose/dump_2d_trans.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | 
 6 | 
 7 | @triton_runner.jit
 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr):
 9 |     row_index = tl.program_id(axis=0)
10 |     col_index = tl.program_id(axis=1)
11 |     offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
12 |     offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
13 |     old_offs = offs_row[:, None] * cols + offs_col[None, :]
14 |     mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols)
15 |     block = tl.load(input_ptr + old_offs, mask=mask)
16 |     transposed_block = tl.trans(block)
17 |     new_block = offs_col[:, None] * rows + offs_row[None, :]
18 |     tl.store(output_ptr + new_block, transposed_block, mask=mask.T)
19 | 
20 | 
21 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):
22 |     grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE']))
23 | 
24 |     BLOCK_SIZE = 64
25 |     dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device)
26 |     # dump_value can be "%transposed_block"(transposed_block = tl.trans(block))
27 |     dump_value = "%transposed_block"
28 | 
29 |     matrix_transpose_kernel[grid](
30 |         input, output,
31 |         rows, cols,
32 |         BLOCK_SIZE=BLOCK_SIZE,
33 |         ttir_dir=triton_runner.get_file_dir(__file__),
34 |         dump_tensor=dump_tensor,
35 |         dump_value=dump_value,
36 |     )
37 |     triton_runner.color_print.blue_print(f"debug {dump_tensor}")
38 |     dump_torch = output
39 |     max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor))
40 |     triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}")
41 | 
42 | if __name__ == "__main__":
43 |     rows, cols = 104, 78
44 |     a = torch.randn((rows, cols), device='cuda')
45 |     torch_output = a.T
46 |     triton_output = torch.empty(torch_output.shape, device='cuda')
47 |     solve(a, triton_output, rows, cols)
48 |     if torch.allclose(triton_output, torch_output):
49 |         print("✅ Triton and Torch match")
50 |     else:
51 |         print("❌ Triton and Torch differ")
52 | 


--------------------------------------------------------------------------------
/examples/runner/v3.3.x/README.md:
--------------------------------------------------------------------------------
 1 | ### sm90 (H100, H200, H20, etc.)
 2 | ```shell
 3 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
 4 | 
 5 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
 6 | 
 7 | python examples/runner/v3.3.x/ttgir/sm90/matmul-with-tma-v3.py
 8 | 
 9 | python examples/runner/v3.3.x/llir/sm90/matmul-with-tma-v3.py
10 | 
11 | python examples/runner/v3.3.x/ptx/sm90/matmul-with-tma-v3.py
12 | 
13 | python examples/runner/v3.4.0/cubin/sm90/matmul-with-tma-v4.py
14 | ```
15 | 
16 | ### sm80 (A100, A30)
17 | ```shell
18 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
19 | 
20 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
21 | 
22 | python examples/runner/v3.4.0/ttgir/sm80/matmul-with-dot-v2.py
23 | 
24 | python examples/runner/v3.3.x/llir/sm80/matmul-with-dot-v2.py
25 | 
26 | python examples/runner/v3.3.x/ptx/sm80/matmul-with-dot-v2.py
27 | 
28 | python examples/runner/v3.4.0/cubin/sm80/matmul-with-dot-v2.py
29 | ```
30 | 
31 | ### sm120 (RTX PRO 6000, RTX 5090, etc.)
32 | ```shell
33 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
34 | 
35 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
36 | 
37 | python examples/runner/v3.3.x/ttgir/sm120/matmul-with-tma-v3.py
38 | 
39 | python examples/runner/v3.3.x/llir/sm120/matmul-with-tma-v3.py
40 | 
41 | python examples/runner/v3.4.0/ptx/sm120/matmul-with-tma-v4.py
42 | 
43 | python examples/runner/v3.4.0/cubin/sm120/matmul-with-tma-v4.py
44 | ```
45 | 
46 | ### sm86 (A10, RTX 3090, etc.)
47 | ```shell
48 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py
49 | 
50 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py
51 | 
52 | python examples/runner/v3.4.0/ttgir/sm86/matmul-with-dot-v2.py
53 | 
54 | python examples/runner/v3.3.x/llir/sm86/matmul-with-dot-v2.py
55 | 
56 | python examples/runner/v3.3.x/ptx/sm86/matmul-with-dot-v2.py
57 | 
58 | python examples/runner/v3.4.0/cubin/sm86/matmul-with-dot-v2.py
59 | ```
60 | 
61 | ### sm75 (T4, RTX 2080, etc.)
62 | ```shell
63 | python examples/runner/v3.5.x/python/matmul.py
64 | 
65 | python examples/runner/v3.5.x/ttir/matmul/matmul.py
66 | 
67 | python examples/runner/v3.5.x/ttgir/sm75/matmul.py
68 | 
69 | python examples/runner/v3.3.x/llir/sm75/matmul.py
70 | 
71 | python examples/runner/v3.3.x/ptx/sm75/matmul.py
72 | 
73 | python examples/runner/v3.4.0/cubin/sm75/matmul.py
74 | ```


--------------------------------------------------------------------------------
/examples/dump/python/02-matrix_transpose/dump_boundary/dump_boundary_trans.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | import triton_runner
 5 | import triton_runner.language as dl
 6 | 
 7 | @triton_runner.jit
 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr):
 9 |     row_index = tl.program_id(axis=0)
10 |     col_index = tl.program_id(axis=1)
11 |     offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
12 |     offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
13 |     old_offs = offs_row[:, None] * cols + offs_col[None, :]
14 |     mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols)
15 |     block = tl.load(input_ptr + old_offs, mask=mask)
16 |     transposed_block = tl.trans(block)
17 | 
18 |     # ===== DEBUG START =====
19 |     dl.dump_boundary(transposed_block)
20 |     # ===== DEBUG END =====
21 | 
22 |     new_block = offs_col[:, None] * rows + offs_row[None, :]
23 |     tl.store(output_ptr + new_block, transposed_block, mask=mask.T)
24 | 
25 | 
26 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):
27 |     grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE']))
28 | 
29 |     BLOCK_SIZE = 64
30 |     dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device)
31 | 
32 |     matrix_transpose_kernel[grid](
33 |         input, output,
34 |         rows, cols,
35 |         BLOCK_SIZE=BLOCK_SIZE,
36 |         dump_tensor=dump_tensor,
37 |     )
38 |     triton_runner.color_print.blue_print(f"debug {dump_tensor}")
39 |     dump_torch = output
40 |     boundary_start_rows = rows & (~(BLOCK_SIZE-1))
41 |     boundary_start_cols = rows & (~(BLOCK_SIZE-1))
42 |     max_diff = torch.max(torch.abs(dump_torch[boundary_start_cols:, boundary_start_rows:] - dump_tensor[:cols-boundary_start_cols, :rows-boundary_start_rows]))
43 |     triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}")
44 | 
45 | if __name__ == "__main__":
46 |     rows, cols = 104, 78
47 |     a = torch.randn((rows, cols), device='cuda')
48 |     torch_output = a.T
49 |     triton_output = torch.empty(torch_output.shape, device='cuda')
50 |     solve(a, triton_output, rows, cols)
51 |     if torch.allclose(triton_output, torch_output):
52 |         print("✅ Triton and Torch match")
53 |     else:
54 |         print("❌ Triton and Torch differ")
55 | 


--------------------------------------------------------------------------------