├── triton_runner ├── bench │ ├── __init__.py │ ├── matmul │ │ └── __init__.py │ ├── launch_latency │ │ ├── __init__.py │ │ └── kernels.py │ └── utils.py ├── compiler │ └── __init__.py ├── driver │ ├── v3_5_0 │ │ └── __init__.py │ └── __init__.py ├── gluon_runner │ └── __init__.py ├── language │ └── __init__.py ├── __init__.py ├── torch_utils.py ├── version_utils.py ├── color_print.py └── check_utils.py ├── doc ├── pdf │ └── Triton Runner-1029.pdf ├── benchmark.md └── solving_triton_issues │ ├── high_usage-7268 │ ├── v3.2.0_cache │ │ ├── _bwd_kernel.cubin │ │ ├── __grp___bwd_kernel.json │ │ └── _bwd_kernel.json │ ├── v3.3.0_cache │ │ ├── _bwd_kernel.cubin │ │ ├── __grp___bwd_kernel.json │ │ └── _bwd_kernel.json │ ├── attn.py │ ├── test │ │ └── test.py │ └── fix │ │ └── fix.py │ ├── performance-7096 │ ├── v3.1.0_cache │ │ ├── matmul_kernel.cubin │ │ ├── matmul_kernel.json │ │ └── __grp__matmul_kernel.json │ └── v3.4.0_cache │ │ ├── matmul_kernel.cubin │ │ ├── __grp__matmul_kernel.json │ │ └── matmul_kernel.json │ └── README.md ├── benchmark └── README.md ├── examples ├── runner │ ├── v3.2.0 │ │ ├── cubin │ │ │ └── sm90 │ │ │ │ ├── matmul_kernel.cubin │ │ │ │ └── matmul_kernel.json │ │ ├── llir │ │ │ ├── sm75 │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm80 │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm86 │ │ │ │ └── matmul_kernel.json │ │ │ └── sm90 │ │ │ │ └── matmul_kernel.json │ │ ├── ptx │ │ │ ├── sm75 │ │ │ │ ├── matmul_kernel.json │ │ │ │ └── old_ptx │ │ │ │ │ └── matmul_kernel.json │ │ │ └── sm90 │ │ │ │ └── matmul_kernel.json │ │ ├── ttgir │ │ │ └── sm75 │ │ │ │ └── matmul_kernel.json │ │ └── README.md │ ├── v3.4.0 │ │ ├── cubin │ │ │ ├── sm75 │ │ │ │ ├── matmul_kernel.cubin │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm80 │ │ │ │ ├── matmul_kernel.cubin │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm86 │ │ │ │ ├── matmul_kernel.cubin │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm120 │ │ │ │ ├── matmul_kernel_make_tensor_desciptor.cubin │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ │ └── sm90 │ │ │ │ ├── matmul_kernel_make_tensor_desciptor.cubin │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ ├── llir │ │ │ ├── sm120 │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ │ ├── sm90 │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ │ ├── sm75 │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm80 │ │ │ │ └── matmul_kernel.json │ │ │ └── sm86 │ │ │ │ └── matmul_kernel.json │ │ └── ptx │ │ │ ├── sm90 │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ │ ├── sm120 │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ │ ├── sm75 │ │ │ └── matmul_kernel.json │ │ │ ├── sm80 │ │ │ └── matmul_kernel.json │ │ │ └── sm86 │ │ │ └── matmul_kernel.json │ ├── tlx │ │ ├── v3.4.0 │ │ │ ├── cubin │ │ │ │ └── sm90 │ │ │ │ │ ├── _attn_fwd_ws_pipelined_pingpong.cubin │ │ │ │ │ └── _attn_fwd_ws_pipelined_pingpong.json │ │ │ ├── ptx │ │ │ │ └── sm90 │ │ │ │ │ └── _attn_fwd_ws_pipelined_pingpong.json │ │ │ ├── llir │ │ │ │ └── sm90 │ │ │ │ │ └── _attn_fwd_ws_pipelined_pingpong.json │ │ │ └── ttgir │ │ │ │ └── sm90 │ │ │ │ └── _attn_fwd_ws_pipelined_pingpong.json │ │ └── README.md │ ├── v3.5.x │ │ ├── cubin │ │ │ └── sm90 │ │ │ │ ├── matmul_kernel_make_tensor_desciptor.cubin │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ ├── llir │ │ │ └── sm90 │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ └── ptx │ │ │ └── sm90 │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ ├── v3.1.0 │ │ ├── llir │ │ │ ├── sm75 │ │ │ │ └── matmul_kernel.json │ │ │ └── sm86 │ │ │ │ └── matmul_kernel.json │ │ ├── ptx │ │ │ ├── sm75 │ │ │ │ └── matmul_kernel.json │ │ │ └── sm86 │ │ │ │ └── matmul_kernel.json │ │ └── README.md │ ├── v3.3.x │ │ ├── ptx │ │ │ ├── sm75 │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm80 │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm86 │ │ │ │ └── matmul_kernel.json │ │ │ └── sm90 │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ ├── llir │ │ │ ├── sm75 │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm80 │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm86 │ │ │ │ └── matmul_kernel.json │ │ │ ├── sm120 │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ │ └── sm90 │ │ │ │ └── matmul_kernel_make_tensor_desciptor.json │ │ └── README.md │ └── v3.0.0 │ │ └── README.md ├── autotune │ ├── kda_gate_fwd_kernel_cache_sm75 │ │ ├── 3J7PIE43DAZ7RUJM7K3BVQRXM6UADBVU5PLHUNWFTOPEYNC746WA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 43CRH27SFXI4LZFC44JA5STEEQ2L6MBRWI3YABOLNJH7C32G373A │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 456X6RCBTYY2CJLGJR5DWN3QQJEOTE6FJVEK3PJJRKS3DU5WZEOQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 52QBMC4BRT5QHKUCA5JXTV5JWB3OZ7DCRZBVPOU47IG3BUEYMO4A │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 5FL7IJFMUYZQGBFXV6724VXMB2H45RT4EUYVKHD7XJ7JNOKZ2TRA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 6GXRVHZYV6JQF7YTWZAZJQ7AZSBNPUP3BV3SRGNJJG4SHIXJECRA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 6U7U7HN66TGBRRNQCNOGDXNHQDK67IPORWGPUAJIZ4GJVYH3HN6Q │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 7VPODQVY6ADLGD5A4PAOUFY3ID6FGPUI52LZLSNRYAOKZEM3GJXA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── CF43WNCF5YMKCVCXPTDMIRUB6UCGO4DZB2KQRJJKV6K7L6ZFWW7A │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── D4OD4ZC3ME3UMHFZOWSYPJVC64VYXQ45BQUZ7N7ZBQD5Y2YCQR5A │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── DZA3KX3D3GRGPKWBACG3DG6L6GUV7QVVAQM6VATF27WVAUHZ565Q │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── HQQSBVMUM6AOU3ICEFGUEBA3EIMEKEIZKDYL27Z26SP6ABY5CZXQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── JH5VISGO5YTN4D3RVGIYOSMDXV42EMSOZOLEI34HRKULL6CNCFAQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── JXGYDIUIMXWTX4QUNU7244XKKUT7SPFH44HVO4AFURRKCOMGT63Q │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── KV6NGAQMO2J5DNTUIMBAW4GXZ2ISZWK6EEZ7MD2B2TOKBMN55L5Q │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── MN74TNF2LXOHDQ7JWHSBXBXDEYNEHVXNNQYYYU4I7UJXGZV57FTA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── O3BRYXGCQQFTUJVYXY6X53DO5MJVNR5J7J2U7OUUIP2SIQUVUXUA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── SKQRWFMKMDSWZOOBL357YKR7VA2BIU7G4R7PCPWE6PZGSFLAWYEA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── THVH3NEAOI3OOMJVYJWTBZO3HHUY4S7PC62QV3GQE3CTMC4VP7TQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── TJPSS3CJUNFKTYOV2BGC4UWBL7NCDS2F2Z2CVA4MZ5OQDRYWLZFQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── UOQTXANLCIU7DDLW3OAQU4IGAOBMAOKI65PO65DGP5NOEN4XAUUQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── VRLASYKFGKAKD6S2LCNWFCL36AD52XDB2QX6FHCKZ2LYMTTOISPA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── VW3HSUCS6BWQFHZKBTGECL76ULLEMZM42DMEKDRG6Z6PPUZOSXDQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ └── W7F4HYOMUNP4EA5G5VPAPTYPFP64XXCKQH23BYIKIWEHOBXPC6ZA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ ├── kda_gate_fwd_kernel_cache_sm90 │ │ ├── 4QVBSIEL4EW76HXF6J42YMTVJSSBM3FZ62S63KR44UH74BOAEK6Q │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 7ABF7WWZ6UVKXZHZAPPCTAIG5WB56VBDYUXGBUHJW5AUWAIUYLLA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── 7MSUOZ5SZX22HSJIWKE6UC6ZX3UKWAU7LCYVXLFYH5BR2DE66YGA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── AZWK5R6GU6NKUAOLEDEDMHPVS7GSUTQ5ONAAYJD7VGRLHKYEEJKA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── BDBDUBFGEFPNA7OC7SMORXJVY3MIANC5KKDYZ5U7ORR3WMQPGL7A │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── C5R3K3Q7KWRIV5SPEAF25RSD6PVH6AJBT5EJOLUJ4NBNSF3OFBAQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── DKJUN6TOWV3VPVPTQWGKXKURI4GXR6NZRMJ6WC7G6T5WLAVEXSOQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── EMTXD7MF7V5ON7OHWVDWE3LB3XFQV277E7YCEOM5SZGNOTXK22MQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── EPDDVXU2AXFLJFUG6ENXSPRP2JFWSTRIEDYTLW7QM5AQALS3D4HQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── FA6WAAIL7CJ6ZKWLJBH7AL7NGKGG52HJFIDPVGIIKFWG6T3D5PSA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── HELKQDNRUHYKVDWUXJPTSLBVWGX7SFCAPF76UGEQP6DP4LT2OIHA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── HILJQSDI7AXV7GPWY5BTHCPKLWKXIA6BEET57TQ5W7U43DBQI6FA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── I7HEAVFUSXY6PUPNXIN2JIE5WQQV2FYMBXDCSRYPCOZUCEUM3GQA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── IPDCXESWUO5G5ZRHA2BPP7NGMRXPS2QMWDWRQLYLADND4ADF4GDA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── JDB7JAU3WFAKKCIRCMQKGZBMBYW4WBWGNSG74TUX5PDJGOJ2CLOQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── KLWMK2PCX53G5X7APZPSDMDVB3N7TB5NUKRIJFTV4RBRSH3MONQQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── LHSTUSP6EKUB4EH6D6VPU2WXGLR7IW3RCQSQ5WLNMMW5AG7LAD7Q │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── LTEIDA7H5ZHQZL4RR2OELBFJGRWTQGURZJGZF47P4CX3P2AHTCKQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── ROJJGSWACR4K5G3OWYLO4A6Q6QGAM5DW4BH2NQHKCG3EHZWMYPWQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── S4HQYRTI3C2OZRBHPNMHUFLSPPSXSQJHZZJZW24V4TCTTIU4JS6A │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── WG4QQROEXFGCZ3LYUZRAISDRLGB6BRGGY4RS45ZNIZDVRTWLRHSA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── WK6MKXOWVAYTCMDOQT5PKFGH3YCF3PCK54HDOVFWUPECNQZVVMGA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── Y2FJGW5TXEL6YELXBO4X4BIC6PP4GKBQFZEB34NKY5XNWRNDIZUQ │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── ZAMAV7JUWTDP5URUGLBWWKZ6CKTAOI4H7PMKTXB4ZCDDWHENZMUA │ │ │ ├── kda_gate_fwd_kernel.cubin │ │ │ └── kda_gate_fwd_kernel.json │ │ ├── KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q │ │ │ └── cuda_utils.cpython-312-x86_64-linux-gnu.so │ │ └── VDPTDIXGIWY3WVB7NQTUNNJBCQG66QRFH5LX5HXNGOE4GJWHBV6Q │ │ │ └── __triton_launcher.cpython-312-x86_64-linux-gnu.so │ ├── cubin │ │ └── test_kda_gate_single.py │ └── python │ │ └── test_kda_gate_single.py ├── utils │ └── generate_cmd.py ├── native_kernel │ └── dump │ │ ├── 02-matrix_transpose.py │ │ ├── 04-softmax.py │ │ └── 05-softmax_lse.py ├── test.py └── dump │ ├── python │ └── 02-matrix_transpose │ │ ├── dump_2d_load.py │ │ ├── dump_2d_trans.py │ │ └── dump_boundary │ │ └── dump_boundary_trans.py │ ├── ttir │ └── 02-matrix_transpose │ │ ├── dump_2d_load.py │ │ └── dump_2d_trans.py │ └── ttgir │ └── 02-matrix_transpose │ ├── dump_2d_load.py │ └── dump_2d_trans.py ├── .vscode └── settings.json ├── .gitignore ├── pyproject.toml └── LICENSE /triton_runner/bench/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /triton_runner/bench/matmul/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /triton_runner/compiler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /triton_runner/driver/v3_5_0/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /triton_runner/bench/launch_latency/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /triton_runner/gluon_runner/__init__.py: -------------------------------------------------------------------------------- 1 | from ._runtime import jit 2 | -------------------------------------------------------------------------------- /doc/pdf/Triton Runner-1029.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/pdf/Triton Runner-1029.pdf -------------------------------------------------------------------------------- /doc/benchmark.md: -------------------------------------------------------------------------------- 1 | # benchmark command 2 | 3 | ```shell 4 | python benchmark/launch_latency/bench.py 5 | 6 | python benchmark/matmul/mma/bench.py 7 | ``` 8 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # benchmark command 2 | 3 | ```shell 4 | python benchmark/launch_latency/bench.py 5 | 6 | python benchmark/matmul/mma/bench.py 7 | ``` 8 | -------------------------------------------------------------------------------- /examples/runner/v3.2.0/cubin/sm90/matmul_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.2.0/cubin/sm90/matmul_kernel.cubin -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm75/matmul_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm75/matmul_kernel.cubin -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm80/matmul_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm80/matmul_kernel.cubin -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm86/matmul_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm86/matmul_kernel.cubin -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/v3.2.0_cache/_bwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/solving_triton_issues/high_usage-7268/v3.2.0_cache/_bwd_kernel.cubin -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/v3.3.0_cache/_bwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/solving_triton_issues/high_usage-7268/v3.3.0_cache/_bwd_kernel.cubin -------------------------------------------------------------------------------- /doc/solving_triton_issues/performance-7096/v3.1.0_cache/matmul_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/solving_triton_issues/performance-7096/v3.1.0_cache/matmul_kernel.cubin -------------------------------------------------------------------------------- /doc/solving_triton_issues/performance-7096/v3.4.0_cache/matmul_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/doc/solving_triton_issues/performance-7096/v3.4.0_cache/matmul_kernel.cubin -------------------------------------------------------------------------------- /examples/runner/tlx/v3.4.0/cubin/sm90/_attn_fwd_ws_pipelined_pingpong.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/tlx/v3.4.0/cubin/sm90/_attn_fwd_ws_pipelined_pingpong.cubin -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm120/matmul_kernel_make_tensor_desciptor.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm120/matmul_kernel_make_tensor_desciptor.cubin -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm90/matmul_kernel_make_tensor_desciptor.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.4.0/cubin/sm90/matmul_kernel_make_tensor_desciptor.cubin -------------------------------------------------------------------------------- /examples/runner/v3.5.x/cubin/sm90/matmul_kernel_make_tensor_desciptor.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/runner/v3.5.x/cubin/sm90/matmul_kernel_make_tensor_desciptor.cubin -------------------------------------------------------------------------------- /triton_runner/language/__init__.py: -------------------------------------------------------------------------------- 1 | import triton 2 | from ..version_utils import is_triton_geq_v3_4 3 | 4 | if is_triton_geq_v3_4: 5 | from .dump import dump, dump_boundary, dump_grids 6 | else: 7 | from .dump_before_3_4_0 import dump, dump_boundary, dump_grids 8 | -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/3J7PIE43DAZ7RUJM7K3BVQRXM6UADBVU5PLHUNWFTOPEYNC746WA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/3J7PIE43DAZ7RUJM7K3BVQRXM6UADBVU5PLHUNWFTOPEYNC746WA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/43CRH27SFXI4LZFC44JA5STEEQ2L6MBRWI3YABOLNJH7C32G373A/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/43CRH27SFXI4LZFC44JA5STEEQ2L6MBRWI3YABOLNJH7C32G373A/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/456X6RCBTYY2CJLGJR5DWN3QQJEOTE6FJVEK3PJJRKS3DU5WZEOQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/456X6RCBTYY2CJLGJR5DWN3QQJEOTE6FJVEK3PJJRKS3DU5WZEOQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/52QBMC4BRT5QHKUCA5JXTV5JWB3OZ7DCRZBVPOU47IG3BUEYMO4A/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/52QBMC4BRT5QHKUCA5JXTV5JWB3OZ7DCRZBVPOU47IG3BUEYMO4A/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/5FL7IJFMUYZQGBFXV6724VXMB2H45RT4EUYVKHD7XJ7JNOKZ2TRA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/5FL7IJFMUYZQGBFXV6724VXMB2H45RT4EUYVKHD7XJ7JNOKZ2TRA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/6GXRVHZYV6JQF7YTWZAZJQ7AZSBNPUP3BV3SRGNJJG4SHIXJECRA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/6GXRVHZYV6JQF7YTWZAZJQ7AZSBNPUP3BV3SRGNJJG4SHIXJECRA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/6U7U7HN66TGBRRNQCNOGDXNHQDK67IPORWGPUAJIZ4GJVYH3HN6Q/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/6U7U7HN66TGBRRNQCNOGDXNHQDK67IPORWGPUAJIZ4GJVYH3HN6Q/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/7VPODQVY6ADLGD5A4PAOUFY3ID6FGPUI52LZLSNRYAOKZEM3GJXA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/7VPODQVY6ADLGD5A4PAOUFY3ID6FGPUI52LZLSNRYAOKZEM3GJXA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/CF43WNCF5YMKCVCXPTDMIRUB6UCGO4DZB2KQRJJKV6K7L6ZFWW7A/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/CF43WNCF5YMKCVCXPTDMIRUB6UCGO4DZB2KQRJJKV6K7L6ZFWW7A/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/D4OD4ZC3ME3UMHFZOWSYPJVC64VYXQ45BQUZ7N7ZBQD5Y2YCQR5A/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/D4OD4ZC3ME3UMHFZOWSYPJVC64VYXQ45BQUZ7N7ZBQD5Y2YCQR5A/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/DZA3KX3D3GRGPKWBACG3DG6L6GUV7QVVAQM6VATF27WVAUHZ565Q/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/DZA3KX3D3GRGPKWBACG3DG6L6GUV7QVVAQM6VATF27WVAUHZ565Q/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/HQQSBVMUM6AOU3ICEFGUEBA3EIMEKEIZKDYL27Z26SP6ABY5CZXQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/HQQSBVMUM6AOU3ICEFGUEBA3EIMEKEIZKDYL27Z26SP6ABY5CZXQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/JH5VISGO5YTN4D3RVGIYOSMDXV42EMSOZOLEI34HRKULL6CNCFAQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/JH5VISGO5YTN4D3RVGIYOSMDXV42EMSOZOLEI34HRKULL6CNCFAQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/JXGYDIUIMXWTX4QUNU7244XKKUT7SPFH44HVO4AFURRKCOMGT63Q/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/JXGYDIUIMXWTX4QUNU7244XKKUT7SPFH44HVO4AFURRKCOMGT63Q/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/KV6NGAQMO2J5DNTUIMBAW4GXZ2ISZWK6EEZ7MD2B2TOKBMN55L5Q/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/KV6NGAQMO2J5DNTUIMBAW4GXZ2ISZWK6EEZ7MD2B2TOKBMN55L5Q/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/MN74TNF2LXOHDQ7JWHSBXBXDEYNEHVXNNQYYYU4I7UJXGZV57FTA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/MN74TNF2LXOHDQ7JWHSBXBXDEYNEHVXNNQYYYU4I7UJXGZV57FTA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/O3BRYXGCQQFTUJVYXY6X53DO5MJVNR5J7J2U7OUUIP2SIQUVUXUA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/O3BRYXGCQQFTUJVYXY6X53DO5MJVNR5J7J2U7OUUIP2SIQUVUXUA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/SKQRWFMKMDSWZOOBL357YKR7VA2BIU7G4R7PCPWE6PZGSFLAWYEA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/SKQRWFMKMDSWZOOBL357YKR7VA2BIU7G4R7PCPWE6PZGSFLAWYEA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/THVH3NEAOI3OOMJVYJWTBZO3HHUY4S7PC62QV3GQE3CTMC4VP7TQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/THVH3NEAOI3OOMJVYJWTBZO3HHUY4S7PC62QV3GQE3CTMC4VP7TQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/TJPSS3CJUNFKTYOV2BGC4UWBL7NCDS2F2Z2CVA4MZ5OQDRYWLZFQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/TJPSS3CJUNFKTYOV2BGC4UWBL7NCDS2F2Z2CVA4MZ5OQDRYWLZFQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/UOQTXANLCIU7DDLW3OAQU4IGAOBMAOKI65PO65DGP5NOEN4XAUUQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/UOQTXANLCIU7DDLW3OAQU4IGAOBMAOKI65PO65DGP5NOEN4XAUUQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/VRLASYKFGKAKD6S2LCNWFCL36AD52XDB2QX6FHCKZ2LYMTTOISPA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/VRLASYKFGKAKD6S2LCNWFCL36AD52XDB2QX6FHCKZ2LYMTTOISPA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/VW3HSUCS6BWQFHZKBTGECL76ULLEMZM42DMEKDRG6Z6PPUZOSXDQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/VW3HSUCS6BWQFHZKBTGECL76ULLEMZM42DMEKDRG6Z6PPUZOSXDQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/W7F4HYOMUNP4EA5G5VPAPTYPFP64XXCKQH23BYIKIWEHOBXPC6ZA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm75/W7F4HYOMUNP4EA5G5VPAPTYPFP64XXCKQH23BYIKIWEHOBXPC6ZA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/4QVBSIEL4EW76HXF6J42YMTVJSSBM3FZ62S63KR44UH74BOAEK6Q/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/4QVBSIEL4EW76HXF6J42YMTVJSSBM3FZ62S63KR44UH74BOAEK6Q/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/7ABF7WWZ6UVKXZHZAPPCTAIG5WB56VBDYUXGBUHJW5AUWAIUYLLA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/7ABF7WWZ6UVKXZHZAPPCTAIG5WB56VBDYUXGBUHJW5AUWAIUYLLA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/7MSUOZ5SZX22HSJIWKE6UC6ZX3UKWAU7LCYVXLFYH5BR2DE66YGA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/7MSUOZ5SZX22HSJIWKE6UC6ZX3UKWAU7LCYVXLFYH5BR2DE66YGA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/AZWK5R6GU6NKUAOLEDEDMHPVS7GSUTQ5ONAAYJD7VGRLHKYEEJKA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/AZWK5R6GU6NKUAOLEDEDMHPVS7GSUTQ5ONAAYJD7VGRLHKYEEJKA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/BDBDUBFGEFPNA7OC7SMORXJVY3MIANC5KKDYZ5U7ORR3WMQPGL7A/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/BDBDUBFGEFPNA7OC7SMORXJVY3MIANC5KKDYZ5U7ORR3WMQPGL7A/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/C5R3K3Q7KWRIV5SPEAF25RSD6PVH6AJBT5EJOLUJ4NBNSF3OFBAQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/C5R3K3Q7KWRIV5SPEAF25RSD6PVH6AJBT5EJOLUJ4NBNSF3OFBAQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/DKJUN6TOWV3VPVPTQWGKXKURI4GXR6NZRMJ6WC7G6T5WLAVEXSOQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/DKJUN6TOWV3VPVPTQWGKXKURI4GXR6NZRMJ6WC7G6T5WLAVEXSOQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/EMTXD7MF7V5ON7OHWVDWE3LB3XFQV277E7YCEOM5SZGNOTXK22MQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/EMTXD7MF7V5ON7OHWVDWE3LB3XFQV277E7YCEOM5SZGNOTXK22MQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/EPDDVXU2AXFLJFUG6ENXSPRP2JFWSTRIEDYTLW7QM5AQALS3D4HQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/EPDDVXU2AXFLJFUG6ENXSPRP2JFWSTRIEDYTLW7QM5AQALS3D4HQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/FA6WAAIL7CJ6ZKWLJBH7AL7NGKGG52HJFIDPVGIIKFWG6T3D5PSA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/FA6WAAIL7CJ6ZKWLJBH7AL7NGKGG52HJFIDPVGIIKFWG6T3D5PSA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/HELKQDNRUHYKVDWUXJPTSLBVWGX7SFCAPF76UGEQP6DP4LT2OIHA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/HELKQDNRUHYKVDWUXJPTSLBVWGX7SFCAPF76UGEQP6DP4LT2OIHA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/HILJQSDI7AXV7GPWY5BTHCPKLWKXIA6BEET57TQ5W7U43DBQI6FA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/HILJQSDI7AXV7GPWY5BTHCPKLWKXIA6BEET57TQ5W7U43DBQI6FA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/I7HEAVFUSXY6PUPNXIN2JIE5WQQV2FYMBXDCSRYPCOZUCEUM3GQA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/I7HEAVFUSXY6PUPNXIN2JIE5WQQV2FYMBXDCSRYPCOZUCEUM3GQA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/IPDCXESWUO5G5ZRHA2BPP7NGMRXPS2QMWDWRQLYLADND4ADF4GDA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/IPDCXESWUO5G5ZRHA2BPP7NGMRXPS2QMWDWRQLYLADND4ADF4GDA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/JDB7JAU3WFAKKCIRCMQKGZBMBYW4WBWGNSG74TUX5PDJGOJ2CLOQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/JDB7JAU3WFAKKCIRCMQKGZBMBYW4WBWGNSG74TUX5PDJGOJ2CLOQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/KLWMK2PCX53G5X7APZPSDMDVB3N7TB5NUKRIJFTV4RBRSH3MONQQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/KLWMK2PCX53G5X7APZPSDMDVB3N7TB5NUKRIJFTV4RBRSH3MONQQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/LHSTUSP6EKUB4EH6D6VPU2WXGLR7IW3RCQSQ5WLNMMW5AG7LAD7Q/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/LHSTUSP6EKUB4EH6D6VPU2WXGLR7IW3RCQSQ5WLNMMW5AG7LAD7Q/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/LTEIDA7H5ZHQZL4RR2OELBFJGRWTQGURZJGZF47P4CX3P2AHTCKQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/LTEIDA7H5ZHQZL4RR2OELBFJGRWTQGURZJGZF47P4CX3P2AHTCKQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/ROJJGSWACR4K5G3OWYLO4A6Q6QGAM5DW4BH2NQHKCG3EHZWMYPWQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/ROJJGSWACR4K5G3OWYLO4A6Q6QGAM5DW4BH2NQHKCG3EHZWMYPWQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/S4HQYRTI3C2OZRBHPNMHUFLSPPSXSQJHZZJZW24V4TCTTIU4JS6A/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/S4HQYRTI3C2OZRBHPNMHUFLSPPSXSQJHZZJZW24V4TCTTIU4JS6A/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/WG4QQROEXFGCZ3LYUZRAISDRLGB6BRGGY4RS45ZNIZDVRTWLRHSA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/WG4QQROEXFGCZ3LYUZRAISDRLGB6BRGGY4RS45ZNIZDVRTWLRHSA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/WK6MKXOWVAYTCMDOQT5PKFGH3YCF3PCK54HDOVFWUPECNQZVVMGA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/WK6MKXOWVAYTCMDOQT5PKFGH3YCF3PCK54HDOVFWUPECNQZVVMGA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/Y2FJGW5TXEL6YELXBO4X4BIC6PP4GKBQFZEB34NKY5XNWRNDIZUQ/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/Y2FJGW5TXEL6YELXBO4X4BIC6PP4GKBQFZEB34NKY5XNWRNDIZUQ/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/ZAMAV7JUWTDP5URUGLBWWKZ6CKTAOI4H7PMKTXB4ZCDDWHENZMUA/kda_gate_fwd_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/ZAMAV7JUWTDP5URUGLBWWKZ6CKTAOI4H7PMKTXB4ZCDDWHENZMUA/kda_gate_fwd_kernel.cubin -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q/cuda_utils.cpython-312-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q/cuda_utils.cpython-312-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/VDPTDIXGIWY3WVB7NQTUNNJBCQG66QRFH5LX5HXNGOE4GJWHBV6Q/__triton_launcher.cpython-312-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toyaix/triton-runner/HEAD/examples/autotune/kda_gate_fwd_kernel_cache_sm90/VDPTDIXGIWY3WVB7NQTUNNJBCQG66QRFH5LX5HXNGOE4GJWHBV6Q/__triton_launcher.cpython-312-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "*.ttir": "mlir", 4 | "*.ttgir": "mlir", 5 | "*.llir": "mlir", 6 | "*.ptx": "mlir", 7 | "*.source": "mlir", 8 | }, 9 | "files.trimTrailingWhitespace": true, 10 | "[mlir]": { 11 | "files.trimTrailingWhitespace": false 12 | }, 13 | } 14 | -------------------------------------------------------------------------------- /triton_runner/driver/__init__.py: -------------------------------------------------------------------------------- 1 | def get_device_interface(): 2 | import torch 3 | return torch.cuda 4 | 5 | 6 | def get_empty_cache_for_benchmark(): 7 | import torch 8 | 9 | # We maintain a buffer of 256 MB that we clear 10 | # before each kernel call to make sure that the L2 cache 11 | # doesn't contain any input data before the run 12 | cache_size = 256 * 1024 * 1024 13 | return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda') 14 | 15 | 16 | def clear_cache(cache): 17 | cache.zero_() 18 | -------------------------------------------------------------------------------- /triton_runner/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.3.2' 2 | 3 | from .version_utils import is_support_version, triton_version 4 | if not is_support_version: 5 | raise RuntimeError(f"Triton Runner doesn't support Triton v{triton_version}") 6 | 7 | 8 | from .jit import jit 9 | from .version_utils import is_triton_geq_v3_4 10 | if is_triton_geq_v3_4: 11 | from .autotune import autotune 12 | from . import color_print 13 | from . import torch_utils 14 | import os 15 | 16 | 17 | def get_file_dir(file): 18 | return os.path.dirname(os.path.abspath(file)) 19 | -------------------------------------------------------------------------------- /examples/utils/generate_cmd.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def list_files_rel(path): 4 | files = [] 5 | for root, _, filenames in os.walk(path): 6 | for f in filenames: 7 | if f.endswith('.py'): 8 | rel_path = os.path.relpath(os.path.join(root, f), path) 9 | files.append(rel_path) 10 | return sorted(files) 11 | 12 | old_dirname = "" 13 | for f in list_files_rel("examples/dump/python"): 14 | dirname, filename = os.path.split(f) 15 | if dirname != old_dirname: 16 | print() 17 | old_dirname = dirname 18 | print("python", f"examples/dump/python/{f}") 19 | -------------------------------------------------------------------------------- /examples/runner/v3.1.0/llir/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "1597f8ffb198730637412019688ada11f0c656203824bbd6ca759d828658dc3c", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": false, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "shared": 1152, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.1.0/ptx/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "1597f8ffb198730637412019688ada11f0c656203824bbd6ca759d828658dc3c", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": false, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "shared": 1152, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # triton_runner builds 2 | build/ 3 | build-*/ 4 | 5 | triton_runner*.egg-info/ 6 | 7 | # Pytest 8 | pytest.ini 9 | 10 | # Python caches 11 | __pycache__/ 12 | *.py[cod] 13 | .pytest_cache 14 | 15 | # Environments 16 | .venv 17 | venv/ 18 | venv.bak/ 19 | 20 | # VS Code project files 21 | .vscode 22 | .vs 23 | 24 | # JetBrains project files 25 | .idea 26 | cmake-build-* 27 | 28 | # clangd index. (".clangd" is a config file now, thus trailing slash) 29 | .clangd/ 30 | .cache 31 | /compile_commands.json 32 | .vscode 33 | .vs 34 | 35 | # Vim 36 | *.swp 37 | 38 | # macOS 39 | .DS_Store 40 | 41 | **/dump.ttir 42 | **/dump.ttgir 43 | triton_runner/README.md 44 | fused-attention-* 45 | results.html 46 | -------------------------------------------------------------------------------- /doc/solving_triton_issues/performance-7096/v3.1.0_cache/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": true, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "shared": 73728, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/v3.2.0_cache/__grp___bwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"child_paths": {"_bwd_kernel.ttir": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.ttir", "_bwd_kernel.ttgir": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.ttgir", "_bwd_kernel.llir": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.llir", "_bwd_kernel.ptx": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.ptx", "_bwd_kernel.cubin": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.cubin", "_bwd_kernel.json": "/root/triton-runner/.cache/M9-J3iC1ikNpFW4eibApB6D9a7zHjWCmZpSa2PFKc5U/_bwd_kernel.json"}} -------------------------------------------------------------------------------- /examples/runner/v3.1.0/ptx/sm86/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "None", "hash": "64fbcec4e63cf8ac427398e09df770b785f88527f5ca375bac7547e503bd5dbf", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": false, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "triton_version": "3.1.0", "shared": 49152, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/attn.py: -------------------------------------------------------------------------------- 1 | # Higher shared_memory usage in Triton 3.3 2 | # https://github.com/triton-lang/triton/issues/7268 3 | 4 | from flash_attn_triton import flash_attn_func 5 | import torch 6 | 7 | # set seed 8 | torch.random.manual_seed(0) 9 | batch_size = 1 10 | nheads = 4 11 | d = 64 12 | seqlen = 16 13 | dtype = torch.bfloat16 14 | q = torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") * 5 15 | k, v = [ 16 | torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") * 3 17 | for _ in range(2) 18 | ] 19 | q.requires_grad_(True) 20 | k.requires_grad_(True) 21 | v.requires_grad_(True) 22 | out = flash_attn_func(q, k, v) 23 | g = torch.randn_like(out) 24 | out.backward(g) 25 | -------------------------------------------------------------------------------- /examples/runner/tlx/README.md: -------------------------------------------------------------------------------- 1 | Triton v3.4.0 in TLX with commit [9a7a23d](https://github.com/facebookexperimental/triton/commit/9a7a23d0cfa4ed4b37eb9b177b0e36beb254f9e6) 2 | 3 | ### sm90 (H100, H200, H20, etc.) 4 | ```shell 5 | python examples/runner/tlx/v3.4.0/python/hopper-fa-ws-pipelined-pingpong.py 6 | 7 | python examples/runner/tlx/v3.4.0/ttir/sm90/hopper-fa-ws-pipelined-pingpong.py 8 | 9 | python examples/runner/tlx/v3.4.0/ttgir/sm90/hopper-fa-ws-pipelined-pingpong.py 10 | 11 | python examples/runner/tlx/v3.4.0/llir/sm90/hopper-fa-ws-pipelined-pingpong.py 12 | 13 | python examples/runner/tlx/v3.4.0/ptx/sm90/hopper-fa-ws-pipelined-pingpong.py 14 | 15 | python examples/runner/tlx/v3.4.0/cubin/sm90/hopper-fa-ws-pipelined-pingpong.py 16 | ``` 17 | -------------------------------------------------------------------------------- /examples/runner/v3.1.0/llir/sm86/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "None", "hash": "64fbcec4e63cf8ac427398e09df770b785f88527f5ca375bac7547e503bd5dbf", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "allow_fp8e4nv": false, "allow_fp8e4b15": true, "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": null, "backend_name": "cuda", "triton_version": "3.1.0", "shared": 49152, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/v3.3.0_cache/__grp___bwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"child_paths": {"_bwd_kernel.ttir": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.ttir", "_bwd_kernel.ttgir": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.ttgir", "_bwd_kernel.llir": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.llir", "_bwd_kernel.ptx": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.ptx", "_bwd_kernel.cubin": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.cubin", "_bwd_kernel.json": "/root/triton-runner/.cache/O3D4D7BZYAU3U2OB3UBCC4DLHUL3WHL6CKRSTOUNP2R55VY45WMQ/_bwd_kernel.json"}} -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "triton-runner" 3 | description = "Multi-Level Triton Runner supporting Python, IR, PTX, and cubin." 4 | 5 | dependencies = [ 6 | "triton>=3.0.0", 7 | "termcolor", 8 | ] 9 | 10 | readme = "README.md" 11 | version = "0.3.2" 12 | authors = [ 13 | { name = "Bob Huang", email = "x@bobhuang.xyz" }, 14 | ] 15 | license = "MIT" 16 | 17 | [project.urls] 18 | repository = "https://github.com/toyaix/triton-runner" 19 | homepage = "https://triton-runner.org" 20 | 21 | [build-system] 22 | requires = ["setuptools>=61.0"] 23 | build-backend = "setuptools.build_meta" 24 | 25 | [tool.setuptools] 26 | include-package-data = true 27 | 28 | [tool.setuptools.package-data] 29 | "triton_runner" = ["README.md"] 30 | 31 | [tool.setuptools.packages.find] 32 | include = ["triton_runner*"] 33 | -------------------------------------------------------------------------------- /examples/runner/v3.2.0/llir/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "b64e769422e5fa5399816ba4a980010ac79cb8b001625b23012ec3b0e699d40a", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 1152, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.2.0/llir/sm80/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "56eea00f4a1bf012c5cb9a8b7b95772212bed2037dea35c421ffd3be025a57f5", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 49152, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.2.0/llir/sm86/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "a033e32e3e90ad81ddbdf656d192ce47a1b4f8153257ead819af5222682f522d", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 49152, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.2.0/ptx/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "b64e769422e5fa5399816ba4a980010ac79cb8b001625b23012ec3b0e699d40a", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 1152, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/v3.2.0_cache/_bwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "33df89de20b58a4369156e1e89b02907a0fd6bbcc78d60a666949ad8f14a7395", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 98304, "name": "_bwd_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/performance-7096/v3.1.0_cache/__grp__matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"child_paths": {"matmul_kernel.ttir": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.ttir", "matmul_kernel.ttgir": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.ttgir", "matmul_kernel.llir": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.llir", "matmul_kernel.ptx": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.ptx", "matmul_kernel.cubin": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.cubin", "matmul_kernel.json": "/root/triton-runner/.cache/6138cf007d17ea7c57cb945486d879f19ea5e713a3286d3e0c332ad2ab73472f/matmul_kernel.json"}} -------------------------------------------------------------------------------- /examples/runner/v3.2.0/llir/sm90/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "1d3a3576962135e581ebf26145d4beba58e9ee12ff11c7934b9b69a9059ea670", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 73728, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.2.0/ptx/sm90/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "1d3a3576962135e581ebf26145d4beba58e9ee12ff11c7934b9b69a9059ea670", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 73728, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /triton_runner/torch_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import triton 4 | 5 | 6 | def pad_2d_to_block_shape(tensor, block_shape): 7 | M, K = tensor.shape 8 | BLOCK_M, BLOCK_K = block_shape 9 | 10 | pad_M = (BLOCK_M - M % BLOCK_M) % BLOCK_M 11 | pad_K = (BLOCK_K - K % BLOCK_K) % BLOCK_K 12 | 13 | padded = torch.nn.functional.pad(tensor, (0, pad_K, 0, pad_M), value=0) 14 | 15 | return padded.to(torch.float32) 16 | 17 | 18 | def get_pad_n_elements(tensor, block_shape): 19 | return math.prod(tuple(triton.cdiv(dim, block) * block for dim, block in zip(tensor.shape, block_shape))) 20 | 21 | 22 | def get_grid_dim(tensor_shape, block_shape): 23 | return tuple(triton.cdiv(dim, block) for dim, block in zip(tensor_shape, block_shape)) 24 | 25 | 26 | def get_n_elements_with_grid(block_shape, grid): 27 | return math.prod(block_shape) * math.prod(grid) 28 | -------------------------------------------------------------------------------- /examples/runner/v3.2.0/cubin/sm90/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "e36faa33ba3c2c3f39005afa18658f6cd91bf57c877172c9167ad11d19490d2e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 73728, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/performance-7096/v3.4.0_cache/__grp__matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"child_paths": {"matmul_kernel.source": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.source", "matmul_kernel.ttir": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.ttir", "matmul_kernel.ttgir": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.ttgir", "matmul_kernel.llir": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.llir", "matmul_kernel.ptx": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.ptx", "matmul_kernel.cubin": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.cubin", "matmul_kernel.json": "/root/triton-runner/.cache/ZVGXYCVOXOBGHQIZRLVBAX53P6YSI2WA55RCCBI5EP243TNSYSXA/matmul_kernel.json"}} -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/test/test.py: -------------------------------------------------------------------------------- 1 | from flash_attn_triton_test import _flash_attn_backward 2 | import torch 3 | import math 4 | import triton 5 | 6 | 7 | batch_size, nheads, d, seqlen = 1, 4, 64, 16 8 | torch.random.manual_seed(0) 9 | 10 | dtype = torch.bfloat16 11 | q, k, v, o, do = [ 12 | torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") 13 | for _ in range(5) 14 | ] 15 | seqlen_q_rounded = math.ceil(seqlen / 128) * 128 16 | lse = torch.empty((batch_size, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32) 17 | with torch.inference_mode(): 18 | dq = torch.empty_like(q) 19 | dk = torch.empty_like(k) 20 | dv = torch.empty_like(v) 21 | _flash_attn_backward( 22 | do, 23 | q, 24 | k, 25 | v, 26 | o, 27 | lse, 28 | dq, 29 | dk, 30 | dv 31 | ) 32 | print(do.min().item(), do.max().item(), do.mean().item(), do.std().item()) 33 | -------------------------------------------------------------------------------- /examples/runner/v3.3.x/ptx/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "5b38a8c25a5e0d23d4831fc9a66f1ed798e5ec3a0ccb48010e00cbb30f412e3d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/leetgpu/lib/python3.10/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.3.1", "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.3.x/ptx/sm80/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "d59a4408912cd758ab87d37f0ec619ab5015f47747ef3de78443c10d79d085bc", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.3.1", "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.3.x/ptx/sm86/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "81ea21ee3c20a473bf522f802389aeffc623527513e25c396b8f3b42ed00a367", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.3.1", "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.3.x/llir/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "5b38a8c25a5e0d23d4831fc9a66f1ed798e5ec3a0ccb48010e00cbb30f412e3d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/leetgpu/lib/python3.10/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.3.1", "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.3.x/llir/sm80/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "d59a4408912cd758ab87d37f0ec619ab5015f47747ef3de78443c10d79d085bc", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.3.1", "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.3.x/llir/sm86/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "81ea21ee3c20a473bf522f802389aeffc623527513e25c396b8f3b42ed00a367", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.3.1", "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.2.0/ptx/sm75/old_ptx/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "5b38a8c25a5e0d23d4831fc9a66f1ed798e5ec3a0ccb48010e00cbb30f412e3d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/leetgpu/lib/python3.10/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.3.1", "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.2.0/ttgir/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "runner-67362256efcd8d0f4d35108b946500eb0390f82c28fdcdb0ff16e44b441546fb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.3.1", "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/v3.3.0_cache/_bwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "76c7c1fc39c029ba69c1dd0221706b3d17bb1d7e12a329ba8d7ea3ded71ced99", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm89", "triton_version": "3.3.0", "shared": 114688, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "_bwd_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/performance-7096/v3.4.0_cache/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"hash": "cd4d7c0aaebb8263c1198aea105fbb7fb1246ac0ef6221051d23f5cdcdb2c4ae", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm89", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 73728, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/high_usage-7268/fix/fix.py: -------------------------------------------------------------------------------- 1 | from flash_attn_triton_runner import _flash_attn_backward 2 | import torch 3 | import math 4 | 5 | torch.random.manual_seed(0) 6 | 7 | batch_size, nheads, d, seqlen = 1, 4, 64, 16 8 | 9 | dtype = torch.bfloat16 10 | q, k, v, o, do = [ 11 | torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") 12 | for _ in range(5) 13 | ] 14 | seqlen_q_rounded = math.ceil(seqlen / 128) * 128 15 | lse = torch.empty((batch_size, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32) 16 | with torch.inference_mode(): 17 | dq = torch.empty_like(q) 18 | dk = torch.empty_like(k) 19 | dv = torch.empty_like(v) 20 | _flash_attn_backward( 21 | do, 22 | q, 23 | k, 24 | v, 25 | o, 26 | lse, 27 | dq, 28 | dk, 29 | dv 30 | ) 31 | print(do.min().item(), do.max().item(), do.mean().item(), do.std().item()) 32 | 33 | import triton 34 | print(f"Triton version: {triton.__version__}") 35 | -------------------------------------------------------------------------------- /examples/runner/v3.3.x/llir/sm120/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"hash": "368b77270b5889e568b3df78a951be3a033e2ae230a7b6af7fd3ea5346f25161", "target": {"backend": "cuda", "arch": 120, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm120", "triton_version": "3.3.1", "shared": 24592, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.3.x/llir/sm90/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"hash": "20e60420ec987a18bc4a1f739d4d837501eda754ee672163679f0b985736a80f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.3.1", "shared": 40984, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.3.x/ptx/sm90/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"hash": "20e60420ec987a18bc4a1f739d4d837501eda754ee672163679f0b985736a80f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.3.1", "shared": 40984, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2025-Present BobHuang 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining 5 | * a copy of this software and associated documentation files 6 | * (the "Software"), to deal in the Software without restriction, 7 | * including without limitation the rights to use, copy, modify, merge, 8 | * publish, distribute, sublicense, and/or sell copies of the Software, 9 | * and to permit persons to whom the Software is furnished to do so, 10 | * subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be 13 | * included in all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | */ 23 | -------------------------------------------------------------------------------- /triton_runner/version_utils.py: -------------------------------------------------------------------------------- 1 | import triton 2 | 3 | triton_version = triton.__version__ 4 | version_str = ".".join(triton_version.split('.')[:2]) 5 | 6 | is_support_version = version_str in ["3.6", "3.5", "3.4", "3.3", "3.2", "3.1", "3.0"] 7 | 8 | is_triton_v3_6 = version_str == "3.6" 9 | is_triton_v3_5 = version_str == "3.5" 10 | is_triton_v3_4 = version_str == "3.4" 11 | is_triton_v3_3 = version_str == "3.3" 12 | is_triton_v3_2 = version_str == "3.2" 13 | is_triton_v3_1 = version_str == "3.1" 14 | is_triton_v3_0 = version_str == "3.0" 15 | 16 | is_triton_geq_v3_3 = version_str in ["3.3", "3.4", "3.5", "3.6"] 17 | is_triton_geq_v3_4 = version_str in ["3.4", "3.5", "3.6"] 18 | is_triton_geq_v3_5 = version_str in ["3.5", "3.6"] 19 | 20 | is_triton_leq_v3_2 = version_str in ["3.2", "3.1", "3.0"] 21 | is_triton_leq_v3_1 = version_str in ["3.1", "3.0"] 22 | is_disable_multithreading = version_str in ["3.5", "3.4", "3.3", "3.2"] 23 | 24 | if is_triton_v3_5: 25 | uni_triton_version = "3.5.x" 26 | elif is_triton_v3_3: 27 | uni_triton_version = "3.3.x" 28 | else: 29 | uni_triton_version = triton_version 30 | 31 | try: 32 | import triton.language.extra.tlx as tlx 33 | is_tlx = True 34 | except ImportError as e: 35 | is_tlx = False 36 | -------------------------------------------------------------------------------- /triton_runner/color_print.py: -------------------------------------------------------------------------------- 1 | import termcolor 2 | import os 3 | 4 | def blue_print(text): 5 | print(termcolor.colored(text, "blue"), flush=True) 6 | 7 | def yellow_print(text): 8 | print(termcolor.colored(text, "yellow"), flush=True) 9 | 10 | 11 | def get_project_name(): 12 | return "[Triton Runner]" 13 | 14 | def warning_dump_mode_ssa_and_op(ssa, op, loc, size, encoding): 15 | encoding = f" with encoding={encoding[2:]}" if encoding != "" else "" 16 | blue_print(f"{get_project_name()} In dump mode, ssa={ssa}, op={op}, loc={loc}, size={size}{encoding}") 17 | 18 | def warning_size_not_supported(ssa, op, loc, size): 19 | yellow_print(f"{get_project_name()} Warning: size={size} is not supported. And ssa={ssa}, op={op}, loc={loc}") 20 | 21 | def print_triton_cache_dir(metadata_path, cache_hit=False): 22 | if os.environ.get("RUNNER_PROD", "0") != "1": 23 | always_compile_text = " cache hint and" if cache_hit else "" 24 | blue_print(f"{get_project_name()} Triton kernel{always_compile_text} saved at {os.path.dirname(metadata_path)}") 25 | 26 | def check_dump_tensor_dtype(dump_tensor): 27 | import torch 28 | if dump_tensor.dtype != torch.float32: 29 | yellow_print(f"Warning: tensor dtype is {dump_tensor.dtype}, not torch.float32!") 30 | -------------------------------------------------------------------------------- /examples/runner/v3.4.0/llir/sm120/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "0cb1ad741e9b1a99354522c9b7098bdf3beea9902c9c8ffe195c9433c74ba324", "target": {"backend": "cuda", "arch": 120, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm120", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 24592, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/llir/sm90/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "81bfd88df3fd5c7021b844a5eb534121362b518dd707dd5853f58e08ee3ea8ec", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/ptx/sm90/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "81bfd88df3fd5c7021b844a5eb534121362b518dd707dd5853f58e08ee3ea8ec", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/ptx/sm120/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "7d5c3a22e090a10bfd2a615e080f36adcdfe485fc514fc5af248f382ae86bbfa", "target": {"backend": "cuda", "arch": 120, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm120", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 24592, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm120/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "7d5c3a22e090a10bfd2a615e080f36adcdfe485fc514fc5af248f382ae86bbfa", "target": {"backend": "cuda", "arch": 120, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm120", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 24592, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm90/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D'), ('b_ptr', '*fp8e5', 'D'), ('c_ptr', '*fp16', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_N', 'constexpr', 64), ('BLOCK_SIZE_K', 'constexpr', 64))", "hash": "cda82ff2530be72686a807dd5c393106273474af4950c0b73b81fc51f3ab64b8", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /doc/solving_triton_issues/README.md: -------------------------------------------------------------------------------- 1 | ## [Triton 3.3 Performance Regression on Small Gemms](https://github.com/triton-lang/triton/issues/7096) 2 | 3 | Reproducer in [performance-7096/test.py](./performance-7096/test.py) with Triton v3.4.0 [testing.do_bench](https://github.com/triton-lang/triton/blob/v3.4.0/python/triton/testing.py) 4 | 5 | ``` 6 | GPU: NVIDIA GeForce RTX 4090 7 | Triton version: 3.1.0 8 | 512x512: 0.0124ms 9 | 1024x1024: 0.0210ms 10 | 1536x1536: 0.0673ms 11 | 2048x2048: 0.1181ms 12 | 4096x4096: 0.8580ms 13 | ``` 14 | 15 | ``` 16 | GPU: NVIDIA GeForce RTX 4090 17 | Triton version: 3.4.0 18 | 512x512: 0.0137ms 19 | 1024x1024: 0.0225ms 20 | 1536x1536: 0.0711ms 21 | 2048x2048: 0.1222ms 22 | 4096x4096: 0.8852ms 23 | ``` 24 | 25 | Fix use cubin with triton_runner in [fix.py:67](./performance-7096/fix.py#L67) 26 | 27 | ## [Higher shared_memory usage in Triton 3.3](https://github.com/triton-lang/triton/issues/7268) 28 | 29 | Reproducer on NVIDIA GeForce RTX 4090 30 | 31 | [high_usage-7268/v3.2.0_cache/_bwd_kernel.json](./high_usage-7268/v3.2.0_cache/_bwd_kernel.json) has `"shared": 98304` and [high_usage-7268/v3.3x.0_cache/_bwd_kernel.json](./high_usage-7268/v3.3.0_cache/_bwd_kernel.json) has `"shared": 114688` 32 | 33 | Fix use cubin with triton_runner in [flash_attn_triton_runner.py:152](./high_usage-7268/fix/flash_attn_triton_runner.py#L152) 34 | -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp32', 'D'), ('b_ptr', '*fp32', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_ak', 'constexpr', 1), ('stride_bk', 'i32', 'D'), ('stride_bn', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_cn', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 16), ('BLOCK_SIZE_N', 'constexpr', 16))", "hash": "ed24cdb025613dcb31695c3d8108650987ba83a42cd3f6befcd7af9c0c1fb3fb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/llir/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp32', 'D'), ('b_ptr', '*fp32', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_ak', 'constexpr', 1), ('stride_bk', 'i32', 'D'), ('stride_bn', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_cn', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 16), ('BLOCK_SIZE_N', 'constexpr', 16))", "hash": "ed24cdb025613dcb31695c3d8108650987ba83a42cd3f6befcd7af9c0c1fb3fb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/ptx/sm75/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp32', 'D'), ('b_ptr', '*fp32', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_ak', 'constexpr', 1), ('stride_bk', 'i32', 'D'), ('stride_bn', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_cn', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 16), ('BLOCK_SIZE_N', 'constexpr', 16))", "hash": "ed24cdb025613dcb31695c3d8108650987ba83a42cd3f6befcd7af9c0c1fb3fb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 1152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.0.0/README.md: -------------------------------------------------------------------------------- 1 | ### sm90 (H100, H200, H20, etc.) 2 | ```shell 3 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 4 | 5 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 6 | ``` 7 | 8 | ### sm80 (A100, A30) 9 | ```shell 10 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 11 | 12 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 13 | 14 | python examples/runner/v3.4.0/cubin/sm80/matmul-with-dot-v2.py 15 | ``` 16 | 17 | ### sm120 (RTX PRO 6000, RTX 5090, etc.) 18 | 19 | **not supported** 20 | 21 | ### sm86 (A10, RTX 3090, etc.) 22 | ```shell 23 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 24 | 25 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 26 | 27 | python examples/runner/v3.1.0/ttgir/sm86/matmul-with-dot-v2.py 28 | 29 | python examples/runner/v3.1.0/llir/sm86/matmul-with-dot-v2.py 30 | 31 | python examples/runner/v3.1.0/ptx/sm86/matmul-with-dot-v2.py 32 | ``` 33 | 34 | ### sm75 (T4, RTX 2080, etc.) 35 | 36 | ```shell 37 | python examples/runner/v3.5.x/python/matmul.py 38 | 39 | python examples/runner/v3.5.x/ttir/matmul/matmul.py 40 | 41 | python examples/runner/v3.1.0/ttgir/sm75/matmul.py 42 | 43 | python examples/runner/v3.1.0/llir/sm75/matmul.py 44 | 45 | python examples/runner/v3.2.0/ptx/sm75/matmul.py 46 | 47 | python examples/runner/v3.4.0/cubin/sm75/matmul.py 48 | ``` -------------------------------------------------------------------------------- /examples/runner/v3.1.0/README.md: -------------------------------------------------------------------------------- 1 | ### sm90 (H100, H200, H20, etc.) 2 | ```shell 3 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 4 | 5 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 6 | ``` 7 | 8 | ### sm80 (A100, A30) 9 | ```shell 10 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 11 | 12 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 13 | 14 | python examples/runner/v3.4.0/cubin/sm80/matmul-with-dot-v2.py 15 | ``` 16 | 17 | ### sm120 (RTX PRO 6000, RTX 5090, etc.) 18 | 19 | **not supported** 20 | 21 | ### sm86 (A10, RTX 3090, etc.) 22 | ```shell 23 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 24 | 25 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 26 | 27 | python examples/runner/v3.1.0/ttgir/sm86/matmul-with-dot-v2.py 28 | 29 | python examples/runner/v3.1.0/llir/sm86/matmul-with-dot-v2.py 30 | 31 | python examples/runner/v3.1.0/ptx/sm86/matmul-with-dot-v2.py 32 | ``` 33 | 34 | ### sm75 (T4, RTX 2080, etc.) 35 | 36 | ```shell 37 | python examples/runner/v3.5.x/python/matmul.py 38 | 39 | python examples/runner/v3.5.x/ttir/matmul/matmul.py 40 | 41 | python examples/runner/v3.1.0/ttgir/sm75/matmul.py 42 | 43 | python examples/runner/v3.1.0/llir/sm75/matmul.py 44 | 45 | python examples/runner/v3.2.0/ptx/sm75/matmul.py 46 | 47 | python examples/runner/v3.4.0/cubin/sm75/matmul.py 48 | ``` -------------------------------------------------------------------------------- /examples/runner/v3.4.0/llir/sm80/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38e7e0900bd668a520628198a40bb1cbce9ecf342cb5696635abd3d1c5a1a67b", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/llir/sm86/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38ff42038be6ab9e07fc243f6649bd37974d560308cbcf953bee3250a882217d", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/ptx/sm80/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38e7e0900bd668a520628198a40bb1cbce9ecf342cb5696635abd3d1c5a1a67b", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/ptx/sm86/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38ff42038be6ab9e07fc243f6649bd37974d560308cbcf953bee3250a882217d", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/autotune/cubin/test_kda_gate_single.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from fla.ops.kda.gate import kda_gate_ref 7 | from gate import fused_kda_gate 8 | from fla.utils import assert_close, device 9 | 10 | 11 | device = "cuda" if torch.cuda.is_available() else "cpu" 12 | 13 | def test_kda_gate_single(): 14 | """Run single configuration of kda gate test (B=1, T=2, H=2, D=12, use_bias=False)""" 15 | B, T, H, D, use_bias = (1, 2, 2, 12, False) 16 | 17 | print(f"Running test for B={B}, T={T}, H={H}, D={D}, use_bias={use_bias}") 18 | 19 | # torch.manual_seed(42) 20 | 21 | g = torch.randn(B, T, H * D, dtype=torch.float32) 22 | g = g * 30 23 | A = torch.log(torch.randn(1, 1, H, 1, dtype=torch.float32).uniform_(1, 16)) 24 | g_bias = torch.randn(H * D, dtype=torch.float32) if use_bias else None 25 | 26 | g, A = map(lambda x: x.to(device).requires_grad_(True), (g, A)) 27 | if g_bias is not None: 28 | g_bias = g_bias.to(device).requires_grad_(True) 29 | 30 | ref = kda_gate_ref(g.clone(), A.clone(), D, g_bias.clone() if g_bias is not None else None) 31 | tri = fused_kda_gate(g.clone(), A.clone(), D, g_bias.clone() if g_bias is not None else None) 32 | 33 | assert_close('o', ref, tri, 1e-4) 34 | 35 | print("✅ Test passed for single configuration!") 36 | 37 | 38 | if __name__ == "__main__": 39 | test_kda_gate_single() 40 | -------------------------------------------------------------------------------- /examples/autotune/python/test_kda_gate_single.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from fla.ops.kda.gate import kda_gate_ref 7 | from gate import fused_kda_gate 8 | from fla.utils import assert_close, device 9 | 10 | 11 | device = "cuda" if torch.cuda.is_available() else "cpu" 12 | 13 | def test_kda_gate_single(): 14 | """Run single configuration of kda gate test (B=1, T=2, H=2, D=12, use_bias=False)""" 15 | B, T, H, D, use_bias = (1, 2, 2, 12, False) 16 | 17 | print(f"Running test for B={B}, T={T}, H={H}, D={D}, use_bias={use_bias}") 18 | 19 | # torch.manual_seed(42) 20 | 21 | g = torch.randn(B, T, H * D, dtype=torch.float32) 22 | g = g * 30 23 | A = torch.log(torch.randn(1, 1, H, 1, dtype=torch.float32).uniform_(1, 16)) 24 | g_bias = torch.randn(H * D, dtype=torch.float32) if use_bias else None 25 | 26 | g, A = map(lambda x: x.to(device).requires_grad_(True), (g, A)) 27 | if g_bias is not None: 28 | g_bias = g_bias.to(device).requires_grad_(True) 29 | 30 | ref = kda_gate_ref(g.clone(), A.clone(), D, g_bias.clone() if g_bias is not None else None) 31 | tri = fused_kda_gate(g.clone(), A.clone(), D, g_bias.clone() if g_bias is not None else None) 32 | 33 | assert_close('o', ref, tri, 1e-4) 34 | 35 | print("✅ Test passed for single configuration!") 36 | 37 | 38 | if __name__ == "__main__": 39 | test_kda_gate_single() 40 | -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm80/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38e7e0900bd668a520628198a40bb1cbce9ecf342cb5696635abd3d1c5a1a67b", "target": {"backend": "cuda", "arch": 80, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm80", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.4.0/cubin/sm86/matmul_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp16', 'D'), ('b_ptr', '*fp16', 'D'), ('c_ptr', '*fp32', 'D'), ('M', 'i32', 'D'), ('N', 'i32', 'D'), ('K', 'i32', 'D'), ('stride_am', 'i32', 'D'), ('stride_an', 'constexpr', 1), ('stride_bn', 'i32', 'D'), ('stride_bk', 'constexpr', 1), ('stride_cm', 'i32', 'D'), ('stride_ck', 'constexpr', 1), ('BLOCK_SIZE_M', 'constexpr', 128), ('BLOCK_SIZE_K', 'constexpr', 64), ('BLOCK_SIZE_N', 'constexpr', 64))", "hash": "38ff42038be6ab9e07fc243f6649bd37974d560308cbcf953bee3250a882217d", "target": {"backend": "cuda", "arch": 86, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm86", "triton_version": "3.4.0", "tensordesc_meta": [], "shared": 49152, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "name": "matmul_kernel"} -------------------------------------------------------------------------------- /examples/runner/v3.5.x/llir/sm90/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D', False), ('b_ptr', '*fp8e5', 'D', False), ('c_ptr', '*fp16', 'D', False), ('M', 'i32', 'D', False), ('N', 'i32', 'D', False), ('K', 'i32', 'D', False), ('BLOCK_SIZE_M', 'constexpr', 128, True), ('BLOCK_SIZE_N', 'constexpr', 64, True), ('BLOCK_SIZE_K', 'constexpr', 64, True))", "hash": "e83e5687ef13dcae980bb1730ac8f10134abd8a85d02f4d676098b23c706e5ad", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "triton_runner_version": "0.3.2", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.5.x/ptx/sm90/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D', False), ('b_ptr', '*fp8e5', 'D', False), ('c_ptr', '*fp16', 'D', False), ('M', 'i32', 'D', False), ('N', 'i32', 'D', False), ('K', 'i32', 'D', False), ('BLOCK_SIZE_M', 'constexpr', 128, True), ('BLOCK_SIZE_N', 'constexpr', 64, True), ('BLOCK_SIZE_K', 'constexpr', 64, True))", "hash": "e83e5687ef13dcae980bb1730ac8f10134abd8a85d02f4d676098b23c706e5ad", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "triton_runner_version": "0.3.2", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/v3.5.x/cubin/sm90/matmul_kernel_make_tensor_desciptor.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('a_ptr', '*fp8e5', 'D', False), ('b_ptr', '*fp8e5', 'D', False), ('c_ptr', '*fp16', 'D', False), ('M', 'i32', 'D', False), ('N', 'i32', 'D', False), ('K', 'i32', 'D', False), ('BLOCK_SIZE_M', 'constexpr', 128, True), ('BLOCK_SIZE_N', 'constexpr', 64, True), ('BLOCK_SIZE_K', 'constexpr', 64, True))", "hash": "e83e5687ef13dcae980bb1730ac8f10134abd8a85d02f4d676098b23c706e5ad", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "triton_runner_version": "0.3.2", "tensordesc_meta": [], "shared": 36912, "tmem_size": 0, "global_scratch_size": 384, "global_scratch_align": 128, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "matmul_kernel_make_tensor_desciptor"} -------------------------------------------------------------------------------- /examples/runner/tlx/v3.4.0/ptx/sm90/_attn_fwd_ws_pipelined_pingpong.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "None", "hash": "55058f997ece48ff75c6a860c67ae6321574a5f2f6ce0705f2dddb703d6b5b5b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 12, "num_ctas": 1, "num_stages": 0, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_cluster": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.4.0", "triton_runner_version": "0.2.7", "tensordesc_meta": [{"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}], "shared": 196696, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_attn_fwd_ws_pipelined_pingpong"} -------------------------------------------------------------------------------- /examples/runner/tlx/v3.4.0/cubin/sm90/_attn_fwd_ws_pipelined_pingpong.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "None", "hash": "55058f997ece48ff75c6a860c67ae6321574a5f2f6ce0705f2dddb703d6b5b5b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 12, "num_ctas": 1, "num_stages": 0, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_cluster": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.4.0", "triton_runner_version": "0.2.7", "tensordesc_meta": [{"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}], "shared": 196696, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_attn_fwd_ws_pipelined_pingpong"} -------------------------------------------------------------------------------- /examples/runner/tlx/v3.4.0/llir/sm90/_attn_fwd_ws_pipelined_pingpong.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "None", "hash": "55058f997ece48ff75c6a860c67ae6321574a5f2f6ce0705f2dddb703d6b5b5b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 12, "num_ctas": 1, "num_stages": 0, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_cluster": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.4.0", "triton_runner_version": "0.2.7", "tensordesc_meta": [{"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}], "shared": 196696, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_attn_fwd_ws_pipelined_pingpong"} -------------------------------------------------------------------------------- /examples/runner/tlx/v3.4.0/ttgir/sm90/_attn_fwd_ws_pipelined_pingpong.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "None", "hash": "55058f997ece48ff75c6a860c67ae6321574a5f2f6ce0705f2dddb703d6b5b5b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 12, "num_ctas": 1, "num_stages": 0, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_cluster": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.4.0", "triton_runner_version": "0.2.7", "tensordesc_meta": [{"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [128, 64], "fp4_padded": false}, {"swizzle": 3, "elem_size": 2, "elem_type": 6, "block_size": [64, 64], "fp4_padded": false}], "shared": 196696, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_attn_fwd_ws_pipelined_pingpong"} -------------------------------------------------------------------------------- /examples/native_kernel/dump/02-matrix_transpose.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | 6 | 7 | @triton_runner.jit 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr): 9 | row_index = tl.program_id(axis=0) 10 | col_index = tl.program_id(axis=1) 11 | offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 12 | offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 13 | old_offs = offs_row[:, None] * cols + offs_col[None, :] 14 | mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols) 15 | block = tl.load(input_ptr + old_offs, mask=mask) 16 | transposed_block = tl.trans(block) 17 | new_block = offs_col[:, None] * rows + offs_row[None, :] 18 | tl.store(output_ptr + new_block, transposed_block, mask=mask.T) 19 | 20 | 21 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int): 22 | grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE'])) 23 | 24 | matrix_transpose_kernel[grid]( 25 | input, output, 26 | rows, cols, 27 | BLOCK_SIZE=64, 28 | ) 29 | 30 | if __name__ == "__main__": 31 | rows, cols = 104, 78 32 | a = torch.randn((rows, cols), device='cuda') 33 | torch_output = a.T 34 | triton_output = torch.empty(torch_output.shape, device='cuda') 35 | solve(a, triton_output, rows, cols) 36 | if torch.allclose(triton_output, torch_output): 37 | print("✅ Triton and Torch match") 38 | else: 39 | print("❌ Triton and Torch differ") 40 | -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/4QVBSIEL4EW76HXF6J42YMTVJSSBM3FZ62S63KR44UH74BOAEK6Q/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "e42a19208be12dff1ee5f279ac32754ca4166cb9f6a5edaa3ce50ffe05c022bd", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/7ABF7WWZ6UVKXZHZAPPCTAIG5WB56VBDYUXGBUHJW5AUWAIUYLLA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "f8025fdad9f52aabe4f903de298106ed83df5423c52e60d0e9b7414b0114c2d6", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/7MSUOZ5SZX22HSJIWKE6UC6ZX3UKWAU7LCYVXLFYH5BR2DE66YGA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "fb254767b2cdf5a3c928b289ea0bd9bee8ab029f58b15bacb83f431d0c9ef60c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/AZWK5R6GU6NKUAOLEDEDMHPVS7GSUTQ5ONAAYJD7VGRLHKYEEJKA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "066caec7c6a79aaa01cb20c8361df597cd2a4e1d73400c247fa9a2b3ab042254", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/BDBDUBFGEFPNA7OC7SMORXJVY3MIANC5KKDYZ5U7ORR3WMQPGL7A/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "08c23a04a6215ed07dc2fc98e8dd35c6d880345d52878cf69f7463bb320f32fe", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/C5R3K3Q7KWRIV5SPEAF25RSD6PVH6AJBT5EJOLUJ4NBNSF3OFBAQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1763b56e1f55a28af64f200baec643f3ea7f01219f48972e89e342d9176e2841", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/DKJUN6TOWV3VPVPTQWGKXKURI4GXR6NZRMJ6WC7G6T5WLAVEXSOQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1a9346fa6eb57757d5f3858cabaa91470d78f9b98b13eb0be6f4fb6582a4bc9d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/EMTXD7MF7V5ON7OHWVDWE3LB3XFQV277E7YCEOM5SZGNOTXK22MQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "232771fd85fd7ae6fdc7b547626d61ddcb0aebff27f022399d964cd74eead699", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/EPDDVXU2AXFLJFUG6ENXSPRP2JFWSTRIEDYTLW7QM5AQALS3D4HQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "23c63ade9a05cab49686f11b793e2fd24b694e2820f135dbf06741002e5b1f0f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/FA6WAAIL7CJ6ZKWLJBH7AL7NGKGG52HJFIDPVGIIKFWG6T3D5PSA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "283d60010bf893ecaacb484ff02fed328c6ee8e92a06fa9908516c6f4f63ebe4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/HELKQDNRUHYKVDWUXJPTSLBVWGX7SFCAPF76UGEQP6DP4LT2OIHA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "3916a80db1a1f0aa8ed4ba5f392c35b1aff91440797fea18907f86fe2e7a720e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/HILJQSDI7AXV7GPWY5BTHCPKLWKXIA6BEET57TQ5W7U43DBQI6FA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "3a16984868f82f5f99f6c7433389ea5d957403c12127dfce1db7e9cd8c30478a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/I7HEAVFUSXY6PUPNXIN2JIE5WQQV2FYMBXDCSRYPCOZUCEUM3GQA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "47ce4054b495f1e7d1edba1ba4a09db4215d170c0dc629470f13b341128cd9a0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/IPDCXESWUO5G5ZRHA2BPP7NGMRXPS2QMWDWRQLYLADND4ADF4GDA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "43c62b9256a3ba6ee6270682f7fda6646ef96a0cb0ed182f0b00da3e0065e186", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/JDB7JAU3WFAKKCIRCMQKGZBMBYW4WBWGNSG74TUX5PDJGOJ2CLOQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "48c3f4829bb140a509111320a3642c0e2dcb06c66c8dfe4e97ebc693393a12dd", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/KLWMK2PCX53G5X7APZPSDMDVB3N7TB5NUKRIJFTV4RBRSH3MONQQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "52ecc569e2bf766edfe07e5f21b0750edbf987ada2a2849675e443191f6c7361", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/LHSTUSP6EKUB4EH6D6VPU2WXGLR7IW3RCQSQ5WLNMMW5AG7LAD7Q/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "59e53a49fe22a81e10fe1faafa6ad732e3f45b7114250ed96d632dd01beb00ff", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/LTEIDA7H5ZHQZL4RR2OELBFJGRWTQGURZJGZF47P4CX3P2AHTCKQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "5cc88183e7ee4f0caf918e9c4584a9346d381a91ca4d92f3efe0afb7e8079895", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/ROJJGSWACR4K5G3OWYLO4A6Q6QGAM5DW4BH2NQHKCG3EHZWMYPWQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "8b92934ac01478ae9b6eb616ee03d0f40c067476e04fa6c0ea11b643e6ccc3ed", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/S4HQYRTI3C2OZRBHPNMHUFLSPPSXSQJHZZJZW24V4TCTTIU4JS6A/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "970f0c4668d8b4ecc4277b587a15727be5794127ce539b6b95e4c539a29c4cbc", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/WG4QQROEXFGCZ3LYUZRAISDRLGB6BRGGY4RS45ZNIZDVRTWLRHSA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "b1b90845c4b94c2ced78a6620448715983e0c4c6c7232e772d464758cecb89e4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/WK6MKXOWVAYTCMDOQT5PKFGH3YCF3PCK54HDOVFWUPECNQZVVMGA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "b2bcc55dd6a83131306e84faf514c7de045dbc4aef0e3754b6a3c826c335ab0c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/Y2FJGW5TXEL6YELXBO4X4BIC6PP4GKBQFZEB34NKY5XNWRNDIZUQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "c68a935bb3b917ec11770bb97e0502f3dfc328302e481df1aac76edb45a34669", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm90/ZAMAV7JUWTDP5URUGLBWWKZ6CKTAOI4H7PMKTXB4ZCDDWHENZMUA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "c8180afd34b4c6fed23432c36b2b3e12a6072387fbd8a9dc3cc8863b1c8dcb28", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/root/miniconda3/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/52QBMC4BRT5QHKUCA5JXTV5JWB3OZ7DCRZBVPOU47IG3BUEYMO4A/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "eea0160b818cfb03aa82075379d7a9b076ecfc628e4357ba9cfa0db0d09863b8", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/5FL7IJFMUYZQGBFXV6724VXMB2H45RT4EUYVKHD7XJ7JNOKZ2TRA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "e957f424aca6330304b7afbfae56ec0e8fcec67c2531551c7fba7e96b959d4e2", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/CF43WNCF5YMKCVCXPTDMIRUB6UCGO4DZB2KQRJJKV6K7L6ZFWW7A/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1179bb3445ee18a154577cc6c44681f5046770790e9508a52aaf95f5fb25b5be", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/D4OD4ZC3ME3UMHFZOWSYPJVC64VYXQ45BQUZ7N7ZBQD5Y2YCQR5A/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1f1c3e645b6137461cb975a587a6a2f72b8bc39d0c299fb7f90c07dc6b02847a", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/HQQSBVMUM6AOU3ICEFGUEBA3EIMEKEIZKDYL27Z26SP6ABY5CZXQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "3c2120d5946780ea6d02214d42041b221845111950f0bd7f3af49fe0071d166f", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/MN74TNF2LXOHDQ7JWHSBXBXDEYNEHVXNNQYYYU4I7UJXGZV57FTA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "637fc9b4ba5ddc71c3e9b1e41b86e3261a43d6ed6c318c5388fd137366bdf966", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/UOQTXANLCIU7DDLW3OAQU4IGAOBMAOKI65PO65DGP5NOEN4XAUUQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "a3a13b81ab1229f18d76db810a71060382c03948f75eef74667f5ae237970529", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/VRLASYKFGKAKD6S2LCNWFCL36AD52XDB2QX6FHCKZ2LYMTTOISPA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "ac560961453280a1fa5a589b62897bf007dd5c61d42fe29c4ace97864e6e449e", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/3J7PIE43DAZ7RUJM7K3BVQRXM6UADBVU5PLHUNWFTOPEYNC746WA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "da7ef4139b1833f8d12cfab61ac23767a80186b4ebd67a36c59b9e4c345fe7ac", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/43CRH27SFXI4LZFC44JA5STEEQ2L6MBRWI3YABOLNJH7C32G373A/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "e6c513ebf22dd1c5e4a2e7120eca642434bf3031b2378005cb6a4ff16f46dff6", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/456X6RCBTYY2CJLGJR5DWN3QQJEOTE6FJVEK3PJJRKS3DU5WZEOQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "e77d7f44419e31a125664c7a3b37708248e993c54d48adbd298aa5b1d3b6c91d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/6GXRVHZYV6JQF7YTWZAZJQ7AZSBNPUP3BV3SRGNJJG4SHIXJECRA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "f1af1a9f38af9302ff13b64194c3e0cc82d7d1fb0d772899a949b923a2e920a2", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/6U7U7HN66TGBRRNQCNOGDXNHQDK67IPORWGPUAJIZ4GJVYH3HN6Q/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "f53f4f9dbef4cc18c5b0135c61dda780d5efa1ee8d8cfa0128cf0c9ae0fb3b7d", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/7VPODQVY6ADLGD5A4PAOUFY3ID6FGPUI52LZLSNRYAOKZEM3GJXA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "fd5ee1c2b8f006b30fa0e3c0ea171b40fc533e88ee9795c9b1c01cac919b326e", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/DZA3KX3D3GRGPKWBACG3DG6L6GUV7QVVAQM6VATF27WVAUHZ565Q/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "1e41b55f63d9a267aac1008db19bcbf1a95fc2b50419ea8265d7ed5050f9efbb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/JH5VISGO5YTN4D3RVGIYOSMDXV42EMSOZOLEI34HRKULL6CNCFAQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "49fb5448ceee26de0f71a991874983bd79a2324ecb96446f878aa8b5f84d1141", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/JXGYDIUIMXWTX4QUNU7244XKKUT7SPFH44HVO4AFURRKCOMGT63Q/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "4dcd81a28865ed3bf2146d3fae72ea5527f93ca7e70f577005a462a139869fb7", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/KV6NGAQMO2J5DNTUIMBAW4GXZ2ISZWK6EEZ7MD2B2TOKBMN55L5Q/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "557cd3020c7693d1b67443020b70d7ce912cd95e2133f60f41d4dca0b1bdeafb", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/O3BRYXGCQQFTUJVYXY6X53DO5MJVNR5J7J2U7OUUIP2SIQUVUXUA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "76c31c5cc2840b3a26b8be3d7eec6eeb1356c7a9fa754fba9443f5244295a5e8", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/SKQRWFMKMDSWZOOBL357YKR7VA2BIU7G4R7PCPWE6PZGSFLAWYEA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "92a11b158a60e56cb9c15efbfc2a3fa8341453e6e47ef13ec4f3f2691560b608", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/THVH3NEAOI3OOMJVYJWTBZO3HHUY4S7PC62QV3GQE3CTMC4VP7TQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 128, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "99ea7db4807236e73135c26d30e5db39e98e4bef17b50aecd026c5360b957fe7", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 2, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/TJPSS3CJUNFKTYOV2BGC4UWBL7NCDS2F2Z2CVA4MZ5OQDRYWLZFQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 64, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "9a5f296c49a34aa9e1d5d04c2e52c15fda21cb45d6742a838ccf5d01c7165e4b", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/VW3HSUCS6BWQFHZKBTGECL76ULLEMZM42DMEKDRG6Z6PPUZOSXDQ/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "adb6795052f06d029f2a0ccc412ffea2d646659cd0d8450e26f67cf7d32e95c7", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /examples/autotune/kda_gate_fwd_kernel_cache_sm75/W7F4HYOMUNP4EA5G5VPAPTYPFP64XXCKQH23BYIKIWEHOBXPC6ZA/kda_gate_fwd_kernel.json: -------------------------------------------------------------------------------- 1 | {"kernel_signature": "(('g', '*fp32', 'D', False), ('A', '*fp32', 'D', False), ('y', '*fp32', 'D', False), ('g_bias', 'constexpr', None, False), ('beta', 'constexpr', 1.0, False), ('threshold', 'constexpr', 20.0, False), ('T', 'i32', '', False), ('H', 'i32', '', False), ('D', 'constexpr', 12, False), ('BT', 'constexpr', 32, True), ('BD', 'constexpr', 16, True), ('HAS_BIAS', 'constexpr', False, True))", "hash": "b7cbc3e1cca35fc203a6ed5e07cf0f2bfdcbdc4a81f5b0e10a45887706ef17b2", "target": {"backend": "cuda", "arch": 75, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/home/ubuntu/anaconda3/envs/triton/lib/python3.12/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm75", "instrumentation_mode": "", "TRITON_F32_DEFAULT": "ieee", "triton_version": "3.5.0", "triton_runner_version": "0.3.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "kda_gate_fwd_kernel"} -------------------------------------------------------------------------------- /triton_runner/bench/launch_latency/kernels.py: -------------------------------------------------------------------------------- 1 | # This file is adapted from: 2 | # https://github.com/pytorch-labs/tritonbench/blob/main/tritonbench/operators/launch_latency/kernels.py 3 | # 4 | # Copyright (c) Meta Platforms, Inc. and affiliates. 5 | # 6 | # This source code is licensed under the BSD-style license found in the 7 | # LICENSE file in the root directory of this source tree. 8 | 9 | import torch 10 | import triton 11 | 12 | import triton.language as tl 13 | 14 | import triton_runner 15 | 16 | @triton.jit 17 | def nop_kernel(): 18 | pass 19 | 20 | 21 | @triton.jit 22 | def nop_with_args_kernel( 23 | t1, 24 | t2, 25 | t3, 26 | t4, 27 | t5, 28 | i1, 29 | i2, 30 | i3, 31 | i4, 32 | i5, 33 | i6, 34 | i7, 35 | i8, 36 | i9, 37 | c1: tl.constexpr, 38 | c2: tl.constexpr, 39 | c3: tl.constexpr, 40 | c4: tl.constexpr, 41 | c5: tl.constexpr, 42 | ): 43 | pass 44 | 45 | 46 | @triton_runner.jit 47 | def runner_nop_kernel(): 48 | pass 49 | 50 | 51 | @triton_runner.jit 52 | def runner_nop_with_args_kernel( 53 | t1, 54 | t2, 55 | t3, 56 | t4, 57 | t5, 58 | i1, 59 | i2, 60 | i3, 61 | i4, 62 | i5, 63 | i6, 64 | i7, 65 | i8, 66 | i9, 67 | c1: tl.constexpr, 68 | c2: tl.constexpr, 69 | c3: tl.constexpr, 70 | c4: tl.constexpr, 71 | c5: tl.constexpr, 72 | ): 73 | pass 74 | 75 | 76 | def get_trivial_add_kernel(): 77 | @torch.compile 78 | def trivial_add_kernel(*args): 79 | return sum([torch.tensor(1.0, device="cuda"), *args]) 80 | 81 | return trivial_add_kernel 82 | -------------------------------------------------------------------------------- /triton_runner/check_utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import triton 3 | from .color_print import get_project_name 4 | 5 | _metadata = {} 6 | 7 | 8 | def colored_warning(message, category, filename, lineno, file=None, line=None): 9 | if file is None: 10 | import sys 11 | file = sys.stderr 12 | formatted = f"\033[1m\033[93m{category.__name__}: {message} ({filename}:{lineno})\033[0m\n" 13 | file.write(formatted) 14 | 15 | 16 | warnings.showwarning = colored_warning 17 | 18 | 19 | def check_kernel_name(kernel_name): 20 | if _metadata['name'] != kernel_name: 21 | warnings.warn(f"{get_project_name()} This kernel name {kernel_name} is different with metadata {_metadata['name']}") 22 | 23 | 24 | def check_triton_version(): 25 | kernel_version = _metadata.get('triton_version', '') 26 | from .version_utils import triton_version 27 | installed_version = triton_version 28 | if kernel_version and kernel_version != installed_version: 29 | warnings.warn(f"{get_project_name()} This kernel Triton v{kernel_version} is different with intstalled v{installed_version}") 30 | 31 | 32 | def check_cuda_arch_with_capability(kernel_arch, target_arch): 33 | if kernel_arch != target_arch: 34 | warnings.warn(f"{get_project_name()} This kernel capability={kernel_arch} is different with device capability={target_arch}") 35 | 36 | 37 | def check_cuda_arch(target): 38 | kernel_arch = _metadata["target"]["arch"] 39 | check_cuda_arch_with_capability(target.arch, kernel_arch) 40 | 41 | 42 | def runner_check_triton(kernel_name, metadata, target): 43 | global _metadata 44 | _metadata = metadata 45 | check_kernel_name(kernel_name) 46 | check_triton_version() 47 | check_cuda_arch(target) 48 | -------------------------------------------------------------------------------- /examples/runner/v3.2.0/README.md: -------------------------------------------------------------------------------- 1 | ### sm90 (H100, H200, H20, etc.) 2 | ```shell 3 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 4 | 5 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 6 | 7 | python examples/runner/v3.2.0/ttgir/sm90/matmul-with-dot-v2.py 8 | 9 | python examples/runner/v3.2.0/llir/sm90/matmul-with-dot-v2.py 10 | 11 | python examples/runner/v3.2.0/ptx/sm90/matmul-with-dot-v2.py 12 | 13 | python examples/runner/v3.2.0/cubin/sm90/matmul-with-dot-v2.py 14 | ``` 15 | 16 | ### sm80 (A100, A30) 17 | ```shell 18 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 19 | 20 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 21 | 22 | python examples/runner/v3.2.0/ttgir/sm80/matmul-with-dot-v2.py 23 | 24 | python examples/runner/v3.2.0/llir/sm80/matmul-with-dot-v2.py 25 | 26 | python examples/runner/v3.3.x/ptx/sm80/matmul-with-dot-v2.py 27 | ``` 28 | 29 | ### sm120 (RTX PRO 6000, RTX 5090, etc.) 30 | 31 | **not supported** 32 | 33 | ### sm86 (A10, RTX 3090, etc.) 34 | ```shell 35 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 36 | 37 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 38 | 39 | python examples/runner/v3.2.0/ttgir/sm86/matmul-with-dot-v2.py 40 | 41 | python examples/runner/v3.2.0/llir/sm86/matmul-with-dot-v2.py 42 | 43 | python examples/runner/v3.1.0/ptx/sm86/matmul-with-dot-v2.py 44 | 45 | python examples/runner/v3.4.0/cubin/sm86/matmul-with-dot-v2.py 46 | ``` 47 | 48 | ### sm75 (T4, RTX 2080, etc.) 49 | ```shell 50 | python examples/runner/v3.5.x/python/matmul.py 51 | 52 | python examples/runner/v3.5.x/ttir/matmul/matmul.py 53 | 54 | python examples/runner/v3.2.0/ttgir/sm75/matmul.py 55 | 56 | python examples/runner/v3.2.0/llir/sm75/matmul.py 57 | 58 | python examples/runner/v3.2.0/ptx/sm75/matmul.py 59 | 60 | python examples/runner/v3.4.0/cubin/sm75/matmul.py 61 | ``` -------------------------------------------------------------------------------- /examples/native_kernel/dump/04-softmax.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | 6 | 7 | @triton_runner.jit 8 | def softmax_kernel( 9 | input_ptr, output_ptr, 10 | N, 11 | BLOCK_SIZE: tl.constexpr 12 | ): 13 | input_ptr = input_ptr.to(tl.pointer_type(tl.float32)) 14 | output_ptr = output_ptr.to(tl.pointer_type(tl.float32)) 15 | _max = tl.zeros([BLOCK_SIZE], dtype=tl.float32) - float("inf") 16 | for off in range(0, N, BLOCK_SIZE): 17 | cols = off + tl.arange(0, BLOCK_SIZE) 18 | a = tl.load(input_ptr + cols, mask=cols < N, other=-float("inf")) 19 | _max = tl.maximum(a, _max) 20 | max = tl.max(_max, axis=0) 21 | _sum = tl.zeros([BLOCK_SIZE], dtype=tl.float32) 22 | for off in range(0, N, BLOCK_SIZE): 23 | cols = off + tl.arange(0, BLOCK_SIZE) 24 | a = tl.load(input_ptr + cols, mask=cols < N, other=-float("inf")) 25 | _sum += tl.exp(a - max) 26 | sum = tl.sum(_sum, axis=0) 27 | pid = tl.program_id(0) 28 | offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 29 | mask = offset < N 30 | x = tl.load(input_ptr + offset, mask=mask) 31 | exp_shifted = tl.exp(x - max) 32 | normalize_by_sum = exp_shifted / sum 33 | tl.store(output_ptr + offset, normalize_by_sum, mask=mask) 34 | 35 | 36 | def solve(input: torch.Tensor, output: torch.Tensor, N: int): 37 | grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), ) 38 | softmax_kernel[grid]( 39 | input, output, N, 40 | BLOCK_SIZE=4096, 41 | ) 42 | 43 | if __name__ == "__main__": 44 | N = 100000 45 | input = torch.randn((N), device='cuda') 46 | torch_output = torch.softmax(input, 0) 47 | triton_output = torch.empty(torch_output.shape, device='cuda') 48 | solve(input, triton_output, N) 49 | if torch.allclose(triton_output, torch_output): 50 | print("✅ Triton and Torch match") 51 | else: 52 | print("❌ Triton and Torch differ") 53 | -------------------------------------------------------------------------------- /examples/native_kernel/dump/05-softmax_lse.py: -------------------------------------------------------------------------------- 1 | # softmax use log_sum_exp 2 | import torch 3 | import triton 4 | import triton.language as tl 5 | import triton_runner 6 | 7 | 8 | @triton_runner.jit 9 | def softmax_kernel( 10 | input_ptr, output_ptr, 11 | N, 12 | BLOCK_SIZE: tl.constexpr 13 | ): 14 | input_ptr = input_ptr.to(tl.pointer_type(tl.float32)) 15 | output_ptr = output_ptr.to(tl.pointer_type(tl.float32)) 16 | max_acc = tl.zeros([BLOCK_SIZE], dtype=tl.float32) - float("inf") 17 | log_acc = tl.zeros([BLOCK_SIZE], dtype=tl.float32) 18 | 19 | for off in range(0, N, BLOCK_SIZE): 20 | cols = off + tl.arange(0, BLOCK_SIZE) 21 | a = tl.load(input_ptr + cols, mask=cols < N, other=-float("inf")) 22 | block_max = tl.max(a, axis=0) 23 | max_acc_new = tl.where(max_acc > block_max, max_acc, block_max) 24 | 25 | raw_exp = tl.math.exp(a - max_acc_new) 26 | 27 | log_acc_new = tl.math.exp(max_acc - max_acc_new) * log_acc + tl.sum(raw_exp, axis=-1) 28 | 29 | log_acc = log_acc_new 30 | max_acc = max_acc_new 31 | 32 | pid = tl.program_id(0) 33 | offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 34 | mask = offset < N 35 | x = tl.load(input_ptr + offset, mask=mask) 36 | o = tl.math.exp(x - max_acc) / log_acc 37 | tl.store(output_ptr + offset, o, mask=mask) 38 | 39 | 40 | def solve(input: torch.Tensor, output: torch.Tensor, N: int): 41 | grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), ) 42 | softmax_kernel[grid]( 43 | input, output, N, 44 | BLOCK_SIZE=4096, 45 | ) 46 | 47 | if __name__ == "__main__": 48 | N = 100000 49 | input = torch.randn((N), device='cuda') 50 | torch_output = torch.softmax(input, 0) 51 | triton_output = torch.empty(torch_output.shape, device='cuda') 52 | solve(input, triton_output, N) 53 | if torch.allclose(triton_output, torch_output): 54 | print("✅ Triton and Torch match") 55 | else: 56 | print("❌ Triton and Torch differ") 57 | -------------------------------------------------------------------------------- /examples/test.py: -------------------------------------------------------------------------------- 1 | import triton 2 | import torch 3 | import os 4 | import re 5 | import subprocess 6 | import triton_runner 7 | 8 | from triton_runner.version_utils import triton_version, uni_triton_version 9 | 10 | def get_content(file_path): 11 | return open(file_path, "r").read() 12 | 13 | def get_lines(match): 14 | return [line.strip() for line in match.group(1).strip().splitlines() if line.strip()] 15 | 16 | device = torch.cuda.current_device() 17 | capability = torch.cuda.get_device_capability(device) 18 | capability = capability[0] * 10 + capability[1] 19 | 20 | pattern = re.compile(rf"### sm{capability}.*?shell(.*?)```", re.DOTALL) 21 | runner_file_path = os.path.join("examples", "runner", f"v{uni_triton_version}", "README.md") 22 | match = pattern.search(get_content(runner_file_path)) 23 | if match: 24 | lines = get_lines(match) 25 | pattern = re.compile(rf"shell(.*?)```", re.DOTALL) 26 | bench_file_path = os.path.join("doc", "benchmark.md") 27 | lines.extend(get_lines(pattern.search(get_content(bench_file_path)))) 28 | from triton_runner.version_utils import is_triton_geq_v3_3 29 | if is_triton_geq_v3_3: 30 | debug_file_path = os.path.join("doc", "dump.md") 31 | for i, m in enumerate(pattern.finditer((get_content(debug_file_path)), 1)): 32 | lines.extend(get_lines(m)) 33 | triton_runner.color_print.yellow_print(f"TEST on triton v{triton_version}") 34 | fail_cmd = [] 35 | for cmd in lines: 36 | triton_runner.color_print.blue_print(cmd) 37 | result = subprocess.run(cmd, shell=True, capture_output=True, text=True) 38 | print("stdout:", result.stdout) 39 | print("return code:", result.returncode) 40 | if result.returncode: 41 | fail_cmd.append(cmd) 42 | if len(fail_cmd) == 0: 43 | print(f"✅ ALL TEST PASS on triton v{triton_version}") 44 | else: 45 | triton_runner.color_print.yellow_print(f"❌ SOME TEST FAIL on triton v{triton_version}") 46 | print("\n".join(fail_cmd)) 47 | else: 48 | print(f"sm{capability} on triton v{triton.__version__} not found") 49 | -------------------------------------------------------------------------------- /triton_runner/bench/utils.py: -------------------------------------------------------------------------------- 1 | # This file is adapted from: 2 | # https://github.com/pytorch-labs/tritonbench/blob/main/tritonbench/utils/triton_op.py 3 | # 4 | # Copyright (c) Meta Platforms, Inc. and affiliates. 5 | # 6 | # This source code is licensed under the BSD-style license found in the 7 | # LICENSE file in the root directory of this source tree. 8 | 9 | import torch 10 | import time 11 | import os 12 | from triton_runner.testing import do_bench 13 | 14 | 15 | class TimerContext: 16 | 17 | def __init__(self, enabled=True): 18 | self.enabled = enabled 19 | self.elapsed_ms = None 20 | 21 | def __enter__(self): 22 | if self.enabled: 23 | self._start_time = time.perf_counter() 24 | return self 25 | 26 | def __exit__(self, *args, **kwargs): 27 | if self.enabled: 28 | end_time = time.perf_counter() 29 | self.elapsed_ms = (end_time - self._start_time) * 1e3 30 | 31 | 32 | def benchmark(name, unit_name="ms"): 33 | 34 | def decorator(method): 35 | 36 | def wrapper(self, *args, **kwargs): 37 | if kwargs.pop("enable_benchmark", True) is not False: 38 | os.environ["RUNNER_PROD"] = "1" 39 | input_iter = list(self.get_input_iter()) 40 | # sum_time = 0 41 | input_len = len(input_iter) 42 | for idx, input_args in enumerate(input_iter): 43 | fn = method(self, *input_args) 44 | elapsed_time = do_bench(fn) 45 | elapsed_time_str = f"{elapsed_time:8.3f} ms" 46 | if unit_name == "us": 47 | elapsed_time_str = f"{elapsed_time * 1e3:8.3f} us" 48 | if idx == input_len - 1: 49 | print(f"[{name:<50}|] time: {elapsed_time_str}") 50 | # sum_time += elapsed_time 51 | # print(f"[{name + " average":<30}|] time: {sum_time/input_len:.6f} ms") 52 | os.environ.pop("RUNNER_PROD", None) 53 | else: 54 | return method(self, *args[0]) 55 | 56 | return wrapper 57 | 58 | return decorator 59 | -------------------------------------------------------------------------------- /examples/dump/python/02-matrix_transpose/dump_2d_load.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | import triton_runner.language as dl 6 | 7 | 8 | @triton_runner.jit 9 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr): 10 | row_index = tl.program_id(axis=0) 11 | col_index = tl.program_id(axis=1) 12 | offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 13 | offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 14 | old_offs = offs_row[:, None] * cols + offs_col[None, :] 15 | mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols) 16 | block = tl.load(input_ptr + old_offs, mask=mask) 17 | 18 | # ===== DEBUG START ===== 19 | dl.dump(block) 20 | # ===== DEBUG END ===== 21 | 22 | transposed_block = tl.trans(block) 23 | new_block = offs_col[:, None] * rows + offs_row[None, :] 24 | tl.store(output_ptr + new_block, transposed_block, mask=mask.T) 25 | 26 | 27 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int): 28 | grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE'])) 29 | 30 | BLOCK_SIZE = 64 31 | dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device) 32 | 33 | matrix_transpose_kernel[grid]( 34 | input, output, 35 | rows, cols, 36 | BLOCK_SIZE=BLOCK_SIZE, 37 | dump_tensor=dump_tensor, 38 | ) 39 | triton_runner.color_print.blue_print(f"debug {dump_tensor}") 40 | dump_torch = input 41 | max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor)) 42 | triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}") 43 | 44 | if __name__ == "__main__": 45 | rows, cols = 104, 78 46 | a = torch.randn((rows, cols), device='cuda') 47 | torch_output = a.T 48 | triton_output = torch.empty(torch_output.shape, device='cuda') 49 | solve(a, triton_output, rows, cols) 50 | if torch.allclose(triton_output, torch_output): 51 | print("✅ Triton and Torch match") 52 | else: 53 | print("❌ Triton and Torch differ") 54 | -------------------------------------------------------------------------------- /examples/dump/python/02-matrix_transpose/dump_2d_trans.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | import triton_runner.language as dl 6 | 7 | @triton_runner.jit 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr): 9 | row_index = tl.program_id(axis=0) 10 | col_index = tl.program_id(axis=1) 11 | offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 12 | offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 13 | old_offs = offs_row[:, None] * cols + offs_col[None, :] 14 | mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols) 15 | block = tl.load(input_ptr + old_offs, mask=mask) 16 | transposed_block = tl.trans(block) 17 | 18 | # ===== DEBUG START ===== 19 | dl.dump(transposed_block) 20 | # ===== DEBUG END ===== 21 | 22 | new_block = offs_col[:, None] * rows + offs_row[None, :] 23 | tl.store(output_ptr + new_block, transposed_block, mask=mask.T) 24 | 25 | 26 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int): 27 | grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE'])) 28 | 29 | BLOCK_SIZE = 64 30 | dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device) 31 | 32 | matrix_transpose_kernel[grid]( 33 | input, output, 34 | rows, cols, 35 | BLOCK_SIZE=BLOCK_SIZE, 36 | dump_tensor=dump_tensor, 37 | ) 38 | triton_runner.color_print.blue_print(f"debug {dump_tensor}") 39 | dump_torch = output 40 | max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor)) 41 | triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}") 42 | 43 | if __name__ == "__main__": 44 | rows, cols = 104, 78 45 | a = torch.randn((rows, cols), device='cuda') 46 | torch_output = a.T 47 | triton_output = torch.empty(torch_output.shape, device='cuda') 48 | solve(a, triton_output, rows, cols) 49 | if torch.allclose(triton_output, torch_output): 50 | print("✅ Triton and Torch match") 51 | else: 52 | print("❌ Triton and Torch differ") 53 | -------------------------------------------------------------------------------- /examples/dump/ttir/02-matrix_transpose/dump_2d_load.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | 6 | @triton_runner.jit 7 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr): 8 | row_index = tl.program_id(axis=0) 9 | col_index = tl.program_id(axis=1) 10 | offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 11 | offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 12 | old_offs = offs_row[:, None] * cols + offs_col[None, :] 13 | mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols) 14 | block = tl.load(input_ptr + old_offs, mask=mask) 15 | transposed_block = tl.trans(block) 16 | new_block = offs_col[:, None] * rows + offs_row[None, :] 17 | tl.store(output_ptr + new_block, transposed_block, mask=mask.T) 18 | 19 | 20 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int): 21 | grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE'])) 22 | 23 | BLOCK_SIZE = 64 24 | dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device) 25 | # dump_value can be "%block_18"(block = tl.load) 26 | dump_value = "%block_18" 27 | 28 | matrix_transpose_kernel[grid]( 29 | input, output, 30 | rows, cols, 31 | BLOCK_SIZE=BLOCK_SIZE, 32 | ttir_dir=triton_runner.get_file_dir(__file__), 33 | dump_tensor=dump_tensor, 34 | dump_value=dump_value, 35 | ) 36 | triton_runner.color_print.blue_print(f"debug {dump_tensor}") 37 | dump_torch = input 38 | max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor)) 39 | triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}") 40 | 41 | if __name__ == "__main__": 42 | rows, cols = 104, 78 43 | a = torch.randn((rows, cols), device='cuda') 44 | torch_output = a.T 45 | triton_output = torch.empty(torch_output.shape, device='cuda') 46 | solve(a, triton_output, rows, cols) 47 | if torch.allclose(triton_output, torch_output): 48 | print("✅ Triton and Torch match") 49 | else: 50 | print("❌ Triton and Torch differ") 51 | -------------------------------------------------------------------------------- /examples/dump/ttgir/02-matrix_transpose/dump_2d_load.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | 6 | @triton_runner.jit 7 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr): 8 | row_index = tl.program_id(axis=0) 9 | col_index = tl.program_id(axis=1) 10 | offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 11 | offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 12 | old_offs = offs_row[:, None] * cols + offs_col[None, :] 13 | mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols) 14 | block = tl.load(input_ptr + old_offs, mask=mask) 15 | transposed_block = tl.trans(block) 16 | new_block = offs_col[:, None] * rows + offs_row[None, :] 17 | tl.store(output_ptr + new_block, transposed_block, mask=mask.T) 18 | 19 | 20 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int): 21 | grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE'])) 22 | 23 | BLOCK_SIZE = 64 24 | dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device) 25 | # dump_value can be "%block_38"(block = tl.load) 26 | dump_value = "%block_38" 27 | 28 | matrix_transpose_kernel[grid]( 29 | input, output, 30 | rows, cols, 31 | BLOCK_SIZE=BLOCK_SIZE, 32 | ttgir_dir=triton_runner.get_file_dir(__file__), 33 | dump_tensor=dump_tensor, 34 | dump_value=dump_value, 35 | ) 36 | triton_runner.color_print.blue_print(f"debug {dump_tensor}") 37 | dump_torch = input 38 | max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor)) 39 | triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}") 40 | 41 | if __name__ == "__main__": 42 | rows, cols = 104, 78 43 | a = torch.randn((rows, cols), device='cuda') 44 | torch_output = a.T 45 | triton_output = torch.empty(torch_output.shape, device='cuda') 46 | solve(a, triton_output, rows, cols) 47 | if torch.allclose(triton_output, torch_output): 48 | print("✅ Triton and Torch match") 49 | else: 50 | print("❌ Triton and Torch differ") 51 | -------------------------------------------------------------------------------- /examples/dump/ttgir/02-matrix_transpose/dump_2d_trans.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | 6 | 7 | @triton_runner.jit 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr): 9 | row_index = tl.program_id(axis=0) 10 | col_index = tl.program_id(axis=1) 11 | offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 12 | offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 13 | old_offs = offs_row[:, None] * cols + offs_col[None, :] 14 | mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols) 15 | block = tl.load(input_ptr + old_offs, mask=mask) 16 | transposed_block = tl.trans(block) 17 | new_block = offs_col[:, None] * rows + offs_row[None, :] 18 | tl.store(output_ptr + new_block, transposed_block, mask=mask.T) 19 | 20 | 21 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int): 22 | grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE'])) 23 | 24 | BLOCK_SIZE = 64 25 | dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device) 26 | # dump_value can be "%transposed_block"(transposed_block = tl.trans(block)) 27 | dump_value = "%transposed_block" 28 | 29 | matrix_transpose_kernel[grid]( 30 | input, output, 31 | rows, cols, 32 | BLOCK_SIZE=BLOCK_SIZE, 33 | ttgir_dir=triton_runner.get_file_dir(__file__), 34 | dump_tensor=dump_tensor, 35 | dump_value=dump_value, 36 | ) 37 | triton_runner.color_print.blue_print(f"debug {dump_tensor}") 38 | dump_torch = output 39 | max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor)) 40 | triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}") 41 | 42 | if __name__ == "__main__": 43 | rows, cols = 104, 78 44 | a = torch.randn((rows, cols), device='cuda') 45 | torch_output = a.T 46 | triton_output = torch.empty(torch_output.shape, device='cuda') 47 | solve(a, triton_output, rows, cols) 48 | if torch.allclose(triton_output, torch_output): 49 | print("✅ Triton and Torch match") 50 | else: 51 | print("❌ Triton and Torch differ") 52 | -------------------------------------------------------------------------------- /examples/dump/ttir/02-matrix_transpose/dump_2d_trans.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | 6 | 7 | @triton_runner.jit 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr): 9 | row_index = tl.program_id(axis=0) 10 | col_index = tl.program_id(axis=1) 11 | offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 12 | offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 13 | old_offs = offs_row[:, None] * cols + offs_col[None, :] 14 | mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols) 15 | block = tl.load(input_ptr + old_offs, mask=mask) 16 | transposed_block = tl.trans(block) 17 | new_block = offs_col[:, None] * rows + offs_row[None, :] 18 | tl.store(output_ptr + new_block, transposed_block, mask=mask.T) 19 | 20 | 21 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int): 22 | grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE'])) 23 | 24 | BLOCK_SIZE = 64 25 | dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device) 26 | # dump_value can be "%transposed_block"(transposed_block = tl.trans(block)) 27 | dump_value = "%transposed_block" 28 | 29 | matrix_transpose_kernel[grid]( 30 | input, output, 31 | rows, cols, 32 | BLOCK_SIZE=BLOCK_SIZE, 33 | ttir_dir=triton_runner.get_file_dir(__file__), 34 | dump_tensor=dump_tensor, 35 | dump_value=dump_value, 36 | ) 37 | triton_runner.color_print.blue_print(f"debug {dump_tensor}") 38 | dump_torch = output 39 | max_diff = torch.max(torch.abs(dump_torch[:BLOCK_SIZE, :BLOCK_SIZE] - dump_tensor)) 40 | triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}") 41 | 42 | if __name__ == "__main__": 43 | rows, cols = 104, 78 44 | a = torch.randn((rows, cols), device='cuda') 45 | torch_output = a.T 46 | triton_output = torch.empty(torch_output.shape, device='cuda') 47 | solve(a, triton_output, rows, cols) 48 | if torch.allclose(triton_output, torch_output): 49 | print("✅ Triton and Torch match") 50 | else: 51 | print("❌ Triton and Torch differ") 52 | -------------------------------------------------------------------------------- /examples/runner/v3.3.x/README.md: -------------------------------------------------------------------------------- 1 | ### sm90 (H100, H200, H20, etc.) 2 | ```shell 3 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 4 | 5 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 6 | 7 | python examples/runner/v3.3.x/ttgir/sm90/matmul-with-tma-v3.py 8 | 9 | python examples/runner/v3.3.x/llir/sm90/matmul-with-tma-v3.py 10 | 11 | python examples/runner/v3.3.x/ptx/sm90/matmul-with-tma-v3.py 12 | 13 | python examples/runner/v3.4.0/cubin/sm90/matmul-with-tma-v4.py 14 | ``` 15 | 16 | ### sm80 (A100, A30) 17 | ```shell 18 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 19 | 20 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 21 | 22 | python examples/runner/v3.4.0/ttgir/sm80/matmul-with-dot-v2.py 23 | 24 | python examples/runner/v3.3.x/llir/sm80/matmul-with-dot-v2.py 25 | 26 | python examples/runner/v3.3.x/ptx/sm80/matmul-with-dot-v2.py 27 | 28 | python examples/runner/v3.4.0/cubin/sm80/matmul-with-dot-v2.py 29 | ``` 30 | 31 | ### sm120 (RTX PRO 6000, RTX 5090, etc.) 32 | ```shell 33 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 34 | 35 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 36 | 37 | python examples/runner/v3.3.x/ttgir/sm120/matmul-with-tma-v3.py 38 | 39 | python examples/runner/v3.3.x/llir/sm120/matmul-with-tma-v3.py 40 | 41 | python examples/runner/v3.4.0/ptx/sm120/matmul-with-tma-v4.py 42 | 43 | python examples/runner/v3.4.0/cubin/sm120/matmul-with-tma-v4.py 44 | ``` 45 | 46 | ### sm86 (A10, RTX 3090, etc.) 47 | ```shell 48 | python examples/runner/v3.5.x/python/matmul-with-dot-v2.py 49 | 50 | python examples/runner/v3.5.x/ttir/matmul-with-dot/matmul-with-dot-v2.py 51 | 52 | python examples/runner/v3.4.0/ttgir/sm86/matmul-with-dot-v2.py 53 | 54 | python examples/runner/v3.3.x/llir/sm86/matmul-with-dot-v2.py 55 | 56 | python examples/runner/v3.3.x/ptx/sm86/matmul-with-dot-v2.py 57 | 58 | python examples/runner/v3.4.0/cubin/sm86/matmul-with-dot-v2.py 59 | ``` 60 | 61 | ### sm75 (T4, RTX 2080, etc.) 62 | ```shell 63 | python examples/runner/v3.5.x/python/matmul.py 64 | 65 | python examples/runner/v3.5.x/ttir/matmul/matmul.py 66 | 67 | python examples/runner/v3.5.x/ttgir/sm75/matmul.py 68 | 69 | python examples/runner/v3.3.x/llir/sm75/matmul.py 70 | 71 | python examples/runner/v3.3.x/ptx/sm75/matmul.py 72 | 73 | python examples/runner/v3.4.0/cubin/sm75/matmul.py 74 | ``` -------------------------------------------------------------------------------- /examples/dump/python/02-matrix_transpose/dump_boundary/dump_boundary_trans.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | import triton_runner 5 | import triton_runner.language as dl 6 | 7 | @triton_runner.jit 8 | def matrix_transpose_kernel(input_ptr, output_ptr, rows, cols, BLOCK_SIZE: tl.constexpr): 9 | row_index = tl.program_id(axis=0) 10 | col_index = tl.program_id(axis=1) 11 | offs_row = row_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 12 | offs_col = col_index * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 13 | old_offs = offs_row[:, None] * cols + offs_col[None, :] 14 | mask = (offs_row[:, None] < rows) & (offs_col[None, :] < cols) 15 | block = tl.load(input_ptr + old_offs, mask=mask) 16 | transposed_block = tl.trans(block) 17 | 18 | # ===== DEBUG START ===== 19 | dl.dump_boundary(transposed_block) 20 | # ===== DEBUG END ===== 21 | 22 | new_block = offs_col[:, None] * rows + offs_row[None, :] 23 | tl.store(output_ptr + new_block, transposed_block, mask=mask.T) 24 | 25 | 26 | def solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int): 27 | grid = lambda meta: (triton.cdiv(rows, meta['BLOCK_SIZE']), triton.cdiv(cols, meta['BLOCK_SIZE'])) 28 | 29 | BLOCK_SIZE = 64 30 | dump_tensor = torch.empty((BLOCK_SIZE, BLOCK_SIZE), dtype=input.dtype, device=input.device) 31 | 32 | matrix_transpose_kernel[grid]( 33 | input, output, 34 | rows, cols, 35 | BLOCK_SIZE=BLOCK_SIZE, 36 | dump_tensor=dump_tensor, 37 | ) 38 | triton_runner.color_print.blue_print(f"debug {dump_tensor}") 39 | dump_torch = output 40 | boundary_start_rows = rows & (~(BLOCK_SIZE-1)) 41 | boundary_start_cols = rows & (~(BLOCK_SIZE-1)) 42 | max_diff = torch.max(torch.abs(dump_torch[boundary_start_cols:, boundary_start_rows:] - dump_tensor[:cols-boundary_start_cols, :rows-boundary_start_rows])) 43 | triton_runner.color_print.yellow_print(f"The maximum difference between torch and dump is {max_diff}") 44 | 45 | if __name__ == "__main__": 46 | rows, cols = 104, 78 47 | a = torch.randn((rows, cols), device='cuda') 48 | torch_output = a.T 49 | triton_output = torch.empty(torch_output.shape, device='cuda') 50 | solve(a, triton_output, rows, cols) 51 | if torch.allclose(triton_output, torch_output): 52 | print("✅ Triton and Torch match") 53 | else: 54 | print("❌ Triton and Torch differ") 55 | --------------------------------------------------------------------------------