├── .clang-format ├── .clang-tidy ├── .githooks ├── install └── pre-commit ├── .github ├── scripts │ ├── aiter_test.sh │ ├── build_aiter_triton.sh │ ├── check_deps.sh │ ├── check_signal.sh │ ├── clean_up_rocm.sh │ └── op_tune.sh └── workflows │ ├── aiter-release.yaml │ ├── aiter-test.yaml │ ├── operators-tuning.yaml │ ├── pre-checks.yaml │ ├── sglang_downstream.yaml │ ├── test-network.yaml │ ├── triton-test.yaml │ └── vllm_benchmark.yaml ├── .gitignore ├── .gitmodules ├── 3rdparty └── ck_helper │ └── ck │ └── config.h ├── CONTRIBUTE.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── aiter ├── __init__.py ├── aot │ ├── __init__.py │ ├── asm_mla_decode_fwd.py │ ├── pa.py │ ├── pa_ragged.py │ ├── pa_v1.py │ ├── test │ │ ├── matmul_fp16.py │ │ ├── test.sh │ │ └── test_matmul.cpp │ └── triton │ │ ├── decode_mla.py │ │ └── norm.py ├── bert_padding.py ├── configs │ ├── __init__.py │ ├── a4w4_blockscale_tuned_gemm.csv │ ├── a4w4_blockscale_untuned_gemm.csv │ ├── a8w8_blockscale_bpreshuffle_tuned_gemm.csv │ ├── a8w8_blockscale_bpreshuffle_untuned_gemm.csv │ ├── a8w8_blockscale_tuned_gemm.csv │ ├── a8w8_blockscale_untuned_gemm.csv │ ├── a8w8_bpreshuffle_cktile_tuned_gemm.csv │ ├── a8w8_bpreshuffle_cktile_untuned_gemm.csv │ ├── a8w8_bpreshuffle_tuned_gemm.csv │ ├── a8w8_bpreshuffle_untuned_gemm.csv │ ├── a8w8_tuned_batched_gemm.csv │ ├── a8w8_tuned_gemm.csv │ ├── a8w8_untuned_batched_gemm.csv │ ├── a8w8_untuned_gemm.csv │ ├── asm_a8w8_gemm.csv │ ├── bf16_tuned_batched_gemm.csv │ ├── bf16_tuned_gemm.csv │ ├── bf16_untuned_batched_gemm.csv │ ├── bf16_untuned_gemm.csv │ ├── model_configs │ │ ├── README.md │ │ ├── a8w8_blockscale_bpreshuffle_tuned_gemm_dsv3.csv │ │ ├── a8w8_blockscale_tuned_fmoe_qwen3_235b.csv │ │ ├── a8w8_blockscale_tuned_gemm_ds_v3.csv │ │ ├── a8w8_blockscale_tuned_gemm_qwen3_235b.csv │ │ ├── a8w8_blockscale_untuned_fmoe_qwen3_235b.csv │ │ ├── a8w8_blockscale_untuned_gemm_ds_v3.csv │ │ ├── a8w8_blockscale_untuned_gemm_qwen3_235b.csv │ │ ├── a8w8_bpreshuffle_tuned_gemm_dsv3.csv │ │ ├── gptoss_bf16_tuned_gemm.csv │ │ ├── llama405B_untuned_gemm.csv │ │ ├── llama405B_untuned_gemm_bf16.csv │ │ ├── llama70B_untuned_gemm.csv │ │ ├── llama70B_untuned_gemm_bf16.csv │ │ ├── qwen32B_untuned_gemm.csv │ │ └── qwen32B_untuned_gemm_bf16.csv │ ├── tuned_fmoe.csv │ └── untuned_fmoe.csv ├── dist │ ├── __init__.py │ ├── communication_op.py │ ├── cuda_wrapper.py │ ├── device_communicators │ │ ├── all2all.py │ │ ├── base_device_communicator.py │ │ ├── communicator_cuda.py │ │ ├── communicator_pynccl.py │ │ ├── custom_all_reduce.py │ │ ├── pynccl_wrapper.py │ │ └── quick_all_reduce.py │ ├── parallel_state.py │ ├── shm_broadcast.py │ └── utils.py ├── fused_moe.py ├── fused_moe_bf16_asm.py ├── fused_moe_dp_shared_expert.py ├── int4_utils.py ├── jit │ ├── __init__.py │ ├── core.py │ ├── optCompilerConfig.json │ └── utils │ │ ├── __init__.py │ │ ├── _cpp_extension_versioner.py │ │ ├── chip_info.py │ │ ├── cpp_extension.py │ │ ├── file_baton.py │ │ ├── hipify │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── cuda_to_hip_mappings.py │ │ └── hipify_python.py │ │ └── torch_guard.py ├── mla.py ├── ops │ ├── __init__.py │ ├── activation.py │ ├── aiter_operator.py │ ├── attention.py │ ├── batched_gemm_op_a8w8.py │ ├── batched_gemm_op_bf16.py │ ├── cache.py │ ├── communication.py │ ├── custom.py │ ├── custom_all_reduce.py │ ├── deepgemm.py │ ├── enum.py │ ├── fused_mrope_rms.py │ ├── gemm_op_a16w16.py │ ├── gemm_op_a4w4.py │ ├── gemm_op_a8w8.py │ ├── gemm_op_common.py │ ├── gradlib.py │ ├── mha.py │ ├── moe_op.py │ ├── moe_sorting.py │ ├── norm.py │ ├── pos_encoding.py │ ├── quant.py │ ├── quick_all_reduce.py │ ├── rmsnorm.py │ ├── rope.py │ ├── sample.py │ ├── sampling.py │ ├── shuffle.py │ ├── topk.py │ ├── topk_plain.py │ ├── trans_ragged_layout.py │ └── triton │ │ ├── __init__.py │ │ ├── _triton_kernels │ │ ├── activation.py │ │ ├── batched_gemm_a16wfp4.py │ │ ├── batched_gemm_a8w8.py │ │ ├── batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py │ │ ├── batched_gemm_afp4wfp4.py │ │ ├── batched_gemm_bf16.py │ │ ├── chunked_pa_prefill.py │ │ ├── extend_attention.py │ │ ├── ff_a16w16_fused_gated.py │ │ ├── ff_a16w16_fused_ungated.py │ │ ├── flash_attn_triton_amd │ │ │ ├── __init__.py │ │ │ ├── bwd.py │ │ │ ├── fwd_decode.py │ │ │ ├── fwd_prefill.py │ │ │ ├── interface_v2.py │ │ │ ├── interface_v3.py │ │ │ └── utils.py │ │ ├── fp8_mqa_logits.py │ │ ├── fused_add_rmsnorm_pad.py │ │ ├── fused_fp8_quant.py │ │ ├── fused_gemm_a8w8_blockscale_a16w16.py │ │ ├── fused_gemm_afp4wfp4_a16w16.py │ │ ├── fused_gemm_afp4wfp4_mul_add.py │ │ ├── fused_gemm_afp4wfp4_split_cat.py │ │ ├── fused_kv_cache.py │ │ ├── fused_mul_add.py │ │ ├── fused_mxfp4_quant.py │ │ ├── fused_qk_concat.py │ │ ├── fused_qkv_split_qk_rope.py │ │ ├── gemm_a16w16.py │ │ ├── gemm_a16w16_atomic.py │ │ ├── gemm_a16w16_gated.py │ │ ├── gemm_a16w8_blockscale.py │ │ ├── gemm_a16wfp4.py │ │ ├── gemm_a8w8.py │ │ ├── gemm_a8w8_blockscale.py │ │ ├── gemm_a8w8_per_token_scale.py │ │ ├── gemm_a8wfp4.py │ │ ├── gemm_afp4wfp4.py │ │ ├── gmm.py │ │ ├── hstu_attention.py │ │ ├── lean_atten.py │ │ ├── lean_atten_paged.py │ │ ├── mha.py │ │ ├── mha_fused_bwd.py │ │ ├── mha_onekernel_bwd.py │ │ ├── mla_decode_rope.py │ │ ├── moe_align_block_size.py │ │ ├── moe_op.py │ │ ├── moe_op_e2e.py │ │ ├── moe_op_gelu.py │ │ ├── moe_op_gemm_a8w4.py │ │ ├── moe_op_gemm_a8w8.py │ │ ├── moe_op_mxfp4.py │ │ ├── moe_op_mxfp4_silu_fused.py │ │ ├── moe_op_silu_fused.py │ │ ├── moe_routing │ │ │ ├── bitmatrix.py │ │ │ ├── expt_data.py │ │ │ ├── routing.py │ │ │ └── topk.py │ │ ├── moe_routing_sigmoid_top1_fused.py │ │ ├── norm.py │ │ ├── pa_decode.py │ │ ├── pa_mqa_logits.py │ │ ├── pa_prefill.py │ │ ├── pod_attention.py │ │ ├── prefill_attention.py │ │ ├── quant.py │ │ ├── quant_moe.py │ │ ├── rmsnorm.py │ │ ├── rope.py │ │ ├── softmax.py │ │ ├── split_qkv.py │ │ ├── topk.py │ │ ├── unified_attention.py │ │ └── unified_attention_sparse_mla.py │ │ ├── activation.py │ │ ├── batched_gemm_a16wfp4.py │ │ ├── batched_gemm_a8w8.py │ │ ├── batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py │ │ ├── batched_gemm_afp4wfp4.py │ │ ├── batched_gemm_afp4wfp4_pre_quant.py │ │ ├── batched_gemm_bf16.py │ │ ├── chunked_pa_prefill.py │ │ ├── configs │ │ ├── gemm │ │ │ ├── aot │ │ │ │ ├── README.md │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192 │ │ │ │ │ ├── _gemm_afp4wfp4_preshuffle_kernel.hsaco │ │ │ │ │ └── _gemm_afp4wfp4_preshuffle_kernel.json │ │ │ ├── gfx942-BATCHED_GEMM-A16W16.json │ │ │ ├── gfx942-BATCHED_GEMM-A8W8-A_PER_TOKEN_GROUP_PREQUANT_W_PER_BATCHED_TENSOR_QUANT-N=128-K=512.json │ │ │ ├── gfx942-BATCHED_GEMM-A8W8-A_PER_TOKEN_GROUP_PREQUANT_W_PER_BATCHED_TENSOR_QUANT-N=512-K=128.json │ │ │ ├── gfx942-BATCHED_GEMM-A8W8-A_PER_TOKEN_GROUP_PREQUANT_W_PER_BATCHED_TENSOR_QUANT.json │ │ │ ├── gfx942-BATCHED_GEMM-A8W8.json │ │ │ ├── gfx942-FF-A16W16-fused.json │ │ │ ├── gfx942-FUSED-GEMM-A8W8_BLOCKSCALE-A16W16.json │ │ │ ├── gfx942-GEMM-A16W16-ATOMIC.json │ │ │ ├── gfx942-GEMM-A16W16-gated.json │ │ │ ├── gfx942-GEMM-A16W16.json │ │ │ ├── gfx942-GEMM-A16W8_BLOCKSCALE.json │ │ │ ├── gfx942-GEMM-A8W8.json │ │ │ ├── gfx942-GEMM-A8W8_BLOCKSCALE.json │ │ │ ├── gfx942-GEMM-A8W8_PER_TOKEN_SCALE.json │ │ │ ├── gfx950-BATCHED_GEMM-A16W16.json │ │ │ ├── gfx950-BATCHED_GEMM-A8W8-A_PER_TOKEN_GROUP_PREQUANT_W_PER_BATCHED_TENSOR_QUANT-N=128-K=512.json │ │ │ ├── gfx950-BATCHED_GEMM-A8W8-A_PER_TOKEN_GROUP_PREQUANT_W_PER_BATCHED_TENSOR_QUANT-N=512-K=128.json │ │ │ ├── gfx950-BATCHED_GEMM-A8W8-A_PER_TOKEN_GROUP_PREQUANT_W_PER_BATCHED_TENSOR_QUANT.json │ │ │ ├── gfx950-BATCHED_GEMM-A8W8.json │ │ │ ├── gfx950-BATCHED_GEMM-AFP4WFP4-N=128-K=512.json │ │ │ ├── gfx950-BATCHED_GEMM-AFP4WFP4-N=512-K=128.json │ │ │ ├── gfx950-BATCHED_GEMM-AFP4WFP4.json │ │ │ ├── gfx950-BATCHED_GEMM_PREQUANT-AFP4WFP4-N=128-K=512.json │ │ │ ├── gfx950-BATCHED_GEMM_PREQUANT-AFP4WFP4-N=512-K=128.json │ │ │ ├── gfx950-BATCHED_GEMM_PREQUANT-AFP4WFP4.json │ │ │ ├── gfx950-FF-A16W16-fused.json │ │ │ ├── gfx950-FUSED-GEMM-A8W8_BLOCKSCALE-A16W16-N8=512-N16=256-K=7168.json │ │ │ ├── gfx950-FUSED-GEMM-A8W8_BLOCKSCALE-A16W16.json │ │ │ ├── gfx950-FUSED-GEMM-AFP4WFP4-A16W16-N4=512-N16=256-K=7168.json │ │ │ ├── gfx950-FUSED-GEMM-AFP4WFP4-A16W16.json │ │ │ ├── gfx950-FUSED-GEMM-AFP4WFP4_PRESHUFFLED-A16W16.json │ │ │ ├── gfx950-GEMM-A16W16-ATOMIC-N=256-K=7168.json │ │ │ ├── gfx950-GEMM-A16W16-ATOMIC.json │ │ │ ├── gfx950-GEMM-A16W16-N=128-K=2880.json │ │ │ ├── gfx950-GEMM-A16W16-N=256-K=7168.json │ │ │ ├── gfx950-GEMM-A16W16-N=2880-K=4096.json │ │ │ ├── gfx950-GEMM-A16W16-N=2880-K=512.json │ │ │ ├── gfx950-GEMM-A16W16-N=5120-K=2880.json │ │ │ ├── gfx950-GEMM-A16W16-N=640-K=2880.json │ │ │ ├── gfx950-GEMM-A16W16-gated.json │ │ │ ├── gfx950-GEMM-A16W16.json │ │ │ ├── gfx950-GEMM-A16W8_BLOCKSCALE-N=7168-K=2048.json │ │ │ ├── gfx950-GEMM-A16W8_BLOCKSCALE.json │ │ │ ├── gfx950-GEMM-A16WFP4-N=512-K=7168.json │ │ │ ├── gfx950-GEMM-A16WFP4-N=7168-K=2048.json │ │ │ ├── gfx950-GEMM-A16WFP4.json │ │ │ ├── gfx950-GEMM-A8W8.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=1024-K=8192.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=2112-K=7168.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=3072-K=1536.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=32768-K=8192.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=4096-K=7168.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=4608-K=7168.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=512-K=7168.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=7168-K=2048.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=7168-K=256.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=8192-K=1024.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=8192-K=32768.json │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE.json │ │ │ ├── gfx950-GEMM-A8W8_PER_TOKEN_SCALE-N=1024-K=8192.json │ │ │ ├── gfx950-GEMM-A8W8_PER_TOKEN_SCALE-N=32768-K=8192.json │ │ │ ├── gfx950-GEMM-A8W8_PER_TOKEN_SCALE-N=8192-K=1024.json │ │ │ ├── gfx950-GEMM-A8W8_PER_TOKEN_SCALE-N=8192-K=32768.json │ │ │ ├── gfx950-GEMM-A8W8_PER_TOKEN_SCALE.json │ │ │ ├── gfx950-GEMM-A8WFP4.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=106496-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=1280-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=13312-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=16384-K=13312.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=16384-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=16384-K=2048.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=16384-K=26624.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=16384-K=4096.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=16384-K=53248.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=16384-K=6656.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=16384-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=18432-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=2112-K=7168.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=2304-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=26624-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=3072-K=1536.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=4608-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=4608-K=7168.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=512-K=7168.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=53248-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=7168-K=2048.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=7168-K=2304.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=7168-K=256.json │ │ │ ├── gfx950-GEMM-AFP4WFP4-N=9216-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=10240-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=106496-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=1280-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=14336-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=16384-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=16384-K=53248.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=18432-K=16384.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=2560-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=28672-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=5120-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=57344-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=7168-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=1024.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=14336.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=2048.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=28672.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=3584.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=4096.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=7168.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=8192.json │ │ │ ├── gfx950-GEMM-AFP4WFP4_PRESHUFFLED.json │ │ │ ├── gfx950-GEMM_PREQUANT-AFP4WFP4-N=512-K=7168.json │ │ │ ├── gfx950-GEMM_PREQUANT-AFP4WFP4.json │ │ │ └── gluon │ │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=2112-K=7168.json │ │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=3072-K=1536.json │ │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=4608-K=7168.json │ │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=512-K=7168.json │ │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=7168-K=2048.json │ │ │ │ ├── gfx950-GEMM-A8W8_BLOCKSCALE-N=7168-K=256.json │ │ │ │ └── gfx950-GEMM-A8W8_BLOCKSCALE.json │ │ ├── gfx942-EXTEND_ATTENTION.json │ │ ├── gfx942-GMM.json │ │ ├── gfx942-LEANATTN-DEFAULT.json │ │ ├── gfx942-MHA-DEFAULT.json │ │ ├── gfx942-MLA_DECODE_ROPE-DEFAULT.json │ │ ├── gfx950-EXTEND_ATTENTION.json │ │ ├── gfx950-GMM.json │ │ ├── gfx950-MHA-DEFAULT.json │ │ ├── gfx950-MLA_DECODE_ROPE-DEFAULT.json │ │ ├── hstu_attn │ │ │ ├── gfx942-HSTU_ATTN_BWD.json │ │ │ ├── gfx942-HSTU_ATTN_FWD.json │ │ │ ├── gfx950-HSTU_ATTN_BWD.json │ │ │ └── gfx950-HSTU_ATTN_FWD.json │ │ └── moe │ │ │ ├── gfx942-MOE-DEFAULT.json │ │ │ ├── gfx942-MOE-FP8_W8A8.json │ │ │ ├── gfx942-MOE-INT4_W4A16.json │ │ │ ├── gfx942-MOE-INT8_W8A16.json │ │ │ ├── gfx942-MOE-INT8_W8A8.json │ │ │ ├── gfx942-MOE_ROUTING_SIGMOID_TOPK1.json │ │ │ ├── gfx950-MOE-DEFAULT.json │ │ │ ├── gfx950-MOE-FP8_W8A8.json │ │ │ ├── gfx950-MOE-INT4_W4A16.json │ │ │ ├── gfx950-MOE-INT8_W8A16.json │ │ │ ├── gfx950-MOE-INT8_W8A8.json │ │ │ ├── gfx950-MOE-MX_FP4.json │ │ │ └── gfx950-MOE_ROUTING_SIGMOID_TOPK1.json │ │ ├── extend_attention.py │ │ ├── ff_a16w16.py │ │ ├── ff_a16w16_fused_gated.py │ │ ├── ff_a16w16_fused_ungated.py │ │ ├── fp8_mqa_logits.py │ │ ├── fused_add_rmsnorm_pad.py │ │ ├── fused_fp8_quant.py │ │ ├── fused_gemm_a8w8_blockscale_a16w16.py │ │ ├── fused_gemm_afp4wfp4_a16w16.py │ │ ├── fused_gemm_afp4wfp4_mul_add.py │ │ ├── fused_gemm_afp4wfp4_split_cat.py │ │ ├── fused_kv_cache.py │ │ ├── fused_mul_add.py │ │ ├── fused_mxfp4_quant.py │ │ ├── fused_qk_concat.py │ │ ├── fused_qkv_split_qk_rope.py │ │ ├── gemm_a16w16.py │ │ ├── gemm_a16w16_agnostic.py │ │ ├── gemm_a16w16_atomic.py │ │ ├── gemm_a16w16_gated.py │ │ ├── gemm_a16w8_blockscale.py │ │ ├── gemm_a16wfp4.py │ │ ├── gemm_a8w8.py │ │ ├── gemm_a8w8_blockscale.py │ │ ├── gemm_a8w8_per_token_scale.py │ │ ├── gemm_a8wfp4.py │ │ ├── gemm_afp4wfp4.py │ │ ├── gemm_afp4wfp4_pre_quant_atomic.py │ │ ├── gluon │ │ ├── gemm_a8w8_blockscale.py │ │ └── pa_mqa_logits.py │ │ ├── gmm.py │ │ ├── hstu_attention.py │ │ ├── lean_atten.py │ │ ├── lean_atten_paged.py │ │ ├── mha.py │ │ ├── mha_fused_bwd.py │ │ ├── mha_onekernel_bwd.py │ │ ├── mha_v3.py │ │ ├── mla_decode_rope.py │ │ ├── moe_align_block_size.py │ │ ├── moe_op.py │ │ ├── moe_op_e2e.py │ │ ├── moe_op_gelu.py │ │ ├── moe_op_gemm_a8w4.py │ │ ├── moe_op_gemm_a8w8.py │ │ ├── moe_op_mxfp4.py │ │ ├── moe_op_mxfp4_silu_fused.py │ │ ├── moe_op_silu_fused.py │ │ ├── moe_routing │ │ ├── bitmatrix.py │ │ ├── routing.py │ │ └── topk.py │ │ ├── moe_routing_sigmoid_top1_fused.py │ │ ├── norm.py │ │ ├── pa_decode.py │ │ ├── pa_mqa_logits.py │ │ ├── pa_prefill.py │ │ ├── pod_attention.py │ │ ├── prefill_attention.py │ │ ├── quant.py │ │ ├── quant_moe.py │ │ ├── rmsnorm.py │ │ ├── rope.py │ │ ├── softmax.py │ │ ├── split_qkv.py │ │ ├── topk.py │ │ ├── unified_attention.py │ │ ├── unified_attention_sparse_mla.py │ │ └── utils │ │ ├── __init__.py │ │ ├── _triton │ │ ├── arch_info.py │ │ ├── kernel_repr.py │ │ ├── mha_kernel_utils.py │ │ ├── moe_common.py │ │ └── pid_preprocessing.py │ │ ├── common_utils.py │ │ ├── core.py │ │ ├── device_info.py │ │ ├── gmm_common.py │ │ ├── la_kernel_utils.py │ │ ├── logger.py │ │ ├── mha_kernel_utils.py │ │ ├── moe_common.py │ │ ├── moe_config_utils.py │ │ └── types.py ├── paged_attn.py ├── rotary_embedding.py ├── test_common.py ├── test_mha_common.py ├── tuned_gemm.py └── utility │ ├── base_tuner.py │ ├── dtypes.py │ ├── fp4_utils.py │ ├── mp_tuner.py │ └── triton │ ├── README.md │ └── triton_metadata_redirect.py ├── aiter_logs ├── readme.md └── run.py ├── csrc ├── ck_batched_gemm_a8w8 │ ├── README.md │ ├── batched_gemm_a8w8.cu │ ├── batched_gemm_a8w8_common.py │ ├── batched_gemm_a8w8_tune.cu │ ├── batched_gemm_a8w8_tune.py │ ├── gen_instances.py │ └── include │ │ ├── batched_gemm_a8w8.h │ │ └── batched_gemm_a8w8_common.cuh ├── ck_batched_gemm_bf16 │ ├── README.md │ ├── batched_gemm_bf16.cu │ ├── batched_gemm_bf16_common.py │ ├── batched_gemm_bf16_tune.cu │ ├── batched_gemm_bf16_tune.py │ ├── gen_instances.py │ └── include │ │ ├── batched_gemm_bf16.h │ │ └── batched_gemm_bf16_common.cuh ├── ck_deepgemm │ ├── deepgemm.cu │ ├── deepgemm_common.py │ ├── gen_instances.py │ └── include │ │ ├── deepgemm.h │ │ └── deepgemm_common.cuh ├── ck_gemm_a4w4_blockscale │ ├── README.md │ ├── gemm_a4w4_blockscale.cu │ ├── gemm_a4w4_blockscale_common.py │ ├── gemm_a4w4_blockscale_tune.cu │ ├── gemm_a4w4_blockscale_tune.py │ ├── gen_instances.py │ └── include │ │ ├── gemm_a4w4_blockscale.h │ │ └── gemm_a4w4_blockscale_common.cuh ├── ck_gemm_a8w8 │ ├── README.md │ ├── gemm_a8w8.cu │ ├── gemm_a8w8_common.py │ ├── gemm_a8w8_tune.cu │ ├── gemm_a8w8_tune.py │ ├── gen_instances.py │ └── include │ │ ├── gemm_a8w8.h │ │ └── gemm_a8w8_common.cuh ├── ck_gemm_a8w8_blockscale │ ├── README.md │ ├── gemm_a8w8_blockscale.cu │ ├── gemm_a8w8_blockscale_common.py │ ├── gemm_a8w8_blockscale_tune.cu │ ├── gemm_a8w8_blockscale_tune.py │ ├── gen_instances.py │ └── include │ │ ├── gemm_a8w8_blockscale.h │ │ └── gemm_a8w8_blockscale_common.cuh ├── ck_gemm_a8w8_blockscale_bpreshuffle │ ├── README.md │ ├── gemm_a8w8_blockscale_bpreshuffle.cu │ ├── gemm_a8w8_blockscale_bpreshuffle_common.py │ ├── gemm_a8w8_blockscale_bpreshuffle_tune.cu │ ├── gemm_a8w8_blockscale_bpreshuffle_tune.py │ ├── gen_instances.py │ └── include │ │ ├── gemm_a8w8_blockscale_bpreshuffle.h │ │ └── gemm_a8w8_blockscale_bpreshuffle_common.cuh ├── ck_gemm_a8w8_bpreshuffle │ ├── README.md │ ├── gemm_a8w8_bpreshuffle.cu │ ├── gemm_a8w8_bpreshuffle_common.py │ ├── gemm_a8w8_bpreshuffle_tune.cu │ ├── gemm_a8w8_bpreshuffle_tune.py │ ├── gen_instances.py │ └── include │ │ ├── gemm_a8w8_bpreshuffle.h │ │ └── gemm_a8w8_bpreshuffle_common.cuh ├── ck_gemm_moe_2stages_codegen │ ├── gemm_moe_ck2stages.cu │ ├── gemm_moe_ck2stages.h │ ├── gemm_moe_ck2stages_common.cuh │ ├── gemm_moe_ck2stages_common.py │ ├── gemm_moe_ck2stages_common_blockscale.cuh │ ├── gemm_moe_ck2stages_common_mxfp4.cuh │ ├── gemm_moe_ck2stages_common_mxfp4_bns.cuh │ └── gen_instances.py ├── ck_tile_gemm_moe_2stages │ ├── gen_instances.py │ ├── include │ │ ├── moe_cktile2stages.h │ │ └── moe_cktile2stages_common.cuh │ ├── moe_cktile2stages.cu │ └── moe_cktile2stages_common.py ├── cktile_gemm_a8w8_bpreshuffle │ ├── README.md │ ├── gemm_a8w8_bpreshuffle_cktile.cu │ ├── gemm_a8w8_bpreshuffle_cktile_common.py │ ├── gemm_a8w8_bpreshuffle_cktile_tune.cu │ ├── gemm_a8w8_bpreshuffle_cktile_tune.py │ ├── gen_instances.py │ └── include │ │ ├── gemm_a8w8_bpreshuffle_cktile.h │ │ └── gemm_a8w8_bpreshuffle_cktile_common.cuh ├── cpp_itfs │ ├── README.MD │ ├── __init__.py │ ├── lru_cache.h │ ├── mha_bwd_generate.py │ ├── mha_fwd_generate.py │ ├── mla │ │ ├── Makefile │ │ ├── asm_mla_decode_fwd.cpp │ │ ├── asm_mla_decode_fwd.cpp.jinja │ │ ├── asm_mla_decode_fwd.h │ │ ├── asm_mla_decode_fwd.py │ │ ├── asm_mla_decode_fwd_test.cpp │ │ └── asm_mla_decode_fwd_test.py │ ├── moe │ │ ├── asm_moe.cpp.jinja │ │ ├── asm_moe.py │ │ └── test_asm_moe.py │ ├── pa │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── pa.cpp.jinja │ │ ├── pa.cuh │ │ ├── pa.py │ │ ├── pa_common.cuh │ │ ├── pa_kernels.cuh │ │ ├── pa_ragged.cpp │ │ ├── pa_ragged.cpp.jinja │ │ ├── pa_ragged.cuh │ │ ├── pa_ragged.h │ │ ├── pa_ragged.py │ │ ├── pa_ragged_test.cpp │ │ ├── pa_ragged_test.py │ │ ├── pa_test.py │ │ ├── pa_v1.cpp.jinja │ │ ├── pa_v1.cuh │ │ └── pa_v1.py │ ├── sampling │ │ ├── sampling.cuh │ │ ├── top_k_renorm_probs.cpp.jinja │ │ ├── top_k_renorm_probs.py │ │ ├── top_k_top_p_sampling_from_probs.cpp.jinja │ │ ├── top_k_top_p_sampling_from_probs.py │ │ ├── top_p_sampling_from_probs.cpp.jinja │ │ ├── top_p_sampling_from_probs.py │ │ └── vec_dtypes.cuh │ ├── torch_utils.py │ ├── utils.h │ └── utils.py ├── include │ ├── activation.h │ ├── aiter_enum.h │ ├── aiter_hip_common.h │ ├── aiter_operator.h │ ├── aiter_unary.h │ ├── asm_a8w8_blockscale_bpreshuffle.h │ ├── asm_flatmm_a8w8_blockscale.h │ ├── asm_gemm_a16w16.h │ ├── asm_gemm_a4w4.h │ ├── asm_gemm_a8w8.h │ ├── asm_mi350_a8w8_blockscale.h │ ├── attention.h │ ├── attention_asm.h │ ├── attention_asm_mla.h │ ├── attention_ck.h │ ├── attention_common.cuh │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_ragged.h │ ├── attention_v1.h │ ├── binary_operator.cuh │ ├── cache.h │ ├── ck_tile │ │ └── vec_convert.h │ ├── communication_asm.h │ ├── custom.h │ ├── custom_all_reduce.cuh │ ├── custom_all_reduce.h │ ├── dispatch_utils.h │ ├── dtype_bfloat16.cuh │ ├── dtype_float16.cuh │ ├── dtype_float32.cuh │ ├── dtype_fp8.cuh │ ├── fused_mrope_rms.h │ ├── gemm_common.h │ ├── hip_compat.h │ ├── hip_float8.h │ ├── hip_float8_impl.h │ ├── hip_reduce.h │ ├── mha_bwd.h │ ├── mha_common.h │ ├── mha_fwd.h │ ├── mla.h │ ├── moe_ck.h │ ├── moe_op.h │ ├── moe_sorting.h │ ├── norm.h │ ├── opus │ │ ├── README.md │ │ ├── logo.png │ │ └── opus.hpp │ ├── pa.h │ ├── pos_encoding.h │ ├── py_itfs_common.h │ ├── quant.h │ ├── quant_common.cuh │ ├── quant_utils.cuh │ ├── quick_all_reduce.cuh │ ├── quick_all_reduce.h │ ├── quick_all_reduce_base.h │ ├── rmsnorm.h │ ├── rocm_ops.hpp │ ├── rope.h │ ├── sample.h │ ├── smoothquant.h │ ├── topk_per_row.h │ ├── topk_plain.h │ ├── torch │ │ ├── mha_batch_prefill.h │ │ ├── mha_bwd.h │ │ ├── mha_fwd.h │ │ ├── mha_v3_bwd.h │ │ ├── mha_v3_fwd.h │ │ ├── mha_v3_varlen_bwd.h │ │ ├── mha_v3_varlen_fwd.h │ │ ├── mha_varlen_bwd.h │ │ └── mha_varlen_fwd.h │ ├── vectorization.cuh │ └── warp_sort.h ├── kernels │ ├── activation_kernels.cu │ ├── attention.cu │ ├── attention_ragged.cu │ ├── attention_v1.cu │ ├── binary_operator.cu │ ├── cache_kernels.cu │ ├── custom_all_reduce.cu │ ├── custom_kernels.cu │ ├── fused_kernels.cu │ ├── fused_mrope_rms.cu │ ├── generate_binaryop.py │ ├── mha_common.cu │ ├── mla │ │ ├── metadata.cu │ │ ├── metadata │ │ │ ├── v1_1_device.cuh │ │ │ ├── v1_1_host.cuh │ │ │ ├── v1_2_device.cuh │ │ │ ├── v1_2_pa_device.cuh │ │ │ └── v1_comm.cuh │ │ └── reduce.cu │ ├── moe_align_block_size_kernels.cu │ ├── moe_fused_gate.cu │ ├── pos_encoding_kernels.cu │ ├── quant_kernels.cu │ ├── quick_all_reduce.cu │ ├── rmsnorm_kernels.cu │ ├── rope │ │ ├── general_bwd_kernels.cu │ │ ├── general_fwd_kernels.cu │ │ ├── pos_fwd_kernels.cu │ │ └── rope_common.h │ ├── sample_kernels.cu │ ├── solver │ │ ├── Makefile │ │ ├── README.md │ │ ├── lapack_sytrd.py │ │ ├── sytrd_benchmark.cu │ │ └── sytrd_kernels.cu │ ├── topk_per_row_kernels.cu │ ├── topk_plain_kernels.cu │ ├── topk_softmax_kernels.cu │ ├── topk_softmax_kernels_group.cu │ └── unary_operator.cu ├── py_itfs_ck │ ├── attention_kernels.cu │ ├── mha_batch_prefill_kernels.cu │ ├── mha_bwd_kernels.cu │ ├── mha_fwd_kernels.cu │ ├── mha_varlen_bwd_kernels.cu │ ├── mha_varlen_fwd_kernels.cu │ ├── moe_ck_2stages_gemm_impl │ │ ├── moe_ck_gemm.hpp │ │ ├── moe_ck_gemm1_instance_pertensor_b16.cu │ │ ├── moe_ck_gemm1_instance_pertensor_b16_f8.cu │ │ ├── moe_ck_gemm1_instance_pertensor_b16_f8_wint4.cu │ │ ├── moe_ck_gemm1_instance_pertensor_b16_i8.cu │ │ ├── moe_ck_gemm1_instance_pertensor_f16.cu │ │ ├── moe_ck_gemm1_instance_pertensor_f16_f8.cu │ │ ├── moe_ck_gemm1_instance_pertensor_f16_f8_win4.cu │ │ ├── moe_ck_gemm1_instance_pertensor_f16_i8.cu │ │ ├── moe_ck_gemm1_instance_pertensor_mulweight_b16.cu │ │ ├── moe_ck_gemm1_instance_pertensor_mulweight_f16.cu │ │ ├── moe_ck_gemm1_instance_pertoken_b16.cu │ │ ├── moe_ck_gemm1_instance_pertoken_b16_f8.cu │ │ ├── moe_ck_gemm1_instance_pertoken_b16_f8_wint4.cu │ │ ├── moe_ck_gemm1_instance_pertoken_b16_i8.cu │ │ ├── moe_ck_gemm1_instance_pertoken_f16.cu │ │ ├── moe_ck_gemm1_instance_pertoken_f16_f8.cu │ │ ├── moe_ck_gemm1_instance_pertoken_f16_f8_win4.cu │ │ ├── moe_ck_gemm1_instance_pertoken_f16_i8.cu │ │ ├── moe_ck_gemm1_instance_pertoken_mulweight_b16.cu │ │ ├── moe_ck_gemm1_instance_pertoken_mulweight_f16.cu │ │ ├── moe_ck_gemm2_instance_pertensor_b16..cu │ │ ├── moe_ck_gemm2_instance_pertensor_b16_f8.cu │ │ ├── moe_ck_gemm2_instance_pertensor_b16_f8_wint4.cu │ │ ├── moe_ck_gemm2_instance_pertensor_b16_i8.cu │ │ ├── moe_ck_gemm2_instance_pertensor_f16.cu │ │ ├── moe_ck_gemm2_instance_pertensor_f16_f8.cu │ │ ├── moe_ck_gemm2_instance_pertensor_f16_f8_wint4.cu │ │ ├── moe_ck_gemm2_instance_pertensor_f16_i8.cu │ │ ├── moe_ck_gemm2_instance_pertensor_mulweight_b16.cu │ │ ├── moe_ck_gemm2_instance_pertensor_mulweight_f16.cu │ │ ├── moe_ck_gemm2_instance_pertoken_b16..cu │ │ ├── moe_ck_gemm2_instance_pertoken_b16_f8.cu │ │ ├── moe_ck_gemm2_instance_pertoken_b16_f8_wint4.cu │ │ ├── moe_ck_gemm2_instance_pertoken_b16_i8.cu │ │ ├── moe_ck_gemm2_instance_pertoken_f16.cu │ │ ├── moe_ck_gemm2_instance_pertoken_f16_f8.cu │ │ ├── moe_ck_gemm2_instance_pertoken_f16_f8_wint4.cu │ │ ├── moe_ck_gemm2_instance_pertoken_f16_i8.cu │ │ ├── moe_ck_gemm2_instance_pertoken_mulweight_b16.cu │ │ ├── moe_ck_gemm2_instance_pertoken_mulweight_f16.cu │ │ └── moe_ck_gemm_common.cuh │ ├── moe_ck_2stages_kernel.cu │ ├── moe_sorting_kernels.cu │ ├── norm_kernels.cu │ ├── rmsnorm_ck_kernels.cu │ ├── smoothquant_kernels.cu │ └── topk_sigmoid_kernels.cu ├── py_itfs_cu │ ├── asm_a8w8_blockscale_bpreshuffle.cu │ ├── asm_communication.cu │ ├── asm_flatmm_a8w8_blockscale.cu │ ├── asm_fmoe.cu │ ├── asm_gemm_a16w16.cu │ ├── asm_gemm_a4w4.cu │ ├── asm_gemm_a8w8.cu │ ├── asm_layernorm.cu │ ├── asm_mha_bwd.cu │ ├── asm_mha_fwd.cu │ ├── asm_mha_varlen_bwd.cu │ ├── asm_mha_varlen_fwd.cu │ ├── asm_mi350_a8w8_blockscale.cu │ ├── asm_mla.cu │ ├── asm_moe_2stage.cu │ ├── asm_pa.cu │ ├── asm_topksoftmax.cu │ ├── custom.cu │ ├── fmha_bwd_pre_post_kernel_generate.py │ └── gemm_common.cu ├── pybind │ ├── activation_pybind.cu │ ├── aiter_enum_pybind.cu │ ├── aiter_operator_pybind.cu │ ├── aiter_unary_pybind.cu │ ├── asm_a8w8_blockscale_bpreshuffle_asm_pybind.cu │ ├── asm_mi350_a8w8_blockscale_asm_pybind.cu │ ├── attention_asm_mla_pybind.cu │ ├── attention_asm_pybind.cu │ ├── attention_ck_pybind.cu │ ├── attention_pybind.cu │ ├── attention_ragged_pybind.cu │ ├── attention_v1_pybind.cu │ ├── batched_gemm_a8w8_pybind.cu │ ├── batched_gemm_a8w8_tune_pybind.cu │ ├── batched_gemm_bf16_pybind.cu │ ├── batched_gemm_bf16_tune_pybind.cu │ ├── cache_pybind.cu │ ├── custom_all_reduce_pybind.cu │ ├── custom_pybind.cu │ ├── deepgemm_pybind.cu │ ├── flatmm_a8w8_blockscale_asm_pybind.cu │ ├── fused_mrope_rms_pybind.cu │ ├── gemm_a16w16_asm_pybind.cu │ ├── gemm_a4w4_asm_pybind.cu │ ├── gemm_a4w4_blockscale_pybind.cu │ ├── gemm_a4w4_blockscale_tune_pybind.cu │ ├── gemm_a8w8_asm_pybind.cu │ ├── gemm_a8w8_blockscale_bpreshuffle_pybind.cu │ ├── gemm_a8w8_blockscale_bpreshuffle_tune_pybind.cu │ ├── gemm_a8w8_blockscale_pybind.cu │ ├── gemm_a8w8_blockscale_tune_pybind.cu │ ├── gemm_a8w8_bpreshuffle_cktile_pybind.cu │ ├── gemm_a8w8_bpreshuffle_cktile_tune_pybind.cu │ ├── gemm_a8w8_bpreshuffle_pybind.cu │ ├── gemm_a8w8_bpreshuffle_tune_pybind.cu │ ├── gemm_a8w8_pybind.cu │ ├── gemm_a8w8_tune_pybind.cu │ ├── gemm_common_pybind.cu │ ├── mha_batch_prefill_pybind.cu │ ├── mha_bwd_asm_pybind.cu │ ├── mha_bwd_pybind.cu │ ├── mha_fwd_asm_pybind.cu │ ├── mha_fwd_pybind.cu │ ├── mha_varlen_bwd_asm_pybind.cu │ ├── mha_varlen_bwd_pybind.cu │ ├── mha_varlen_fwd_asm_pybind.cu │ ├── mha_varlen_fwd_pybind.cu │ ├── mla_metadata_pybind.cu │ ├── mla_reduce_pybind.cu │ ├── moe_ck_2stages_pybind.cu │ ├── moe_ck_pybind.cu │ ├── moe_cktile_2stages_pybind.cu │ ├── moe_op_pybind.cu │ ├── moe_sorting_pybind.cu │ ├── moe_topk_pybind.cu │ ├── norm_pybind.cu │ ├── pa_metadata_pybind.cu │ ├── pos_encoding_pybind.cu │ ├── quant_pybind.cu │ ├── quick_all_reduce_pybind.cu │ ├── rmsnorm_pybind.cu │ ├── rope_general_bwd_pybind.cu │ ├── rope_general_fwd_pybind.cu │ ├── rope_pos_fwd_pybind.cu │ ├── sample_pybind.cu │ ├── smoothquant_pybind.cu │ ├── topk_per_row_pybind.cu │ └── topk_plain_pybind.cu └── rocm_ops.cpp ├── docs ├── aiter_container_nonroot_setup.md ├── autotuning_pipeline.md └── images │ ├── autotuning_ci_pipeline_1.jpeg │ └── autotuning_ci_pipeline_2.jpeg ├── gradlib ├── README.md ├── csrc │ ├── grad_funcs.cu │ ├── hipbsolgemm.cu │ └── rocsolgemm.cu ├── gradlib │ ├── GemmTuner.py │ └── gemm_tuner.py ├── include │ ├── hipbsolgemm.cuh │ └── rocsolgemm.cuh └── setup.py ├── hsa ├── codegen.py ├── gfx942 │ ├── all_reduce.co │ ├── allreduce_layernorm_N8192.co │ ├── allreduce_rmsnorm_N8192.co │ ├── allreduce_rmsnorm_qnt_N8192.co │ ├── bf16gemm │ │ ├── bf16gemm_fp32bf16.csv │ │ ├── bf16gemm_fp32bf16_tn_128x64_bshuffle.co │ │ ├── bf16gemm_fp32bf16_tn_160x64_bshuffle.co │ │ ├── bf16gemm_fp32bf16_tn_32x64_bshuffle.co │ │ ├── bf16gemm_fp32bf16_tn_32x64_pf3.co │ │ ├── bf16gemm_fp32bf16_tn_48x64_bshuffle.co │ │ ├── bf16gemm_fp32bf16_tn_48x64_pf3.co │ │ ├── bf16gemm_fp32bf16_tn_64x64_bshuffle.co │ │ ├── bf16gemm_fp32bf16_tn_64x64_pf3.co │ │ ├── bf16gemm_fp32bf16_tn_96x64_bshuffle.co │ │ └── bf16gemm_fp32bf16_tn_96x64_pf3.co │ ├── f4gemm │ │ └── f4gemm_bf16_per1x32Fp4.csv │ ├── flatmm_uk_gfx9_f16f8_128x256x128_1x4x1_16x16x32.co │ ├── fmha_v3_bwd │ │ ├── bwd_hd128_bf16_a16_rtna.co │ │ ├── bwd_hd128_bf16_a16_rtna_pddv.co │ │ ├── bwd_hd128_bf16_a16_rtne.co │ │ ├── bwd_hd128_bf16_a16_rtne_pddv.co │ │ ├── bwd_hd128_bf16_a16_rtz.co │ │ ├── bwd_hd128_bf16_a16_rtz_pddv.co │ │ ├── bwd_hd128_bf16_a32_rtna.co │ │ ├── bwd_hd128_bf16_a32_rtna_pssk_group.co │ │ ├── bwd_hd128_bf16_a32_rtna_psskddv.co │ │ ├── bwd_hd128_bf16_a32_rtna_psskddv_group.co │ │ ├── bwd_hd128_bf16_a32_rtne.co │ │ ├── bwd_hd128_bf16_a32_rtne_pssk_group.co │ │ ├── bwd_hd128_bf16_a32_rtne_psskddv.co │ │ ├── bwd_hd128_bf16_a32_rtne_psskddv_group.co │ │ ├── bwd_hd128_bf16_a32_rtz.co │ │ ├── bwd_hd128_bf16_a32_rtz_pssk_group.co │ │ ├── bwd_hd128_bf16_a32_rtz_psskddv.co │ │ ├── bwd_hd128_bf16_a32_rtz_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_a16_rtna.co │ │ ├── bwd_hd128_bf16_causal_a16_rtna_pddv.co │ │ ├── bwd_hd128_bf16_causal_a16_rtne.co │ │ ├── bwd_hd128_bf16_causal_a16_rtne_pddv.co │ │ ├── bwd_hd128_bf16_causal_a16_rtz.co │ │ ├── bwd_hd128_bf16_causal_a16_rtz_pddv.co │ │ ├── bwd_hd128_bf16_causal_a32_rtna.co │ │ ├── bwd_hd128_bf16_causal_a32_rtna_pssk_group.co │ │ ├── bwd_hd128_bf16_causal_a32_rtna_psskddv.co │ │ ├── bwd_hd128_bf16_causal_a32_rtna_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_a32_rtne.co │ │ ├── bwd_hd128_bf16_causal_a32_rtne_pssk_group.co │ │ ├── bwd_hd128_bf16_causal_a32_rtne_psskddv.co │ │ ├── bwd_hd128_bf16_causal_a32_rtne_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_a32_rtz.co │ │ ├── bwd_hd128_bf16_causal_a32_rtz_pssk_group.co │ │ ├── bwd_hd128_bf16_causal_a32_rtz_psskddv.co │ │ ├── bwd_hd128_bf16_causal_a32_rtz_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtna_pssk_group.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtna_psskddv.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtna_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtne_pssk_group.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtne_psskddv.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtne_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtz_pssk_group.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtz_psskddv.co │ │ ├── bwd_hd128_bf16_causal_br_a32_rtz_psskddv_group.co │ │ ├── bwd_hd128_bf16_swa_a32_rtna_psskddv.co │ │ ├── bwd_hd128_bf16_swa_a32_rtne_psskddv.co │ │ ├── bwd_hd128_bf16_swa_a32_rtz_psskddv.co │ │ ├── bwd_hd128_fp16_a16.co │ │ ├── bwd_hd128_fp16_a16_pddv.co │ │ ├── bwd_hd128_fp16_a32.co │ │ ├── bwd_hd128_fp16_a32_pssk_group.co │ │ ├── bwd_hd128_fp16_a32_psskddv.co │ │ ├── bwd_hd128_fp16_a32_psskddv_group.co │ │ ├── bwd_hd128_fp16_causal_a16.co │ │ ├── bwd_hd128_fp16_causal_a16_pddv.co │ │ ├── bwd_hd128_fp16_causal_a32.co │ │ ├── bwd_hd128_fp16_causal_a32_pssk_group.co │ │ ├── bwd_hd128_fp16_causal_a32_psskddv.co │ │ ├── bwd_hd128_fp16_causal_a32_psskddv_group.co │ │ ├── bwd_hd128_fp16_causal_br_a32_pssk_group.co │ │ ├── bwd_hd128_fp16_causal_br_a32_psskddv.co │ │ ├── bwd_hd128_fp16_causal_br_a32_psskddv_group.co │ │ ├── bwd_hd128_fp16_swa_a32_psskddv.co │ │ ├── bwd_hd192_bf16_a32_rtna_psskddv.co │ │ ├── bwd_hd192_bf16_a32_rtna_psskddv_group.co │ │ ├── bwd_hd192_bf16_a32_rtne_psskddv.co │ │ ├── bwd_hd192_bf16_a32_rtne_psskddv_group.co │ │ ├── bwd_hd192_bf16_a32_rtz_psskddv.co │ │ ├── bwd_hd192_bf16_a32_rtz_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_a32_rtna_psskddv.co │ │ ├── bwd_hd192_bf16_causal_a32_rtna_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_a32_rtne_psskddv.co │ │ ├── bwd_hd192_bf16_causal_a32_rtne_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_a32_rtz_psskddv.co │ │ ├── bwd_hd192_bf16_causal_a32_rtz_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtna_psskddv.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtna_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtne_psskddv.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtne_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtz_psskddv.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtz_psskddv_group.co │ │ ├── bwd_hd192_fp16_a32_psskddv.co │ │ ├── bwd_hd192_fp16_a32_psskddv_group.co │ │ ├── bwd_hd192_fp16_causal_a32_psskddv.co │ │ ├── bwd_hd192_fp16_causal_a32_psskddv_group.co │ │ ├── bwd_hd192_fp16_causal_br_a32_psskddv.co │ │ ├── bwd_hd192_fp16_causal_br_a32_psskddv_group.co │ │ ├── bwd_hd64_bf16_a16_rtna.co │ │ ├── bwd_hd64_bf16_a16_rtne.co │ │ ├── bwd_hd64_bf16_a16_rtz.co │ │ ├── bwd_hd64_bf16_a32_rtna_pssk.co │ │ ├── bwd_hd64_bf16_a32_rtna_pssk_group.co │ │ ├── bwd_hd64_bf16_a32_rtne_pssk.co │ │ ├── bwd_hd64_bf16_a32_rtne_pssk_group.co │ │ ├── bwd_hd64_bf16_a32_rtz_pssk.co │ │ ├── bwd_hd64_bf16_a32_rtz_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_a16_rtna.co │ │ ├── bwd_hd64_bf16_causal_a16_rtne.co │ │ ├── bwd_hd64_bf16_causal_a16_rtz.co │ │ ├── bwd_hd64_bf16_causal_a32_rtna_pssk.co │ │ ├── bwd_hd64_bf16_causal_a32_rtna_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_a32_rtne_pssk.co │ │ ├── bwd_hd64_bf16_causal_a32_rtne_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_a32_rtz_pssk.co │ │ ├── bwd_hd64_bf16_causal_a32_rtz_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtna_pssk.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtna_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtne_pssk.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtne_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtz_pssk.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtz_pssk_group.co │ │ ├── bwd_hd64_fp16_a16.co │ │ ├── bwd_hd64_fp16_a32_pssk.co │ │ ├── bwd_hd64_fp16_a32_pssk_group.co │ │ ├── bwd_hd64_fp16_causal_a16.co │ │ ├── bwd_hd64_fp16_causal_a32_pssk.co │ │ ├── bwd_hd64_fp16_causal_a32_pssk_group.co │ │ ├── bwd_hd64_fp16_causal_br_a32_pssk.co │ │ ├── bwd_hd64_fp16_causal_br_a32_pssk_group.co │ │ └── codegen.py │ ├── fmha_v3_fwd │ │ ├── MI300 │ │ │ ├── fwd_hd128_bf16.co │ │ │ ├── fwd_hd128_bf16_causal.co │ │ │ ├── fwd_hd128_bf16_causal_group.co │ │ │ ├── fwd_hd128_bf16_causal_rtna.co │ │ │ ├── fwd_hd128_bf16_causal_rtna_group.co │ │ │ ├── fwd_hd128_bf16_causal_rtne.co │ │ │ ├── fwd_hd128_bf16_causal_rtne_group.co │ │ │ ├── fwd_hd128_bf16_causal_rtz.co │ │ │ ├── fwd_hd128_bf16_causal_rtz_group.co │ │ │ ├── fwd_hd128_bf16_group.co │ │ │ ├── fwd_hd128_bf16_rtna.co │ │ │ ├── fwd_hd128_bf16_rtna_group.co │ │ │ ├── fwd_hd128_bf16_rtne.co │ │ │ ├── fwd_hd128_bf16_rtne_group.co │ │ │ ├── fwd_hd128_bf16_rtz.co │ │ │ └── fwd_hd128_bf16_rtz_group.co │ │ ├── MI308 │ │ │ ├── fwd_hd128_bf16.co │ │ │ ├── fwd_hd128_bf16_causal.co │ │ │ ├── fwd_hd128_bf16_causal_group.co │ │ │ ├── fwd_hd128_bf16_causal_rtna.co │ │ │ ├── fwd_hd128_bf16_causal_rtna_group.co │ │ │ ├── fwd_hd128_bf16_causal_rtne.co │ │ │ ├── fwd_hd128_bf16_causal_rtne_group.co │ │ │ ├── fwd_hd128_bf16_causal_rtz.co │ │ │ ├── fwd_hd128_bf16_causal_rtz_group.co │ │ │ ├── fwd_hd128_bf16_group.co │ │ │ ├── fwd_hd128_bf16_rtna.co │ │ │ ├── fwd_hd128_bf16_rtna_group.co │ │ │ ├── fwd_hd128_bf16_rtne.co │ │ │ ├── fwd_hd128_bf16_rtne_group.co │ │ │ ├── fwd_hd128_bf16_rtz.co │ │ │ └── fwd_hd128_bf16_rtz_group.co │ │ └── codegen.py │ ├── fmoe │ │ ├── fmoe_fp8_blockscale_g1u1_novs_subGU_256.co │ │ ├── fmoe_fp8_blockscale_g1u1_novs_subGU_256_ps.co │ │ ├── fmoe_fp8_blockscale_g1u1_subGU_256.co │ │ ├── gelu │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_gelu.csv │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_atm_inlv_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_atm_inlv_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_noquant_g1u0_gelu.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_atm_opt_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_atm_opt_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_gelu_1tg_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_gelu.csv │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_atm_inlv_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_atm_inlv_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_noquant_g1u0_gelu.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_atm_opt_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_atm_opt_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_atm_opt_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_atm_opt_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_smf_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_smf_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp8_g1u1_subGU_128_gelu.co │ │ │ ├── fmoe_fp8_g1u1_subGU_128_gelu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_192_gelu.co │ │ │ ├── fmoe_fp8_g1u1_subGU_192_gelu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_256_gelu.co │ │ │ ├── fmoe_fp8_g1u1_subGU_256_gelu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_320_gelu.co │ │ │ ├── fmoe_fp8_g1u1_subGU_320_gelu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_384_gelu.co │ │ │ ├── fmoe_fp8_g1u1_subGU_384_gelu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_448_gelu.co │ │ │ ├── fmoe_fp8_g1u1_subGU_448_gelu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_512_gelu.co │ │ │ ├── fmoe_fp8_g1u1_subGU_512_gelu_tkw1.co │ │ │ ├── fmoe_int8_g1u0_subGU_128_gelu.co │ │ │ ├── fmoe_int8_g1u0_subGU_192_gelu.co │ │ │ ├── fmoe_int8_g1u0_subGU_256_gelu.co │ │ │ ├── fmoe_int8_g1u0_subGU_320_gelu.co │ │ │ ├── fmoe_int8_g1u0_subGU_384_gelu.co │ │ │ ├── fmoe_int8_g1u0_subGU_448_gelu.co │ │ │ ├── fmoe_int8_g1u0_subGU_512_gelu.co │ │ │ ├── fmoe_int8_g1u1_subGU_128_gelu.co │ │ │ ├── fmoe_int8_g1u1_subGU_192_gelu.co │ │ │ ├── fmoe_int8_g1u1_subGU_256_gelu.co │ │ │ ├── fmoe_int8_g1u1_subGU_320_gelu.co │ │ │ ├── fmoe_int8_g1u1_subGU_384_gelu.co │ │ │ ├── fmoe_int8_g1u1_subGU_448_gelu.co │ │ │ └── fmoe_int8_g1u1_subGU_512_gelu.co │ │ └── silu │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_silu.csv │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_atm_inlv_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_atm_inlv_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_noquant_g1u0_silu.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_vs_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_atm_opt_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_atm_opt_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_silu_1tg_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_silu.csv │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_atm_inlv_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_atm_inlv_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_noquant_g1u0_silu.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_atm_opt_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_atm_opt_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_smf_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp8_g1u1_subGU_128.co │ │ │ ├── fmoe_fp8_g1u1_subGU_128_silu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_192.co │ │ │ ├── fmoe_fp8_g1u1_subGU_192_silu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_256.co │ │ │ ├── fmoe_fp8_g1u1_subGU_256_silu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_320.co │ │ │ ├── fmoe_fp8_g1u1_subGU_320_silu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_384.co │ │ │ ├── fmoe_fp8_g1u1_subGU_384_silu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_448.co │ │ │ ├── fmoe_fp8_g1u1_subGU_448_silu_tkw1.co │ │ │ ├── fmoe_fp8_g1u1_subGU_512.co │ │ │ ├── fmoe_fp8_g1u1_subGU_512_silu_tkw1.co │ │ │ ├── fmoe_int8_g1u0_subGU_128.co │ │ │ ├── fmoe_int8_g1u0_subGU_192.co │ │ │ ├── fmoe_int8_g1u0_subGU_256.co │ │ │ ├── fmoe_int8_g1u0_subGU_320.co │ │ │ ├── fmoe_int8_g1u0_subGU_384.co │ │ │ ├── fmoe_int8_g1u0_subGU_448.co │ │ │ ├── fmoe_int8_g1u0_subGU_512.co │ │ │ ├── fmoe_int8_g1u1_subGU_128.co │ │ │ ├── fmoe_int8_g1u1_subGU_192.co │ │ │ ├── fmoe_int8_g1u1_subGU_256.co │ │ │ ├── fmoe_int8_g1u1_subGU_320.co │ │ │ ├── fmoe_int8_g1u1_subGU_384.co │ │ │ ├── fmoe_int8_g1u1_subGU_448.co │ │ │ └── fmoe_int8_g1u1_subGU_512.co │ ├── fmoe_2stages │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1.csv │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_112x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_112x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_128x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_128x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_144x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_144x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_160x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_160x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x128_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x128_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x512_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x128_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x128_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x256_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_80x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_80x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_80x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_80x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_96x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_96x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_112x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_112x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_112x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_112x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x128_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x128_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x256_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x512_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x64_5tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x64_6tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x64_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x64_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x64_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x64_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_96x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_96x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_96x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_96x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1.csv │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_112x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_112x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_112x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_112x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_128x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_128x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_128x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_128x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_144x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_144x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_144x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_144x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_160x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_160x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_160x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_160x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x128_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x128_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x256_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x512_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x64_5tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x64_6tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x64_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x64_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x64_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x64_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x64_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_96x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_96x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_96x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_96x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1.csv │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_112x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_112x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_128x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_128x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_144x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_144x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_160x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_160x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x128_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x128_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x192_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x256_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x384_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x512_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x128_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x128_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x192_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x256_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x384_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_64x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_64x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_64x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_64x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_80x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_80x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_80x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_80x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_96x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_96x128_pf3.co │ │ └── tune.py │ ├── fmoe_b16.co │ ├── fmoe_f16.co │ ├── fmoe_fp8_blockscale_g1u1_novs_subGU_256.co │ ├── fmoe_fp8_blockscale_g1u1_subGU_256.co │ ├── fmoe_fp8_g1u1_multix_subGU_128.co │ ├── fmoe_fp8_g1u1_multix_subGU_192.co │ ├── fmoe_fp8_g1u1_multix_subGU_256.co │ ├── fmoe_fp8_g1u1_multix_subGU_320.co │ ├── fmoe_fp8_g1u1_multix_subGU_384.co │ ├── fmoe_fp8_g1u1_multix_subGU_448.co │ ├── fmoe_fp8_g1u1_multix_subGU_512.co │ ├── fmoe_fp8_g1u1_smf_subGU_320.co │ ├── fmoe_fp8_g1u1_smf_subGU_512.co │ ├── fmoe_int4fp8_g1u1_subGU_128_gelu.co │ ├── fmoe_int4fp8_g1u1_subGU_256_gelu.co │ ├── fmoe_int4fp8_g1u1_subGU_512_gelu.co │ ├── fmoe_int8_g1u0.co │ ├── fmoe_int8_g1u0_smf.co │ ├── fmoe_int8_g1u1_multix_subGU_128.co │ ├── fmoe_int8_g1u1_multix_subGU_192.co │ ├── fmoe_int8_g1u1_multix_subGU_256.co │ ├── fmoe_int8_g1u1_multix_subGU_320.co │ ├── fmoe_int8_g1u1_multix_subGU_384.co │ ├── fmoe_int8_g1u1_multix_subGU_448.co │ ├── fmoe_int8_g1u1_multix_subGU_512.co │ ├── fmoe_int8_g1u1_smf_subGU_256.co │ ├── fmoe_int8_g1u1_smf_subGU_320.co │ ├── fp8gemm_blockscale │ │ ├── fp8gemm_bf16_blockscale.csv │ │ ├── fp8gemm_bf16_blockscale_BpreShuffle_128x128.co │ │ └── fp8gemm_bf16_blockscale_BpreShuffle_64x128.co │ ├── gemm_a8w8_m128_noSplitK.co │ ├── gemm_a8w8_m128_splitK.co │ ├── i8gemm │ │ ├── I8gemm_bf16_perTokenI8_BpreShuffle_128x128.co │ │ ├── I8gemm_bf16_perTokenI8_BpreShuffle_192x128.co │ │ └── i8gemm_bf16_perTokenI8.csv │ ├── layer_norm.co │ ├── layer_norm_qnt.co │ ├── mla │ │ ├── mla_a16w16_qh16_m16x4_n16x1_coex0_mask1.co │ │ ├── mla_a16w16_qh16_m16x4_n16x1_coex0_mask1_ps.co │ │ ├── mla_a16w16_qh16_m32x4_n16x1_coex0_mask1.co │ │ ├── mla_a16w8_qh16_m16x4_n16x1_coex0_mask1_ps.co │ │ ├── mla_a8w8_qh128_m32x4_n16x2_msk0_ps.co │ │ ├── mla_a8w8_qh128_m32x4_n16x2_msk1.co │ │ ├── mla_a8w8_qh16_qseqlen1_gqaratio16.co │ │ ├── mla_a8w8_qh16_qseqlen1_gqaratio16_ps.co │ │ ├── mla_a8w8_qh16_qseqlen2_gqaratio16.co │ │ ├── mla_a8w8_qh16_qseqlen2_gqaratio16_ps.co │ │ ├── mla_a8w8_qh16_qseqlen4_gqaratio16.co │ │ ├── mla_a8w8_qh16_qseqlen4_gqaratio16_ps.co │ │ ├── mla_dec_stage1_bf16_a16w16_subQ128_mqa128.co │ │ ├── mla_dec_stage1_bf16_a16w16_subQ16_mqa16.co │ │ ├── mla_pfl_bf16_a16w16_causal_subQ128_mqa128.co │ │ └── mla_pfl_bf16_a16w16_causal_subQ16_mqa16.co │ ├── pa │ │ ├── pa_asm.csv │ │ ├── pa_bf16_noquant_gqa16_1tg_4w.co │ │ ├── pa_bf16_noquant_gqa8_1tg_4w.co │ │ ├── pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa10_1tg_4w_qlen1_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa10_1tg_4w_qlen2_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa10_1tg_4w_qlen3_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa10_1tg_4w_qlen4_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen16_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen1_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen32_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen40_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen48_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen64_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_2tg_4w.co │ │ ├── pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa8_2tg_4w.co │ │ ├── pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co │ │ ├── pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co │ │ ├── pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenInt8_gqa16_2tg_4w.co │ │ ├── pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenInt8_gqa8_2tg_4w.co │ │ ├── pa_fp16_noquant_gqa16_1tg_4w.co │ │ ├── pa_fp16_noquant_gqa8_1tg_4w.co │ │ ├── pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co │ │ ├── pa_fp16_pertokenFp8_gqa16_2tg_4w.co │ │ ├── pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_fp16_pertokenFp8_gqa8_2tg_4w.co │ │ ├── pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co │ │ ├── pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co │ │ ├── pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co │ │ ├── pa_fp16_pertokenInt8_gqa16_2tg_4w.co │ │ ├── pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co │ │ └── pa_fp16_pertokenInt8_gqa8_2tg_4w.co │ ├── pa_a16w16_b16.co │ ├── pa_a16w16_f16.co │ ├── pa_a16w8_2tg_g8_f8_q_fp16_tail_bf16.co │ ├── pa_a16w8_b16.co │ ├── pa_a16w8_b16_2tg_g8_f8.co │ ├── pa_a16w8_b16_2tg_g8_i8.co │ ├── pa_a16w8_bf16_2tg_g8_f8_gemm1_bf16.co │ ├── pa_a16w8_bf16_2tg_g8_f8_tail_bf16.co │ ├── pa_a16w8_f16.co │ ├── pa_a16w8_f16_2tg_g8_f8.co │ ├── pa_a16w8_f16_2tg_g8_i8.co │ └── topksoftmax │ │ ├── topksoftmax_12x128x6.co │ │ ├── topksoftmax_12x128x8.co │ │ ├── topksoftmax_12x256x6.co │ │ ├── topksoftmax_12x256x8.co │ │ ├── topksoftmax_4x128x6.co │ │ ├── topksoftmax_4x128x8.co │ │ ├── topksoftmax_4x256x6.co │ │ └── topksoftmax_4x256x8.co ├── gfx950 │ ├── bf16gemm │ │ ├── bf16gemm_bf16_tn_256x256.co │ │ ├── bf16gemm_bf16_tn_256x256_bpreshuffle.co │ │ ├── bf16gemm_fp32bf16.csv │ │ ├── bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co │ │ ├── bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co │ │ ├── bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co │ │ ├── bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co │ │ ├── bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co │ │ ├── bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co │ │ ├── bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co │ │ ├── bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co │ │ ├── bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co │ │ └── bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co │ ├── f4gemm │ │ ├── f4gemm_bf16_per1x32Fp4.csv │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_128x256.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_128x384.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_160x128.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_160x256.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_192x128.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_192x256.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_224x128.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_224x256.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_256x128.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_32x1024.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_32x128.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_32x256.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_32x384.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_32x512.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_32x640.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_32x768.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_32x896.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_64x1024.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_64x256.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_64x384.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_64x512.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_64x640.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_64x768.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_64x896.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_96x128.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_96x256.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_96x384.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_96x512.co │ │ ├── f4gemm_bf16_per1x32Fp4_BpreShuffle_96x640.co │ │ └── f4gemm_bf16_per1x32Fp4_noBpreShuffle_256x256.co │ ├── f8_block_scale_mi350_x128.co │ ├── f8_block_scale_mi350_x32.co │ ├── f8_block_scale_mi350_x64.co │ ├── f8_block_scale_mi350_x96.co │ ├── fmha_v3_bwd │ │ ├── bwd_hd128_bf16_a16_psskddv.co │ │ ├── bwd_hd128_bf16_a16_psskddv_group.co │ │ ├── bwd_hd128_bf16_a32_psskddv.co │ │ ├── bwd_hd128_bf16_a32_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_a16_psskddv.co │ │ ├── bwd_hd128_bf16_causal_a16_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_a32_psskddv.co │ │ ├── bwd_hd128_bf16_causal_a32_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_br_a16_psskddv.co │ │ ├── bwd_hd128_bf16_causal_br_a16_psskddv_group.co │ │ ├── bwd_hd128_bf16_causal_br_a32_psskddv.co │ │ ├── bwd_hd128_bf16_causal_br_a32_psskddv_group.co │ │ ├── bwd_hd128_bf16_swa_a32_rtna_psskddv.co │ │ ├── bwd_hd128_bf16_swa_a32_rtne_psskddv.co │ │ ├── bwd_hd128_bf16_swa_a32_rtz_psskddv.co │ │ ├── bwd_hd128_dq_convert_bf16.co │ │ ├── bwd_hd128_dq_convert_bf16_group.co │ │ ├── bwd_hd128_dq_convert_fp16.co │ │ ├── bwd_hd128_dq_convert_fp16_group.co │ │ ├── bwd_hd128_dq_shuffle.co │ │ ├── bwd_hd128_dq_shuffle_group.co │ │ ├── bwd_hd128_fp16_a16_psskddv.co │ │ ├── bwd_hd128_fp16_a16_psskddv_group.co │ │ ├── bwd_hd128_fp16_a32_psskddv.co │ │ ├── bwd_hd128_fp16_a32_psskddv_group.co │ │ ├── bwd_hd128_fp16_causal_a16_psskddv.co │ │ ├── bwd_hd128_fp16_causal_a16_psskddv_group.co │ │ ├── bwd_hd128_fp16_causal_a32_psskddv.co │ │ ├── bwd_hd128_fp16_causal_a32_psskddv_group.co │ │ ├── bwd_hd128_fp16_causal_br_a16_psskddv.co │ │ ├── bwd_hd128_fp16_causal_br_a16_psskddv_group.co │ │ ├── bwd_hd128_fp16_causal_br_a32_psskddv.co │ │ ├── bwd_hd128_fp16_causal_br_a32_psskddv_group.co │ │ ├── bwd_hd128_fp16_swa_a32_psskddv.co │ │ ├── bwd_hd128_odo_bf16.co │ │ ├── bwd_hd128_odo_bf16_group.co │ │ ├── bwd_hd128_odo_fp16.co │ │ ├── bwd_hd128_odo_fp16_group.co │ │ ├── bwd_hd192_bf16_a32_rtna_psskddv.co │ │ ├── bwd_hd192_bf16_a32_rtna_psskddv_group.co │ │ ├── bwd_hd192_bf16_a32_rtne_psskddv.co │ │ ├── bwd_hd192_bf16_a32_rtne_psskddv_group.co │ │ ├── bwd_hd192_bf16_a32_rtz_psskddv.co │ │ ├── bwd_hd192_bf16_a32_rtz_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_a32_rtna_psskddv.co │ │ ├── bwd_hd192_bf16_causal_a32_rtna_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_a32_rtne_psskddv.co │ │ ├── bwd_hd192_bf16_causal_a32_rtne_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_a32_rtz_psskddv.co │ │ ├── bwd_hd192_bf16_causal_a32_rtz_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtna_psskddv.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtna_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtne_psskddv.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtne_psskddv_group.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtz_psskddv.co │ │ ├── bwd_hd192_bf16_causal_br_a32_rtz_psskddv_group.co │ │ ├── bwd_hd192_dq_convert_bf16.co │ │ ├── bwd_hd192_dq_convert_bf16_group.co │ │ ├── bwd_hd192_dq_convert_fp16.co │ │ ├── bwd_hd192_dq_convert_fp16_group.co │ │ ├── bwd_hd192_dq_shuffle.co │ │ ├── bwd_hd192_fp16_a32_psskddv.co │ │ ├── bwd_hd192_fp16_a32_psskddv_group.co │ │ ├── bwd_hd192_fp16_causal_a32_psskddv.co │ │ ├── bwd_hd192_fp16_causal_a32_psskddv_group.co │ │ ├── bwd_hd192_fp16_causal_br_a32_psskddv.co │ │ ├── bwd_hd192_fp16_causal_br_a32_psskddv_group.co │ │ ├── bwd_hd192_hd128_bf16_a16_pssk.co │ │ ├── bwd_hd192_hd128_bf16_a32_pssk.co │ │ ├── bwd_hd192_hd128_bf16_causal_a16_pssk.co │ │ ├── bwd_hd192_hd128_bf16_causal_a32_pssk.co │ │ ├── bwd_hd192_hd128_fp16_a16_pssk.co │ │ ├── bwd_hd192_hd128_fp16_a32_pssk.co │ │ ├── bwd_hd192_hd128_fp16_causal_a16_pssk.co │ │ ├── bwd_hd192_hd128_fp16_causal_a32_pssk.co │ │ ├── bwd_hd192_odo_bf16.co │ │ ├── bwd_hd192_odo_bf16_group.co │ │ ├── bwd_hd192_odo_fp16.co │ │ ├── bwd_hd192_odo_fp16_group.co │ │ ├── bwd_hd64_bf16_a16_rtna.co │ │ ├── bwd_hd64_bf16_a16_rtne.co │ │ ├── bwd_hd64_bf16_a16_rtz.co │ │ ├── bwd_hd64_bf16_a32_rtna_pssk.co │ │ ├── bwd_hd64_bf16_a32_rtna_pssk_group.co │ │ ├── bwd_hd64_bf16_a32_rtne_pssk.co │ │ ├── bwd_hd64_bf16_a32_rtne_pssk_group.co │ │ ├── bwd_hd64_bf16_a32_rtz_pssk.co │ │ ├── bwd_hd64_bf16_a32_rtz_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_a16_rtna.co │ │ ├── bwd_hd64_bf16_causal_a16_rtne.co │ │ ├── bwd_hd64_bf16_causal_a16_rtz.co │ │ ├── bwd_hd64_bf16_causal_a32_rtna_pssk.co │ │ ├── bwd_hd64_bf16_causal_a32_rtna_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_a32_rtne_pssk.co │ │ ├── bwd_hd64_bf16_causal_a32_rtne_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_a32_rtz_pssk.co │ │ ├── bwd_hd64_bf16_causal_a32_rtz_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtna_pssk.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtna_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtne_pssk.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtne_pssk_group.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtz_pssk.co │ │ ├── bwd_hd64_bf16_causal_br_a32_rtz_pssk_group.co │ │ ├── bwd_hd64_dq_convert_bf16.co │ │ ├── bwd_hd64_dq_convert_bf16_group.co │ │ ├── bwd_hd64_dq_convert_fp16.co │ │ ├── bwd_hd64_dq_convert_fp16_group.co │ │ ├── bwd_hd64_fp16_a16.co │ │ ├── bwd_hd64_fp16_a32_pssk.co │ │ ├── bwd_hd64_fp16_a32_pssk_group.co │ │ ├── bwd_hd64_fp16_causal_a16.co │ │ ├── bwd_hd64_fp16_causal_a32_pssk.co │ │ ├── bwd_hd64_fp16_causal_a32_pssk_group.co │ │ ├── bwd_hd64_fp16_causal_br_a32_pssk.co │ │ ├── bwd_hd64_fp16_causal_br_a32_pssk_group.co │ │ ├── bwd_hd64_odo_bf16.co │ │ ├── bwd_hd64_odo_bf16_group.co │ │ ├── bwd_hd64_odo_fp16.co │ │ ├── bwd_hd64_odo_fp16_group.co │ │ └── codegen.py │ ├── fmha_v3_fwd │ │ ├── codegen.py │ │ ├── fwd_hd128_bf16.co │ │ ├── fwd_hd128_bf16_causal.co │ │ ├── fwd_hd128_bf16_causal_group.co │ │ ├── fwd_hd128_bf16_group.co │ │ ├── fwd_hd192_hd128_bf16.co │ │ ├── fwd_hd192_hd128_bf16_causal.co │ │ ├── fwd_hd192_hd128_bf16_causal_group.co │ │ └── fwd_hd192_hd128_bf16_group.co │ ├── fmoe │ │ ├── fmoe_fp8_blockscale_g1u1_novs_subGU_256.co │ │ ├── fmoe_fp8_blockscale_g1u1_subGU_256.co │ │ ├── gelu │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_gelu.csv │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_atm_inlv_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_atm_inlv_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_noquant_g1u0_gelu.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_tkw1_gelu_1tg_32x64.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_atm_opt_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_atm_opt_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_gelu_1tg_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_gelu.csv │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_gelu_2tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_gelu_2tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_novs_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_novs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_novs_gelu_2tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_novs_gelu_2tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_vs_gelu_2tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_vs_gelu_2tg_ps_32x256.co │ │ │ ├── fmoe_f16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co │ │ │ ├── fmoe_f16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_f16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_f16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_novs_gelu_1tg_32x512.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_novs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_novs_gelu_2tg_32x256.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_novs_gelu_2tg_ps_32x256.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_vs_gelu_2tg_32x256.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_vs_gelu_2tg_ps_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_gelu.csv │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_atm_inlv_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_atm_inlv_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_noquant_g1u0_gelu.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_atm_opt_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_atm_opt_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_atm_opt_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_atm_opt_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_smf_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_smf_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_smf_gelu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_smf_gelu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_gelu.csv │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_gelu_2tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_gelu_2tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_novs_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_novs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_novs_gelu_2tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_novs_gelu_2tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_vs_gelu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_vs_gelu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_vs_gelu_2tg_32x256.co │ │ │ └── fmoe_fp16_pertokenMXfp4_g1u1_vs_gelu_2tg_ps_32x256.co │ │ └── silu │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_silu.csv │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_atm_inlv_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_atm_inlv_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_noquant_g1u0_silu.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_atm_opt_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_atm_opt_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u0_vs_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_silu_1tg_1tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenInt8_g1u1_vs_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_novs_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_novs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_novs_silu_2tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_novs_silu_2tg_ps_2tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_silu.csv │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_silu_2tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_silu_2tg_ps_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_vs_silu_2tg_32x256.co │ │ │ ├── fmoe_bf16_pertokenMXfp4_g1u1_vs_silu_2tg_ps_32x256.co │ │ │ ├── fmoe_f16_blockscaleFp8_g1u1_novs_silu_1tg_32x256.co │ │ │ ├── fmoe_f16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_f16_blockscaleFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_f16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_novs_silu_1tg_32x512.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_novs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_novs_silu_2tg_32x256.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_novs_silu_2tg_ps_2tg_32x256.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_vs_silu_2tg_32x256.co │ │ │ ├── fmoe_f16_pertokenMXfp4_g1u1_vs_silu_2tg_ps_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_silu.csv │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_atm_inlv_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_atm_inlv_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_noquant_g1u0_silu.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_atm_opt_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_atm_opt_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u0_vs_smf_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_smf_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x128.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x192.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x320.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x384.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x448.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_smf_silu_1tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenInt8_g1u1_vs_smf_silu_1tg_32x320.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_novs_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_novs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_novs_silu_2tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_novs_silu_2tg_ps_2tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_silu.csv │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_silu_2tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_silu_2tg_ps_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_vs_silu_1tg_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_vs_silu_1tg_ps_32x512.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_vs_silu_2tg_32x256.co │ │ │ ├── fmoe_fp16_pertokenMXfp4_g1u1_vs_silu_2tg_ps_32x256.co │ │ │ ├── fmoe_mxfp4_g1u1_vs_subGU_256.co │ │ │ └── fmoe_mxfp4_g1u1_vs_subGU_512.co │ ├── fmoe_2stages │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1.csv │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_112x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_112x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_128x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_128x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_144x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_144x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_160x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_160x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x128_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x128_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x512_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x128_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x128_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x256_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_48x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_80x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_80x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_80x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_80x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_96x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_96x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1.csv │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_112x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_112x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_112x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_112x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_144x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_160x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x128_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x128_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x256_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x512_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x64_5tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_16x64_6tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x64_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_48x64_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x64_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_80x64_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_96x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_96x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_96x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_96x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1.csv │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_112x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_112x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_112x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_112x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_128x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_128x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_128x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_128x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_144x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_144x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_144x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_144x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_160x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_160x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_160x64_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_160x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x128_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x128_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x256_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x512_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x64_5tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_16x64_6tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x64_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_48x64_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x64_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_64x64_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_80x64_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_96x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_96x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_96x64_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenFp8_g1u1_96x64_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1.csv │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_112x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_112x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_128x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_128x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_144x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_144x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_160x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_160x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x128_4tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x128_4tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x256_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x512_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_16x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x128_3tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x128_3tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x256_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x256_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_32x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x512_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_48x512_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_64x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_64x128_2tg_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_64x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_64x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_80x128_2tg_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_80x128_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_80x256_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_80x256_pf3.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_96x128_pf2.co │ │ ├── fmoe_stage1_bf16_pertokenInt8_g1u1_96x128_pf3.co │ │ └── tune.py │ ├── fp8gemm_blockscale │ │ ├── fp8gemm_bf16_blockscale.csv │ │ ├── fp8gemm_bf16_blockscale_BpreShuffle_128x128.co │ │ └── fp8gemm_bf16_blockscale_BpreShuffle_64x128.co │ ├── mla │ │ ├── mla_a16w16_qh16_m16x4_n16x1_coex0_mask1.co │ │ ├── mla_a16w16_qh16_m16x4_n16x1_coex0_mask1_ps.co │ │ ├── mla_a16w16_qh16_m32x4_n16x1_coex0_mask1.co │ │ ├── mla_a16w8_qh16_m16x4_n16x1_coex0_mask1_ps.co │ │ ├── mla_a8w8_qh128_m32x4_n16x2_msk0.co │ │ ├── mla_a8w8_qh128_m32x4_n16x2_msk0_ps.co │ │ ├── mla_a8w8_qh128_m32x4_n16x2_msk1.co │ │ ├── mla_a8w8_qh128_m32x4_n16x2_msk1_ps.co │ │ ├── mla_a8w8_qh16_qseqlen1_gqaratio16.co │ │ ├── mla_a8w8_qh16_qseqlen1_gqaratio16_ps.co │ │ ├── mla_a8w8_qh16_qseqlen2_gqaratio16.co │ │ ├── mla_a8w8_qh16_qseqlen2_gqaratio16_ps.co │ │ ├── mla_a8w8_qh16_qseqlen2_gqaratio16_ps_page.co │ │ ├── mla_a8w8_qh16_qseqlen4_gqaratio16.co │ │ ├── mla_a8w8_qh16_qseqlen4_gqaratio16_ps.co │ │ ├── mla_a8w8_qh16_qseqlen4_gqaratio16_ps_page.co │ │ ├── mla_dec_stage1_bf16_a16w16_subQ128_mqa128.co │ │ ├── mla_dec_stage1_bf16_a16w16_subQ16_mqa16.co │ │ ├── mla_pfl_bf16_a16w16_causal_subQ128_mqa128.co │ │ └── mla_pfl_bf16_a16w16_causal_subQ16_mqa16.co │ └── pa │ │ ├── pa_asm.csv │ │ ├── pa_bf16_noquant_gqa16_1tg_4w.co │ │ ├── pa_bf16_noquant_gqa8_1tg_4w.co │ │ ├── pa_bf16_noquant_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_noquant_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen16_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen32_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen40_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen48_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen64_msk1_ps.co │ │ ├── pa_bf16_pertokenFp8_gqa16_2tg_4w.co │ │ ├── pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenFp8_gqa8_2tg_4w.co │ │ ├── pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co │ │ ├── pa_bf16_pertokenFp8_gqa8_2tg_4w_uhp.co │ │ ├── pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenInt8_gqa16_2tg_4w.co │ │ ├── pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_bf16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_bf16_pertokenInt8_gqa8_2tg_4w.co │ │ ├── pa_fp16_noquant_gqa16_1tg_4w.co │ │ ├── pa_fp16_noquant_gqa8_1tg_4w.co │ │ ├── pa_fp16_noquant_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_noquant_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co │ │ ├── pa_fp16_pertokenFp8_gqa16_2tg_4w.co │ │ ├── pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_pertokenFp8_gqa8_1tg_4w_mtp_msk1.co │ │ ├── pa_fp16_pertokenFp8_gqa8_2tg_4w.co │ │ ├── pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co │ │ ├── pa_fp16_pertokenFp8_gqa8_2tg_4w_uhp.co │ │ ├── pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_pertokenInt8_gqa16_1tg_4w_mtp_msk1.co │ │ ├── pa_fp16_pertokenInt8_gqa16_2tg_4w.co │ │ ├── pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk0.co │ │ ├── pa_fp16_pertokenInt8_gqa8_1tg_4w_mtp_msk1.co │ │ └── pa_fp16_pertokenInt8_gqa8_2tg_4w.co └── readme.md ├── op_tests ├── __init__.py ├── cpp │ └── mha │ │ ├── README.md │ │ ├── benchmark_mha_bwd.cpp │ │ ├── benchmark_mha_fwd.cpp │ │ ├── build_mha.sh │ │ ├── compile.py │ │ ├── images │ │ ├── causal-bwd-perf.png │ │ ├── causal-fwd-perf.png │ │ ├── non-causal-bwd-perf.png │ │ └── non-causal-fwd-perf.png │ │ ├── smoke_test_bwd.sh │ │ ├── smoke_test_bwd_v3.sh │ │ ├── smoke_test_fwd.sh │ │ └── smoke_test_fwd_v3.sh ├── multigpu_tests │ ├── test_allgather.py │ ├── test_communication.py │ ├── test_custom_allreduce.py │ ├── test_custom_allreduce_fp8.py │ ├── test_dispatch_combine.py │ ├── test_fused_ar_rms.py │ ├── test_mori_all2all.py │ └── test_quick_all_reduce.py ├── op_benchmarks │ └── triton │ │ ├── bench_batched_gemm_a16w16.py │ │ ├── bench_batched_gemm_a16wfp4.py │ │ ├── bench_batched_gemm_a8w8.py │ │ ├── bench_batched_gemm_afp4wfp4.py │ │ ├── bench_deepgemm_attention.py │ │ ├── bench_extend_attention.py │ │ ├── bench_ff_a16w16_fused.py │ │ ├── bench_fp8_mqa_logits.py │ │ ├── bench_gemm_a16w16.py │ │ ├── bench_gemm_a16w16_gating.py │ │ ├── bench_gemm_a8w8.py │ │ ├── bench_gemm_a8w8_blockscale.py │ │ ├── bench_gemm_a8w8_per_token_scale.py │ │ ├── bench_gemm_a8wfp4.py │ │ ├── bench_gemm_afp4wfp4.py │ │ ├── bench_gemm_afp4wfp4_pre_quant_atomic.py │ │ ├── bench_gmm.py │ │ ├── bench_hstu_attn.py │ │ ├── bench_la.py │ │ ├── bench_la_paged_decode.py │ │ ├── bench_mha.py │ │ ├── bench_mla_decode.py │ │ ├── bench_mla_decode_rope.py │ │ ├── bench_moe.py │ │ ├── bench_moe_align_block_size.py │ │ ├── bench_moe_gemm_a8w4.py │ │ ├── bench_moe_gemm_a8w8.py │ │ ├── bench_moe_mx.py │ │ ├── bench_moe_routing_sigmoid_top1_fused.py │ │ ├── bench_pa_decode.py │ │ ├── bench_pa_prefill.py │ │ ├── bench_rmsnorm.py │ │ ├── bench_rope.py │ │ ├── bench_schema.yaml │ │ ├── bench_tests │ │ └── test_kernel_benchmarks.py │ │ ├── bench_topk.py │ │ └── utils │ │ ├── argparse.py │ │ ├── benchmark_utils.py │ │ └── model_configs.json ├── test_activation.py ├── test_aiter_add.py ├── test_aiter_addInp.py ├── test_aiter_sigmoid.py ├── test_batch_prefill.py ├── test_batched_gemm_a8w8.py ├── test_batched_gemm_bf16.py ├── test_concat_cache_mla.py ├── test_deepgemm.py ├── test_fused_mrope_rms.py ├── test_gemm_a16w16.py ├── test_gemm_a4w4.py ├── test_gemm_a8w8.py ├── test_gemm_a8w8_blockscale.py ├── test_gemm_a8w8_blockscale_mi350.py ├── test_indexer_k_quant_and_cache.py ├── test_kvcache.py ├── test_kvcache_blockscale.py ├── test_layernorm2d.py ├── test_layernorm2dFusedAddQuant.py ├── test_mha.py ├── test_mha_fp8.py ├── test_mha_varlen.py ├── test_mha_varlen_fp8.py ├── test_mla.py ├── test_mla_persistent.py ├── test_mla_sparse.py ├── test_moe.py ├── test_moeTopkSoftmax.py ├── test_moe_2stage.py ├── test_moe_blockscale.py ├── test_moe_dp_share_expert.py ├── test_moe_ep.py ├── test_moe_sorting.py ├── test_moe_sorting_mxfp4.py ├── test_moe_tkw1.py ├── test_moe_topk_sigmoid.py ├── test_pa.py ├── test_pa_mtp.py ├── test_pa_ps.py ├── test_pa_ragged.py ├── test_pa_ragged_experimental.py ├── test_pa_v1.py ├── test_quant.py ├── test_rmsnorm2d.py ├── test_rmsnorm2dFusedAddQuant.py ├── test_rope.py ├── test_sample.py ├── test_sampling.py ├── test_smoothquant.py ├── test_topk_per_row.py ├── test_topk_plain.py └── triton_tests │ ├── README.md │ ├── __init__.py │ ├── ff_test_utils.py │ ├── test_activation.py │ ├── test_batched_gemm_a16wfp4.py │ ├── test_batched_gemm_a8w8.py │ ├── test_batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py │ ├── test_batched_gemm_afp4wfp4.py │ ├── test_batched_gemm_bf16.py │ ├── test_chunked_pa_prefill.py │ ├── test_extend_attention.py │ ├── test_ff_a16w16.py │ ├── test_ff_a16w16_fused.py │ ├── test_fp8_mqa_logits.py │ ├── test_fused_add_rmsnorm_pad.py │ ├── test_fused_fp8_quant.py │ ├── test_fused_gemm_a8w8_blockscale_a16w16.py │ ├── test_fused_gemm_afp4wfp4_a16w16.py │ ├── test_fused_gemm_afp4wfp4_mul_add.py │ ├── test_fused_gemm_afp4wfp4_split_cat.py │ ├── test_fused_kv_cache.py │ ├── test_fused_mul_add.py │ ├── test_fused_mxfp4_quant.py │ ├── test_fused_qk_concat.py │ ├── test_fused_qkv_split_qk_rope.py │ ├── test_gemm_a16w16.py │ ├── test_gemm_a16w16_gated.py │ ├── test_gemm_a16w8_blockscale.py │ ├── test_gemm_a16wfp4.py │ ├── test_gemm_a8w8.py │ ├── test_gemm_a8w8_blockscale.py │ ├── test_gemm_a8w8_per_token_scale.py │ ├── test_gemm_a8wfp4.py │ ├── test_gemm_afp4wfp4.py │ ├── test_gmm.py │ ├── test_hstu_attn.py │ ├── test_la.py │ ├── test_la_paged.py │ ├── test_layernorm.py │ ├── test_mha.py │ ├── test_mla_decode_rope.py │ ├── test_moe.py │ ├── test_moe_align_block_size.py │ ├── test_moe_gemm_a8w4.py │ ├── test_moe_gemm_a8w8.py │ ├── test_moe_mx.py │ ├── test_moe_routing.py │ ├── test_moe_routing_sigmoid_top1_fused.py │ ├── test_pa_decode.py │ ├── test_pa_prefill.py │ ├── test_prefill_attention.py │ ├── test_quant.py │ ├── test_quant_mxfp4.py │ ├── test_rmsnorm.py │ ├── test_rope.py │ ├── test_softmax.py │ ├── test_topk.py │ ├── test_unified_attention.py │ ├── test_unified_attention_sparse_mla.py │ ├── triton_metadata_redirect │ ├── kernel.py │ └── test_metadata_redirect.py │ └── utils │ ├── __init__.py │ ├── hstu_attention_ref.py │ ├── mla_decode_ref.py │ ├── mla_extend_ref.py │ ├── rotary_embedding.py │ └── types.py ├── pyproject.toml ├── requirements.txt └── setup.py /.clang-format: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.clang-format -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.clang-tidy -------------------------------------------------------------------------------- /.githooks/install: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.githooks/install -------------------------------------------------------------------------------- /.githooks/pre-commit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.githooks/pre-commit -------------------------------------------------------------------------------- /.github/scripts/aiter_test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/scripts/aiter_test.sh -------------------------------------------------------------------------------- /.github/scripts/build_aiter_triton.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/scripts/build_aiter_triton.sh -------------------------------------------------------------------------------- /.github/scripts/check_deps.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/scripts/check_deps.sh -------------------------------------------------------------------------------- /.github/scripts/check_signal.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/scripts/check_signal.sh -------------------------------------------------------------------------------- /.github/scripts/clean_up_rocm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/scripts/clean_up_rocm.sh -------------------------------------------------------------------------------- /.github/scripts/op_tune.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/scripts/op_tune.sh -------------------------------------------------------------------------------- /.github/workflows/aiter-release.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/workflows/aiter-release.yaml -------------------------------------------------------------------------------- /.github/workflows/aiter-test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/workflows/aiter-test.yaml -------------------------------------------------------------------------------- /.github/workflows/operators-tuning.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/workflows/operators-tuning.yaml -------------------------------------------------------------------------------- /.github/workflows/pre-checks.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/workflows/pre-checks.yaml -------------------------------------------------------------------------------- /.github/workflows/sglang_downstream.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/workflows/sglang_downstream.yaml -------------------------------------------------------------------------------- /.github/workflows/test-network.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/workflows/test-network.yaml -------------------------------------------------------------------------------- /.github/workflows/triton-test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/workflows/triton-test.yaml -------------------------------------------------------------------------------- /.github/workflows/vllm_benchmark.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.github/workflows/vllm_benchmark.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/.gitmodules -------------------------------------------------------------------------------- /3rdparty/ck_helper/ck/config.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/3rdparty/ck_helper/ck/config.h -------------------------------------------------------------------------------- /CONTRIBUTE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/CONTRIBUTE.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/README.md -------------------------------------------------------------------------------- /aiter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/__init__.py -------------------------------------------------------------------------------- /aiter/aot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/__init__.py -------------------------------------------------------------------------------- /aiter/aot/asm_mla_decode_fwd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/asm_mla_decode_fwd.py -------------------------------------------------------------------------------- /aiter/aot/pa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/pa.py -------------------------------------------------------------------------------- /aiter/aot/pa_ragged.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/pa_ragged.py -------------------------------------------------------------------------------- /aiter/aot/pa_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/pa_v1.py -------------------------------------------------------------------------------- /aiter/aot/test/matmul_fp16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/test/matmul_fp16.py -------------------------------------------------------------------------------- /aiter/aot/test/test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/test/test.sh -------------------------------------------------------------------------------- /aiter/aot/test/test_matmul.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/test/test_matmul.cpp -------------------------------------------------------------------------------- /aiter/aot/triton/decode_mla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/triton/decode_mla.py -------------------------------------------------------------------------------- /aiter/aot/triton/norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/aot/triton/norm.py -------------------------------------------------------------------------------- /aiter/bert_padding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/bert_padding.py -------------------------------------------------------------------------------- /aiter/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/__init__.py -------------------------------------------------------------------------------- /aiter/configs/a4w4_blockscale_tuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a4w4_blockscale_tuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a4w4_blockscale_untuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a4w4_blockscale_untuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a8w8_blockscale_tuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a8w8_blockscale_tuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a8w8_blockscale_untuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a8w8_blockscale_untuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a8w8_bpreshuffle_untuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a8w8_bpreshuffle_untuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a8w8_tuned_batched_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a8w8_tuned_batched_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a8w8_tuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a8w8_tuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a8w8_untuned_batched_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a8w8_untuned_batched_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/a8w8_untuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/a8w8_untuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/asm_a8w8_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/asm_a8w8_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/bf16_tuned_batched_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/bf16_tuned_batched_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/bf16_tuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/bf16_tuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/bf16_untuned_batched_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/bf16_untuned_batched_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/bf16_untuned_gemm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/bf16_untuned_gemm.csv -------------------------------------------------------------------------------- /aiter/configs/model_configs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/model_configs/README.md -------------------------------------------------------------------------------- /aiter/configs/tuned_fmoe.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/tuned_fmoe.csv -------------------------------------------------------------------------------- /aiter/configs/untuned_fmoe.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/configs/untuned_fmoe.csv -------------------------------------------------------------------------------- /aiter/dist/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/__init__.py -------------------------------------------------------------------------------- /aiter/dist/communication_op.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/communication_op.py -------------------------------------------------------------------------------- /aiter/dist/cuda_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/cuda_wrapper.py -------------------------------------------------------------------------------- /aiter/dist/device_communicators/all2all.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/device_communicators/all2all.py -------------------------------------------------------------------------------- /aiter/dist/device_communicators/pynccl_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/device_communicators/pynccl_wrapper.py -------------------------------------------------------------------------------- /aiter/dist/device_communicators/quick_all_reduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/device_communicators/quick_all_reduce.py -------------------------------------------------------------------------------- /aiter/dist/parallel_state.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/parallel_state.py -------------------------------------------------------------------------------- /aiter/dist/shm_broadcast.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/shm_broadcast.py -------------------------------------------------------------------------------- /aiter/dist/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/dist/utils.py -------------------------------------------------------------------------------- /aiter/fused_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/fused_moe.py -------------------------------------------------------------------------------- /aiter/fused_moe_bf16_asm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/fused_moe_bf16_asm.py -------------------------------------------------------------------------------- /aiter/fused_moe_dp_shared_expert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/fused_moe_dp_shared_expert.py -------------------------------------------------------------------------------- /aiter/int4_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/int4_utils.py -------------------------------------------------------------------------------- /aiter/jit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/__init__.py -------------------------------------------------------------------------------- /aiter/jit/core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/core.py -------------------------------------------------------------------------------- /aiter/jit/optCompilerConfig.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/optCompilerConfig.json -------------------------------------------------------------------------------- /aiter/jit/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/__init__.py -------------------------------------------------------------------------------- /aiter/jit/utils/_cpp_extension_versioner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/_cpp_extension_versioner.py -------------------------------------------------------------------------------- /aiter/jit/utils/chip_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/chip_info.py -------------------------------------------------------------------------------- /aiter/jit/utils/cpp_extension.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/cpp_extension.py -------------------------------------------------------------------------------- /aiter/jit/utils/file_baton.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/file_baton.py -------------------------------------------------------------------------------- /aiter/jit/utils/hipify/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/hipify/__init__.py -------------------------------------------------------------------------------- /aiter/jit/utils/hipify/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/hipify/constants.py -------------------------------------------------------------------------------- /aiter/jit/utils/hipify/cuda_to_hip_mappings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/hipify/cuda_to_hip_mappings.py -------------------------------------------------------------------------------- /aiter/jit/utils/hipify/hipify_python.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/hipify/hipify_python.py -------------------------------------------------------------------------------- /aiter/jit/utils/torch_guard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/jit/utils/torch_guard.py -------------------------------------------------------------------------------- /aiter/mla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/mla.py -------------------------------------------------------------------------------- /aiter/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/__init__.py -------------------------------------------------------------------------------- /aiter/ops/activation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/activation.py -------------------------------------------------------------------------------- /aiter/ops/aiter_operator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/aiter_operator.py -------------------------------------------------------------------------------- /aiter/ops/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/attention.py -------------------------------------------------------------------------------- /aiter/ops/batched_gemm_op_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/batched_gemm_op_a8w8.py -------------------------------------------------------------------------------- /aiter/ops/batched_gemm_op_bf16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/batched_gemm_op_bf16.py -------------------------------------------------------------------------------- /aiter/ops/cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/cache.py -------------------------------------------------------------------------------- /aiter/ops/communication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/communication.py -------------------------------------------------------------------------------- /aiter/ops/custom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/custom.py -------------------------------------------------------------------------------- /aiter/ops/custom_all_reduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/custom_all_reduce.py -------------------------------------------------------------------------------- /aiter/ops/deepgemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/deepgemm.py -------------------------------------------------------------------------------- /aiter/ops/enum.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/enum.py -------------------------------------------------------------------------------- /aiter/ops/fused_mrope_rms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/fused_mrope_rms.py -------------------------------------------------------------------------------- /aiter/ops/gemm_op_a16w16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/gemm_op_a16w16.py -------------------------------------------------------------------------------- /aiter/ops/gemm_op_a4w4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/gemm_op_a4w4.py -------------------------------------------------------------------------------- /aiter/ops/gemm_op_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/gemm_op_a8w8.py -------------------------------------------------------------------------------- /aiter/ops/gemm_op_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/gemm_op_common.py -------------------------------------------------------------------------------- /aiter/ops/gradlib.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/gradlib.py -------------------------------------------------------------------------------- /aiter/ops/mha.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/mha.py -------------------------------------------------------------------------------- /aiter/ops/moe_op.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/moe_op.py -------------------------------------------------------------------------------- /aiter/ops/moe_sorting.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/moe_sorting.py -------------------------------------------------------------------------------- /aiter/ops/norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/norm.py -------------------------------------------------------------------------------- /aiter/ops/pos_encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/pos_encoding.py -------------------------------------------------------------------------------- /aiter/ops/quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/quant.py -------------------------------------------------------------------------------- /aiter/ops/quick_all_reduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/quick_all_reduce.py -------------------------------------------------------------------------------- /aiter/ops/rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/rmsnorm.py -------------------------------------------------------------------------------- /aiter/ops/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/rope.py -------------------------------------------------------------------------------- /aiter/ops/sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/sample.py -------------------------------------------------------------------------------- /aiter/ops/sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/sampling.py -------------------------------------------------------------------------------- /aiter/ops/shuffle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/shuffle.py -------------------------------------------------------------------------------- /aiter/ops/topk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/topk.py -------------------------------------------------------------------------------- /aiter/ops/topk_plain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/topk_plain.py -------------------------------------------------------------------------------- /aiter/ops/trans_ragged_layout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/trans_ragged_layout.py -------------------------------------------------------------------------------- /aiter/ops/triton/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/__init__.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/activation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/activation.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/fp8_mqa_logits.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/fp8_mqa_logits.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/fused_fp8_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/fused_fp8_quant.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/fused_kv_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/fused_kv_cache.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/fused_mul_add.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/fused_mul_add.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/fused_qk_concat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/fused_qk_concat.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/gemm_a16w16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/gemm_a16w16.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/gemm_a16wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/gemm_a16wfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/gemm_a8w8.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/gemm_a8wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/gemm_a8wfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/gmm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/gmm.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/hstu_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/hstu_attention.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/lean_atten.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/lean_atten.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/mha.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/mha.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/mha_fused_bwd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/mha_fused_bwd.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/mla_decode_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/mla_decode_rope.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/moe_op.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/moe_op.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/moe_op_e2e.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/moe_op_e2e.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/moe_op_gelu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/moe_op_gelu.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/moe_op_mxfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/moe_op_mxfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/norm.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/pa_decode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/pa_decode.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/pa_mqa_logits.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/pa_mqa_logits.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/pa_prefill.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/pa_prefill.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/pod_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/pod_attention.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/quant.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/quant_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/quant_moe.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/rmsnorm.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/rope.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/softmax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/softmax.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/split_qkv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/split_qkv.py -------------------------------------------------------------------------------- /aiter/ops/triton/_triton_kernels/topk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/_triton_kernels/topk.py -------------------------------------------------------------------------------- /aiter/ops/triton/activation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/activation.py -------------------------------------------------------------------------------- /aiter/ops/triton/batched_gemm_a16wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/batched_gemm_a16wfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/batched_gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/batched_gemm_a8w8.py -------------------------------------------------------------------------------- /aiter/ops/triton/batched_gemm_afp4wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/batched_gemm_afp4wfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py -------------------------------------------------------------------------------- /aiter/ops/triton/batched_gemm_bf16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/batched_gemm_bf16.py -------------------------------------------------------------------------------- /aiter/ops/triton/chunked_pa_prefill.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/chunked_pa_prefill.py -------------------------------------------------------------------------------- /aiter/ops/triton/configs/gemm/aot/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/configs/gemm/aot/README.md -------------------------------------------------------------------------------- /aiter/ops/triton/configs/gemm/gfx942-GEMM-A8W8.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/configs/gemm/gfx942-GEMM-A8W8.json -------------------------------------------------------------------------------- /aiter/ops/triton/configs/gemm/gfx950-GEMM-A8W8.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/configs/gemm/gfx950-GEMM-A8W8.json -------------------------------------------------------------------------------- /aiter/ops/triton/configs/gfx942-GMM.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/configs/gfx942-GMM.json -------------------------------------------------------------------------------- /aiter/ops/triton/configs/gfx942-MHA-DEFAULT.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/configs/gfx942-MHA-DEFAULT.json -------------------------------------------------------------------------------- /aiter/ops/triton/configs/gfx950-GMM.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/configs/gfx950-GMM.json -------------------------------------------------------------------------------- /aiter/ops/triton/configs/gfx950-MHA-DEFAULT.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/configs/gfx950-MHA-DEFAULT.json -------------------------------------------------------------------------------- /aiter/ops/triton/configs/moe/gfx950-MOE-MX_FP4.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/configs/moe/gfx950-MOE-MX_FP4.json -------------------------------------------------------------------------------- /aiter/ops/triton/extend_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/extend_attention.py -------------------------------------------------------------------------------- /aiter/ops/triton/ff_a16w16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/ff_a16w16.py -------------------------------------------------------------------------------- /aiter/ops/triton/ff_a16w16_fused_gated.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/ff_a16w16_fused_gated.py -------------------------------------------------------------------------------- /aiter/ops/triton/ff_a16w16_fused_ungated.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/ff_a16w16_fused_ungated.py -------------------------------------------------------------------------------- /aiter/ops/triton/fp8_mqa_logits.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fp8_mqa_logits.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_add_rmsnorm_pad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_add_rmsnorm_pad.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_fp8_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_fp8_quant.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_gemm_afp4wfp4_a16w16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_gemm_afp4wfp4_a16w16.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_gemm_afp4wfp4_mul_add.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_gemm_afp4wfp4_mul_add.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_gemm_afp4wfp4_split_cat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_gemm_afp4wfp4_split_cat.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_kv_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_kv_cache.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_mul_add.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_mul_add.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_mxfp4_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_mxfp4_quant.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_qk_concat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_qk_concat.py -------------------------------------------------------------------------------- /aiter/ops/triton/fused_qkv_split_qk_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/fused_qkv_split_qk_rope.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a16w16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a16w16.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a16w16_agnostic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a16w16_agnostic.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a16w16_atomic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a16w16_atomic.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a16w16_gated.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a16w16_gated.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a16w8_blockscale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a16w8_blockscale.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a16wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a16wfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a8w8.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a8w8_blockscale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a8w8_blockscale.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a8w8_per_token_scale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a8w8_per_token_scale.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_a8wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_a8wfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_afp4wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_afp4wfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py -------------------------------------------------------------------------------- /aiter/ops/triton/gluon/gemm_a8w8_blockscale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gluon/gemm_a8w8_blockscale.py -------------------------------------------------------------------------------- /aiter/ops/triton/gluon/pa_mqa_logits.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gluon/pa_mqa_logits.py -------------------------------------------------------------------------------- /aiter/ops/triton/gmm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/gmm.py -------------------------------------------------------------------------------- /aiter/ops/triton/hstu_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/hstu_attention.py -------------------------------------------------------------------------------- /aiter/ops/triton/lean_atten.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/lean_atten.py -------------------------------------------------------------------------------- /aiter/ops/triton/lean_atten_paged.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/lean_atten_paged.py -------------------------------------------------------------------------------- /aiter/ops/triton/mha.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/mha.py -------------------------------------------------------------------------------- /aiter/ops/triton/mha_fused_bwd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/mha_fused_bwd.py -------------------------------------------------------------------------------- /aiter/ops/triton/mha_onekernel_bwd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/mha_onekernel_bwd.py -------------------------------------------------------------------------------- /aiter/ops/triton/mha_v3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/mha_v3.py -------------------------------------------------------------------------------- /aiter/ops/triton/mla_decode_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/mla_decode_rope.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_align_block_size.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_align_block_size.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_op.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_op.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_op_e2e.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_op_e2e.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_op_gelu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_op_gelu.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_op_gemm_a8w4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_op_gemm_a8w4.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_op_gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_op_gemm_a8w8.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_op_mxfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_op_mxfp4.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_op_mxfp4_silu_fused.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_op_mxfp4_silu_fused.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_op_silu_fused.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_op_silu_fused.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_routing/bitmatrix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_routing/bitmatrix.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_routing/routing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_routing/routing.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_routing/topk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_routing/topk.py -------------------------------------------------------------------------------- /aiter/ops/triton/moe_routing_sigmoid_top1_fused.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/moe_routing_sigmoid_top1_fused.py -------------------------------------------------------------------------------- /aiter/ops/triton/norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/norm.py -------------------------------------------------------------------------------- /aiter/ops/triton/pa_decode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/pa_decode.py -------------------------------------------------------------------------------- /aiter/ops/triton/pa_mqa_logits.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/pa_mqa_logits.py -------------------------------------------------------------------------------- /aiter/ops/triton/pa_prefill.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/pa_prefill.py -------------------------------------------------------------------------------- /aiter/ops/triton/pod_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/pod_attention.py -------------------------------------------------------------------------------- /aiter/ops/triton/prefill_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/prefill_attention.py -------------------------------------------------------------------------------- /aiter/ops/triton/quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/quant.py -------------------------------------------------------------------------------- /aiter/ops/triton/quant_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/quant_moe.py -------------------------------------------------------------------------------- /aiter/ops/triton/rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/rmsnorm.py -------------------------------------------------------------------------------- /aiter/ops/triton/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/rope.py -------------------------------------------------------------------------------- /aiter/ops/triton/softmax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/softmax.py -------------------------------------------------------------------------------- /aiter/ops/triton/split_qkv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/split_qkv.py -------------------------------------------------------------------------------- /aiter/ops/triton/topk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/topk.py -------------------------------------------------------------------------------- /aiter/ops/triton/unified_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/unified_attention.py -------------------------------------------------------------------------------- /aiter/ops/triton/unified_attention_sparse_mla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/unified_attention_sparse_mla.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/__init__.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/_triton/arch_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/_triton/arch_info.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/_triton/kernel_repr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/_triton/kernel_repr.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/_triton/mha_kernel_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/_triton/mha_kernel_utils.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/_triton/moe_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/_triton/moe_common.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/_triton/pid_preprocessing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/_triton/pid_preprocessing.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/common_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/common_utils.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/core.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/device_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/device_info.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/gmm_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/gmm_common.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/la_kernel_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/la_kernel_utils.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/logger.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/mha_kernel_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/mha_kernel_utils.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/moe_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/moe_common.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/moe_config_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/moe_config_utils.py -------------------------------------------------------------------------------- /aiter/ops/triton/utils/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/ops/triton/utils/types.py -------------------------------------------------------------------------------- /aiter/paged_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/paged_attn.py -------------------------------------------------------------------------------- /aiter/rotary_embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/rotary_embedding.py -------------------------------------------------------------------------------- /aiter/test_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/test_common.py -------------------------------------------------------------------------------- /aiter/test_mha_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/test_mha_common.py -------------------------------------------------------------------------------- /aiter/tuned_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/tuned_gemm.py -------------------------------------------------------------------------------- /aiter/utility/base_tuner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/utility/base_tuner.py -------------------------------------------------------------------------------- /aiter/utility/dtypes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/utility/dtypes.py -------------------------------------------------------------------------------- /aiter/utility/fp4_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/utility/fp4_utils.py -------------------------------------------------------------------------------- /aiter/utility/mp_tuner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/utility/mp_tuner.py -------------------------------------------------------------------------------- /aiter/utility/triton/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/utility/triton/README.md -------------------------------------------------------------------------------- /aiter/utility/triton/triton_metadata_redirect.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter/utility/triton/triton_metadata_redirect.py -------------------------------------------------------------------------------- /aiter_logs/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter_logs/readme.md -------------------------------------------------------------------------------- /aiter_logs/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/aiter_logs/run.py -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_a8w8/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_a8w8/README.md -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.cu -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_a8w8/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_a8w8/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_bf16/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_bf16/README.md -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.cu -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py -------------------------------------------------------------------------------- /csrc/ck_batched_gemm_bf16/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_batched_gemm_bf16/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_deepgemm/deepgemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_deepgemm/deepgemm.cu -------------------------------------------------------------------------------- /csrc/ck_deepgemm/deepgemm_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_deepgemm/deepgemm_common.py -------------------------------------------------------------------------------- /csrc/ck_deepgemm/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_deepgemm/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_deepgemm/include/deepgemm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_deepgemm/include/deepgemm.h -------------------------------------------------------------------------------- /csrc/ck_deepgemm/include/deepgemm_common.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_deepgemm/include/deepgemm_common.cuh -------------------------------------------------------------------------------- /csrc/ck_gemm_a4w4_blockscale/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a4w4_blockscale/README.md -------------------------------------------------------------------------------- /csrc/ck_gemm_a4w4_blockscale/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a4w4_blockscale/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8/README.md -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8/gemm_a8w8.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8/gemm_a8w8.cu -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8/gemm_a8w8_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8/gemm_a8w8_common.py -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8/gemm_a8w8_tune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8/include/gemm_a8w8.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8/include/gemm_a8w8.h -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8/include/gemm_a8w8_common.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8/include/gemm_a8w8_common.cuh -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8_blockscale/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8_blockscale/README.md -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8_blockscale/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8_blockscale/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8_bpreshuffle/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8_bpreshuffle/README.md -------------------------------------------------------------------------------- /csrc/ck_gemm_a8w8_bpreshuffle/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_a8w8_bpreshuffle/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_gemm_moe_2stages_codegen/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_gemm_moe_2stages_codegen/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_tile_gemm_moe_2stages/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_tile_gemm_moe_2stages/gen_instances.py -------------------------------------------------------------------------------- /csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages.cu -------------------------------------------------------------------------------- /csrc/cktile_gemm_a8w8_bpreshuffle/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cktile_gemm_a8w8_bpreshuffle/README.md -------------------------------------------------------------------------------- /csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cktile_gemm_a8w8_bpreshuffle/gen_instances.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/README.MD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/README.MD -------------------------------------------------------------------------------- /csrc/cpp_itfs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /csrc/cpp_itfs/lru_cache.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/lru_cache.h -------------------------------------------------------------------------------- /csrc/cpp_itfs/mha_bwd_generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mha_bwd_generate.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/mha_fwd_generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mha_fwd_generate.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/mla/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mla/Makefile -------------------------------------------------------------------------------- /csrc/cpp_itfs/mla/asm_mla_decode_fwd.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mla/asm_mla_decode_fwd.cpp -------------------------------------------------------------------------------- /csrc/cpp_itfs/mla/asm_mla_decode_fwd.cpp.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mla/asm_mla_decode_fwd.cpp.jinja -------------------------------------------------------------------------------- /csrc/cpp_itfs/mla/asm_mla_decode_fwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mla/asm_mla_decode_fwd.h -------------------------------------------------------------------------------- /csrc/cpp_itfs/mla/asm_mla_decode_fwd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mla/asm_mla_decode_fwd.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/mla/asm_mla_decode_fwd_test.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mla/asm_mla_decode_fwd_test.cpp -------------------------------------------------------------------------------- /csrc/cpp_itfs/mla/asm_mla_decode_fwd_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/mla/asm_mla_decode_fwd_test.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/moe/asm_moe.cpp.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/moe/asm_moe.cpp.jinja -------------------------------------------------------------------------------- /csrc/cpp_itfs/moe/asm_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/moe/asm_moe.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/moe/test_asm_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/moe/test_asm_moe.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/Makefile -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa.cpp.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa.cpp.jinja -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa.cuh -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_common.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_common.cuh -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_kernels.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_kernels.cuh -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_ragged.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_ragged.cpp -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_ragged.cpp.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_ragged.cpp.jinja -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_ragged.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_ragged.cuh -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_ragged.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_ragged.h -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_ragged.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_ragged.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_ragged_test.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_ragged_test.cpp -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_ragged_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_ragged_test.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_test.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_v1.cpp.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_v1.cpp.jinja -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_v1.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_v1.cuh -------------------------------------------------------------------------------- /csrc/cpp_itfs/pa/pa_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/pa/pa_v1.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/sampling/sampling.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/sampling/sampling.cuh -------------------------------------------------------------------------------- /csrc/cpp_itfs/sampling/top_k_renorm_probs.cpp.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/sampling/top_k_renorm_probs.cpp.jinja -------------------------------------------------------------------------------- /csrc/cpp_itfs/sampling/top_k_renorm_probs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/sampling/top_k_renorm_probs.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/sampling/top_p_sampling_from_probs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/sampling/top_p_sampling_from_probs.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/sampling/vec_dtypes.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/sampling/vec_dtypes.cuh -------------------------------------------------------------------------------- /csrc/cpp_itfs/torch_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/torch_utils.py -------------------------------------------------------------------------------- /csrc/cpp_itfs/utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/utils.h -------------------------------------------------------------------------------- /csrc/cpp_itfs/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/cpp_itfs/utils.py -------------------------------------------------------------------------------- /csrc/include/activation.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/activation.h -------------------------------------------------------------------------------- /csrc/include/aiter_enum.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/aiter_enum.h -------------------------------------------------------------------------------- /csrc/include/aiter_hip_common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/aiter_hip_common.h -------------------------------------------------------------------------------- /csrc/include/aiter_operator.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/aiter_operator.h -------------------------------------------------------------------------------- /csrc/include/aiter_unary.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/aiter_unary.h -------------------------------------------------------------------------------- /csrc/include/asm_a8w8_blockscale_bpreshuffle.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/asm_a8w8_blockscale_bpreshuffle.h -------------------------------------------------------------------------------- /csrc/include/asm_flatmm_a8w8_blockscale.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/asm_flatmm_a8w8_blockscale.h -------------------------------------------------------------------------------- /csrc/include/asm_gemm_a16w16.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/asm_gemm_a16w16.h -------------------------------------------------------------------------------- /csrc/include/asm_gemm_a4w4.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/asm_gemm_a4w4.h -------------------------------------------------------------------------------- /csrc/include/asm_gemm_a8w8.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/asm_gemm_a8w8.h -------------------------------------------------------------------------------- /csrc/include/asm_mi350_a8w8_blockscale.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/asm_mi350_a8w8_blockscale.h -------------------------------------------------------------------------------- /csrc/include/attention.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention.h -------------------------------------------------------------------------------- /csrc/include/attention_asm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention_asm.h -------------------------------------------------------------------------------- /csrc/include/attention_asm_mla.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention_asm_mla.h -------------------------------------------------------------------------------- /csrc/include/attention_ck.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention_ck.h -------------------------------------------------------------------------------- /csrc/include/attention_common.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention_common.cuh -------------------------------------------------------------------------------- /csrc/include/attention_dtypes.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention_dtypes.h -------------------------------------------------------------------------------- /csrc/include/attention_generic.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention_generic.cuh -------------------------------------------------------------------------------- /csrc/include/attention_ragged.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention_ragged.h -------------------------------------------------------------------------------- /csrc/include/attention_v1.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/attention_v1.h -------------------------------------------------------------------------------- /csrc/include/binary_operator.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/binary_operator.cuh -------------------------------------------------------------------------------- /csrc/include/cache.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/cache.h -------------------------------------------------------------------------------- /csrc/include/ck_tile/vec_convert.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/ck_tile/vec_convert.h -------------------------------------------------------------------------------- /csrc/include/communication_asm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/communication_asm.h -------------------------------------------------------------------------------- /csrc/include/custom.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/custom.h -------------------------------------------------------------------------------- /csrc/include/custom_all_reduce.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/custom_all_reduce.cuh -------------------------------------------------------------------------------- /csrc/include/custom_all_reduce.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/custom_all_reduce.h -------------------------------------------------------------------------------- /csrc/include/dispatch_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/dispatch_utils.h -------------------------------------------------------------------------------- /csrc/include/dtype_bfloat16.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/dtype_bfloat16.cuh -------------------------------------------------------------------------------- /csrc/include/dtype_float16.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/dtype_float16.cuh -------------------------------------------------------------------------------- /csrc/include/dtype_float32.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/dtype_float32.cuh -------------------------------------------------------------------------------- /csrc/include/dtype_fp8.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/dtype_fp8.cuh -------------------------------------------------------------------------------- /csrc/include/fused_mrope_rms.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/fused_mrope_rms.h -------------------------------------------------------------------------------- /csrc/include/gemm_common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/gemm_common.h -------------------------------------------------------------------------------- /csrc/include/hip_compat.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/hip_compat.h -------------------------------------------------------------------------------- /csrc/include/hip_float8.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/hip_float8.h -------------------------------------------------------------------------------- /csrc/include/hip_float8_impl.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/hip_float8_impl.h -------------------------------------------------------------------------------- /csrc/include/hip_reduce.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/hip_reduce.h -------------------------------------------------------------------------------- /csrc/include/mha_bwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/mha_bwd.h -------------------------------------------------------------------------------- /csrc/include/mha_common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/mha_common.h -------------------------------------------------------------------------------- /csrc/include/mha_fwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/mha_fwd.h -------------------------------------------------------------------------------- /csrc/include/mla.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/mla.h -------------------------------------------------------------------------------- /csrc/include/moe_ck.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/moe_ck.h -------------------------------------------------------------------------------- /csrc/include/moe_op.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/moe_op.h -------------------------------------------------------------------------------- /csrc/include/moe_sorting.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/moe_sorting.h -------------------------------------------------------------------------------- /csrc/include/norm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/norm.h -------------------------------------------------------------------------------- /csrc/include/opus/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/opus/README.md -------------------------------------------------------------------------------- /csrc/include/opus/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/opus/logo.png -------------------------------------------------------------------------------- /csrc/include/opus/opus.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/opus/opus.hpp -------------------------------------------------------------------------------- /csrc/include/pa.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/pa.h -------------------------------------------------------------------------------- /csrc/include/pos_encoding.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/pos_encoding.h -------------------------------------------------------------------------------- /csrc/include/py_itfs_common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/py_itfs_common.h -------------------------------------------------------------------------------- /csrc/include/quant.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/quant.h -------------------------------------------------------------------------------- /csrc/include/quant_common.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/quant_common.cuh -------------------------------------------------------------------------------- /csrc/include/quant_utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/quant_utils.cuh -------------------------------------------------------------------------------- /csrc/include/quick_all_reduce.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/quick_all_reduce.cuh -------------------------------------------------------------------------------- /csrc/include/quick_all_reduce.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/quick_all_reduce.h -------------------------------------------------------------------------------- /csrc/include/quick_all_reduce_base.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/quick_all_reduce_base.h -------------------------------------------------------------------------------- /csrc/include/rmsnorm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/rmsnorm.h -------------------------------------------------------------------------------- /csrc/include/rocm_ops.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/rocm_ops.hpp -------------------------------------------------------------------------------- /csrc/include/rope.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/rope.h -------------------------------------------------------------------------------- /csrc/include/sample.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/sample.h -------------------------------------------------------------------------------- /csrc/include/smoothquant.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/smoothquant.h -------------------------------------------------------------------------------- /csrc/include/topk_per_row.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/topk_per_row.h -------------------------------------------------------------------------------- /csrc/include/topk_plain.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/topk_plain.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_batch_prefill.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_batch_prefill.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_bwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_bwd.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_fwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_fwd.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_v3_bwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_v3_bwd.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_v3_fwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_v3_fwd.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_v3_varlen_bwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_v3_varlen_bwd.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_v3_varlen_fwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_v3_varlen_fwd.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_varlen_bwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_varlen_bwd.h -------------------------------------------------------------------------------- /csrc/include/torch/mha_varlen_fwd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/torch/mha_varlen_fwd.h -------------------------------------------------------------------------------- /csrc/include/vectorization.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/vectorization.cuh -------------------------------------------------------------------------------- /csrc/include/warp_sort.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/include/warp_sort.h -------------------------------------------------------------------------------- /csrc/kernels/activation_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/activation_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/attention.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/attention.cu -------------------------------------------------------------------------------- /csrc/kernels/attention_ragged.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/attention_ragged.cu -------------------------------------------------------------------------------- /csrc/kernels/attention_v1.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/attention_v1.cu -------------------------------------------------------------------------------- /csrc/kernels/binary_operator.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/binary_operator.cu -------------------------------------------------------------------------------- /csrc/kernels/cache_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/cache_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/custom_all_reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/custom_all_reduce.cu -------------------------------------------------------------------------------- /csrc/kernels/custom_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/custom_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/fused_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/fused_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/fused_mrope_rms.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/fused_mrope_rms.cu -------------------------------------------------------------------------------- /csrc/kernels/generate_binaryop.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/generate_binaryop.py -------------------------------------------------------------------------------- /csrc/kernels/mha_common.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/mha_common.cu -------------------------------------------------------------------------------- /csrc/kernels/mla/metadata.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/mla/metadata.cu -------------------------------------------------------------------------------- /csrc/kernels/mla/metadata/v1_1_device.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/mla/metadata/v1_1_device.cuh -------------------------------------------------------------------------------- /csrc/kernels/mla/metadata/v1_1_host.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/mla/metadata/v1_1_host.cuh -------------------------------------------------------------------------------- /csrc/kernels/mla/metadata/v1_2_device.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/mla/metadata/v1_2_device.cuh -------------------------------------------------------------------------------- /csrc/kernels/mla/metadata/v1_2_pa_device.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/mla/metadata/v1_2_pa_device.cuh -------------------------------------------------------------------------------- /csrc/kernels/mla/metadata/v1_comm.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/mla/metadata/v1_comm.cuh -------------------------------------------------------------------------------- /csrc/kernels/mla/reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/mla/reduce.cu -------------------------------------------------------------------------------- /csrc/kernels/moe_align_block_size_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/moe_align_block_size_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/moe_fused_gate.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/moe_fused_gate.cu -------------------------------------------------------------------------------- /csrc/kernels/pos_encoding_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/pos_encoding_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/quant_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/quant_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/quick_all_reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/quick_all_reduce.cu -------------------------------------------------------------------------------- /csrc/kernels/rmsnorm_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/rmsnorm_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/rope/general_bwd_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/rope/general_bwd_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/rope/general_fwd_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/rope/general_fwd_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/rope/pos_fwd_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/rope/pos_fwd_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/rope/rope_common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/rope/rope_common.h -------------------------------------------------------------------------------- /csrc/kernels/sample_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/sample_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/solver/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/solver/Makefile -------------------------------------------------------------------------------- /csrc/kernels/solver/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/solver/README.md -------------------------------------------------------------------------------- /csrc/kernels/solver/lapack_sytrd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/solver/lapack_sytrd.py -------------------------------------------------------------------------------- /csrc/kernels/solver/sytrd_benchmark.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/solver/sytrd_benchmark.cu -------------------------------------------------------------------------------- /csrc/kernels/solver/sytrd_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/solver/sytrd_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/topk_per_row_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/topk_per_row_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/topk_plain_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/topk_plain_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/topk_softmax_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/topk_softmax_kernels.cu -------------------------------------------------------------------------------- /csrc/kernels/topk_softmax_kernels_group.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/topk_softmax_kernels_group.cu -------------------------------------------------------------------------------- /csrc/kernels/unary_operator.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/kernels/unary_operator.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/attention_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/attention_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/mha_batch_prefill_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/mha_batch_prefill_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/mha_bwd_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/mha_bwd_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/mha_fwd_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/mha_fwd_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/mha_varlen_bwd_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/mha_varlen_bwd_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/mha_varlen_fwd_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/mha_varlen_fwd_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/moe_ck_2stages_kernel.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/moe_ck_2stages_kernel.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/moe_sorting_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/moe_sorting_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/norm_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/norm_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/rmsnorm_ck_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/rmsnorm_ck_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/smoothquant_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/smoothquant_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_ck/topk_sigmoid_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_ck/topk_sigmoid_kernels.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_a8w8_blockscale_bpreshuffle.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_a8w8_blockscale_bpreshuffle.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_communication.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_communication.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_flatmm_a8w8_blockscale.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_flatmm_a8w8_blockscale.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_fmoe.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_fmoe.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_gemm_a16w16.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_gemm_a16w16.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_gemm_a4w4.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_gemm_a4w4.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_gemm_a8w8.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_gemm_a8w8.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_layernorm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_layernorm.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_mha_bwd.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_mha_bwd.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_mha_fwd.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_mha_fwd.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_mha_varlen_bwd.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_mha_varlen_bwd.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_mha_varlen_fwd.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_mha_varlen_fwd.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_mi350_a8w8_blockscale.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_mi350_a8w8_blockscale.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_mla.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_mla.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_moe_2stage.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_moe_2stage.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_pa.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_pa.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/asm_topksoftmax.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/asm_topksoftmax.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/custom.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/custom.cu -------------------------------------------------------------------------------- /csrc/py_itfs_cu/gemm_common.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/py_itfs_cu/gemm_common.cu -------------------------------------------------------------------------------- /csrc/pybind/activation_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/activation_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/aiter_enum_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/aiter_enum_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/aiter_operator_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/aiter_operator_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/aiter_unary_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/aiter_unary_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/asm_mi350_a8w8_blockscale_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/asm_mi350_a8w8_blockscale_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/attention_asm_mla_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/attention_asm_mla_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/attention_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/attention_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/attention_ck_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/attention_ck_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/attention_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/attention_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/attention_ragged_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/attention_ragged_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/attention_v1_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/attention_v1_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/batched_gemm_a8w8_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/batched_gemm_a8w8_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/batched_gemm_a8w8_tune_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/batched_gemm_a8w8_tune_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/batched_gemm_bf16_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/batched_gemm_bf16_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/batched_gemm_bf16_tune_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/batched_gemm_bf16_tune_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/cache_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/cache_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/custom_all_reduce_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/custom_all_reduce_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/custom_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/custom_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/deepgemm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/deepgemm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/flatmm_a8w8_blockscale_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/flatmm_a8w8_blockscale_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/fused_mrope_rms_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/fused_mrope_rms_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a16w16_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a16w16_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a4w4_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a4w4_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a4w4_blockscale_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a4w4_blockscale_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a4w4_blockscale_tune_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a4w4_blockscale_tune_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a8w8_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a8w8_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a8w8_blockscale_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a8w8_blockscale_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a8w8_blockscale_tune_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a8w8_blockscale_tune_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a8w8_bpreshuffle_cktile_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a8w8_bpreshuffle_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a8w8_bpreshuffle_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a8w8_bpreshuffle_tune_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a8w8_bpreshuffle_tune_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a8w8_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a8w8_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_a8w8_tune_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_a8w8_tune_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/gemm_common_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/gemm_common_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_batch_prefill_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_batch_prefill_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_bwd_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_bwd_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_bwd_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_bwd_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_fwd_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_fwd_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_fwd_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_fwd_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_varlen_bwd_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_varlen_bwd_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_varlen_bwd_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_varlen_bwd_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_varlen_fwd_asm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_varlen_fwd_asm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mha_varlen_fwd_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mha_varlen_fwd_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mla_metadata_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mla_metadata_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/mla_reduce_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/mla_reduce_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/moe_ck_2stages_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/moe_ck_2stages_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/moe_ck_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/moe_ck_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/moe_cktile_2stages_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/moe_cktile_2stages_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/moe_op_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/moe_op_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/moe_sorting_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/moe_sorting_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/moe_topk_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/moe_topk_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/norm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/norm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/pa_metadata_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/pa_metadata_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/pos_encoding_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/pos_encoding_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/quant_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/quant_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/quick_all_reduce_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/quick_all_reduce_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/rmsnorm_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/rmsnorm_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/rope_general_bwd_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/rope_general_bwd_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/rope_general_fwd_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/rope_general_fwd_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/rope_pos_fwd_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/rope_pos_fwd_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/sample_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/sample_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/smoothquant_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/smoothquant_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/topk_per_row_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/topk_per_row_pybind.cu -------------------------------------------------------------------------------- /csrc/pybind/topk_plain_pybind.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/pybind/topk_plain_pybind.cu -------------------------------------------------------------------------------- /csrc/rocm_ops.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/csrc/rocm_ops.cpp -------------------------------------------------------------------------------- /docs/aiter_container_nonroot_setup.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/docs/aiter_container_nonroot_setup.md -------------------------------------------------------------------------------- /docs/autotuning_pipeline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/docs/autotuning_pipeline.md -------------------------------------------------------------------------------- /docs/images/autotuning_ci_pipeline_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/docs/images/autotuning_ci_pipeline_1.jpeg -------------------------------------------------------------------------------- /docs/images/autotuning_ci_pipeline_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/docs/images/autotuning_ci_pipeline_2.jpeg -------------------------------------------------------------------------------- /gradlib/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/README.md -------------------------------------------------------------------------------- /gradlib/csrc/grad_funcs.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/csrc/grad_funcs.cu -------------------------------------------------------------------------------- /gradlib/csrc/hipbsolgemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/csrc/hipbsolgemm.cu -------------------------------------------------------------------------------- /gradlib/csrc/rocsolgemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/csrc/rocsolgemm.cu -------------------------------------------------------------------------------- /gradlib/gradlib/GemmTuner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/gradlib/GemmTuner.py -------------------------------------------------------------------------------- /gradlib/gradlib/gemm_tuner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/gradlib/gemm_tuner.py -------------------------------------------------------------------------------- /gradlib/include/hipbsolgemm.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/include/hipbsolgemm.cuh -------------------------------------------------------------------------------- /gradlib/include/rocsolgemm.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/include/rocsolgemm.cuh -------------------------------------------------------------------------------- /gradlib/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/gradlib/setup.py -------------------------------------------------------------------------------- /hsa/codegen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/codegen.py -------------------------------------------------------------------------------- /hsa/gfx942/all_reduce.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/all_reduce.co -------------------------------------------------------------------------------- /hsa/gfx942/allreduce_layernorm_N8192.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/allreduce_layernorm_N8192.co -------------------------------------------------------------------------------- /hsa/gfx942/allreduce_rmsnorm_N8192.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/allreduce_rmsnorm_N8192.co -------------------------------------------------------------------------------- /hsa/gfx942/allreduce_rmsnorm_qnt_N8192.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/allreduce_rmsnorm_qnt_N8192.co -------------------------------------------------------------------------------- /hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv -------------------------------------------------------------------------------- /hsa/gfx942/f4gemm/f4gemm_bf16_per1x32Fp4.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/f4gemm/f4gemm_bf16_per1x32Fp4.csv -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a16_rtna.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a16_rtna.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a16_rtne.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a16_rtne.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a16_rtz.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a16_rtz.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a32_rtna.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a32_rtna.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a32_rtne.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a32_rtne.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a32_rtz.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_bf16_a32_rtz.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_a16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_a16.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_a16_pddv.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_a16_pddv.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_a32.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_a32.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_causal_a16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_causal_a16.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_causal_a32.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd128_fp16_causal_a32.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd64_bf16_a16_rtna.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd64_bf16_a16_rtna.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd64_bf16_a16_rtne.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd64_bf16_a16_rtne.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd64_bf16_a16_rtz.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd64_bf16_a16_rtz.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd64_fp16_a16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd64_fp16_a16.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd64_fp16_a32_pssk.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd64_fp16_a32_pssk.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/bwd_hd64_fp16_causal_a16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/bwd_hd64_fp16_causal_a16.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_bwd/codegen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_bwd/codegen.py -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtna.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtne.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtne.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd128_bf16_rtz.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtna.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtne.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtne.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd128_bf16_rtz.co -------------------------------------------------------------------------------- /hsa/gfx942/fmha_v3_fwd/codegen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmha_v3_fwd/codegen.py -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_128.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_128.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_192.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_192.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_256.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_256.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_320.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_320.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_384.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_384.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_448.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_448.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_512.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_fp8_g1u1_subGU_512.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_128.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_128.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_192.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_192.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_256.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_256.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_320.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_320.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_384.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_384.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_448.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_448.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_512.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u0_subGU_512.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_128.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_128.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_192.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_192.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_256.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_256.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_320.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_320.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_384.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_384.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_448.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_448.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_512.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe/silu/fmoe_int8_g1u1_subGU_512.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_2stages/tune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_2stages/tune.py -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_b16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_b16.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_f16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_f16.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_blockscale_g1u1_subGU_256.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_blockscale_g1u1_subGU_256.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_128.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_128.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_192.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_192.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_256.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_256.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_320.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_320.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_384.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_384.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_448.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_448.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_512.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_multix_subGU_512.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_smf_subGU_320.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_smf_subGU_320.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_fp8_g1u1_smf_subGU_512.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_fp8_g1u1_smf_subGU_512.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int4fp8_g1u1_subGU_128_gelu.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int4fp8_g1u1_subGU_128_gelu.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int4fp8_g1u1_subGU_256_gelu.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int4fp8_g1u1_subGU_256_gelu.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int4fp8_g1u1_subGU_512_gelu.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int4fp8_g1u1_subGU_512_gelu.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u0.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u0.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u0_smf.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u0_smf.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_multix_subGU_128.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_multix_subGU_128.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_multix_subGU_192.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_multix_subGU_192.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_multix_subGU_256.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_multix_subGU_256.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_multix_subGU_320.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_multix_subGU_320.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_multix_subGU_384.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_multix_subGU_384.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_multix_subGU_448.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_multix_subGU_448.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_multix_subGU_512.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_multix_subGU_512.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_smf_subGU_256.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_smf_subGU_256.co -------------------------------------------------------------------------------- /hsa/gfx942/fmoe_int8_g1u1_smf_subGU_320.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/fmoe_int8_g1u1_smf_subGU_320.co -------------------------------------------------------------------------------- /hsa/gfx942/gemm_a8w8_m128_noSplitK.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/gemm_a8w8_m128_noSplitK.co -------------------------------------------------------------------------------- /hsa/gfx942/gemm_a8w8_m128_splitK.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/gemm_a8w8_m128_splitK.co -------------------------------------------------------------------------------- /hsa/gfx942/i8gemm/i8gemm_bf16_perTokenI8.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/i8gemm/i8gemm_bf16_perTokenI8.csv -------------------------------------------------------------------------------- /hsa/gfx942/layer_norm.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/layer_norm.co -------------------------------------------------------------------------------- /hsa/gfx942/layer_norm_qnt.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/layer_norm_qnt.co -------------------------------------------------------------------------------- /hsa/gfx942/mla/mla_a8w8_qh128_m32x4_n16x2_msk1.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/mla/mla_a8w8_qh128_m32x4_n16x2_msk1.co -------------------------------------------------------------------------------- /hsa/gfx942/mla/mla_a8w8_qh16_qseqlen1_gqaratio16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/mla/mla_a8w8_qh16_qseqlen1_gqaratio16.co -------------------------------------------------------------------------------- /hsa/gfx942/mla/mla_a8w8_qh16_qseqlen2_gqaratio16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/mla/mla_a8w8_qh16_qseqlen2_gqaratio16.co -------------------------------------------------------------------------------- /hsa/gfx942/mla/mla_a8w8_qh16_qseqlen4_gqaratio16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/mla/mla_a8w8_qh16_qseqlen4_gqaratio16.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_asm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_asm.csv -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_bf16_noquant_gqa16_1tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_bf16_noquant_gqa16_1tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_bf16_noquant_gqa8_1tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_bf16_noquant_gqa8_1tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa16_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w_hp.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa16_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_bf16_pertokenInt8_gqa8_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_fp16_noquant_gqa16_1tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_fp16_noquant_gqa16_1tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_fp16_noquant_gqa8_1tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa16_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w_hp.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa16_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa/pa_fp16_pertokenInt8_gqa8_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w16_b16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w16_b16.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w16_f16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w16_f16.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_2tg_g8_f8_q_fp16_tail_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_2tg_g8_f8_q_fp16_tail_bf16.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_b16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_b16.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_b16_2tg_g8_f8.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_b16_2tg_g8_f8.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_b16_2tg_g8_i8.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_b16_2tg_g8_i8.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_bf16_2tg_g8_f8_gemm1_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_bf16_2tg_g8_f8_gemm1_bf16.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_bf16_2tg_g8_f8_tail_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_bf16_2tg_g8_f8_tail_bf16.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_f16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_f16.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_f16_2tg_g8_f8.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_f16_2tg_g8_f8.co -------------------------------------------------------------------------------- /hsa/gfx942/pa_a16w8_f16_2tg_g8_i8.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/pa_a16w8_f16_2tg_g8_i8.co -------------------------------------------------------------------------------- /hsa/gfx942/topksoftmax/topksoftmax_12x128x6.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/topksoftmax/topksoftmax_12x128x6.co -------------------------------------------------------------------------------- /hsa/gfx942/topksoftmax/topksoftmax_12x128x8.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/topksoftmax/topksoftmax_12x128x8.co -------------------------------------------------------------------------------- /hsa/gfx942/topksoftmax/topksoftmax_12x256x6.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/topksoftmax/topksoftmax_12x256x6.co -------------------------------------------------------------------------------- /hsa/gfx942/topksoftmax/topksoftmax_12x256x8.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/topksoftmax/topksoftmax_12x256x8.co -------------------------------------------------------------------------------- /hsa/gfx942/topksoftmax/topksoftmax_4x128x6.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/topksoftmax/topksoftmax_4x128x6.co -------------------------------------------------------------------------------- /hsa/gfx942/topksoftmax/topksoftmax_4x128x8.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/topksoftmax/topksoftmax_4x128x8.co -------------------------------------------------------------------------------- /hsa/gfx942/topksoftmax/topksoftmax_4x256x6.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/topksoftmax/topksoftmax_4x256x6.co -------------------------------------------------------------------------------- /hsa/gfx942/topksoftmax/topksoftmax_4x256x8.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx942/topksoftmax/topksoftmax_4x256x8.co -------------------------------------------------------------------------------- /hsa/gfx950/bf16gemm/bf16gemm_bf16_tn_256x256.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/bf16gemm/bf16gemm_bf16_tn_256x256.co -------------------------------------------------------------------------------- /hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv -------------------------------------------------------------------------------- /hsa/gfx950/f4gemm/f4gemm_bf16_per1x32Fp4.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/f4gemm/f4gemm_bf16_per1x32Fp4.csv -------------------------------------------------------------------------------- /hsa/gfx950/f8_block_scale_mi350_x128.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/f8_block_scale_mi350_x128.co -------------------------------------------------------------------------------- /hsa/gfx950/f8_block_scale_mi350_x32.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/f8_block_scale_mi350_x32.co -------------------------------------------------------------------------------- /hsa/gfx950/f8_block_scale_mi350_x64.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/f8_block_scale_mi350_x64.co -------------------------------------------------------------------------------- /hsa/gfx950/f8_block_scale_mi350_x96.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/f8_block_scale_mi350_x96.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd128_dq_shuffle.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd128_dq_shuffle.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd128_odo_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd128_odo_bf16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd128_odo_fp16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd128_odo_fp16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd192_dq_shuffle.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd192_dq_shuffle.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd192_odo_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd192_odo_bf16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd192_odo_fp16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd192_odo_fp16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd64_bf16_a16_rtna.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd64_bf16_a16_rtna.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd64_bf16_a16_rtne.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd64_bf16_a16_rtne.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd64_bf16_a16_rtz.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd64_bf16_a16_rtz.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd64_fp16_a16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd64_fp16_a16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd64_fp16_a32_pssk.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd64_fp16_a32_pssk.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd64_odo_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd64_odo_bf16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/bwd_hd64_odo_fp16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/bwd_hd64_odo_fp16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_bwd/codegen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_bwd/codegen.py -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_fwd/codegen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_fwd/codegen.py -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_group.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_group.co -------------------------------------------------------------------------------- /hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmha_v3_fwd/fwd_hd192_hd128_bf16.co -------------------------------------------------------------------------------- /hsa/gfx950/fmoe_2stages/tune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/fmoe_2stages/tune.py -------------------------------------------------------------------------------- /hsa/gfx950/pa/pa_asm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/pa/pa_asm.csv -------------------------------------------------------------------------------- /hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/pa/pa_bf16_noquant_gqa16_1tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/pa/pa_bf16_noquant_gqa8_1tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa8_2tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/pa/pa_fp16_noquant_gqa16_1tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/pa/pa_fp16_noquant_gqa8_1tg_4w.co -------------------------------------------------------------------------------- /hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/gfx950/pa/pa_fp16_pertokenFp8_gqa8_2tg_4w.co -------------------------------------------------------------------------------- /hsa/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/hsa/readme.md -------------------------------------------------------------------------------- /op_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/__init__.py -------------------------------------------------------------------------------- /op_tests/cpp/mha/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/README.md -------------------------------------------------------------------------------- /op_tests/cpp/mha/benchmark_mha_bwd.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/benchmark_mha_bwd.cpp -------------------------------------------------------------------------------- /op_tests/cpp/mha/benchmark_mha_fwd.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/benchmark_mha_fwd.cpp -------------------------------------------------------------------------------- /op_tests/cpp/mha/build_mha.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/build_mha.sh -------------------------------------------------------------------------------- /op_tests/cpp/mha/compile.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/compile.py -------------------------------------------------------------------------------- /op_tests/cpp/mha/images/causal-bwd-perf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/images/causal-bwd-perf.png -------------------------------------------------------------------------------- /op_tests/cpp/mha/images/causal-fwd-perf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/images/causal-fwd-perf.png -------------------------------------------------------------------------------- /op_tests/cpp/mha/images/non-causal-bwd-perf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/images/non-causal-bwd-perf.png -------------------------------------------------------------------------------- /op_tests/cpp/mha/images/non-causal-fwd-perf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/images/non-causal-fwd-perf.png -------------------------------------------------------------------------------- /op_tests/cpp/mha/smoke_test_bwd.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/smoke_test_bwd.sh -------------------------------------------------------------------------------- /op_tests/cpp/mha/smoke_test_bwd_v3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/smoke_test_bwd_v3.sh -------------------------------------------------------------------------------- /op_tests/cpp/mha/smoke_test_fwd.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/smoke_test_fwd.sh -------------------------------------------------------------------------------- /op_tests/cpp/mha/smoke_test_fwd_v3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/cpp/mha/smoke_test_fwd_v3.sh -------------------------------------------------------------------------------- /op_tests/multigpu_tests/test_allgather.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/multigpu_tests/test_allgather.py -------------------------------------------------------------------------------- /op_tests/multigpu_tests/test_communication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/multigpu_tests/test_communication.py -------------------------------------------------------------------------------- /op_tests/multigpu_tests/test_custom_allreduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/multigpu_tests/test_custom_allreduce.py -------------------------------------------------------------------------------- /op_tests/multigpu_tests/test_dispatch_combine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/multigpu_tests/test_dispatch_combine.py -------------------------------------------------------------------------------- /op_tests/multigpu_tests/test_fused_ar_rms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/multigpu_tests/test_fused_ar_rms.py -------------------------------------------------------------------------------- /op_tests/multigpu_tests/test_mori_all2all.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/multigpu_tests/test_mori_all2all.py -------------------------------------------------------------------------------- /op_tests/multigpu_tests/test_quick_all_reduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/multigpu_tests/test_quick_all_reduce.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_gemm_a8w8.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_gmm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_gmm.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_hstu_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_hstu_attn.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_la.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_la.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_mha.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_mha.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_moe.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_moe_mx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_moe_mx.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_pa_decode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_pa_decode.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_rmsnorm.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_rope.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_schema.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_schema.yaml -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/bench_topk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/bench_topk.py -------------------------------------------------------------------------------- /op_tests/op_benchmarks/triton/utils/argparse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/op_benchmarks/triton/utils/argparse.py -------------------------------------------------------------------------------- /op_tests/test_activation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_activation.py -------------------------------------------------------------------------------- /op_tests/test_aiter_add.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_aiter_add.py -------------------------------------------------------------------------------- /op_tests/test_aiter_addInp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_aiter_addInp.py -------------------------------------------------------------------------------- /op_tests/test_aiter_sigmoid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_aiter_sigmoid.py -------------------------------------------------------------------------------- /op_tests/test_batch_prefill.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_batch_prefill.py -------------------------------------------------------------------------------- /op_tests/test_batched_gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_batched_gemm_a8w8.py -------------------------------------------------------------------------------- /op_tests/test_batched_gemm_bf16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_batched_gemm_bf16.py -------------------------------------------------------------------------------- /op_tests/test_concat_cache_mla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_concat_cache_mla.py -------------------------------------------------------------------------------- /op_tests/test_deepgemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_deepgemm.py -------------------------------------------------------------------------------- /op_tests/test_fused_mrope_rms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_fused_mrope_rms.py -------------------------------------------------------------------------------- /op_tests/test_gemm_a16w16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_gemm_a16w16.py -------------------------------------------------------------------------------- /op_tests/test_gemm_a4w4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_gemm_a4w4.py -------------------------------------------------------------------------------- /op_tests/test_gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_gemm_a8w8.py -------------------------------------------------------------------------------- /op_tests/test_gemm_a8w8_blockscale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_gemm_a8w8_blockscale.py -------------------------------------------------------------------------------- /op_tests/test_gemm_a8w8_blockscale_mi350.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_gemm_a8w8_blockscale_mi350.py -------------------------------------------------------------------------------- /op_tests/test_indexer_k_quant_and_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_indexer_k_quant_and_cache.py -------------------------------------------------------------------------------- /op_tests/test_kvcache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_kvcache.py -------------------------------------------------------------------------------- /op_tests/test_kvcache_blockscale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_kvcache_blockscale.py -------------------------------------------------------------------------------- /op_tests/test_layernorm2d.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_layernorm2d.py -------------------------------------------------------------------------------- /op_tests/test_layernorm2dFusedAddQuant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_layernorm2dFusedAddQuant.py -------------------------------------------------------------------------------- /op_tests/test_mha.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_mha.py -------------------------------------------------------------------------------- /op_tests/test_mha_fp8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_mha_fp8.py -------------------------------------------------------------------------------- /op_tests/test_mha_varlen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_mha_varlen.py -------------------------------------------------------------------------------- /op_tests/test_mha_varlen_fp8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_mha_varlen_fp8.py -------------------------------------------------------------------------------- /op_tests/test_mla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_mla.py -------------------------------------------------------------------------------- /op_tests/test_mla_persistent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_mla_persistent.py -------------------------------------------------------------------------------- /op_tests/test_mla_sparse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_mla_sparse.py -------------------------------------------------------------------------------- /op_tests/test_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe.py -------------------------------------------------------------------------------- /op_tests/test_moeTopkSoftmax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moeTopkSoftmax.py -------------------------------------------------------------------------------- /op_tests/test_moe_2stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe_2stage.py -------------------------------------------------------------------------------- /op_tests/test_moe_blockscale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe_blockscale.py -------------------------------------------------------------------------------- /op_tests/test_moe_dp_share_expert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe_dp_share_expert.py -------------------------------------------------------------------------------- /op_tests/test_moe_ep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe_ep.py -------------------------------------------------------------------------------- /op_tests/test_moe_sorting.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe_sorting.py -------------------------------------------------------------------------------- /op_tests/test_moe_sorting_mxfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe_sorting_mxfp4.py -------------------------------------------------------------------------------- /op_tests/test_moe_tkw1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe_tkw1.py -------------------------------------------------------------------------------- /op_tests/test_moe_topk_sigmoid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_moe_topk_sigmoid.py -------------------------------------------------------------------------------- /op_tests/test_pa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_pa.py -------------------------------------------------------------------------------- /op_tests/test_pa_mtp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_pa_mtp.py -------------------------------------------------------------------------------- /op_tests/test_pa_ps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_pa_ps.py -------------------------------------------------------------------------------- /op_tests/test_pa_ragged.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_pa_ragged.py -------------------------------------------------------------------------------- /op_tests/test_pa_ragged_experimental.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_pa_ragged_experimental.py -------------------------------------------------------------------------------- /op_tests/test_pa_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_pa_v1.py -------------------------------------------------------------------------------- /op_tests/test_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_quant.py -------------------------------------------------------------------------------- /op_tests/test_rmsnorm2d.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_rmsnorm2d.py -------------------------------------------------------------------------------- /op_tests/test_rmsnorm2dFusedAddQuant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_rmsnorm2dFusedAddQuant.py -------------------------------------------------------------------------------- /op_tests/test_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_rope.py -------------------------------------------------------------------------------- /op_tests/test_sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_sample.py -------------------------------------------------------------------------------- /op_tests/test_sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_sampling.py -------------------------------------------------------------------------------- /op_tests/test_smoothquant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_smoothquant.py -------------------------------------------------------------------------------- /op_tests/test_topk_per_row.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_topk_per_row.py -------------------------------------------------------------------------------- /op_tests/test_topk_plain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/test_topk_plain.py -------------------------------------------------------------------------------- /op_tests/triton_tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/README.md -------------------------------------------------------------------------------- /op_tests/triton_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/__init__.py -------------------------------------------------------------------------------- /op_tests/triton_tests/ff_test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/ff_test_utils.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_activation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_activation.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_batched_gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_batched_gemm_a8w8.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_batched_gemm_bf16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_batched_gemm_bf16.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_chunked_pa_prefill.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_chunked_pa_prefill.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_extend_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_extend_attention.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_ff_a16w16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_ff_a16w16.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_ff_a16w16_fused.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_ff_a16w16_fused.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_fp8_mqa_logits.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_fp8_mqa_logits.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_fused_fp8_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_fused_fp8_quant.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_fused_kv_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_fused_kv_cache.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_fused_mul_add.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_fused_mul_add.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_fused_mxfp4_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_fused_mxfp4_quant.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_fused_qk_concat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_fused_qk_concat.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_gemm_a16w16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_gemm_a16w16.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_gemm_a16w16_gated.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_gemm_a16w16_gated.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_gemm_a16wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_gemm_a16wfp4.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_gemm_a8w8.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_gemm_a8wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_gemm_a8wfp4.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_gemm_afp4wfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_gemm_afp4wfp4.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_gmm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_gmm.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_hstu_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_hstu_attn.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_la.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_la.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_la_paged.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_la_paged.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_layernorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_layernorm.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_mha.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_mha.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_mla_decode_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_mla_decode_rope.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_moe.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_moe_gemm_a8w4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_moe_gemm_a8w4.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_moe_gemm_a8w8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_moe_gemm_a8w8.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_moe_mx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_moe_mx.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_moe_routing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_moe_routing.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_pa_decode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_pa_decode.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_pa_prefill.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_pa_prefill.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_prefill_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_prefill_attention.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_quant.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_quant_mxfp4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_quant_mxfp4.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_rmsnorm.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_rope.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_softmax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_softmax.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_topk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_topk.py -------------------------------------------------------------------------------- /op_tests/triton_tests/test_unified_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/test_unified_attention.py -------------------------------------------------------------------------------- /op_tests/triton_tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/utils/__init__.py -------------------------------------------------------------------------------- /op_tests/triton_tests/utils/mla_decode_ref.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/utils/mla_decode_ref.py -------------------------------------------------------------------------------- /op_tests/triton_tests/utils/mla_extend_ref.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/utils/mla_extend_ref.py -------------------------------------------------------------------------------- /op_tests/triton_tests/utils/rotary_embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/utils/rotary_embedding.py -------------------------------------------------------------------------------- /op_tests/triton_tests/utils/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/op_tests/triton_tests/utils/types.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/aiter/HEAD/setup.py --------------------------------------------------------------------------------