├── .github └── workflows │ └── publish.yml ├── .gitignore ├── .python-version ├── LICENSE ├── README.md ├── benchmark ├── NVIDIA-A100-SXM4-80GB │ ├── memory │ │ ├── memory.parquet │ │ ├── peak_memory_bwd_torch.bfloat16.png │ │ └── peak_memory_fwd_torch.bfloat16.png │ └── runtime │ │ ├── results.html │ │ ├── tri_attn_bwd_torch.bfloat16.csv │ │ ├── tri_attn_bwd_torch.bfloat16.png │ │ ├── tri_attn_fwd_torch.bfloat16.csv │ │ └── tri_attn_fwd_torch.bfloat16.png └── NVIDIA-GeForce-RTX-3090 │ ├── memory │ ├── memory.parquet │ ├── peak_memory_bwd_torch.bfloat16.png │ └── peak_memory_fwd_torch.bfloat16.png │ └── runtime │ ├── results.html │ ├── tri_attn_bwd_torch.bfloat16.csv │ ├── tri_attn_bwd_torch.bfloat16.png │ ├── tri_attn_fwd_torch.bfloat16.csv │ └── tri_attn_fwd_torch.bfloat16.png ├── pyproject.toml ├── pyrightconfig.json ├── scripts ├── __init__.py ├── benchmark.py ├── compile.py ├── memory.py └── run.py ├── src └── trifast │ ├── __init__.py │ ├── autotune.py │ ├── autotune_helpers.py │ ├── equiv.py │ ├── torch.py │ ├── triton.py │ ├── tune.py │ └── utils.py ├── tests ├── __init__.py ├── conftest.py ├── unit │ ├── test_trifast.py │ └── test_trifast_opcheck.py └── utils.py └── uv.lock /.github/workflows/publish.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/.github/workflows/publish.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/.gitignore -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/README.md -------------------------------------------------------------------------------- /benchmark/NVIDIA-A100-SXM4-80GB/memory/memory.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-A100-SXM4-80GB/memory/memory.parquet -------------------------------------------------------------------------------- /benchmark/NVIDIA-A100-SXM4-80GB/memory/peak_memory_bwd_torch.bfloat16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-A100-SXM4-80GB/memory/peak_memory_bwd_torch.bfloat16.png -------------------------------------------------------------------------------- /benchmark/NVIDIA-A100-SXM4-80GB/memory/peak_memory_fwd_torch.bfloat16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-A100-SXM4-80GB/memory/peak_memory_fwd_torch.bfloat16.png -------------------------------------------------------------------------------- /benchmark/NVIDIA-A100-SXM4-80GB/runtime/results.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-A100-SXM4-80GB/runtime/results.html -------------------------------------------------------------------------------- /benchmark/NVIDIA-A100-SXM4-80GB/runtime/tri_attn_bwd_torch.bfloat16.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-A100-SXM4-80GB/runtime/tri_attn_bwd_torch.bfloat16.csv -------------------------------------------------------------------------------- /benchmark/NVIDIA-A100-SXM4-80GB/runtime/tri_attn_bwd_torch.bfloat16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-A100-SXM4-80GB/runtime/tri_attn_bwd_torch.bfloat16.png -------------------------------------------------------------------------------- /benchmark/NVIDIA-A100-SXM4-80GB/runtime/tri_attn_fwd_torch.bfloat16.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-A100-SXM4-80GB/runtime/tri_attn_fwd_torch.bfloat16.csv -------------------------------------------------------------------------------- /benchmark/NVIDIA-A100-SXM4-80GB/runtime/tri_attn_fwd_torch.bfloat16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-A100-SXM4-80GB/runtime/tri_attn_fwd_torch.bfloat16.png -------------------------------------------------------------------------------- /benchmark/NVIDIA-GeForce-RTX-3090/memory/memory.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-GeForce-RTX-3090/memory/memory.parquet -------------------------------------------------------------------------------- /benchmark/NVIDIA-GeForce-RTX-3090/memory/peak_memory_bwd_torch.bfloat16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-GeForce-RTX-3090/memory/peak_memory_bwd_torch.bfloat16.png -------------------------------------------------------------------------------- /benchmark/NVIDIA-GeForce-RTX-3090/memory/peak_memory_fwd_torch.bfloat16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-GeForce-RTX-3090/memory/peak_memory_fwd_torch.bfloat16.png -------------------------------------------------------------------------------- /benchmark/NVIDIA-GeForce-RTX-3090/runtime/results.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-GeForce-RTX-3090/runtime/results.html -------------------------------------------------------------------------------- /benchmark/NVIDIA-GeForce-RTX-3090/runtime/tri_attn_bwd_torch.bfloat16.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-GeForce-RTX-3090/runtime/tri_attn_bwd_torch.bfloat16.csv -------------------------------------------------------------------------------- /benchmark/NVIDIA-GeForce-RTX-3090/runtime/tri_attn_bwd_torch.bfloat16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-GeForce-RTX-3090/runtime/tri_attn_bwd_torch.bfloat16.png -------------------------------------------------------------------------------- /benchmark/NVIDIA-GeForce-RTX-3090/runtime/tri_attn_fwd_torch.bfloat16.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-GeForce-RTX-3090/runtime/tri_attn_fwd_torch.bfloat16.csv -------------------------------------------------------------------------------- /benchmark/NVIDIA-GeForce-RTX-3090/runtime/tri_attn_fwd_torch.bfloat16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/benchmark/NVIDIA-GeForce-RTX-3090/runtime/tri_attn_fwd_torch.bfloat16.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/pyproject.toml -------------------------------------------------------------------------------- /pyrightconfig.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/pyrightconfig.json -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/scripts/benchmark.py -------------------------------------------------------------------------------- /scripts/compile.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/scripts/compile.py -------------------------------------------------------------------------------- /scripts/memory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/scripts/memory.py -------------------------------------------------------------------------------- /scripts/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/scripts/run.py -------------------------------------------------------------------------------- /src/trifast/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/src/trifast/__init__.py -------------------------------------------------------------------------------- /src/trifast/autotune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/src/trifast/autotune.py -------------------------------------------------------------------------------- /src/trifast/autotune_helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/src/trifast/autotune_helpers.py -------------------------------------------------------------------------------- /src/trifast/equiv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/src/trifast/equiv.py -------------------------------------------------------------------------------- /src/trifast/torch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/src/trifast/torch.py -------------------------------------------------------------------------------- /src/trifast/triton.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/src/trifast/triton.py -------------------------------------------------------------------------------- /src/trifast/tune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/src/trifast/tune.py -------------------------------------------------------------------------------- /src/trifast/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/src/trifast/utils.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/test_trifast.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/tests/unit/test_trifast.py -------------------------------------------------------------------------------- /tests/unit/test_trifast_opcheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/tests/unit/test_trifast_opcheck.py -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/tests/utils.py -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/latkins/trifast/HEAD/uv.lock --------------------------------------------------------------------------------