├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── attention_simulator ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── examples │ └── language_modeling │ │ ├── configs │ │ ├── experiment_realnews_transformer_bf16_flash_sigmoid_multilayer_rope.yaml │ │ ├── experiment_realnews_transformer_bf16_flash_softmax_multilayer_rope.yaml │ │ ├── experiment_realnews_transformer_sigmoid_alibi.yaml │ │ ├── experiment_tiny_shakespeare_transformer_sigmoid_rope.yaml │ │ └── experiment_tiny_shakespeare_transformer_softmax_sincos.yaml │ │ └── train_autoregressive_language_model.py ├── pyproject.toml └── src │ └── attention_simulator │ ├── autoregressive_language_model.py │ ├── helpers │ ├── grapher.py │ ├── params.py │ └── utils.py │ └── layers │ ├── activations.py │ ├── attention.py │ ├── container.py │ ├── flash_sigmoid_attention.py │ ├── flash_softmax_attention.py │ ├── initialization.py │ ├── linear.py │ ├── masking.py │ ├── mlp.py │ ├── normalization.py │ ├── position_embedding.py │ └── transformer.py ├── figures ├── H100_noalibi_BWD_Full_2.7_0.06_Causal_6.19_0.06.png ├── H100_noalibi_FWD_Full_17.39_0.07_Causal_18.76_0.06.png └── train_nll_softmax_vs_sigmoid.png ├── flash_sigmoid ├── .github │ └── workflows │ │ └── publish.yml ├── .gitignore ├── .gitmodules ├── AUTHORS ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── assets │ ├── flash2_a100_fwd_bwd_benchmark.png │ ├── flash2_h100_fwd_bwd_benchmark.png │ ├── flashattention_logo.png │ ├── flashattn_banner.jpg │ ├── flashattn_banner.pdf │ ├── flashattn_memory.jpg │ ├── flashattn_speedup.jpg │ ├── flashattn_speedup_3090.jpg │ ├── flashattn_speedup_a100_d128.jpg │ ├── flashattn_speedup_t4.jpg │ ├── flashattn_speedup_t4_fwd.jpg │ ├── gpt2_training_curve.jpg │ ├── gpt2_training_efficiency.jpg │ ├── gpt3_training_curve.jpg │ └── gpt3_training_efficiency.jpg ├── benchmarks │ ├── benchmark_alibi.py │ ├── benchmark_causal.py │ └── benchmark_flash_attention.py ├── csrc │ ├── flash_sigmoid │ │ ├── flash_api.cpp │ │ └── src │ │ │ ├── alibi.h │ │ │ ├── block_info.h │ │ │ ├── dropout.h │ │ │ ├── flash.h │ │ │ ├── flash_bwd_hdim128_bf16_sm80.cu │ │ │ ├── flash_bwd_hdim128_fp16_sm80.cu │ │ │ ├── flash_bwd_hdim160_bf16_sm80.cu │ │ │ ├── flash_bwd_hdim160_fp16_sm80.cu │ │ │ ├── flash_bwd_hdim192_bf16_sm80.cu │ │ │ ├── flash_bwd_hdim192_fp16_sm80.cu │ │ │ ├── flash_bwd_hdim224_bf16_sm80.cu │ │ │ ├── flash_bwd_hdim224_fp16_sm80.cu │ │ │ ├── flash_bwd_hdim256_bf16_sm80.cu │ │ │ ├── flash_bwd_hdim256_fp16_sm80.cu │ │ │ ├── flash_bwd_hdim32_bf16_sm80.cu │ │ │ ├── flash_bwd_hdim32_fp16_sm80.cu │ │ │ ├── flash_bwd_hdim64_bf16_sm80.cu │ │ │ ├── flash_bwd_hdim64_fp16_sm80.cu │ │ │ ├── flash_bwd_hdim96_bf16_sm80.cu │ │ │ ├── flash_bwd_hdim96_fp16_sm80.cu │ │ │ ├── flash_bwd_kernel.h │ │ │ ├── flash_bwd_launch_template.h │ │ │ ├── flash_bwd_preprocess_kernel.h │ │ │ ├── flash_fwd_hdim128_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim128_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim160_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim160_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim192_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim192_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim224_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim224_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim256_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim256_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim32_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim32_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim64_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim64_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim96_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim96_fp16_sm80.cu │ │ │ ├── flash_fwd_kernel.h │ │ │ ├── flash_fwd_launch_template.h │ │ │ ├── flash_fwd_split_hdim128_bf16_sm80.cu │ │ │ ├── flash_fwd_split_hdim128_fp16_sm80.cu │ │ │ ├── flash_fwd_split_hdim160_bf16_sm80.cu │ │ │ ├── flash_fwd_split_hdim160_fp16_sm80.cu │ │ │ ├── flash_fwd_split_hdim192_bf16_sm80.cu │ │ │ ├── flash_fwd_split_hdim192_fp16_sm80.cu │ │ │ ├── flash_fwd_split_hdim224_bf16_sm80.cu │ │ │ ├── flash_fwd_split_hdim224_fp16_sm80.cu │ │ │ ├── flash_fwd_split_hdim256_bf16_sm80.cu │ │ │ ├── flash_fwd_split_hdim256_fp16_sm80.cu │ │ │ ├── flash_fwd_split_hdim32_bf16_sm80.cu │ │ │ ├── flash_fwd_split_hdim32_fp16_sm80.cu │ │ │ ├── flash_fwd_split_hdim64_bf16_sm80.cu │ │ │ ├── flash_fwd_split_hdim64_fp16_sm80.cu │ │ │ ├── flash_fwd_split_hdim96_bf16_sm80.cu │ │ │ ├── flash_fwd_split_hdim96_fp16_sm80.cu │ │ │ ├── generate_kernels.py │ │ │ ├── kernel_traits.h │ │ │ ├── mask.h │ │ │ ├── philox.cuh │ │ │ ├── rotary.h │ │ │ ├── softmax.h │ │ │ ├── static_switch.h │ │ │ └── utils.h │ ├── ft_attention │ │ ├── README.md │ │ ├── cuda_bf16_fallbacks.cuh │ │ ├── cuda_bf16_wrapper.h │ │ ├── decoder_masked_multihead_attention.cu │ │ ├── decoder_masked_multihead_attention.h │ │ ├── decoder_masked_multihead_attention_template.hpp │ │ ├── decoder_masked_multihead_attention_utils.h │ │ ├── ft_attention.cpp │ │ └── setup.py │ ├── fused_dense_lib │ │ ├── README.md │ │ ├── fused_dense.cpp │ │ ├── fused_dense_cuda.cu │ │ └── setup.py │ ├── fused_softmax │ │ ├── fused_softmax.cpp │ │ ├── scaled_masked_softmax.h │ │ ├── scaled_masked_softmax_cuda.cu │ │ ├── scaled_upper_triang_masked_softmax.h │ │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ │ ├── setup.py │ │ └── type_shim.h │ ├── layer_norm │ │ ├── README.md │ │ ├── ln.h │ │ ├── ln_api.cpp │ │ ├── ln_bwd_1024.cu │ │ ├── ln_bwd_1280.cu │ │ ├── ln_bwd_1536.cu │ │ ├── ln_bwd_2048.cu │ │ ├── ln_bwd_256.cu │ │ ├── ln_bwd_2560.cu │ │ ├── ln_bwd_3072.cu │ │ ├── ln_bwd_4096.cu │ │ ├── ln_bwd_512.cu │ │ ├── ln_bwd_5120.cu │ │ ├── ln_bwd_6144.cu │ │ ├── ln_bwd_7168.cu │ │ ├── ln_bwd_768.cu │ │ ├── ln_bwd_8192.cu │ │ ├── ln_bwd_kernels.cuh │ │ ├── ln_fwd_1024.cu │ │ ├── ln_fwd_1280.cu │ │ ├── ln_fwd_1536.cu │ │ ├── ln_fwd_2048.cu │ │ ├── ln_fwd_256.cu │ │ ├── ln_fwd_2560.cu │ │ ├── ln_fwd_3072.cu │ │ ├── ln_fwd_4096.cu │ │ ├── ln_fwd_512.cu │ │ ├── ln_fwd_5120.cu │ │ ├── ln_fwd_6144.cu │ │ ├── ln_fwd_7168.cu │ │ ├── ln_fwd_768.cu │ │ ├── ln_fwd_8192.cu │ │ ├── ln_fwd_kernels.cuh │ │ ├── ln_kernel_traits.h │ │ ├── ln_parallel_bwd_1024.cu │ │ ├── ln_parallel_bwd_1280.cu │ │ ├── ln_parallel_bwd_1536.cu │ │ ├── ln_parallel_bwd_2048.cu │ │ ├── ln_parallel_bwd_256.cu │ │ ├── ln_parallel_bwd_2560.cu │ │ ├── ln_parallel_bwd_3072.cu │ │ ├── ln_parallel_bwd_4096.cu │ │ ├── ln_parallel_bwd_512.cu │ │ ├── ln_parallel_bwd_5120.cu │ │ ├── ln_parallel_bwd_6144.cu │ │ ├── ln_parallel_bwd_7168.cu │ │ ├── ln_parallel_bwd_768.cu │ │ ├── ln_parallel_bwd_8192.cu │ │ ├── ln_parallel_fwd_1024.cu │ │ ├── ln_parallel_fwd_1280.cu │ │ ├── ln_parallel_fwd_1536.cu │ │ ├── ln_parallel_fwd_2048.cu │ │ ├── ln_parallel_fwd_256.cu │ │ ├── ln_parallel_fwd_2560.cu │ │ ├── ln_parallel_fwd_3072.cu │ │ ├── ln_parallel_fwd_4096.cu │ │ ├── ln_parallel_fwd_512.cu │ │ ├── ln_parallel_fwd_5120.cu │ │ ├── ln_parallel_fwd_6144.cu │ │ ├── ln_parallel_fwd_7168.cu │ │ ├── ln_parallel_fwd_768.cu │ │ ├── ln_parallel_fwd_8192.cu │ │ ├── ln_parallel_residual_bwd_kernels.cuh │ │ ├── ln_parallel_residual_fwd_kernels.cuh │ │ ├── ln_utils.cuh │ │ ├── setup.py │ │ └── static_switch.h │ ├── rotary │ │ ├── rotary.cpp │ │ ├── rotary_cuda.cu │ │ └── setup.py │ └── xentropy │ │ ├── README.md │ │ ├── interface.cpp │ │ ├── setup.py │ │ └── xentropy_kernel.cu ├── examples │ └── inference │ │ └── README.md ├── flash_sigmoid │ ├── __init__.py │ ├── bert_padding.py │ ├── flash_attn_interface.py │ ├── flash_attn_triton.py │ ├── flash_attn_triton_og.py │ ├── flash_blocksparse_attention.py │ ├── flash_blocksparse_attn_interface.py │ ├── fused_softmax.py │ ├── layers │ │ ├── __init__.py │ │ ├── patch_embed.py │ │ └── rotary.py │ ├── losses │ │ ├── __init__.py │ │ └── cross_entropy.py │ ├── models │ │ ├── __init__.py │ │ ├── baichuan.py │ │ ├── bert.py │ │ ├── bigcode.py │ │ ├── btlm.py │ │ ├── falcon.py │ │ ├── gpt.py │ │ ├── gpt_neox.py │ │ ├── gptj.py │ │ ├── llama.py │ │ ├── opt.py │ │ └── vit.py │ ├── modules │ │ ├── __init__.py │ │ ├── block.py │ │ ├── embedding.py │ │ ├── mha.py │ │ └── mlp.py │ ├── ops │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── fused_dense.py │ │ ├── layer_norm.py │ │ ├── rms_norm.py │ │ └── triton │ │ │ ├── __init__.py │ │ │ ├── cross_entropy.py │ │ │ ├── k_activations.py │ │ │ ├── layer_norm.py │ │ │ ├── linear.py │ │ │ ├── mlp.py │ │ │ └── rotary.py │ ├── pyproject.toml │ └── utils │ │ ├── __init__.py │ │ ├── benchmark.py │ │ ├── distributed.py │ │ ├── generation.py │ │ └── pretrained.py ├── setup.py ├── tests │ ├── layers │ │ └── test_rotary.py │ ├── losses │ │ ├── test_cross_entropy.py │ │ └── test_cross_entropy_parallel.py │ ├── models │ │ ├── test_baichuan.py │ │ ├── test_bert.py │ │ ├── test_bigcode.py │ │ ├── test_btlm.py │ │ ├── test_falcon.py │ │ ├── test_gpt.py │ │ ├── test_gpt_generation_parallel.py │ │ ├── test_gpt_neox.py │ │ ├── test_gpt_parallel.py │ │ ├── test_gptj.py │ │ ├── test_llama.py │ │ ├── test_opt.py │ │ └── test_vit.py │ ├── modules │ │ ├── test_block_parallel.py │ │ ├── test_embedding_parallel.py │ │ ├── test_mha_parallel.py │ │ └── test_mlp_parallel.py │ ├── ops │ │ ├── test_dropout_layer_norm.py │ │ ├── test_fused_dense.py │ │ ├── test_fused_dense_parallel.py │ │ └── triton │ │ │ └── test_layer_norm.py │ ├── pyproject.toml │ ├── test_flash_attn.py │ └── test_rotary.py ├── training │ ├── Dockerfile │ ├── README.md │ ├── configs │ │ ├── callbacks │ │ │ ├── causality-monitor.yaml │ │ │ ├── default.yaml │ │ │ ├── ema.yaml │ │ │ ├── flop-count.yaml │ │ │ ├── gpu-monitor.yaml │ │ │ ├── model-summary.yaml │ │ │ ├── none.yaml │ │ │ ├── norm-monitor.yaml │ │ │ ├── params-log.yaml │ │ │ └── wandb.yaml │ │ ├── config.yaml │ │ ├── datamodule │ │ │ ├── openwebtext.yaml │ │ │ └── thepile.yaml │ │ ├── experiment │ │ │ ├── owt │ │ │ │ ├── base.yaml │ │ │ │ ├── gpt2l-flash.yaml │ │ │ │ ├── gpt2l-hf.yaml │ │ │ │ ├── gpt2l.yaml │ │ │ │ ├── gpt2m-flash.yaml │ │ │ │ ├── gpt2m-hf.yaml │ │ │ │ ├── gpt2m.yaml │ │ │ │ ├── gpt2s-flash.yaml │ │ │ │ ├── gpt2s-hf.yaml │ │ │ │ ├── gpt2s.yaml │ │ │ │ ├── gpt2xl-flash.yaml │ │ │ │ ├── gpt2xl-hf.yaml │ │ │ │ └── gpt2xl.yaml │ │ │ └── pile │ │ │ │ ├── base.yaml │ │ │ │ ├── gpt3-2.7B-flash-8k.yaml │ │ │ │ ├── gpt3-2.7B-flash-hdim128-rotary-8k.yaml │ │ │ │ ├── gpt3-2.7B-flash-hdim128-rotary.yaml │ │ │ │ ├── gpt3-2.7B-flash-hdim128.yaml │ │ │ │ ├── gpt3-2.7B-flash-rotary-8k.yaml │ │ │ │ ├── gpt3-2.7B-flash-rotary.yaml │ │ │ │ ├── gpt3-2.7B-flash.yaml │ │ │ │ ├── gpt3-2.7B-hf-hdim128.yaml │ │ │ │ ├── gpt3-2.7B-hf.yaml │ │ │ │ ├── gpt3l-flash-8k.yaml │ │ │ │ ├── gpt3l-flash-rotary-30B.yaml │ │ │ │ ├── gpt3l-flash-rotary-8k.yaml │ │ │ │ ├── gpt3l-flash-rotary.yaml │ │ │ │ ├── gpt3l-flash.yaml │ │ │ │ ├── gpt3l-hf.yaml │ │ │ │ ├── gpt3m-flash-8k.yaml │ │ │ │ ├── gpt3m-flash-rotary-30B.yaml │ │ │ │ ├── gpt3m-flash-rotary-8k.yaml │ │ │ │ ├── gpt3m-flash-rotary.yaml │ │ │ │ ├── gpt3m-flash.yaml │ │ │ │ ├── gpt3m-hf.yaml │ │ │ │ ├── gpt3s-flash-8k.yaml │ │ │ │ ├── gpt3s-flash-rotary-30B.yaml │ │ │ │ ├── gpt3s-flash-rotary-8k.yaml │ │ │ │ ├── gpt3s-flash-rotary.yaml │ │ │ │ ├── gpt3s-flash.yaml │ │ │ │ ├── gpt3s-hf.yaml │ │ │ │ ├── gpt3xl-flash-8k.yaml │ │ │ │ ├── gpt3xl-flash-rotary-60B.yaml │ │ │ │ ├── gpt3xl-flash-rotary-8k.yaml │ │ │ │ ├── gpt3xl-flash-rotary.yaml │ │ │ │ ├── gpt3xl-flash.yaml │ │ │ │ └── gpt3xl-hf.yaml │ │ ├── logger │ │ │ ├── comet.yaml │ │ │ ├── csv.yaml │ │ │ ├── many_loggers.yaml │ │ │ ├── mlflow.yaml │ │ │ ├── neptune.yaml │ │ │ ├── tensorboard.yaml │ │ │ └── wandb.yaml │ │ ├── metrics │ │ │ ├── acc.yaml │ │ │ ├── acc_ignore_index.yaml │ │ │ ├── acctop5.yaml │ │ │ ├── mse.yaml │ │ │ ├── num-tokens.yaml │ │ │ └── perplexity.yaml │ │ ├── mode │ │ │ ├── debug.yaml │ │ │ ├── default.yaml │ │ │ ├── exp.yaml │ │ │ ├── profile.yaml │ │ │ └── smoke.yaml │ │ ├── model │ │ │ ├── gpt2-hf.yaml │ │ │ ├── gpt2.yaml │ │ │ └── gpt2model │ │ │ │ ├── gpt2-large.yaml │ │ │ │ ├── gpt2-medium.yaml │ │ │ │ ├── gpt2-small.yaml │ │ │ │ └── gpt2-xlarge.yaml │ │ ├── optimizer │ │ │ ├── adam.yaml │ │ │ ├── adamw-apex-distributed.yaml │ │ │ ├── adamw-apex-zero.yaml │ │ │ ├── adamw-apex.yaml │ │ │ ├── adamw-zero.yaml │ │ │ ├── adamw.yaml │ │ │ ├── fusedlamb-ds.yaml │ │ │ ├── fusedlamb.yaml │ │ │ └── sgd.yaml │ │ ├── scheduler │ │ │ ├── cosine-warmup-timm.yaml │ │ │ ├── cosine-warmup.yaml │ │ │ ├── invsqrt.yaml │ │ │ ├── linear-warmup.yaml │ │ │ ├── multi-step.yaml │ │ │ ├── plateau.yaml │ │ │ ├── poly-warmup.yaml │ │ │ └── step.yaml │ │ ├── task │ │ │ └── sequence-model.yaml │ │ └── trainer │ │ │ ├── all_params.yaml │ │ │ ├── ddp.yaml │ │ │ ├── debug.yaml │ │ │ └── default.yaml │ ├── run.py │ ├── src │ │ ├── callbacks │ │ │ ├── __init__.py │ │ │ ├── causality_monitor.py │ │ │ ├── ema.py │ │ │ ├── flop_count.py │ │ │ ├── gpu_affinity.py │ │ │ ├── loss_scale_monitor.py │ │ │ ├── model_checkpoint.py │ │ │ ├── norm_monitor.py │ │ │ ├── params_log.py │ │ │ ├── speed_monitor.py │ │ │ └── wandb_callbacks.py │ │ ├── datamodules │ │ │ ├── datasets │ │ │ │ ├── detokenizer.py │ │ │ │ └── lm_dataset.py │ │ │ ├── fault_tolerant_sampler.py │ │ │ ├── imagenet.py │ │ │ ├── language_modeling_hf.py │ │ │ └── timm_mixup.py │ │ ├── distributed │ │ │ └── ddp_comm_hooks.py │ │ ├── eval.py │ │ ├── metrics │ │ │ ├── accuracy.py │ │ │ ├── num_tokens.py │ │ │ └── perplexity.py │ │ ├── models │ │ │ └── modules │ │ │ │ └── seq_common.py │ │ ├── optim │ │ │ ├── param_grouping.py │ │ │ └── timm_lr_scheduler.py │ │ ├── tasks │ │ │ └── seq.py │ │ ├── train.py │ │ └── utils │ │ │ ├── checkpoint.py │ │ │ ├── ddp_zero1.py │ │ │ ├── ddp_zero2.py │ │ │ ├── distributed.py │ │ │ ├── ema.py │ │ │ ├── flops.py │ │ │ ├── gpu_affinity.py │ │ │ └── utils.py │ └── tests │ │ └── datamodules │ │ └── test_language_modeling_hf.py └── usage.md ├── optorch ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── pyproject.toml └── src │ ├── optorch │ ├── __init__.py │ ├── adam.py │ ├── amp.py │ ├── builder.py │ ├── common.py │ ├── lars.py │ ├── schedule.py │ └── sgd.py │ └── tests │ ├── test_adam.py │ ├── test_lars.py │ └── test_sgd.py ├── pretrained └── axlearn_load_pretrained.ipynb └── setup.bash /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/.gitmodules -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/README.md -------------------------------------------------------------------------------- /attention_simulator/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/.gitignore -------------------------------------------------------------------------------- /attention_simulator/.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/.pre-commit-config.yaml -------------------------------------------------------------------------------- /attention_simulator/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/LICENSE -------------------------------------------------------------------------------- /attention_simulator/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/README.md -------------------------------------------------------------------------------- /attention_simulator/examples/language_modeling/configs/experiment_realnews_transformer_bf16_flash_sigmoid_multilayer_rope.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/examples/language_modeling/configs/experiment_realnews_transformer_bf16_flash_sigmoid_multilayer_rope.yaml -------------------------------------------------------------------------------- /attention_simulator/examples/language_modeling/configs/experiment_realnews_transformer_bf16_flash_softmax_multilayer_rope.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/examples/language_modeling/configs/experiment_realnews_transformer_bf16_flash_softmax_multilayer_rope.yaml -------------------------------------------------------------------------------- /attention_simulator/examples/language_modeling/configs/experiment_realnews_transformer_sigmoid_alibi.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/examples/language_modeling/configs/experiment_realnews_transformer_sigmoid_alibi.yaml -------------------------------------------------------------------------------- /attention_simulator/examples/language_modeling/configs/experiment_tiny_shakespeare_transformer_sigmoid_rope.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/examples/language_modeling/configs/experiment_tiny_shakespeare_transformer_sigmoid_rope.yaml -------------------------------------------------------------------------------- /attention_simulator/examples/language_modeling/configs/experiment_tiny_shakespeare_transformer_softmax_sincos.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/examples/language_modeling/configs/experiment_tiny_shakespeare_transformer_softmax_sincos.yaml -------------------------------------------------------------------------------- /attention_simulator/examples/language_modeling/train_autoregressive_language_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/examples/language_modeling/train_autoregressive_language_model.py -------------------------------------------------------------------------------- /attention_simulator/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/pyproject.toml -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/autoregressive_language_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/autoregressive_language_model.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/helpers/grapher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/helpers/grapher.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/helpers/params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/helpers/params.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/helpers/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/helpers/utils.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/activations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/activations.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/attention.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/container.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/container.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/flash_sigmoid_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/flash_sigmoid_attention.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/flash_softmax_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/flash_softmax_attention.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/initialization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/initialization.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/linear.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/masking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/masking.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/mlp.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/normalization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/normalization.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/position_embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/position_embedding.py -------------------------------------------------------------------------------- /attention_simulator/src/attention_simulator/layers/transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/attention_simulator/src/attention_simulator/layers/transformer.py -------------------------------------------------------------------------------- /figures/H100_noalibi_BWD_Full_2.7_0.06_Causal_6.19_0.06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/figures/H100_noalibi_BWD_Full_2.7_0.06_Causal_6.19_0.06.png -------------------------------------------------------------------------------- /figures/H100_noalibi_FWD_Full_17.39_0.07_Causal_18.76_0.06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/figures/H100_noalibi_FWD_Full_17.39_0.07_Causal_18.76_0.06.png -------------------------------------------------------------------------------- /figures/train_nll_softmax_vs_sigmoid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/figures/train_nll_softmax_vs_sigmoid.png -------------------------------------------------------------------------------- /flash_sigmoid/.github/workflows/publish.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/.github/workflows/publish.yml -------------------------------------------------------------------------------- /flash_sigmoid/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/.gitignore -------------------------------------------------------------------------------- /flash_sigmoid/.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/.gitmodules -------------------------------------------------------------------------------- /flash_sigmoid/AUTHORS: -------------------------------------------------------------------------------- 1 | Tri Dao, trid@cs.stanford.edu -------------------------------------------------------------------------------- /flash_sigmoid/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/LICENSE -------------------------------------------------------------------------------- /flash_sigmoid/MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/MANIFEST.in -------------------------------------------------------------------------------- /flash_sigmoid/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/Makefile -------------------------------------------------------------------------------- /flash_sigmoid/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/README.md -------------------------------------------------------------------------------- /flash_sigmoid/assets/flash2_a100_fwd_bwd_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flash2_a100_fwd_bwd_benchmark.png -------------------------------------------------------------------------------- /flash_sigmoid/assets/flash2_h100_fwd_bwd_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flash2_h100_fwd_bwd_benchmark.png -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattention_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattention_logo.png -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattn_banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattn_banner.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattn_banner.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattn_banner.pdf -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattn_memory.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattn_memory.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattn_speedup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattn_speedup.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattn_speedup_3090.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattn_speedup_3090.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattn_speedup_a100_d128.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattn_speedup_a100_d128.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattn_speedup_t4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattn_speedup_t4.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/flashattn_speedup_t4_fwd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/flashattn_speedup_t4_fwd.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/gpt2_training_curve.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/gpt2_training_curve.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/gpt2_training_efficiency.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/gpt2_training_efficiency.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/gpt3_training_curve.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/gpt3_training_curve.jpg -------------------------------------------------------------------------------- /flash_sigmoid/assets/gpt3_training_efficiency.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/assets/gpt3_training_efficiency.jpg -------------------------------------------------------------------------------- /flash_sigmoid/benchmarks/benchmark_alibi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/benchmarks/benchmark_alibi.py -------------------------------------------------------------------------------- /flash_sigmoid/benchmarks/benchmark_causal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/benchmarks/benchmark_causal.py -------------------------------------------------------------------------------- /flash_sigmoid/benchmarks/benchmark_flash_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/benchmarks/benchmark_flash_attention.py -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/flash_api.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/flash_api.cpp -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/alibi.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/alibi.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/block_info.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/block_info.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/dropout.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/dropout.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim128_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim128_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim128_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim128_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim160_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim160_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim160_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim160_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim192_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim192_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim192_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim192_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim224_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim224_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim224_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim224_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim256_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim256_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim256_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim256_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim32_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim32_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim32_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim32_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim64_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim64_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim64_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim64_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim96_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim96_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim96_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_hdim96_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_kernel.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_kernel.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_launch_template.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_launch_template.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_preprocess_kernel.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_bwd_preprocess_kernel.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim128_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim128_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim128_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim128_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim160_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim160_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim160_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim160_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim192_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim192_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim192_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim192_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim224_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim224_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim224_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim224_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim256_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim256_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim256_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim256_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim32_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim32_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim32_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim32_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim64_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim64_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim64_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim64_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim96_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim96_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim96_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_hdim96_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_kernel.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_kernel.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_launch_template.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_launch_template.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim128_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim128_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim128_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim128_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim160_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim160_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim160_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim160_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim192_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim192_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim192_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim192_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim224_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim224_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim224_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim224_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim256_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim256_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim256_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim256_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim32_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim32_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim32_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim32_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim64_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim64_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim64_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim64_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim96_bf16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim96_bf16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim96_fp16_sm80.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/flash_fwd_split_hdim96_fp16_sm80.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/generate_kernels.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/generate_kernels.py -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/kernel_traits.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/kernel_traits.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/mask.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/mask.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/philox.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/philox.cuh -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/rotary.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/rotary.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/softmax.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/softmax.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/static_switch.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/static_switch.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/flash_sigmoid/src/utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/flash_sigmoid/src/utils.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/README.md -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/cuda_bf16_fallbacks.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/cuda_bf16_fallbacks.cuh -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/cuda_bf16_wrapper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/cuda_bf16_wrapper.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/decoder_masked_multihead_attention.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/decoder_masked_multihead_attention.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/decoder_masked_multihead_attention.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/decoder_masked_multihead_attention.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/decoder_masked_multihead_attention_template.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/decoder_masked_multihead_attention_template.hpp -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/decoder_masked_multihead_attention_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/decoder_masked_multihead_attention_utils.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/ft_attention.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/ft_attention.cpp -------------------------------------------------------------------------------- /flash_sigmoid/csrc/ft_attention/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/ft_attention/setup.py -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_dense_lib/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_dense_lib/README.md -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_dense_lib/fused_dense.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_dense_lib/fused_dense.cpp -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_dense_lib/fused_dense_cuda.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_dense_lib/fused_dense_cuda.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_dense_lib/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_dense_lib/setup.py -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_softmax/fused_softmax.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_softmax/fused_softmax.cpp -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_softmax/scaled_masked_softmax.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_softmax/scaled_masked_softmax.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_softmax/scaled_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_softmax/scaled_masked_softmax_cuda.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_softmax/scaled_upper_triang_masked_softmax.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_softmax/scaled_upper_triang_masked_softmax.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_softmax/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_softmax/scaled_upper_triang_masked_softmax_cuda.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_softmax/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_softmax/setup.py -------------------------------------------------------------------------------- /flash_sigmoid/csrc/fused_softmax/type_shim.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/fused_softmax/type_shim.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/README.md -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_api.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_api.cpp -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_1024.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_1024.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_1280.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_1280.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_1536.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_1536.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_2048.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_2048.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_256.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_256.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_2560.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_2560.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_3072.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_3072.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_4096.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_4096.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_512.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_512.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_5120.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_5120.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_6144.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_6144.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_7168.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_7168.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_768.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_768.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_8192.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_8192.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_bwd_kernels.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_bwd_kernels.cuh -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_1024.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_1024.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_1280.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_1280.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_1536.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_1536.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_2048.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_2048.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_256.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_256.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_2560.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_2560.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_3072.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_3072.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_4096.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_4096.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_512.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_512.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_5120.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_5120.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_6144.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_6144.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_7168.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_7168.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_768.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_768.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_8192.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_8192.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_fwd_kernels.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_fwd_kernels.cuh -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_kernel_traits.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_kernel_traits.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_1024.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_1024.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_1280.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_1280.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_1536.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_1536.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_2048.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_2048.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_256.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_256.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_2560.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_2560.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_3072.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_3072.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_4096.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_4096.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_512.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_512.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_5120.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_5120.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_6144.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_6144.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_7168.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_7168.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_768.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_768.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_8192.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_bwd_8192.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_1024.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_1024.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_1280.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_1280.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_1536.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_1536.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_2048.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_2048.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_256.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_256.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_2560.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_2560.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_3072.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_3072.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_4096.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_4096.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_512.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_512.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_5120.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_5120.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_6144.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_6144.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_7168.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_7168.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_768.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_768.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_8192.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_fwd_8192.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_residual_bwd_kernels.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_residual_bwd_kernels.cuh -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_parallel_residual_fwd_kernels.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_parallel_residual_fwd_kernels.cuh -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/ln_utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/ln_utils.cuh -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/setup.py -------------------------------------------------------------------------------- /flash_sigmoid/csrc/layer_norm/static_switch.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/layer_norm/static_switch.h -------------------------------------------------------------------------------- /flash_sigmoid/csrc/rotary/rotary.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/rotary/rotary.cpp -------------------------------------------------------------------------------- /flash_sigmoid/csrc/rotary/rotary_cuda.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/rotary/rotary_cuda.cu -------------------------------------------------------------------------------- /flash_sigmoid/csrc/rotary/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/rotary/setup.py -------------------------------------------------------------------------------- /flash_sigmoid/csrc/xentropy/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/xentropy/README.md -------------------------------------------------------------------------------- /flash_sigmoid/csrc/xentropy/interface.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/xentropy/interface.cpp -------------------------------------------------------------------------------- /flash_sigmoid/csrc/xentropy/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/xentropy/setup.py -------------------------------------------------------------------------------- /flash_sigmoid/csrc/xentropy/xentropy_kernel.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/csrc/xentropy/xentropy_kernel.cu -------------------------------------------------------------------------------- /flash_sigmoid/examples/inference/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/examples/inference/README.md -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/__init__.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/bert_padding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/bert_padding.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/flash_attn_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/flash_attn_interface.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/flash_attn_triton.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/flash_attn_triton.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/flash_attn_triton_og.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/flash_attn_triton_og.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/flash_blocksparse_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/flash_blocksparse_attention.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/flash_blocksparse_attn_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/flash_blocksparse_attn_interface.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/fused_softmax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/fused_softmax.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/layers/patch_embed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/layers/patch_embed.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/layers/rotary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/layers/rotary.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/losses/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/losses/cross_entropy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/losses/cross_entropy.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/baichuan.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/baichuan.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/bert.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/bigcode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/bigcode.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/btlm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/btlm.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/falcon.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/falcon.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/gpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/gpt.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/gpt_neox.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/gpt_neox.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/gptj.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/gptj.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/llama.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/opt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/opt.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/models/vit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/models/vit.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/modules/block.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/modules/block.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/modules/embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/modules/embedding.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/modules/mha.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/modules/mha.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/modules/mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/modules/mlp.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/activations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/activations.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/fused_dense.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/fused_dense.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/layer_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/layer_norm.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/rms_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/rms_norm.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/triton/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/triton/cross_entropy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/triton/cross_entropy.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/triton/k_activations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/triton/k_activations.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/triton/layer_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/triton/layer_norm.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/triton/linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/triton/linear.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/triton/mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/triton/mlp.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/ops/triton/rotary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/ops/triton/rotary.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | target-version = ['py38'] -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/utils/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/utils/benchmark.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/utils/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/utils/distributed.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/utils/generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/utils/generation.py -------------------------------------------------------------------------------- /flash_sigmoid/flash_sigmoid/utils/pretrained.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/flash_sigmoid/utils/pretrained.py -------------------------------------------------------------------------------- /flash_sigmoid/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/setup.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/layers/test_rotary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/layers/test_rotary.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/losses/test_cross_entropy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/losses/test_cross_entropy.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/losses/test_cross_entropy_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/losses/test_cross_entropy_parallel.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_baichuan.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_baichuan.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_bert.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_bigcode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_bigcode.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_btlm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_btlm.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_falcon.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_falcon.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_gpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_gpt.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_gpt_generation_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_gpt_generation_parallel.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_gpt_neox.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_gpt_neox.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_gpt_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_gpt_parallel.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_gptj.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_gptj.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_llama.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_opt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_opt.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/models/test_vit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/models/test_vit.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/modules/test_block_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/modules/test_block_parallel.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/modules/test_embedding_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/modules/test_embedding_parallel.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/modules/test_mha_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/modules/test_mha_parallel.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/modules/test_mlp_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/modules/test_mlp_parallel.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/ops/test_dropout_layer_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/ops/test_dropout_layer_norm.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/ops/test_fused_dense.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/ops/test_fused_dense.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/ops/test_fused_dense_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/ops/test_fused_dense_parallel.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/ops/triton/test_layer_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/ops/triton/test_layer_norm.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | target-version = ['py38'] -------------------------------------------------------------------------------- /flash_sigmoid/tests/test_flash_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/test_flash_attn.py -------------------------------------------------------------------------------- /flash_sigmoid/tests/test_rotary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/tests/test_rotary.py -------------------------------------------------------------------------------- /flash_sigmoid/training/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/Dockerfile -------------------------------------------------------------------------------- /flash_sigmoid/training/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/README.md -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/causality-monitor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/causality-monitor.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/default.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/ema.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/ema.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/flop-count.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/flop-count.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/gpu-monitor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/gpu-monitor.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/model-summary.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/model-summary.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/none.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/norm-monitor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/norm-monitor.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/params-log.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/params-log.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/callbacks/wandb.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/callbacks/wandb.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/config.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/datamodule/openwebtext.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/datamodule/openwebtext.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/datamodule/thepile.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/datamodule/thepile.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/base.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/base.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2l-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2l-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2l-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2l-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2l.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2l.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2m-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2m-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2m-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2m-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2m.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2m.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2s-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2s-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2s-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2s-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2s.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2s.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2xl-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2xl-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2xl-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2xl-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/owt/gpt2xl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/owt/gpt2xl.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/base.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/base.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-hdim128-rotary-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-hdim128-rotary-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-hdim128-rotary.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-hdim128-rotary.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-hdim128.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-hdim128.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-rotary-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-rotary-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-rotary.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash-rotary.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-hf-hdim128.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-hf-hdim128.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3-2.7B-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3l-flash-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3l-flash-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3l-flash-rotary-30B.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3l-flash-rotary-30B.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3l-flash-rotary-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3l-flash-rotary-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3l-flash-rotary.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3l-flash-rotary.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3l-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3l-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3l-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3l-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3m-flash-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3m-flash-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3m-flash-rotary-30B.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3m-flash-rotary-30B.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3m-flash-rotary-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3m-flash-rotary-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3m-flash-rotary.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3m-flash-rotary.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3m-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3m-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3m-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3m-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3s-flash-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3s-flash-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3s-flash-rotary-30B.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3s-flash-rotary-30B.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3s-flash-rotary-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3s-flash-rotary-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3s-flash-rotary.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3s-flash-rotary.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3s-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3s-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3s-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3s-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash-rotary-60B.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash-rotary-60B.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash-rotary-8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash-rotary-8k.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash-rotary.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash-rotary.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3xl-flash.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/experiment/pile/gpt3xl-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/experiment/pile/gpt3xl-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/logger/comet.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/logger/comet.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/logger/csv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/logger/csv.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/logger/many_loggers.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/logger/many_loggers.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/logger/mlflow.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/logger/mlflow.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/logger/neptune.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/logger/neptune.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/logger/tensorboard.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/logger/tensorboard.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/logger/wandb.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/logger/wandb.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/metrics/acc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/metrics/acc.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/metrics/acc_ignore_index.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/metrics/acc_ignore_index.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/metrics/acctop5.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/metrics/acctop5.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/metrics/mse.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/metrics/mse.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/metrics/num-tokens.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/metrics/num-tokens.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/metrics/perplexity.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/metrics/perplexity.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/mode/debug.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/mode/debug.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/mode/default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/mode/default.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/mode/exp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/mode/exp.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/mode/profile.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/mode/profile.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/mode/smoke.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/mode/smoke.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/model/gpt2-hf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/model/gpt2-hf.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/model/gpt2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/model/gpt2.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/model/gpt2model/gpt2-large.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/model/gpt2model/gpt2-large.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/model/gpt2model/gpt2-medium.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/model/gpt2model/gpt2-medium.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/model/gpt2model/gpt2-small.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/model/gpt2model/gpt2-small.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/model/gpt2model/gpt2-xlarge.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/model/gpt2model/gpt2-xlarge.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/adam.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/adam.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/adamw-apex-distributed.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/adamw-apex-distributed.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/adamw-apex-zero.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/adamw-apex-zero.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/adamw-apex.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/adamw-apex.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/adamw-zero.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/adamw-zero.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/adamw.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/adamw.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/fusedlamb-ds.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/fusedlamb-ds.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/fusedlamb.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/fusedlamb.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/optimizer/sgd.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/optimizer/sgd.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/scheduler/cosine-warmup-timm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/scheduler/cosine-warmup-timm.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/scheduler/cosine-warmup.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/scheduler/cosine-warmup.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/scheduler/invsqrt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/scheduler/invsqrt.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/scheduler/linear-warmup.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/scheduler/linear-warmup.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/scheduler/multi-step.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/scheduler/multi-step.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/scheduler/plateau.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/scheduler/plateau.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/scheduler/poly-warmup.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/scheduler/poly-warmup.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/scheduler/step.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/scheduler/step.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/task/sequence-model.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.tasks.seq.SequenceModel 2 | -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/trainer/all_params.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/trainer/all_params.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/trainer/ddp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/trainer/ddp.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/trainer/debug.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/trainer/debug.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/configs/trainer/default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/configs/trainer/default.yaml -------------------------------------------------------------------------------- /flash_sigmoid/training/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/run.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/causality_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/causality_monitor.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/ema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/ema.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/flop_count.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/flop_count.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/gpu_affinity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/gpu_affinity.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/loss_scale_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/loss_scale_monitor.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/model_checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/model_checkpoint.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/norm_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/norm_monitor.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/params_log.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/params_log.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/speed_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/speed_monitor.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/callbacks/wandb_callbacks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/callbacks/wandb_callbacks.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/datamodules/datasets/detokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/datamodules/datasets/detokenizer.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/datamodules/datasets/lm_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/datamodules/datasets/lm_dataset.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/datamodules/fault_tolerant_sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/datamodules/fault_tolerant_sampler.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/datamodules/imagenet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/datamodules/imagenet.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/datamodules/language_modeling_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/datamodules/language_modeling_hf.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/datamodules/timm_mixup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/datamodules/timm_mixup.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/distributed/ddp_comm_hooks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/distributed/ddp_comm_hooks.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/eval.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/metrics/accuracy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/metrics/accuracy.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/metrics/num_tokens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/metrics/num_tokens.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/metrics/perplexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/metrics/perplexity.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/models/modules/seq_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/models/modules/seq_common.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/optim/param_grouping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/optim/param_grouping.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/optim/timm_lr_scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/optim/timm_lr_scheduler.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/tasks/seq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/tasks/seq.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/train.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/utils/checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/utils/checkpoint.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/utils/ddp_zero1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/utils/ddp_zero1.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/utils/ddp_zero2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/utils/ddp_zero2.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/utils/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/utils/distributed.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/utils/ema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/utils/ema.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/utils/flops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/utils/flops.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/utils/gpu_affinity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/utils/gpu_affinity.py -------------------------------------------------------------------------------- /flash_sigmoid/training/src/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/src/utils/utils.py -------------------------------------------------------------------------------- /flash_sigmoid/training/tests/datamodules/test_language_modeling_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/training/tests/datamodules/test_language_modeling_hf.py -------------------------------------------------------------------------------- /flash_sigmoid/usage.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/flash_sigmoid/usage.md -------------------------------------------------------------------------------- /optorch/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/.gitignore -------------------------------------------------------------------------------- /optorch/.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/.pre-commit-config.yaml -------------------------------------------------------------------------------- /optorch/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/LICENSE -------------------------------------------------------------------------------- /optorch/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/README.md -------------------------------------------------------------------------------- /optorch/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/pyproject.toml -------------------------------------------------------------------------------- /optorch/src/optorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/optorch/__init__.py -------------------------------------------------------------------------------- /optorch/src/optorch/adam.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/optorch/adam.py -------------------------------------------------------------------------------- /optorch/src/optorch/amp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/optorch/amp.py -------------------------------------------------------------------------------- /optorch/src/optorch/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/optorch/builder.py -------------------------------------------------------------------------------- /optorch/src/optorch/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/optorch/common.py -------------------------------------------------------------------------------- /optorch/src/optorch/lars.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/optorch/lars.py -------------------------------------------------------------------------------- /optorch/src/optorch/schedule.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/optorch/schedule.py -------------------------------------------------------------------------------- /optorch/src/optorch/sgd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/optorch/sgd.py -------------------------------------------------------------------------------- /optorch/src/tests/test_adam.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/tests/test_adam.py -------------------------------------------------------------------------------- /optorch/src/tests/test_lars.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/tests/test_lars.py -------------------------------------------------------------------------------- /optorch/src/tests/test_sgd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/optorch/src/tests/test_sgd.py -------------------------------------------------------------------------------- /pretrained/axlearn_load_pretrained.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/pretrained/axlearn_load_pretrained.ipynb -------------------------------------------------------------------------------- /setup.bash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-sigmoid-attention/HEAD/setup.bash --------------------------------------------------------------------------------