├── .github └── workflows │ ├── publish-pypi.yml │ └── ruff.yml ├── .gitignore ├── .pre-commit-config.yaml ├── README.md ├── configs ├── 10B │ └── H100_simple.toml ├── 150M │ ├── 3090.toml │ ├── A40.toml │ ├── H100-fast.toml │ └── H100.toml ├── 150M_short │ ├── 3090.toml │ ├── A40.toml │ └── H100.toml ├── 1B │ └── H100.toml ├── 70M │ └── H100.toml ├── 7B │ └── H100.toml ├── debug │ └── normal.toml └── test.toml ├── helo.py ├── install.sh ├── pyproject.toml ├── scripts └── subset_data.py ├── src ├── muon │ ├── LICENSE │ ├── README.md │ ├── muon_fsdp2.py │ └── pyproject.toml └── zeroband │ ├── __init__.py │ ├── data.py │ ├── logger.py │ ├── lr_scheduler.py │ ├── models │ ├── __init__.py │ ├── llama │ │ ├── __init__.py │ │ └── model.py │ └── norms.py │ ├── utils.py │ └── world_info.py ├── train_ddp.py ├── train_fsdp.py └── uv.lock /.github/workflows/publish-pypi.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/.github/workflows/publish-pypi.yml -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/.github/workflows/ruff.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/README.md -------------------------------------------------------------------------------- /configs/10B/H100_simple.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/10B/H100_simple.toml -------------------------------------------------------------------------------- /configs/150M/3090.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/150M/3090.toml -------------------------------------------------------------------------------- /configs/150M/A40.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/150M/A40.toml -------------------------------------------------------------------------------- /configs/150M/H100-fast.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/150M/H100-fast.toml -------------------------------------------------------------------------------- /configs/150M/H100.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/150M/H100.toml -------------------------------------------------------------------------------- /configs/150M_short/3090.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/150M_short/3090.toml -------------------------------------------------------------------------------- /configs/150M_short/A40.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/150M_short/A40.toml -------------------------------------------------------------------------------- /configs/150M_short/H100.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/150M_short/H100.toml -------------------------------------------------------------------------------- /configs/1B/H100.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/1B/H100.toml -------------------------------------------------------------------------------- /configs/70M/H100.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/70M/H100.toml -------------------------------------------------------------------------------- /configs/7B/H100.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/7B/H100.toml -------------------------------------------------------------------------------- /configs/debug/normal.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/debug/normal.toml -------------------------------------------------------------------------------- /configs/test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/configs/test.toml -------------------------------------------------------------------------------- /helo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/helo.py -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/install.sh -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/pyproject.toml -------------------------------------------------------------------------------- /scripts/subset_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/scripts/subset_data.py -------------------------------------------------------------------------------- /src/muon/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/muon/LICENSE -------------------------------------------------------------------------------- /src/muon/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/muon/README.md -------------------------------------------------------------------------------- /src/muon/muon_fsdp2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/muon/muon_fsdp2.py -------------------------------------------------------------------------------- /src/muon/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/muon/pyproject.toml -------------------------------------------------------------------------------- /src/zeroband/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/zeroband/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/zeroband/data.py -------------------------------------------------------------------------------- /src/zeroband/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/zeroband/logger.py -------------------------------------------------------------------------------- /src/zeroband/lr_scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/zeroband/lr_scheduler.py -------------------------------------------------------------------------------- /src/zeroband/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/zeroband/models/llama/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/zeroband/models/llama/__init__.py -------------------------------------------------------------------------------- /src/zeroband/models/llama/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/zeroband/models/llama/model.py -------------------------------------------------------------------------------- /src/zeroband/models/norms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/zeroband/models/norms.py -------------------------------------------------------------------------------- /src/zeroband/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/zeroband/utils.py -------------------------------------------------------------------------------- /src/zeroband/world_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/src/zeroband/world_info.py -------------------------------------------------------------------------------- /train_ddp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/train_ddp.py -------------------------------------------------------------------------------- /train_fsdp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/train_fsdp.py -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samsja/muon_fsdp_2/HEAD/uv.lock --------------------------------------------------------------------------------