├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── finetuning ├── README.md ├── data.py ├── factorized.py ├── finetune.ipynb ├── finetune.py ├── gemma.py ├── optimizer.py ├── requirements.txt ├── rope.py ├── runs │ ├── gemma4bpt_math_adafactor.yaml │ ├── gemma4bpt_math_adam_bs1.yaml │ ├── gemma4bpt_math_adam_bs16.yaml │ └── gemma4bpt_math_lora.yaml ├── sampler.py └── utils.py ├── plots ├── adam_2d.png ├── finetune_bar.png └── gpt3xl_sgd.png ├── pretraining ├── README.md ├── configs │ ├── base.yaml │ ├── dataset │ │ ├── c4_t5all.yaml │ │ ├── fw_gpt2.yaml │ │ └── fwedu_gpt2.yaml │ ├── hparams │ │ └── lm11m_fwedu_adam.yaml │ ├── model │ │ ├── gpt2s.yaml │ │ ├── gpt3xl.yaml │ │ ├── lm11m.yaml │ │ └── lm19m.yaml │ └── resolver_setup.py ├── data.py ├── download_fineweb.py ├── factorized.py ├── main.py ├── model.py ├── optimizer.py ├── requirements.txt ├── rope.py ├── runs │ ├── gpt2s │ │ ├── gpt2s_adam_2d.yaml │ │ └── optimizer_comp │ │ │ ├── final_configs.sh │ │ │ ├── gpt2s_adafactor.yaml │ │ │ ├── gpt2s_adam_bs1_b2.yaml │ │ │ ├── gpt2s_adam_bs1_t2.yaml │ │ │ ├── gpt2s_adam_bs512.yaml │ │ │ └── gpt2s_sgd.yaml │ ├── gpt3xl │ │ └── gpt3xl_optimizer_comp.sh │ ├── lm19m │ │ ├── fig10_fixed_b2.yaml │ │ └── fig10_fixed_t2.yaml │ └── lm30m │ │ ├── hparam_scaling │ │ ├── lm30m_adam_b1.yaml │ │ ├── lm30m_adam_b2.yaml │ │ ├── lm30m_adam_lr.yaml │ │ └── lm30m_adam_t2.yaml │ │ ├── lm30m_adam_sensitivity.yaml │ │ └── optimizer_comp │ │ ├── lm30m_adafactor_grid.yaml │ │ ├── lm30m_adam_grid.yaml │ │ ├── lm30m_muon_grid.yaml │ │ └── lm30m_sgd_grid.yaml ├── train.ipynb ├── train.py └── utils.py └── utils ├── adam_1d.ipynb ├── adam_2d.ipynb ├── fig10.ipynb ├── finetuning.ipynb ├── gpt.ipynb ├── memory_measure.ipynb ├── memory_plot.ipynb ├── optimizer_comparison.ipynb ├── toy.ipynb └── utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-documentation 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | __pycache__ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/README.md -------------------------------------------------------------------------------- /finetuning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/README.md -------------------------------------------------------------------------------- /finetuning/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/data.py -------------------------------------------------------------------------------- /finetuning/factorized.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/factorized.py -------------------------------------------------------------------------------- /finetuning/finetune.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/finetune.ipynb -------------------------------------------------------------------------------- /finetuning/finetune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/finetune.py -------------------------------------------------------------------------------- /finetuning/gemma.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/gemma.py -------------------------------------------------------------------------------- /finetuning/optimizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/optimizer.py -------------------------------------------------------------------------------- /finetuning/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/requirements.txt -------------------------------------------------------------------------------- /finetuning/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/rope.py -------------------------------------------------------------------------------- /finetuning/runs/gemma4bpt_math_adafactor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/runs/gemma4bpt_math_adafactor.yaml -------------------------------------------------------------------------------- /finetuning/runs/gemma4bpt_math_adam_bs1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/runs/gemma4bpt_math_adam_bs1.yaml -------------------------------------------------------------------------------- /finetuning/runs/gemma4bpt_math_adam_bs16.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/runs/gemma4bpt_math_adam_bs16.yaml -------------------------------------------------------------------------------- /finetuning/runs/gemma4bpt_math_lora.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/runs/gemma4bpt_math_lora.yaml -------------------------------------------------------------------------------- /finetuning/sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/sampler.py -------------------------------------------------------------------------------- /finetuning/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/finetuning/utils.py -------------------------------------------------------------------------------- /plots/adam_2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/plots/adam_2d.png -------------------------------------------------------------------------------- /plots/finetune_bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/plots/finetune_bar.png -------------------------------------------------------------------------------- /plots/gpt3xl_sgd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/plots/gpt3xl_sgd.png -------------------------------------------------------------------------------- /pretraining/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/README.md -------------------------------------------------------------------------------- /pretraining/configs/base.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/configs/base.yaml -------------------------------------------------------------------------------- /pretraining/configs/dataset/c4_t5all.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/configs/dataset/c4_t5all.yaml -------------------------------------------------------------------------------- /pretraining/configs/dataset/fw_gpt2.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | ds_path: '~/datasets/fineweb_gpt2.bin' 4 | 5 | model: 6 | V: 50257 7 | -------------------------------------------------------------------------------- /pretraining/configs/dataset/fwedu_gpt2.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | ds_path: '~/datasets/finewebedu_gpt2.bin' 4 | 5 | model: 6 | V: 50257 7 | -------------------------------------------------------------------------------- /pretraining/configs/hparams/lm11m_fwedu_adam.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/configs/hparams/lm11m_fwedu_adam.yaml -------------------------------------------------------------------------------- /pretraining/configs/model/gpt2s.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/configs/model/gpt2s.yaml -------------------------------------------------------------------------------- /pretraining/configs/model/gpt3xl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/configs/model/gpt3xl.yaml -------------------------------------------------------------------------------- /pretraining/configs/model/lm11m.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/configs/model/lm11m.yaml -------------------------------------------------------------------------------- /pretraining/configs/model/lm19m.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/configs/model/lm19m.yaml -------------------------------------------------------------------------------- /pretraining/configs/resolver_setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/configs/resolver_setup.py -------------------------------------------------------------------------------- /pretraining/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/data.py -------------------------------------------------------------------------------- /pretraining/download_fineweb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/download_fineweb.py -------------------------------------------------------------------------------- /pretraining/factorized.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/factorized.py -------------------------------------------------------------------------------- /pretraining/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/main.py -------------------------------------------------------------------------------- /pretraining/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/model.py -------------------------------------------------------------------------------- /pretraining/optimizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/optimizer.py -------------------------------------------------------------------------------- /pretraining/requirements.txt: -------------------------------------------------------------------------------- 1 | jax 2 | flax==0.12.0 3 | optax 4 | numpy 5 | tqdm 6 | hydra-core 7 | wandb -------------------------------------------------------------------------------- /pretraining/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/rope.py -------------------------------------------------------------------------------- /pretraining/runs/gpt2s/gpt2s_adam_2d.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/gpt2s/gpt2s_adam_2d.yaml -------------------------------------------------------------------------------- /pretraining/runs/gpt2s/optimizer_comp/final_configs.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/gpt2s/optimizer_comp/final_configs.sh -------------------------------------------------------------------------------- /pretraining/runs/gpt2s/optimizer_comp/gpt2s_adafactor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/gpt2s/optimizer_comp/gpt2s_adafactor.yaml -------------------------------------------------------------------------------- /pretraining/runs/gpt2s/optimizer_comp/gpt2s_adam_bs1_b2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/gpt2s/optimizer_comp/gpt2s_adam_bs1_b2.yaml -------------------------------------------------------------------------------- /pretraining/runs/gpt2s/optimizer_comp/gpt2s_adam_bs1_t2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/gpt2s/optimizer_comp/gpt2s_adam_bs1_t2.yaml -------------------------------------------------------------------------------- /pretraining/runs/gpt2s/optimizer_comp/gpt2s_adam_bs512.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/gpt2s/optimizer_comp/gpt2s_adam_bs512.yaml -------------------------------------------------------------------------------- /pretraining/runs/gpt2s/optimizer_comp/gpt2s_sgd.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/gpt2s/optimizer_comp/gpt2s_sgd.yaml -------------------------------------------------------------------------------- /pretraining/runs/gpt3xl/gpt3xl_optimizer_comp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/gpt3xl/gpt3xl_optimizer_comp.sh -------------------------------------------------------------------------------- /pretraining/runs/lm19m/fig10_fixed_b2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm19m/fig10_fixed_b2.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm19m/fig10_fixed_t2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm19m/fig10_fixed_t2.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/hparam_scaling/lm30m_adam_b1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/hparam_scaling/lm30m_adam_b1.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/hparam_scaling/lm30m_adam_b2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/hparam_scaling/lm30m_adam_b2.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/hparam_scaling/lm30m_adam_lr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/hparam_scaling/lm30m_adam_lr.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/hparam_scaling/lm30m_adam_t2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/hparam_scaling/lm30m_adam_t2.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/lm30m_adam_sensitivity.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/lm30m_adam_sensitivity.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/optimizer_comp/lm30m_adafactor_grid.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/optimizer_comp/lm30m_adafactor_grid.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/optimizer_comp/lm30m_adam_grid.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/optimizer_comp/lm30m_adam_grid.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/optimizer_comp/lm30m_muon_grid.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/optimizer_comp/lm30m_muon_grid.yaml -------------------------------------------------------------------------------- /pretraining/runs/lm30m/optimizer_comp/lm30m_sgd_grid.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/runs/lm30m/optimizer_comp/lm30m_sgd_grid.yaml -------------------------------------------------------------------------------- /pretraining/train.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/train.ipynb -------------------------------------------------------------------------------- /pretraining/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/train.py -------------------------------------------------------------------------------- /pretraining/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/pretraining/utils.py -------------------------------------------------------------------------------- /utils/adam_1d.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/adam_1d.ipynb -------------------------------------------------------------------------------- /utils/adam_2d.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/adam_2d.ipynb -------------------------------------------------------------------------------- /utils/fig10.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/fig10.ipynb -------------------------------------------------------------------------------- /utils/finetuning.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/finetuning.ipynb -------------------------------------------------------------------------------- /utils/gpt.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/gpt.ipynb -------------------------------------------------------------------------------- /utils/memory_measure.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/memory_measure.ipynb -------------------------------------------------------------------------------- /utils/memory_plot.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/memory_plot.ipynb -------------------------------------------------------------------------------- /utils/optimizer_comparison.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/optimizer_comparison.ipynb -------------------------------------------------------------------------------- /utils/toy.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/toy.ipynb -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martin-marek/batch-size/HEAD/utils/utils.py --------------------------------------------------------------------------------