├── .gitignore ├── README.md ├── bert ├── .python-version ├── LICENSE.txt ├── README.md ├── conda_env.yaml ├── cramming │ ├── __init__.py │ ├── architectures │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── components.py │ │ ├── construction.py │ │ ├── embeddings.py │ │ ├── fixed_cramlm.py │ │ ├── funnel_transformers.py │ │ ├── fused_layers.py │ │ ├── gpt2.py │ │ ├── huggingface_interface.py │ │ ├── losses.py │ │ ├── recurrent_transformers.py │ │ ├── sanity_check.py │ │ ├── scriptable_bert.py │ │ └── t5.py │ ├── backend │ │ ├── __init__.py │ │ ├── deepspeed_integration.py │ │ ├── optimizers │ │ │ ├── __init__.py │ │ │ ├── adahessian.py │ │ │ ├── adamw_scale.py │ │ │ ├── lion_pytorch.py │ │ │ ├── optimizer_modifiers.py │ │ │ ├── progressive_batching.py │ │ │ ├── schedulers.py │ │ │ ├── shampoo │ │ │ │ ├── CODE_OF_CONDUCT.md │ │ │ │ ├── CONTRIBUTING.md │ │ │ │ ├── LICENSE │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── matrix_functions.py │ │ │ │ ├── shampoo.py │ │ │ │ └── shampoo_utils.py │ │ │ └── sophiag.py │ │ ├── prepare_backend.py │ │ ├── torch_default.py │ │ └── utils.py │ ├── config │ │ ├── __init__.py │ │ ├── arch │ │ │ ├── __init__.py │ │ │ ├── bert-base.yaml │ │ │ ├── bert-c2.yaml │ │ │ ├── bert-c3.yaml │ │ │ ├── bert-c4.yaml │ │ │ ├── bert-c5.yaml │ │ │ ├── bert-i4.yaml │ │ │ ├── bert-large-izsak.yaml │ │ │ ├── bert-original.yaml │ │ │ ├── bert-tiny.yaml │ │ │ ├── funnel-c2.yaml │ │ │ ├── hf-bert-base.yaml │ │ │ ├── hf-bert-tiny.yaml │ │ │ ├── recurrent-c2.yaml │ │ │ └── sanitycheck.yaml │ │ ├── cfg_eval.yaml │ │ ├── cfg_eval_pt.yaml │ │ ├── cfg_pretrain.yaml │ │ ├── cfg_save_losses.yaml │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── bert-default.yaml │ │ │ ├── bookcorpus-wikipedia.yaml │ │ │ ├── c4-subset-processed.yaml │ │ │ ├── c4-subset-random.yaml │ │ │ ├── c4-subset.yaml │ │ │ ├── minipile.yaml │ │ │ ├── sanity-check-1.yaml │ │ │ ├── sanity-check-2.yaml │ │ │ ├── sources │ │ │ │ ├── ag_news.yaml │ │ │ │ ├── bookcorpus.yaml │ │ │ │ ├── c4.yaml │ │ │ │ ├── c4_non_streaming.yaml │ │ │ │ ├── fake.yaml │ │ │ │ ├── minipile.yaml │ │ │ │ ├── the_pile.yaml │ │ │ │ ├── the_pileCC.yaml │ │ │ │ ├── the_pile_natural.yaml │ │ │ │ └── wikipedia.yaml │ │ │ ├── the-pile-natural.yaml │ │ │ └── the-pile.yaml │ │ ├── eval │ │ │ ├── GLUE.yaml │ │ │ ├── GLUE_sane.yaml │ │ │ ├── GLUEmosbach.yaml │ │ │ ├── SuperGLUE.yaml │ │ │ ├── __init__.py │ │ │ ├── boolq.yaml │ │ │ ├── mnli.yaml │ │ │ ├── optim │ │ │ │ └── adam.yaml │ │ │ ├── save_losses_rho_loss.yaml │ │ │ └── tasks │ │ │ │ ├── boolq.yaml │ │ │ │ ├── cb.yaml │ │ │ │ ├── cola.yaml │ │ │ │ ├── copa.yaml │ │ │ │ ├── mnli.yaml │ │ │ │ ├── mrpc.yaml │ │ │ │ ├── multirc.yaml │ │ │ │ ├── qnli.yaml │ │ │ │ ├── qqp.yaml │ │ │ │ ├── record.yaml │ │ │ │ ├── rte.yaml │ │ │ │ ├── rte_superglue.yaml │ │ │ │ ├── sst2.yaml │ │ │ │ ├── stsb.yaml │ │ │ │ ├── wic.yaml │ │ │ │ ├── wnli.yaml │ │ │ │ └── wsc.yaml │ │ ├── hydra │ │ │ ├── __init__.py │ │ │ └── job_logging │ │ │ │ └── custom.yaml │ │ ├── impl │ │ │ ├── __init__.py │ │ │ ├── _default.yaml │ │ │ ├── data_structure │ │ │ │ ├── LMDB.yaml │ │ │ │ ├── RAM.yaml │ │ │ │ ├── from-disk.yaml │ │ │ │ └── none.yaml │ │ │ ├── deepspeed-hf.yaml │ │ │ ├── deepspeed.yaml │ │ │ ├── onnx.yaml │ │ │ ├── save_losses_rho_loss.yaml │ │ │ └── torch-default.yaml │ │ ├── piotr │ │ │ ├── default.yaml │ │ │ └── task │ │ │ │ ├── ft.yaml │ │ │ │ └── pt.yaml │ │ ├── train │ │ │ ├── __init__.py │ │ │ ├── bert-base.yaml │ │ │ ├── bert-izsak.yaml │ │ │ ├── bert-o1.yaml │ │ │ ├── bert-o2.yaml │ │ │ ├── bert-o3.yaml │ │ │ ├── bert-original.yaml │ │ │ ├── optim │ │ │ │ ├── adafactor.yaml │ │ │ │ ├── adahessian.yaml │ │ │ │ ├── adam.yaml │ │ │ │ ├── adam_classic.yaml │ │ │ │ ├── lion.yaml │ │ │ │ ├── radam.yaml │ │ │ │ ├── sgd.yaml │ │ │ │ ├── shampoo.yaml │ │ │ │ └── sophiag.yaml │ │ │ └── optim_mod │ │ │ │ ├── disabled.yaml │ │ │ │ ├── larc.yaml │ │ │ │ ├── lars.yaml │ │ │ │ ├── progressive.yaml │ │ │ │ └── sam.yaml │ │ └── wandb │ │ │ ├── default.yaml │ │ │ └── none.yaml │ ├── data │ │ ├── __init__.py │ │ ├── cached_datasets.py │ │ ├── curriculum_sorting.py │ │ ├── deduplicate.py │ │ ├── downstream_task_preparation.py │ │ ├── generation_gibbs.py │ │ ├── lmdb_datasets.py │ │ ├── pretraining_preparation.py │ │ ├── tokenizer_preparation.py │ │ └── utils.py │ └── utils.py ├── efficient_training │ ├── __init__.py │ ├── extract_il_losses.py │ ├── layer_drop.py │ ├── stacking.py │ └── test_layer_drop.py ├── eval.py ├── poetry.lock ├── pretrain_bert.py ├── pretrain_bert_rho_loss.py ├── pretrain_bert_sb.py ├── pretrain_bert_sophia.py ├── pyproject.toml ├── rst │ ├── __init__.py │ ├── get_RSTs_from_wandb.py │ └── saved_rsts.py └── validate_bert.py └── t5 ├── .gitignore ├── LICENSE ├── README.md ├── assets ├── lscpu.txt └── nvidia_smi.txt ├── requirements.txt └── t5 ├── __init__.py ├── configs ├── default.yaml ├── local_env │ └── default.yaml └── task │ ├── debug.yaml │ ├── ft.yaml │ └── pt.yaml ├── models ├── __init__.py ├── progressive.py └── t5.py ├── train.py └── utils ├── __init__.py ├── copied.py ├── data.py ├── general.py ├── lion.py ├── logging.py ├── optim.py ├── sni_dataset.py ├── sophia.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | # I think for ML projects including the precise python version is useful for 86 | # reproducibility, but maybe I am wrong - Oscar 87 | # .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | # JetBrains 134 | .idea/ 135 | .vscode 136 | 137 | # Experiment outputs 138 | 139 | saved_models 140 | outputs 141 | 142 | tmp.py 143 | *.DS_Store 144 | *.ipynb_checkpoints 145 | 146 | # Ignore generated figures 147 | plots/*.png 148 | plots/*.pdf 149 | 150 | deprecated/jeans_scripts/jean.sh 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # No Train No Gain 2 | 3 | Code for the paper 4 | "[No Train No Gain: Revisiting Efficient Training Algorithms For Transformer-based Language Models](https://arxiv.org/abs/2307.06440)"; 5 | Jean Kaddour, Oscar Key, Piotr Nawrot, Pasquale Minervini, Matt J. Kusner . 6 | 7 | 8 | ## Running the code 9 | See the README for the: 10 | - [BERT experiments](bert/README.md) 11 | - [T5 experiments](t5/README.md) 12 | 13 | ## Citation and license 14 | We use two excellent open source codebases to implement our experiments: 15 | - The BERT experiments are forked of [Cramming](https://github.com/JonasGeiping/cramming) 16 | - The T5 experiments are forked of [NanoT5](https://github.com/PiotrNawrot/nanoT5) 17 | 18 | If you find this repository useful, please consider citing both our work and these original codebases. 19 | 20 | To cite our work, we suggest the following BibTeX: 21 | ``` 22 | @misc{kaddourNoTrainNo2023, 23 | title = {No {Train} {No} {Gain}: {Revisiting} {Efficient} {Training} {Algorithms} {For} {Transformer}-based {Language} {Models}}, 24 | url = {http://arxiv.org/abs/2307.06440}, 25 | doi = {10.48550/arXiv.2307.06440}, 26 | urldate = {2023-07-17}, 27 | publisher = {arXiv}, 28 | author = {Kaddour, Jean and Key, Oscar and Nawrot, Piotr and Minervini, Pasquale and Kusner, Matt J.}, 29 | month = jul, 30 | year = {2023}, 31 | note = {arXiv:2307.06440 [cs]}, 32 | } 33 | ``` 34 | 35 | We provide separate licenses for the [BERT experiments](bert/LICENSE.txt) and the [T5 experiments](t5/LICENSE). 36 | 37 | ## Contact 38 | Feel free to open an issue, or email us, with any questions. 39 | -------------------------------------------------------------------------------- /bert/.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /bert/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The code in this folder is based off the Cramming repository (https://github.com/JonasGeiping/cramming), which is Copyright 2022 Jonas Geiping and released under the MIT license (included below). 2 | 3 | This modified version of the Cramming code is Copyright 2023 Jean Kaddour and Oscar Key and also released under the MIT license. 4 | 5 | ----------- 6 | 7 | MIT License 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 14 | -------------------------------------------------------------------------------- /bert/README.md: -------------------------------------------------------------------------------- 1 | # BERT experiments 2 | The BERT experiments are based off the excellent [Cramming](https://github.com/JonasGeiping/cramming) repository, see [LICENSE.txt](LICENSE.txt). 3 | 4 | ## Environment setup 5 | The project has the following dependencies: 6 | - CUDA toolkit + nvcc 11.7 (required to install [FlashAttention](https://github.com/Dao-AILab/flash-attention)) 7 | - Python 3.10 8 | - [Poetry](https://python-poetry.org/) 9 | 10 | One way to install the dependencies is using Conda and the provided environment file: 11 | - `conda env update -f conda_env.yaml` 12 | - `conda activate ntng_bert` 13 | - `export CUDA_HOME=$CONDA_PREFIX` (this is required so PyTorch finds the correct nvcc version when building FlashAttention) 14 | 15 | Create and activate the Poetry environment: 16 | - Install: `poetry install` 17 | - Activate: `poetry shell` 18 | - Manually install FlashAttention: `pip install --no-build-isolation flash-attn==1.0.9` 19 | 20 | ## Modules 21 | ### Entry points 22 | * `pretrain_bert.py` 23 | * implements the baseline, layer stacking, layer dropping, and Lion 24 | * `pretrain_bert_sb.py` 25 | * modified copy of `pretrain_bert.py` that includes selective backpropagation 26 | * `pretrain_bert_rho_loss.py` 27 | * modified copy of `pretrain_bert.py` that includes RHO-Loss 28 | * requires the irreducible losses to be extracted first (see command below) 29 | * `pretrain_bert_sophia.py` 30 | * modified copy of `pretrain_bert.py` that includes Sophia-G 31 | * `eval.py` 32 | * implements fine-tuning and evaluating a pretrained model 33 | * `validate_bert.py` 34 | * implements validating a pretrained checkpoint on the validation set 35 | 36 | ### Other 37 | * `efficient_training` contains additional code for some of the efficient training methods 38 | * we recommend starting at the entry point scripts to understand how to use the code 39 | * `rst` includes helper code for tracking the Reference System Time (RST) metric 40 | 41 | ## Experiment commands 42 | ### Pre-train 43 | First, download the randomized subset of the C4 dataset from [our archive](https://doi.org/10.5281/zenodo.8279728): 44 | * `wget https://zenodo.org/record/8279728/files/c4-subset-random.tar.bz2` 45 | * `mkdir -p outputs/data` 46 | * `tar xvf c4-subset-random.tar.bz2 -C outputs/data/` 47 | 48 | If you would like to use [Weights & Biases](https://wandb.ai/site), configure this in `cramming/config/wandb/default.yaml`. 49 | 50 | #### Dynamic architectures 51 | * Baseline (FP16): 52 | `python pretrain_bert.py name={name} budget={budget in hours} seed={seed}` 53 | 54 | * Layer stacking: 55 | `python pretrain_bert.py name={name} budget={budget in hours} seed={seed} train.stacking.enabled=True` 56 | 57 | * Layer dropping: 58 | `python pretrain_bert.py name={name} budget={budget in hours} seed={seed} arch.layer_drop.enabled=True` 59 | 60 | #### Batch selection 61 | By default the dataset is the randomized subset of C4, but you can also set `data=minipile` or `data=bookcorpus-wikitext`. 62 | Minipile and BCWK will be downloaded automatically from Hugging Face at the start of training. 63 | 64 | * Selective backprop: 65 | `python pretrain_bert_sb.py name={name} budget={budget in hours} seed={seed} train.validation_set.fraction=0.2 impl.validate_every_hours=3` 66 | * To reproduce the ablation where the additional forward passes are not counted against the training budget, add `train.track_forward_pass_only=false`. 67 | 68 | ##### RHO-loss 69 | To acquire the irreducible losses you can either: 70 | * Download ours: 71 | * `wget https://zenodo.org/record/8279728/files/il_losses_[dataset].tar` where `dataset` is `c4`, `bcwk`, or `mp` 72 | * `mkdir -p outputs/il_losses` 73 | * `tar xvf il_losses_[dataset].tar -C outputs/il_losses` 74 | * Train your own irreducible loss model and extract the losses: 75 | * `python pretrain_bert.py name=il_model budget={budget in hours} train.validation_set.il_model=True train.validation_set.fraction=0.2` 76 | * `python efficient_training/extract_il_losses.py name=il_model` 77 | 78 | Pre-train: `python pretrain_bert_rho_loss.py name={name} budget={budget in hours} seed={seed} data={dataset} train.validation_set.fraction=0.2 impl.validate_every_hours=3 train.rho_loss.il_losses_path={path to irreducible losses for dataset} train.rho_loss.mega_batch_size=3072` 79 | 80 | To reproduce the ablation where the additional forward passes are not counted against the training budget, add `train.track_forward_pass_only=false`. 81 | 82 | #### Efficient optimizers 83 | We found Sophia was unstable when using FP16, thus for this set of experiments we use BF16. 84 | 85 | * Baseline (BF16): 86 | `python pretrain_bert.py name={name} budget={budget in hours} seed={seed} impl.mixed_precision_target_dtype=bfloat16` 87 | 88 | * Lion: `python pretrain_bert.py name={name} budget={budget in hours} seed={seed} impl.mixed_precision_target_dtype=bfloat16 train/optim=lion train.optim.lr={learning rate} train.optim.weight_decay={weight decay}` 89 | 90 | * Sophia: `python pretrain_bert_sophia.py name={name} budget={budget in hours} seed={seed} impl.mixed_precision_target_dtype=bfloat16 train/optim=sophiag train.optim.rho={Sophia rho} train.optim.lr={learning rate} train.optim.weight_decay={weight decay}` 91 | * To reproduce the ablation where the additional forward passes are not counted against the training budget, add `train.sophia.free_updates=True`. 92 | 93 | 94 | 95 | ### Fine tune & evaluate 96 | Fine tune and evaluate a checkpoint using GLUE: 97 | 98 | `python eval.py name={pretrain name} eval=glue_sane impl.microbatch_size=16 impl.shuffle_in_dataloader=true seed=0 [impl.mixed_precision_target_dtype=bfloat16 if the checkpoint was trained using BF16 rather than FP16]` 99 | 100 | Fine tune and evaluate a checkpoint SuperGLUE: 101 | 102 | `python eval.py name={pretrain name} eval=SuperGLUE impl.microbatch_size=16 seed=0 [impl.mixed_precision_target_dtype=bfloat16 if the checkpoint was trained using BF16 rather than FP16]` 103 | -------------------------------------------------------------------------------- /bert/conda_env.yaml: -------------------------------------------------------------------------------- 1 | name: ntng_bert 2 | dependencies: 3 | - python=3.10 4 | - cuda=11.7 5 | - cuda-nvcc=11.7 6 | - cuda-nvvp=11.7 7 | - gxx=11.4.0 8 | - pip=23.2 9 | - pip: 10 | - poetry==1.5.1 11 | channels: 12 | - nvidia 13 | - pytorch 14 | - conda-forge 15 | -------------------------------------------------------------------------------- /bert/cramming/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize cramming""" 2 | 3 | from cramming.architectures import construct_model 4 | from cramming.backend import load_backend 5 | from cramming.data import load_pretraining_corpus, prepare_task_dataloaders 6 | 7 | __all__ = [ 8 | "construct_model", 9 | "load_backend", 10 | "load_pretraining_corpus", 11 | "prepare_task_dataloaders", 12 | ] 13 | 14 | 15 | import hydra 16 | 17 | """Construct interfaces to some cfg folders for use in packaged installations:""" 18 | 19 | 20 | def get_config(overrides=[]): 21 | """Return default hydra config.""" 22 | with hydra.initialize(config_path="config"): 23 | cfg = hydra.compose(config_name="cfg", overrides=overrides) 24 | print(f"Loading default config {cfg.name}.") 25 | return cfg 26 | 27 | 28 | def get_model_config(arch="hf-bert-tiny", overrides=[]): 29 | """Return default hydra config for a given attack.""" 30 | with hydra.initialize(config_path="config/arch"): 31 | cfg = hydra.compose(config_name=arch, overrides=overrides) 32 | print(f"Loading model configuration {cfg.architecture}.") 33 | return cfg 34 | 35 | 36 | def get_backend_config(backend="torch-default", overrides=[]): 37 | """Return default hydra config for a given attack.""" 38 | with hydra.initialize(config_path="config/impl"): 39 | cfg = hydra.compose(config_name=backend, overrides=overrides) 40 | print(f"Loading backend {cfg.name}.") 41 | return cfg 42 | -------------------------------------------------------------------------------- /bert/cramming/architectures/__init__.py: -------------------------------------------------------------------------------- 1 | """This module handles all questions of model architecture.""" 2 | 3 | from .construction import construct_model 4 | 5 | __all__ = ["construct_model"] 6 | -------------------------------------------------------------------------------- /bert/cramming/architectures/construction.py: -------------------------------------------------------------------------------- 1 | """Interface to construct models.""" 2 | 3 | import logging 4 | 5 | from cramming.utils import is_main_process 6 | 7 | from .fixed_cramlm import construct_fixed_cramlm 8 | from .funnel_transformers import construct_scriptable_funnel 9 | from .huggingface_interface import construct_huggingface_model 10 | from .recurrent_transformers import construct_scriptable_recurrent 11 | from .sanity_check import SanityCheckforPreTraining 12 | from .scriptable_bert import construct_scriptable_bert 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | def construct_model(cfg_arch, vocab_size, downstream_classes=None): 18 | model = None 19 | if cfg_arch.architectures is not None: 20 | # attempt to solve locally 21 | if "ScriptableMaskedLM" in cfg_arch.architectures: 22 | model = construct_scriptable_bert(cfg_arch, vocab_size, downstream_classes) 23 | elif "ScriptableFunnelLM" in cfg_arch.architectures: 24 | model = construct_scriptable_funnel(cfg_arch, vocab_size, downstream_classes) 25 | elif "ScriptableRecurrentLM" in cfg_arch.architectures: 26 | model = construct_scriptable_recurrent(cfg_arch, vocab_size, downstream_classes) 27 | elif "SanityCheckLM" in cfg_arch.architectures: 28 | model = SanityCheckforPreTraining(cfg_arch.width, vocab_size) 29 | elif "FusedCraMLM" in cfg_arch.architectures: 30 | model = construct_fixed_cramlm(cfg_arch, vocab_size, downstream_classes) 31 | 32 | if model is not None: # Return local model arch 33 | num_params = sum([p.numel() for p in model.parameters()]) 34 | if is_main_process(): 35 | log.info(f"Model with architecture {cfg_arch.architectures[0]} loaded with {num_params:,} parameters.") 36 | return model 37 | 38 | try: # else try on HF 39 | model = construct_huggingface_model(cfg_arch, vocab_size, downstream_classes) 40 | num_params = sum([p.numel() for p in model.parameters()]) 41 | if is_main_process(): 42 | log.info(f"Model with config {cfg_arch} loaded with {num_params:,} parameters.") 43 | return model 44 | except Exception as e: 45 | raise ValueError(f"Invalid model architecture {cfg_arch.architectures} given. Error: {e}") 46 | -------------------------------------------------------------------------------- /bert/cramming/architectures/fused_layers.py: -------------------------------------------------------------------------------- 1 | """Pre-Norm / Post-norm / sandwich fused layers with dropout.""" 2 | 3 | from functools import partial 4 | 5 | import torch 6 | from torch.nn.functional import dropout 7 | 8 | 9 | def get_layer_fn(type="pre", prob=0.1, scripting=True, dn=False, drop=False): 10 | if not dn and not drop: 11 | base_train, base_eval = simplified_layer_training, simplified_layer_eval 12 | else: 13 | base_train, base_eval = scaled_layer_training, scaled_layer_eval 14 | if type in ["pre", "post"]: 15 | if scripting: 16 | fn_train, fn_eval = torch.jit.script(base_train), torch.jit.script(base_eval) 17 | else: 18 | fn_train, fn_eval = base_train, base_eval 19 | return partial(fn_train, prob=prob), partial(fn_eval, prob=prob) 20 | elif type == "sandwich": 21 | return torch.jit.script(sandwich_layer_structure) if scripting else sandwich_layer_structure 22 | else: 23 | raise ValueError("Invalid layer type.") 24 | 25 | 26 | def layer_structure(states, outputs, alpha, residual_scale, prob: float = 0.1, training: bool = False): 27 | return states * alpha + residual_scale * dropout(outputs, p=prob, training=training) 28 | 29 | 30 | def scaled_layer_training(states, outputs, alpha, residual_scale, prob: float = 0.1): 31 | return layer_structure(states, outputs, alpha, residual_scale, prob, training=True) 32 | 33 | 34 | def scaled_layer_eval(states, outputs, alpha, residual_scale, prob: float = 0.1): 35 | return layer_structure(states, outputs, alpha, residual_scale, prob, training=False) 36 | 37 | 38 | def sandwich_layer_structure(states, outputs, alpha, residual_scale, prob: float = 0.1, training: bool = False): 39 | states = states * alpha + residual_scale * outputs 40 | return states 41 | 42 | 43 | def simplified_layer_structure(states, outputs, alpha, residual_scale, prob: float = 0.1, training: bool = False): 44 | return states + dropout(outputs, p=prob, training=training) 45 | 46 | 47 | def simplified_layer_training(states, outputs, alpha, residual_scale, prob: float = 0.1): 48 | return simplified_layer_structure(states, outputs, alpha, residual_scale, prob, training=True) 49 | 50 | 51 | def simplified_layer_eval(states, outputs, alpha, residual_scale, prob: float = 0.1): 52 | return simplified_layer_structure(states, outputs, alpha, residual_scale, prob, training=False) 53 | -------------------------------------------------------------------------------- /bert/cramming/architectures/huggingface_interface.py: -------------------------------------------------------------------------------- 1 | """BERT variations based on the huggingface implementation.""" 2 | 3 | import transformers 4 | from omegaconf import OmegaConf 5 | 6 | 7 | def construct_huggingface_model(cfg_arch, vocab_size, downstream_classes=None): 8 | """construct model from given configuration. Only works if this arch exists on the hub.""" 9 | if downstream_classes is None: 10 | if isinstance(cfg_arch, transformers.PretrainedConfig): 11 | configuration = cfg_arch 12 | else: 13 | configuration = transformers.BertConfig(**cfg_arch) 14 | configuration.vocab_size = vocab_size 15 | model = transformers.AutoModelForMaskedLM.from_config(configuration) 16 | model.vocab_size = model.config.vocab_size 17 | else: 18 | if isinstance(cfg_arch, transformers.PretrainedConfig): 19 | configuration = cfg_arch 20 | configuration.num_labels = downstream_classes 21 | else: 22 | configuration = OmegaConf.to_container(cfg_arch) 23 | configuration = transformers.BertConfig(**configuration, num_labels=downstream_classes) 24 | configuration.vocab_size = vocab_size 25 | model = transformers.AutoModelForSequenceClassification.from_config(configuration) 26 | model.vocab_size = vocab_size 27 | return model 28 | -------------------------------------------------------------------------------- /bert/cramming/architectures/sanity_check.py: -------------------------------------------------------------------------------- 1 | """Sanity Check architecture.""" 2 | from typing import Optional 3 | 4 | import torch 5 | 6 | 7 | class SanityCheckforPreTraining(torch.nn.Module): 8 | """Make big go fast.""" 9 | 10 | def __init__(self, width, vocab_size): 11 | super().__init__() 12 | self.word_embedding = torch.nn.Embedding(vocab_size, width, padding_idx=0) 13 | self.transform = torch.nn.Linear(width, width, bias=False) 14 | 15 | def forward(self, input_ids, attention_mask: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None): 16 | embeds = self.word_embedding(input_ids) 17 | outputs = self.transform(embeds) 18 | loss = outputs.mean() 19 | return dict(outputs=outputs, loss=loss) 20 | -------------------------------------------------------------------------------- /bert/cramming/backend/__init__.py: -------------------------------------------------------------------------------- 1 | """This module implements interfaces to the various backends.""" 2 | 3 | from .prepare_backend import load_backend 4 | from .utils import prepare_pretraining_dataloader 5 | 6 | __all__ = ["load_backend"] 7 | -------------------------------------------------------------------------------- /bert/cramming/backend/deepspeed_integration.py: -------------------------------------------------------------------------------- 1 | """(Hopefully) seamless integration of deepspeed.""" 2 | import json 3 | import logging 4 | import os 5 | from functools import partial 6 | 7 | import torch 8 | from omegaconf import OmegaConf 9 | 10 | from .optimizers import get_schedule_fn 11 | from .utils import ( 12 | group_parameters, 13 | prepare_pretraining_dataloader, 14 | torchdynamo_compile_method, 15 | ) 16 | 17 | log = logging.getLogger(__name__) 18 | _default_setup = dict(device=torch.device("cpu"), dtype=torch.float) 19 | 20 | 21 | """Todo: 22 | * integrate batch size ramping via 23 | https://deepspeed.readthedocs.io/en/latest/pipeline.html#deepspeed.runtime.pipe.engine.PipelineEngine.set_train_batch_size 24 | """ 25 | 26 | 27 | def initialize_deepspeed(model, dataset, validation_set, tokenizer, cfg_train, cfg_impl, setup=_default_setup): 28 | """Initialize deepspeed. Module is imported lazily here.""" 29 | import deepspeed 30 | 31 | if cfg_impl.jit == "trace": 32 | # This variant is very experimental... 33 | input_setup = dict(dtype=torch.long, device=setup["device"]) 34 | templates = torch.randint(0, model.vocab_size, (*cfg_impl.trace_shape,), **input_setup) 35 | labels = torch.randint(0, model.vocab_size, (*cfg_impl.trace_shape,), **input_setup) 36 | 37 | model.to(**setup) 38 | model.kwargs_forward = model.forward 39 | model.forward = lambda input_ids, labels: model.kwargs_forward(input_ids=input_ids, labels=labels) 40 | model = torch.jit.trace(model, (templates, labels), strict=False) 41 | elif cfg_impl.jit == "script": 42 | # This does not work for huggingface models 43 | model = torch.jit.script(model) 44 | 45 | model_engine, optimizer, dataloader, scheduler = deepspeed.initialize( 46 | config=OmegaConf.to_container(cfg_impl, resolve=True), 47 | model=model, 48 | model_parameters=group_parameters(model, cfg_train), 49 | lr_scheduler=get_schedule_fn(cfg_train), 50 | # training_data=dataset, # handle this natively 51 | # collate_fn=collate_fn, 52 | ) 53 | # Monkey-patch checkpointing 54 | model_engine.save_training_checkpoint = partial(save_training_checkpoint, self=model_engine) 55 | model_engine.save_final_model = partial(save_final_model, model_engine) 56 | # And more methods 57 | model_engine.gradinit = partial(gradinit, self=model_engine) 58 | model_engine.to_device = lambda batch: to_device(self=model_engine, batch=batch, keys=["input_ids", "labels"]) 59 | 60 | model_engine.setup = setup 61 | model_engine.record_batch_size = lambda: cfg_train.batch_size 62 | model_engine.record_tokens_per_step = lambda: tokenizer.model_max_length * cfg_impl.microbatch_size 63 | 64 | def step(self, batch): 65 | loss = self.forward(**batch)["loss"] 66 | self.backward(loss) 67 | self.optimizer_step() 68 | return loss.detach() 69 | 70 | model_engine.step = lambda batch: torchdynamo_compile_method(step, cfg_impl.optimizer_context)(self=model_engine, batch=batch) 71 | 72 | if dataset is not None: 73 | dataloader = prepare_pretraining_dataloader(dataset, tokenizer, cfg_train, cfg_impl) 74 | validation_dataloader = ( 75 | prepare_pretraining_dataloader(validation_set, tokenizer, cfg_train, cfg_impl, is_validation=True) 76 | if validation_set is not None 77 | else None 78 | ) 79 | else: 80 | dataloader = None 81 | validation_dataloader = None 82 | # dataloader = deepspeed.RepeatingLoader(dataloader) 83 | return model_engine, optimizer, scheduler, dataloader, validation_dataloader 84 | 85 | 86 | def save_training_checkpoint(self, identifier, directory="checkpoints", state=None): 87 | """Path, identifier and additional client state. This checkpoint can be used to resume training. 88 | The default behavior is to save this checkpoint relative to the training working directory. 89 | """ 90 | self.save_checkpoint(directory, identifier, client_state=state) 91 | 92 | 93 | def save_final_model(self, base_directory, identifier, tokenizer, cfg_arch, dryrun=False): 94 | """This checkpoint can be used for downstream tasks. 95 | The default behavior is to save this checkpoint to a checkpoints folder under base_directory/name/checkpoints""" 96 | try: 97 | identifier_str = f"{identifier:2.4f}" 98 | except ValueError: 99 | identifier_str = str(identifier) 100 | full_path = os.path.join(base_directory, "checkpoints", identifier_str) 101 | os.makedirs(full_path, exist_ok=True) 102 | # This saves tokenizer_config.json, tokenizer.json and special_tokens_map.json to this folder 103 | if not dryrun: 104 | tokenizer.save_pretrained(full_path) 105 | # Save model.pth, model_config.json 106 | self.save_checkpoint(full_path, "model") 107 | with open(os.path.join(full_path, "model_config.json"), "w") as file: 108 | json.dump(OmegaConf.to_container(cfg_arch, resolve=True), file) 109 | 110 | 111 | def gradinit(self, dataloader, config): 112 | raise ValueError("GradInit not implemented for deepspeed.") 113 | 114 | 115 | def to_device(self, batch, keys=["input_ids", "labels"]): 116 | """Move batch of data into device memory.""" 117 | return { 118 | k: v.to(device=self.setup["device"], dtype=torch.long, non_blocking=True) 119 | for k, v in batch.items() 120 | if k in keys # Add more keywords here if needed 121 | } 122 | -------------------------------------------------------------------------------- /bert/cramming/backend/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .adahessian import Adahessian 2 | from .lion_pytorch import Lion 3 | from .optimizer_modifiers import LARS, SAM 4 | from .progressive_batching import ProgressiveBatching 5 | from .schedulers import get_schedule_fn 6 | from .shampoo import Shampoo 7 | from .sophiag import SophiaG 8 | -------------------------------------------------------------------------------- /bert/cramming/backend/optimizers/adamw_scale.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Iterable, Tuple 3 | 4 | import torch 5 | from torch import nn 6 | from torch.optim import Optimizer 7 | 8 | 9 | class AdamWScale(Optimizer): 10 | """ 11 | This AdamW implementation is copied from Huggingface. 12 | We modified it with Adagrad scaling by rms of a weight tensor 13 | 14 | Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay 15 | Regularization](https://arxiv.org/abs/1711.05101). 16 | 17 | Parameters: 18 | params (`Iterable[nn.parameter.Parameter]`): 19 | Iterable of parameters to optimize or dictionaries defining parameter groups. 20 | lr (`float`, *optional*, defaults to 1e-3): 21 | The learning rate to use. 22 | betas (`Tuple[float,float]`, *optional*, defaults to (0.9, 0.999)): 23 | Adam's betas parameters (b1, b2). 24 | eps (`float`, *optional*, defaults to 1e-6): 25 | Adam's epsilon for numerical stability. 26 | weight_decay (`float`, *optional*, defaults to 0): 27 | Decoupled weight decay to apply. 28 | correct_bias (`bool`, *optional*, defaults to `True`): 29 | Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`). 30 | no_deprecation_warning (`bool`, *optional*, defaults to `False`): 31 | A flag used to disable the deprecation warning (set to `True` to disable the warning). 32 | """ 33 | 34 | def __init__( 35 | self, 36 | params: Iterable[nn.parameter.Parameter], 37 | lr: float = 1e-3, 38 | betas: Tuple[float, float] = (0.9, 0.999), 39 | eps: float = 1e-6, 40 | weight_decay: float = 0.0, 41 | correct_bias: bool = True, 42 | ): 43 | if lr < 0.0: 44 | raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0") 45 | if not 0.0 <= betas[0] < 1.0: 46 | raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)") 47 | if not 0.0 <= betas[1] < 1.0: 48 | raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)") 49 | if not 0.0 <= eps: 50 | raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0") 51 | defaults = dict( 52 | lr=lr, 53 | betas=betas, 54 | eps=eps, 55 | weight_decay=weight_decay, 56 | correct_bias=correct_bias, 57 | ) 58 | super().__init__(params, defaults) 59 | 60 | @staticmethod 61 | def _rms(tensor): 62 | return tensor.norm(2) / (tensor.numel() ** 0.5) 63 | 64 | def step(self, closure=None): 65 | """ 66 | Performs a single optimization step. 67 | 68 | Arguments: 69 | closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss. 70 | """ 71 | loss = None 72 | if closure is not None: 73 | loss = closure() 74 | 75 | for group in self.param_groups: 76 | for p in group["params"]: 77 | if p.grad is None: 78 | continue 79 | grad = p.grad.data 80 | if grad.is_sparse: 81 | raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead") 82 | 83 | state = self.state[p] 84 | beta1, beta2 = group["betas"] 85 | 86 | # State initialization 87 | if len(state) == 0: 88 | state["step"] = 0 89 | # Exponential moving average of gradient values 90 | state["exp_avg"] = torch.zeros_like(p.data) 91 | # Exponential moving average of squared gradient values 92 | state["exp_avg_sq"] = torch.zeros_like(p.data) 93 | 94 | exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] 95 | 96 | state["step"] += 1 97 | 98 | # Decay the first and second moment running average coefficient 99 | # In-place operations to update the averages at the same time 100 | exp_avg.mul_(beta1) 101 | exp_avg.add_(grad, alpha=(1.0 - beta1)) 102 | exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2) 103 | denom = exp_avg_sq.sqrt().add_(group["eps"]) 104 | 105 | step_size = group["lr"] 106 | if group["correct_bias"]: # No bias correction for Bert 107 | bias_correction1 = 1.0 - beta1 ** state["step"] 108 | bias_correction2 = 1.0 - beta2 ** state["step"] 109 | step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 110 | 111 | # /Adapt Step from Adagrad 112 | step_size = step_size * max(1e-3, self._rms(p.data)) 113 | # /Adapt Step from Adagrad 114 | 115 | p.data.addcdiv_(exp_avg, denom, value=-step_size) 116 | 117 | # Just adding the square of the weights to the loss function is *not* 118 | # the correct way of using L2 regularization/weight decay with Adam, 119 | # since that will interact with the m and v parameters in strange ways. 120 | # 121 | # Instead we want to decay the weights in a manner that doesn't interact 122 | # with the m/v parameters. This is equivalent to adding the square 123 | # of the weights to the loss with plain (non-momentum) SGD. 124 | # Add weight decay at the end (fixed version) 125 | if group["weight_decay"] > 0.0: 126 | p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"])) 127 | 128 | return loss 129 | -------------------------------------------------------------------------------- /bert/cramming/backend/optimizers/lion_pytorch.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional, Tuple 2 | 3 | import torch 4 | from torch.optim.optimizer import Optimizer 5 | 6 | # functions 7 | 8 | 9 | def exists(val): 10 | return val is not None 11 | 12 | 13 | # update functions 14 | 15 | 16 | def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2): 17 | # stepweight decay 18 | 19 | p.data.mul_(1 - lr * wd) 20 | 21 | # weight update 22 | 23 | update = exp_avg.clone().mul_(beta1).add(grad, alpha=1 - beta1).sign_() 24 | p.add_(update, alpha=-lr) 25 | 26 | # decay the momentum running average coefficient 27 | 28 | exp_avg.mul_(beta2).add_(grad, alpha=1 - beta2) 29 | 30 | 31 | # class 32 | 33 | 34 | class Lion(Optimizer): 35 | def __init__( 36 | self, params, lr: float = 1e-4, betas: Tuple[float, float] = (0.9, 0.99), weight_decay: float = 0.0, use_triton: bool = False 37 | ): 38 | assert lr > 0.0 39 | assert all([0.0 <= beta <= 1.0 for beta in betas]) 40 | 41 | defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay) 42 | 43 | super().__init__(params, defaults) 44 | 45 | self.update_fn = update_fn 46 | 47 | if use_triton: 48 | from lion_pytorch.triton import update_fn as triton_update_fn 49 | 50 | self.update_fn = triton_update_fn 51 | 52 | @torch.no_grad() 53 | def step(self, closure: Optional[Callable] = None): 54 | 55 | loss = None 56 | if exists(closure): 57 | with torch.enable_grad(): 58 | loss = closure() 59 | 60 | for group in self.param_groups: 61 | for p in filter(lambda p: exists(p.grad), group["params"]): 62 | 63 | grad, lr, wd, beta1, beta2, state = p.grad, group["lr"], group["weight_decay"], *group["betas"], self.state[p] 64 | 65 | # init state - exponential moving average of gradient values 66 | 67 | if len(state) == 0: 68 | state["exp_avg"] = torch.zeros_like(p) 69 | 70 | exp_avg = state["exp_avg"] 71 | 72 | self.update_fn(p, grad, exp_avg, lr, wd, beta1, beta2) 73 | 74 | return loss 75 | -------------------------------------------------------------------------------- /bert/cramming/backend/optimizers/shampoo/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /bert/cramming/backend/optimizers/shampoo/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Optimizers 2 | We want to make contributing to this project as easy and transparent as 3 | possible. Our goal is to provide a repo that promotes optimizer research 4 | and development separate from the official PyTorch library. Please only 5 | create pull requests for improving existing optimizers in the repo; new 6 | optimizers should be created in a separate public repo. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests for existing optimizers. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Meta's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## Coding Style 33 | * 4 spaces for indentation rather than tabs 34 | * 80 character line length 35 | * Please maintain a consistent style with the rest of the code 36 | 37 | ## License 38 | By contributing to Optimizers, you agree that your contributions will be licensed 39 | under the LICENSE file in the root directory of this source tree. 40 | -------------------------------------------------------------------------------- /bert/cramming/backend/optimizers/shampoo/LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For Optimizers software 4 | 5 | Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Meta nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /bert/cramming/backend/optimizers/shampoo/README.md: -------------------------------------------------------------------------------- 1 | # Optimizers 2 | 3 | *Copyright (c) Meta Platforms, Inc. and affiliates. 4 | All rights reserved.* 5 | 6 | ## Description 7 | Optimizers is a Github repository of PyTorch optimization algorithms. It is designed for external collaboration and development. 8 | 9 | Currently includes the optimizers: 10 | - Distributed Shampoo 11 | 12 | See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out. 13 | 14 | ## License 15 | Optimizers is BSD licensed, as found in the LICENSE file. 16 | -------------------------------------------------------------------------------- /bert/cramming/backend/optimizers/shampoo/__init__.py: -------------------------------------------------------------------------------- 1 | from .shampoo import Shampoo 2 | -------------------------------------------------------------------------------- /bert/cramming/backend/prepare_backend.py: -------------------------------------------------------------------------------- 1 | """Instantiate backend objects in a congruent format.""" 2 | import torch 3 | 4 | from .deepspeed_integration import initialize_deepspeed 5 | from .torch_default import initialize_torch 6 | 7 | _default_setup = dict(device=torch.device("cpu"), dtype=torch.float) 8 | 9 | 10 | def load_backend(model, dataset, validation_set, tokenizer, cfg_train, cfg_impl, setup=_default_setup): 11 | if cfg_impl.name == "torch-default": 12 | return initialize_torch(model, dataset, validation_set, tokenizer, cfg_train, cfg_impl, setup=setup) 13 | elif cfg_impl.name == "deepspeed": 14 | return initialize_deepspeed(model, dataset, validation_set, tokenizer, cfg_train, cfg_impl, setup=setup) 15 | else: 16 | raise ValueError(f"Invalid backend {cfg_impl.name} given.") 17 | -------------------------------------------------------------------------------- /bert/cramming/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/__init__.py -------------------------------------------------------------------------------- /bert/cramming/config/arch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/arch/__init__.py -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-base.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableMaskedLM 6 | 7 | num_transformer_layers: 12 8 | hidden_size: 768 9 | intermed_size: 3072 10 | hidden_dropout_prob: 0.1 11 | 12 | norm: LayerNorm 13 | norm_eps: 1e-6 14 | norm_scheme: post # can be "pre", "post", "sandwich" 15 | nonlin: GELU 16 | 17 | tie_weights: True # Tie input/output embedding 18 | sparse_prediction: True # Whether to predict only on masked tokens 19 | decoder_bias: True # Whether to include a bias in the decoding step 20 | loss: cross-entropy 21 | z_loss_factor: 0 22 | gradient_checkpointing: False 23 | layer_fusion: True # Fuse transformer layer residual structure 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: learned 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: self-attention 36 | causal_attention: False 37 | num_attention_heads: 12 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: True 41 | 42 | rotary_embedding: False 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: True 47 | low_level_fusion: True 48 | 49 | init: 50 | type: normal 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | ffn_layer_frequency: 1 # FFN layer in every layer 55 | deepnorm_scaling: False 56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size 57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 58 | use_bias: True # Whether to learn biases on all dense layers 59 | final_norm: False # Add a final norm layer before the end 60 | recurrent_layers: 61 | layer_macro_type: transformer # can also be FLASH 62 | 63 | # Downstream settings: 64 | num_labels: # This can be automatically filled in for downstream 65 | classification_head: 66 | pooler: zero_index 67 | include_ff_layer: True 68 | head_dim: ${arch.hidden_size} 69 | nonlin: Tanh 70 | classifier_dropout: ${arch.hidden_dropout_prob} 71 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-c2.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableMaskedLM 6 | 7 | num_transformer_layers: 12 8 | hidden_size: 768 9 | intermed_size: 3072 10 | hidden_dropout_prob: 0.1 11 | 12 | norm: LayerNorm 13 | norm_eps: 1e-6 14 | norm_scheme: pre # can be "pre", "post", "sandwich" 15 | nonlin: GELU 16 | 17 | tie_weights: True # Tie input/output embedding 18 | sparse_prediction: True # Whether to predict only on masked tokens 19 | decoder_bias: True # Whether to include a bias in the decoding step 20 | loss: cross-entropy 21 | z_loss_factor: 0 22 | gradient_checkpointing: False 23 | layer_fusion: True # Fuse transformer layer residual structure 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: scaled-sinusoidal 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: self-attention 36 | causal_attention: False 37 | num_attention_heads: 12 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: True 41 | 42 | rotary_embedding: True 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: True 47 | low_level_fusion: True 48 | 49 | init: 50 | type: normal 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | ffn_layer_frequency: 1 # FFN layer in every layer 55 | deepnorm_scaling: False 56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size 57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 58 | use_bias: True # Whether to learn biases on all dense layers 59 | final_norm: False # Add a final norm layer before the end 60 | recurrent_layers: 61 | layer_macro_type: transformer # can also be FLASH 62 | 63 | # Downstream settings: 64 | num_labels: # This can be automatically filled in for downstream 65 | classification_head: 66 | pooler: avg 67 | include_ff_layer: True 68 | head_dim: 1024 69 | nonlin: Tanh 70 | classifier_dropout: ${arch.hidden_dropout_prob} 71 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-c3.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableMaskedLM 6 | 7 | num_transformer_layers: 12 8 | hidden_size: 768 9 | intermed_size: 3072 10 | hidden_dropout_prob: 0.1 11 | 12 | norm: LayerNorm 13 | norm_eps: 1e-6 14 | norm_scheme: pre # can be "pre", "post", "sandwich" 15 | nonlin: GELU 16 | 17 | tie_weights: True # Tie input/output embedding 18 | sparse_prediction: True # Whether to predict only on masked tokens 19 | decoder_bias: False # Whether to include a bias in the decoding step 20 | loss: cross-entropy 21 | z_loss_factor: 0 22 | gradient_checkpointing: False 23 | layer_fusion: True # Fuse transformer layer residual structure 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: scaled-sinusoidal 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: self-attention 36 | causal_attention: False 37 | num_attention_heads: 12 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: True 41 | 42 | rotary_embedding: True 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: True 47 | low_level_fusion: True 48 | 49 | init: 50 | type: normal 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | ffn_layer_frequency: 1 # FFN layer in every layer 55 | deepnorm_scaling: False 56 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size 57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 58 | use_bias: True # Whether to learn biases on all dense layers 59 | final_norm: False # Add a final norm layer before the end 60 | recurrent_layers: 61 | layer_macro_type: transformer # can also be FLASH 62 | 63 | # Downstream settings: 64 | num_labels: # This can be automatically filled in for downstream 65 | classification_head: 66 | pooler: avg 67 | include_ff_layer: True 68 | head_dim: 1024 69 | nonlin: Tanh 70 | classifier_dropout: ${arch.hidden_dropout_prob} 71 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-c4.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableMaskedLM 6 | 7 | num_transformer_layers: 12 8 | hidden_size: 768 9 | intermed_size: 3072 10 | hidden_dropout_prob: 0.1 11 | 12 | norm: LayerNorm 13 | norm_eps: 1e-12 14 | norm_scheme: pre # can be "pre", "post", "sandwich" 15 | nonlin: GELUglu 16 | 17 | tie_weights: True # Tie input/output embedding 18 | sparse_prediction: True # Whether to predict only on masked tokens 19 | decoder_bias: False # Whether to include a bias in the decoding step 20 | loss: cross-entropy 21 | z_loss_factor: 0 22 | gradient_checkpointing: False 23 | layer_fusion: True # Fuse transformer layer residual structure 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: scaled-sinusoidal 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: self-attention 36 | causal_attention: False 37 | num_attention_heads: 4 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: False 41 | 42 | rotary_embedding: True 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: True 47 | low_level_fusion: True 48 | 49 | init: 50 | type: normal 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | ffn_layer_frequency: 1 # FFN layer in every layer 55 | deepnorm_scaling: False 56 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size 57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 58 | use_bias: False # Whether to learn biases on all dense layers 59 | final_norm: True # Add a final norm layer before the end 60 | recurrent_layers: 61 | layer_macro_type: transformer # can also be FLASH 62 | 63 | # Downstream settings: 64 | num_labels: # This can be automatically filled in for downstream 65 | classification_head: 66 | pooler: avg 67 | include_ff_layer: True 68 | head_dim: 1024 69 | nonlin: Tanh 70 | classifier_dropout: ${arch.hidden_dropout_prob} 71 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-c5.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableMaskedLM 6 | 7 | num_transformer_layers: 16 8 | hidden_size: 768 9 | intermed_size: 3072 10 | hidden_dropout_prob: 0.1 11 | 12 | norm: LayerNorm 13 | norm_eps: 1e-12 14 | norm_scheme: pre # can be "pre", "post", "sandwich" 15 | nonlin: GELUglu 16 | 17 | tie_weights: True # Tie input/output embedding 18 | sparse_prediction: True # Whether to predict only on masked tokens 19 | decoder_bias: False # Whether to include a bias in the decoding step 20 | loss: cross-entropy 21 | z_loss_factor: 0 22 | gradient_checkpointing: False 23 | layer_fusion: True # Fuse transformer layer residual structure 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: scaled-sinusoidal 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: flash-attention-impl 36 | causal_attention: False 37 | num_attention_heads: 12 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: False 41 | 42 | rotary_embedding: False 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: False 47 | low_level_fusion: True 48 | 49 | init: 50 | type: normal 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | ffn_layer_frequency: 1 # FFN layer in every layer 55 | deepnorm_scaling: False 56 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size 57 | layer_drop: 58 | enabled: False # If true, then layers will be dynaically dropped. 59 | max_theta: 0.5 # The maximum probability of keeping a dropping, when the drop schedule is at the end. 60 | gamma_factor: 100 61 | use_bias: False # Whether to learn biases on all dense layers 62 | final_norm: True # Add a final norm layer before the end 63 | recurrent_layers: 64 | layer_macro_type: transformer # can also be FLASH 65 | 66 | # Downstream settings: 67 | num_labels: # This can be automatically filled in for downstream 68 | classification_head: 69 | pooler: avg 70 | include_ff_layer: True 71 | head_dim: 1024 72 | nonlin: Tanh 73 | classifier_dropout: ${arch.hidden_dropout_prob} 74 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-i4.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # based on amp_b1536_L8_H1088_I4352_H8 4 | 5 | # These are the huggingface bert parameters 6 | architectures: 7 | - ScriptableMaskedLM 8 | 9 | num_transformer_layers: 8 10 | hidden_size: 1088 11 | intermed_size: 4352 12 | hidden_dropout_prob: 0.1 13 | 14 | norm: LayerNorm 15 | norm_eps: 1e-12 16 | norm_scheme: pre # maybe post is actually better?? 17 | nonlin: GELU # glu? 18 | 19 | tie_weights: True # Tie input/output embedding 20 | sparse_prediction: True # Whether to predict only on masked tokens 21 | decoder_bias: False # Whether to include a bias in the decoding step 22 | loss: cross-entropy 23 | z_loss_factor: 0 24 | gradient_checkpointing: False 25 | layer_fusion: True # Fuse transformer layer residual structure 26 | 27 | embedding: 28 | vocab_size: # will be populated automatically 29 | pos_embedding: scaled-sinusoidal 30 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 31 | pad_token_id: 0 32 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 33 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 34 | normalization: True 35 | 36 | attention: 37 | type: self-attention 38 | causal_attention: False 39 | num_attention_heads: 8 40 | dropout_prob: 0.1 41 | skip_output_projection: False 42 | qkv_bias: False 43 | 44 | rotary_embedding: True 45 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 46 | sequence_op: torch-softmax # Can be normalization 47 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 48 | high_level_fusion: True 49 | low_level_fusion: True 50 | 51 | init: 52 | type: normal 53 | std: 0.02 54 | 55 | # Very experimental options: 56 | ffn_layer_frequency: 1 # FFN layer in every layer 57 | deepnorm_scaling: False 58 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size 59 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 60 | use_bias: False # Whether to learn biases on all dense layers 61 | final_norm: True # Add a final norm layer before the end 62 | recurrent_layers: 63 | layer_macro_type: transformer # can also be FLASH 64 | 65 | # Downstream settings: 66 | num_labels: # This can be automatically filled in for downstream 67 | classification_head: 68 | pooler: avg 69 | include_ff_layer: True 70 | head_dim: 1024 71 | nonlin: Tanh 72 | classifier_dropout: ${arch.hidden_dropout_prob} 73 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-large-izsak.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableMaskedLM 6 | 7 | num_transformer_layers: 24 8 | hidden_size: 1024 9 | intermed_size: 4096 10 | hidden_dropout_prob: 0.1 11 | 12 | norm: LayerNorm 13 | norm_eps: 1e-6 14 | norm_scheme: pre # can be "pre", "post", "sandwich" 15 | nonlin: GELU 16 | 17 | tie_weights: True # Tie input/output embedding 18 | sparse_prediction: True # Whether to predict only on masked tokens 19 | decoder_bias: True # Whether to include a bias in the decoding step 20 | loss: cross-entropy 21 | z_loss_factor: 0 22 | gradient_checkpointing: False 23 | layer_fusion: True # Fuse transformer layer residual structure 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: learned 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: self-attention 36 | causal_attention: False 37 | num_attention_heads: 16 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: True 41 | 42 | rotary_embedding: False 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: True 47 | low_level_fusion: True 48 | 49 | init: 50 | type: normal 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | ffn_layer_frequency: 1 # FFN layer in every layer 55 | deepnorm_scaling: False 56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size 57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 58 | use_bias: True # Whether to learn biases on all dense layers 59 | final_norm: False # Add a final norm layer before the end 60 | recurrent_layers: 61 | layer_macro_type: transformer # can also be FLASH 62 | 63 | # Downstream settings: 64 | num_labels: # This can be automatically filled in for downstream 65 | classification_head: 66 | pooler: zero_index 67 | include_ff_layer: True 68 | head_dim: ${arch.hidden_size} 69 | nonlin: Tanh 70 | classifier_dropout: ${arch.hidden_dropout_prob} 71 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-original.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baselin 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableMaskedLM 6 | 7 | num_transformer_layers: 12 8 | hidden_size: 768 9 | intermed_size: 3072 10 | hidden_dropout_prob: 0.1 11 | 12 | norm: LayerNorm 13 | norm_eps: 1e-12 14 | norm_scheme: post # can be "pre", "post", "sandwich" 15 | nonlin: GELU 16 | 17 | tie_weights: True # Tie input/output embedding 18 | sparse_prediction: False # Whether to predict only on masked tokens 19 | decoder_bias: True # Whether to include a bias in the decoding step 20 | loss: cross-entropy 21 | z_loss_factor: 0 22 | gradient_checkpointing: False 23 | layer_fusion: False # Fuse transformer layer residual structure 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: learned 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 512 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: self-attention 36 | causal_attention: False 37 | num_attention_heads: 12 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: True 41 | 42 | rotary_embedding: False 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (this is the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be other normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: False 47 | low_level_fusion: False 48 | 49 | init: 50 | type: normal 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | ffn_layer_frequency: 1 # FFN layer in every layer 55 | deepnorm_scaling: False 56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size 57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 58 | use_bias: True # Whether to learn biases on all dense layers 59 | final_norm: False # Add a final norm layer before the end 60 | recurrent_layers: 61 | layer_macro_type: transformer 62 | 63 | # Downstream settings: 64 | num_labels: # This can be automatically filled in for downstream 65 | classification_head: 66 | pooler: zero_index 67 | include_ff_layer: True 68 | head_dim: ${arch.hidden_size} 69 | nonlin: Tanh 70 | classifier_dropout: ${arch.hidden_dropout_prob} 71 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/bert-tiny.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableMaskedLM 6 | 7 | num_transformer_layers: 4 8 | hidden_size: 384 9 | intermed_size: 1024 10 | hidden_dropout_prob: 0.1 11 | 12 | norm: LayerNorm 13 | norm_eps: 1e-6 14 | norm_scheme: post # can be "pre", "post", "sandwich" 15 | nonlin: GELU 16 | 17 | tie_weights: True # Tie input/output embedding 18 | sparse_prediction: True # Whether to predict only on masked tokens 19 | decoder_bias: True # Whether to include a bias in the decoding step 20 | loss: cross-entropy 21 | z_loss_factor: 0 22 | gradient_checkpointing: False 23 | layer_fusion: True # Fuse transformer layer residual structure 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: learned 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: 96 # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: self-attention 36 | causal_attention: False 37 | num_attention_heads: 12 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: True 41 | 42 | rotary_embedding: False 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: True 47 | low_level_fusion: True 48 | 49 | init: 50 | type: small 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | ffn_layer_frequency: 1 # FFN layer in every layer 55 | deepnorm_scaling: False 56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size 57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 58 | use_bias: True # Whether to learn biases on all dense layers 59 | final_norm: False # Add a final norm layer before the end 60 | recurrent_layers: 61 | layer_macro_type: transformer # can also be FLASH 62 | 63 | # Downstream settings: 64 | num_labels: # This can be automatically filled in for downstream 65 | classification_head: 66 | pooler: zero_index 67 | include_ff_layer: True 68 | head_dim: ${arch.hidden_size} 69 | nonlin: Tanh 70 | classifier_dropout: ${arch.hidden_dropout_prob} 71 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/funnel-c2.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableFunnelLM 6 | 7 | setup: [128, 64, 32, 16, 8, 4, 2, 4, 8, 16, 32, 64, 128] 8 | num_transformer_layers: 12 9 | 10 | hidden_size: 768 11 | intermed_size: 3072 12 | hidden_dropout_prob: 0.1 13 | 14 | norm: LayerNorm 15 | norm_eps: 1e-6 16 | norm_scheme: pre # can be "pre", "post", "sandwich" 17 | nonlin: GELU 18 | 19 | tie_weights: True # Tie input/output embedding 20 | sparse_prediction: True # Whether to predict only on masked tokens 21 | decoder_bias: True # Whether to include a bias in the decoding step 22 | loss: cross-entropy 23 | z_loss_factor: 0 24 | 25 | embedding: 26 | vocab_size: # will be populated automatically 27 | pos_embedding: scaled-sinusoidal 28 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 29 | pad_token_id: 0 30 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 31 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 32 | normalization: True 33 | 34 | attention: 35 | type: funnel 36 | causal_attention: False 37 | num_attention_heads: 12 38 | dropout_prob: 0.1 39 | skip_output_projection: False 40 | qkv_bias: True 41 | 42 | rotary_embedding: True 43 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 44 | sequence_op: torch-softmax # Can be normalization 45 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 46 | high_level_fusion: True 47 | low_level_fusion: True 48 | 49 | init: 50 | type: normal 51 | std: 0.02 52 | 53 | # Very experimental options: 54 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size 55 | use_bias: True # Whether to learn biases on all dense layers 56 | final_norm: False # Add a final norm layer before the end 57 | 58 | # Downstream settings: 59 | num_labels: # This can be automatically filled in for downstream 60 | classification_head: 61 | pooler: avg 62 | include_ff_layer: True 63 | head_dim: 1024 64 | nonlin: Tanh 65 | classifier_dropout: ${arch.hidden_dropout_prob} 66 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/hf-bert-base.yaml: -------------------------------------------------------------------------------- 1 | # These are the huggingface bert parameters 2 | architectures: 3 | - BertForMaskedLM 4 | 5 | attention_probs_dropout_prob: 0.1 6 | hidden_act: gelu 7 | hidden_dropout_prob: 0.1 8 | hidden_size: 768 9 | initializer_range: 0.02 10 | intermediate_size: 3072 11 | layer_norm_eps: 1e-12 12 | max_position_embeddings: 512 13 | num_attention_heads: 12 14 | num_hidden_layers: 12 15 | pad_token_id: 0 16 | position_embedding_type: absolute 17 | 18 | type_vocab_size: 2 19 | use_cache: true 20 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/hf-bert-tiny.yaml: -------------------------------------------------------------------------------- 1 | # These are the huggingface bert parameters 2 | architectures: 3 | - BertForMaskedLM 4 | 5 | attention_probs_dropout_prob: 0.1 6 | hidden_act: gelu 7 | hidden_dropout_prob: 0.1 8 | hidden_size: 128 9 | initializer_range: 0.02 10 | intermediate_size: 512 11 | layer_norm_eps: 1e-12 12 | max_position_embeddings: 512 13 | num_attention_heads: 2 14 | num_hidden_layers: 2 15 | pad_token_id: 0 16 | position_embedding_type: absolute 17 | 18 | type_vocab_size: 2 19 | use_cache: true 20 | # original bert-tiny hparams from https://github.com/google-research/bert: 21 | # {"hidden_size": 128, 22 | # "hidden_act": "gelu", 23 | # "initializer_range": 0.02, 24 | # "vocab_size": 30522, 25 | # "hidden_dropout_prob": 0.1, 26 | # "num_attention_heads": 2, 27 | # "type_vocab_size": 2, 28 | # "max_position_embeddings": 512, 29 | # "num_hidden_layers": 2, 30 | # "intermediate_size": 512, 31 | # "attention_probs_dropout_prob": 0.1} 32 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/recurrent-c2.yaml: -------------------------------------------------------------------------------- 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline 2 | 3 | # These are the huggingface bert parameters 4 | architectures: 5 | - ScriptableRecurrentLM 6 | 7 | training_scheme: bptt-deepthinking 8 | maximal_recurrence: 12 9 | recurrent_layers: 2 # How deep is the block of transformer layers that is recurring 10 | hidden_size: 768 11 | intermed_size: 3072 12 | hidden_dropout_prob: 0.1 13 | 14 | norm: LayerNorm 15 | norm_eps: 1e-6 16 | norm_scheme: pre # can be "pre", "post", "sandwich" 17 | nonlin: GELU 18 | 19 | tie_weights: True # Tie input/output embedding 20 | sparse_prediction: True # Whether to predict only on masked tokens 21 | decoder_bias: True # Whether to include a bias in the decoding step 22 | loss: cross-entropy 23 | 24 | layer_fusion: True # Fuse transformer layer residual structure 25 | 26 | embedding: 27 | vocab_size: # will be populated automatically 28 | pos_embedding: scaled-sinusoidal 29 | dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT 30 | pad_token_id: 0 31 | max_seq_length: 128 # max seq length that the positional embedding is instantiated for 32 | embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick) 33 | normalization: True 34 | 35 | attention: 36 | type: self-attention 37 | causal_attention: False 38 | num_attention_heads: 12 39 | dropout_prob: 0.1 40 | skip_output_projection: False 41 | qkv_bias: True 42 | 43 | rotary_embedding: True 44 | seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn) 45 | sequence_op: torch-softmax # Can be normalization 46 | # hybrid_layers: [10, 11] # Only used when type=fourier-hybrid to denote self-attention layers 47 | high_level_fusion: True 48 | low_level_fusion: True 49 | 50 | init: 51 | type: normal 52 | std: 0.02 53 | 54 | # Very experimental options: 55 | ffn_layer_frequency: 1 # FFN layer in every layer 56 | deepnorm_scaling: False 57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers 58 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size 59 | use_bias: True # Whether to learn biases on all dense layers 60 | 61 | # Downstream settings: 62 | num_labels: # This can be automatically filled in for downstream 63 | classification_head: 64 | pooler: avg 65 | include_ff_layer: True 66 | head_dim: 1024 67 | nonlin: Tanh 68 | classifier_dropout: ${arch.hidden_dropout_prob} 69 | 70 | num_transformer_layers: ${arch.maximal_recurrence} # only for compatibility with other archs 71 | -------------------------------------------------------------------------------- /bert/cramming/config/arch/sanitycheck.yaml: -------------------------------------------------------------------------------- 1 | architectures: 2 | - SanityCheckLM 3 | 4 | width: 8352 5 | -------------------------------------------------------------------------------- /bert/cramming/config/cfg_eval.yaml: -------------------------------------------------------------------------------- 1 | # Configuration defaults 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams 3 | defaults: 4 | - impl: torch-default 5 | - wandb: default 6 | - eval: mnli 7 | - _self_ 8 | - override hydra/job_logging: custom 9 | 10 | wandb: 11 | project: cramming-eval 12 | 13 | base_dir: outputs 14 | hydra: 15 | sweep: 16 | dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S} 17 | run: 18 | dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S} 19 | job: 20 | chdir: True 21 | 22 | seed: # Optional: Set initial seed 23 | 24 | # A name for this run [will draw the checkpoint from runs with this name 25 | # and use this name for the summary table and outputs folder] 26 | name: default 27 | # If set, override the name on wandb. Otherwise, uses name above. 28 | wandb_name: 29 | 30 | # debug implementation by running every loop just once: 31 | dryrun: False 32 | -------------------------------------------------------------------------------- /bert/cramming/config/cfg_eval_pt.yaml: -------------------------------------------------------------------------------- 1 | # Configuration defaults 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams 3 | defaults: 4 | - arch: bert-c5 5 | - data: c4-subset-random #bookcorpus-wikipedia 6 | - impl: torch-default 7 | - wandb: default 8 | - eval: save_losses_rho_loss 9 | - train: bert-o3 10 | - _self_ 11 | - override hydra/job_logging: custom 12 | 13 | wandb: 14 | project: cramming-eval 15 | 16 | base_dir: outputs 17 | hydra: 18 | sweep: 19 | dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S} 20 | run: 21 | dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S} 22 | job: 23 | chdir: True 24 | 25 | seed: 0 # Optional: Set initial seed 26 | 27 | # A name for this run [will draw the checkpoint from runs with this name 28 | # and use this name for the summary table and outputs folder] 29 | name: default 30 | budget: 96 31 | # debug implementation by running every loop just once: 32 | dryrun: False 33 | 34 | train: 35 | validation_set: 36 | enabled: true 37 | fraction: 0.001 38 | 39 | truncate_dataset: 0 -------------------------------------------------------------------------------- /bert/cramming/config/cfg_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Configuration defaults 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams 3 | # default settings run a sanity check with a small model and test data. 4 | defaults: 5 | - arch: bert-c5 6 | - data: c4-subset-random #bookcorpus-wikipedia 7 | - impl: torch-default 8 | - wandb: default 9 | - train: bert-o3 10 | - _self_ 11 | - override hydra/job_logging: custom 12 | 13 | base_dir: outputs 14 | hydra: 15 | sweep: 16 | dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S} 17 | run: 18 | dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S} 19 | job: 20 | chdir: True 21 | 22 | seed: 0 # Optional: Set initial seed 23 | name: default # A name for this run [will be used for the summary table and outputs folder] 24 | 25 | # If a number, then the total compute budget in hours. If "steps", then instead train 26 | # for the number of steps given by train.steps. 27 | budget: 24 28 | # debug implementation by running every loop just once: 29 | dryrun: False 30 | -------------------------------------------------------------------------------- /bert/cramming/config/cfg_save_losses.yaml: -------------------------------------------------------------------------------- 1 | # Configuration defaults 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams 3 | # default settings run a sanity check with a small model and test data. 4 | defaults: 5 | - arch: bert-c5 6 | - data: c4-subset-random #bookcorpus-wikipedia 7 | - impl: save_losses_rho_loss 8 | - wandb: default 9 | - eval: save_losses_rho_loss 10 | - train: bert-o3 11 | - _self_ 12 | - override hydra/job_logging: custom 13 | 14 | base_dir: outputs 15 | hydra: 16 | sweep: 17 | dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S} 18 | run: 19 | dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S} 20 | job: 21 | chdir: True 22 | 23 | seed: 0 # Optional: Set initial seed 24 | name: rho_loss_save_losses # A name for this run [will be used for the summary table and outputs folder] 25 | budget: 24 26 | 27 | # debug implementation by running every loop just once: 28 | dryrun: False 29 | -------------------------------------------------------------------------------- /bert/cramming/config/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/data/__init__.py -------------------------------------------------------------------------------- /bert/cramming/config/data/bert-default.yaml: -------------------------------------------------------------------------------- 1 | # This is the "default" BERT dataset 2 | name: bookcorpus-wikitext 3 | defaults: 4 | - sources: 5 | - bookcorpus 6 | - wikipedia 7 | 8 | # Preprocessing 9 | normalizer: # This is ignored and the default bert normalizer is used instead 10 | force_lowercase: # True 11 | strip_accents: # True 12 | force_english_keyboard: # False 13 | whitespace_escape: # False 14 | tokenizer: bert-base-uncased 15 | vocab_size: 30522 16 | 17 | # Dataset Formation 18 | seq_length: 512 19 | include_cls_token_in_corpus: # True, but ignored and the default post_processor is used 20 | include_sep_token_in_corpus: # True, but ignored and the default post_processor is used 21 | use_type_ids: # True 22 | max_entries_in_raw_dataset: 1e14 # Select no more than this number of examples from the dataset 23 | max_seq_in_tokenized_dataset: 1e14 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.3 31 | deduplicate_entries: False 32 | deduplication_threshold: 100 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/bookcorpus-wikipedia.yaml: -------------------------------------------------------------------------------- 1 | # This is a modernized/sanitized config for bookcorpus-wikipedia 2 | name: bookcorpus-wikitext 3 | defaults: 4 | - sources: 5 | - bookcorpus 6 | - wikipedia 7 | 8 | # Preprocessing 9 | normalizer: 10 | force_lowercase: True 11 | strip_accents: True 12 | force_english_keyboard: True 13 | whitespace_escape: False 14 | tokenizer: WordPiece 15 | vocab_size: 32768 # 2^15 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: False 20 | include_sep_token_in_corpus: True 21 | use_type_ids: False 22 | max_entries_in_raw_dataset: 1e10 # Select only this many examples from the dataset 23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.3 31 | deduplicate_entries: False 32 | deduplication_threshold: 100 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/c4-subset-processed.yaml: -------------------------------------------------------------------------------- 1 | # This would be a slice of C4 2 | name: c4-subset 3 | defaults: 4 | - sources: 5 | - c4 6 | 7 | # 8 | # Preprocessing 9 | normalizer: 10 | force_lowercase: True 11 | strip_accents: True 12 | force_english_keyboard: True 13 | whitespace_escape: False 14 | tokenizer: WordPiece 15 | vocab_size: 32768 # 2^15 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: False 20 | include_sep_token_in_corpus: True 21 | use_type_ids: False 22 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering 23 | max_seq_in_tokenized_dataset: 85e6 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: True 30 | trash_cutoff: 0.25 31 | deduplicate_entries: True 32 | deduplication_threshold: 75 33 | 34 | # Data Order: 35 | ordering: sentence-length-curriculum # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/c4-subset-random.yaml: -------------------------------------------------------------------------------- 1 | # This would be a slice of C4 2 | name: c4-subset-random 3 | defaults: 4 | - sources: 5 | - c4_non_streaming 6 | 7 | # 8 | # Preprocessing 9 | normalizer: 10 | force_lowercase: True 11 | strip_accents: True 12 | force_english_keyboard: True 13 | whitespace_escape: False 14 | tokenizer: WordPiece 15 | vocab_size: 32768 # 2^15 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: False 20 | include_sep_token_in_corpus: True 21 | use_type_ids: False 22 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering 23 | max_seq_in_tokenized_dataset: 85e6 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.25 31 | deduplicate_entries: False 32 | deduplication_threshold: 75 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/c4-subset.yaml: -------------------------------------------------------------------------------- 1 | # This would be a slice of C4 2 | name: c4-subset 3 | defaults: 4 | - sources: 5 | - c4 6 | 7 | # 8 | # Preprocessing 9 | normalizer: 10 | force_lowercase: True 11 | strip_accents: True 12 | force_english_keyboard: True 13 | whitespace_escape: False 14 | tokenizer: WordPiece 15 | vocab_size: 32768 # 2^15 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: False 20 | include_sep_token_in_corpus: True 21 | use_type_ids: False 22 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering 23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.3 31 | deduplicate_entries: False 32 | deduplication_threshold: 100 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/minipile.yaml: -------------------------------------------------------------------------------- 1 | # This would be a slice of C4 2 | name: minipile 3 | defaults: 4 | - sources: 5 | - minipile 6 | 7 | # 8 | # Preprocessing 9 | normalizer: 10 | force_lowercase: True 11 | strip_accents: True 12 | force_english_keyboard: True 13 | whitespace_escape: False 14 | tokenizer: WordPiece 15 | vocab_size: 32768 # 2^15 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: False 20 | include_sep_token_in_corpus: True 21 | use_type_ids: False 22 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering 23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.3 31 | deduplicate_entries: False 32 | deduplication_threshold: 100 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sanity-check-1.yaml: -------------------------------------------------------------------------------- 1 | # Just a bunch of fake data ... 2 | name: sanity-check-1 3 | defaults: 4 | - sources: 5 | - fake 6 | 7 | # 8 | # Preprocessing 9 | normalizer: # This is ignored and the default bert normalizer is used instead 10 | force_lowercase: 11 | strip_accents: 12 | force_english_keyboard: 13 | whitespace_escape: 14 | tokenizer: bert-base-uncased 15 | vocab_size: 30522 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: 20 | include_sep_token_in_corpus: 21 | use_type_ids: 22 | max_entries_in_raw_dataset: 1e12 # Select only this many examples from the dataset 23 | max_seq_in_tokenized_dataset: 1e12 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.3 31 | deduplicate_entries: False 32 | deduplication_threshold: 100 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sanity-check-2.yaml: -------------------------------------------------------------------------------- 1 | # Just a tiny test dataset ... 2 | name: sanity-check-2 3 | # https://hydra.cc/docs/patterns/select_multiple_configs_from_config_group/ 4 | defaults: 5 | - sources: 6 | - ag_news 7 | 8 | # Preprocessing 9 | normalizer: 10 | force_lowercase: True 11 | strip_accents: True 12 | force_english_keyboard: True 13 | whitespace_escape: False 14 | tokenizer: BPE # faster for sanity checks 15 | vocab_size: 32768 # to make sure there are not memory surprises compared to the actual data 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: False 20 | include_sep_token_in_corpus: False 21 | use_type_ids: False 22 | max_entries_in_raw_dataset: 1e10 # Select only this many examples from the dataset 23 | max_seq_in_tokenized_dataset: 1e10 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.3 31 | deduplicate_entries: False 32 | deduplication_threshold: 100 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/ag_news.yaml: -------------------------------------------------------------------------------- 1 | # For sanity testing 2 | ag_news: 3 | provider: huggingface 4 | partition: default 5 | split: train 6 | 7 | streaming: False 8 | 9 | remove_columns: label 10 | concatenate_successive_entries: 0 11 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/bookcorpus.yaml: -------------------------------------------------------------------------------- 1 | # The bookcorpus dataset, drawn from it huggingface mirror 2 | bookcorpus: 3 | provider: huggingface 4 | partition: plain_text 5 | split: train 6 | 7 | streaming: False 8 | 9 | # source-specific cleaning rules? 10 | remove_columns: 11 | concatenate_successive_entries: 16 12 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/c4.yaml: -------------------------------------------------------------------------------- 1 | # The wikipedia en dataset, drawn from it huggingface mirror 2 | c4: 3 | provider: huggingface 4 | partition: en 5 | split: train 6 | 7 | streaming: True 8 | 9 | # source-specific cleaning rules? 10 | remove_columns: 11 | concatenate_successive_entries: 0 12 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/c4_non_streaming.yaml: -------------------------------------------------------------------------------- 1 | # The wikipedia en dataset, drawn from it huggingface mirror 2 | c4: 3 | provider: huggingface 4 | partition: en 5 | split: train 6 | 7 | streaming: False 8 | 9 | # source-specific cleaning rules? 10 | remove_columns: 11 | concatenate_successive_entries: 0 12 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/fake.yaml: -------------------------------------------------------------------------------- 1 | # Just a bunch of fake data ... 2 | fake: 3 | provider: fake 4 | split: 5 | 6 | randgen_seed: 0 7 | size: 2048 8 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/minipile.yaml: -------------------------------------------------------------------------------- 1 | # The minipile dataset, drawn from it huggingface mirror 2 | JeanKaddour/minipile: 3 | provider: huggingface 4 | partition: null 5 | split: train 6 | 7 | streaming: False 8 | 9 | # source-specific cleaning rules? 10 | remove_columns: 11 | concatenate_successive_entries: 0 12 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/the_pile.yaml: -------------------------------------------------------------------------------- 1 | # 2 | the_pile: 3 | provider: local 4 | file_type: json 5 | files: 6 | - "/fs/cml-datasets/Pile/train/00.jsonl.zst" 7 | - "/fs/cml-datasets/Pile/train/01.jsonl.zst" 8 | - "/fs/cml-datasets/Pile/train/02.jsonl.zst" 9 | - "/fs/cml-datasets/Pile/train/03.jsonl.zst" 10 | - "/fs/cml-datasets/Pile/train/04.jsonl.zst" 11 | - "/fs/cml-datasets/Pile/train/05.jsonl.zst" 12 | - "/fs/cml-datasets/Pile/train/06.jsonl.zst" 13 | - "/fs/cml-datasets/Pile/train/07.jsonl.zst" 14 | - "/fs/cml-datasets/Pile/train/08.jsonl.zst" 15 | - "/fs/cml-datasets/Pile/train/09.jsonl.zst" 16 | - "/fs/cml-datasets/Pile/train/10.jsonl.zst" 17 | - "/fs/cml-datasets/Pile/train/11.jsonl.zst" 18 | - "/fs/cml-datasets/Pile/train/12.jsonl.zst" 19 | - "/fs/cml-datasets/Pile/train/13.jsonl.zst" 20 | - "/fs/cml-datasets/Pile/train/14.jsonl.zst" 21 | - "/fs/cml-datasets/Pile/train/15.jsonl.zst" 22 | - "/fs/cml-datasets/Pile/train/16.jsonl.zst" 23 | - "/fs/cml-datasets/Pile/train/17.jsonl.zst" 24 | - "/fs/cml-datasets/Pile/train/18.jsonl.zst" 25 | - "/fs/cml-datasets/Pile/train/19.jsonl.zst" 26 | - "/fs/cml-datasets/Pile/train/20.jsonl.zst" 27 | - "/fs/cml-datasets/Pile/train/21.jsonl.zst" 28 | - "/fs/cml-datasets/Pile/train/22.jsonl.zst" 29 | - "/fs/cml-datasets/Pile/train/23.jsonl.zst" 30 | - "/fs/cml-datasets/Pile/train/24.jsonl.zst" 31 | - "/fs/cml-datasets/Pile/train/25.jsonl.zst" 32 | - "/fs/cml-datasets/Pile/train/26.jsonl.zst" 33 | - "/fs/cml-datasets/Pile/train/27.jsonl.zst" 34 | - "/fs/cml-datasets/Pile/train/28.jsonl.zst" 35 | - "/fs/cml-datasets/Pile/train/29.jsonl.zst" 36 | filter: 37 | # pile_set_name: 38 | # possible pile_set_name values are 39 | # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB 40 | # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB 41 | # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB 42 | # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB 43 | # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB 44 | # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB 45 | # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB 46 | # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB 47 | # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB 48 | # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB 49 | # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB 50 | # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB 51 | # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB 52 | # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB 53 | # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB 54 | # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB 55 | # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB 56 | # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB 57 | # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB 58 | # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB 59 | # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB 60 | # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB 61 | split: train 62 | streaming: True 63 | 64 | # source-specific cleaning rules? 65 | remove_columns: 66 | concatenate_successive_entries: 0 67 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/the_pileCC.yaml: -------------------------------------------------------------------------------- 1 | # 2 | the_pileCC: 3 | provider: local 4 | file_type: json 5 | files: 6 | - "/fs/cml-datasets/Pile/train/00.jsonl.zst" 7 | - "/fs/cml-datasets/Pile/train/01.jsonl.zst" 8 | - "/fs/cml-datasets/Pile/train/02.jsonl.zst" 9 | - "/fs/cml-datasets/Pile/train/03.jsonl.zst" 10 | - "/fs/cml-datasets/Pile/train/04.jsonl.zst" 11 | - "/fs/cml-datasets/Pile/train/05.jsonl.zst" 12 | - "/fs/cml-datasets/Pile/train/06.jsonl.zst" 13 | - "/fs/cml-datasets/Pile/train/07.jsonl.zst" 14 | - "/fs/cml-datasets/Pile/train/08.jsonl.zst" 15 | - "/fs/cml-datasets/Pile/train/09.jsonl.zst" 16 | - "/fs/cml-datasets/Pile/train/10.jsonl.zst" 17 | - "/fs/cml-datasets/Pile/train/11.jsonl.zst" 18 | - "/fs/cml-datasets/Pile/train/12.jsonl.zst" 19 | - "/fs/cml-datasets/Pile/train/13.jsonl.zst" 20 | - "/fs/cml-datasets/Pile/train/14.jsonl.zst" 21 | - "/fs/cml-datasets/Pile/train/15.jsonl.zst" 22 | - "/fs/cml-datasets/Pile/train/16.jsonl.zst" 23 | - "/fs/cml-datasets/Pile/train/17.jsonl.zst" 24 | - "/fs/cml-datasets/Pile/train/18.jsonl.zst" 25 | - "/fs/cml-datasets/Pile/train/19.jsonl.zst" 26 | - "/fs/cml-datasets/Pile/train/20.jsonl.zst" 27 | - "/fs/cml-datasets/Pile/train/21.jsonl.zst" 28 | - "/fs/cml-datasets/Pile/train/22.jsonl.zst" 29 | - "/fs/cml-datasets/Pile/train/23.jsonl.zst" 30 | - "/fs/cml-datasets/Pile/train/24.jsonl.zst" 31 | - "/fs/cml-datasets/Pile/train/25.jsonl.zst" 32 | - "/fs/cml-datasets/Pile/train/26.jsonl.zst" 33 | - "/fs/cml-datasets/Pile/train/27.jsonl.zst" 34 | - "/fs/cml-datasets/Pile/train/28.jsonl.zst" 35 | - "/fs/cml-datasets/Pile/train/29.jsonl.zst" 36 | filter: 37 | pile_set_name: 38 | - Pile-CC 39 | # possible pile_set_name values are 40 | # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB 41 | # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB 42 | # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB 43 | # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB 44 | # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB 45 | # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB 46 | # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB 47 | # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB 48 | # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB 49 | # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB 50 | # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB 51 | # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB 52 | # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB 53 | # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB 54 | # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB 55 | # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB 56 | # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB 57 | # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB 58 | # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB 59 | # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB 60 | # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB 61 | # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB 62 | split: train 63 | streaming: True 64 | 65 | # source-specific cleaning rules? 66 | remove_columns: 67 | concatenate_successive_entries: 0 68 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/the_pile_natural.yaml: -------------------------------------------------------------------------------- 1 | # 2 | the_pile_natural: 3 | provider: local 4 | file_type: json 5 | files: 6 | - "/fs/cml-datasets/Pile/train/00.jsonl.zst" 7 | - "/fs/cml-datasets/Pile/train/01.jsonl.zst" 8 | - "/fs/cml-datasets/Pile/train/02.jsonl.zst" 9 | - "/fs/cml-datasets/Pile/train/03.jsonl.zst" 10 | - "/fs/cml-datasets/Pile/train/04.jsonl.zst" 11 | - "/fs/cml-datasets/Pile/train/05.jsonl.zst" 12 | - "/fs/cml-datasets/Pile/train/06.jsonl.zst" 13 | - "/fs/cml-datasets/Pile/train/07.jsonl.zst" 14 | - "/fs/cml-datasets/Pile/train/08.jsonl.zst" 15 | - "/fs/cml-datasets/Pile/train/09.jsonl.zst" 16 | - "/fs/cml-datasets/Pile/train/10.jsonl.zst" 17 | - "/fs/cml-datasets/Pile/train/11.jsonl.zst" 18 | - "/fs/cml-datasets/Pile/train/12.jsonl.zst" 19 | - "/fs/cml-datasets/Pile/train/13.jsonl.zst" 20 | - "/fs/cml-datasets/Pile/train/14.jsonl.zst" 21 | - "/fs/cml-datasets/Pile/train/15.jsonl.zst" 22 | - "/fs/cml-datasets/Pile/train/16.jsonl.zst" 23 | - "/fs/cml-datasets/Pile/train/17.jsonl.zst" 24 | - "/fs/cml-datasets/Pile/train/18.jsonl.zst" 25 | - "/fs/cml-datasets/Pile/train/19.jsonl.zst" 26 | - "/fs/cml-datasets/Pile/train/20.jsonl.zst" 27 | - "/fs/cml-datasets/Pile/train/21.jsonl.zst" 28 | - "/fs/cml-datasets/Pile/train/22.jsonl.zst" 29 | - "/fs/cml-datasets/Pile/train/23.jsonl.zst" 30 | - "/fs/cml-datasets/Pile/train/24.jsonl.zst" 31 | - "/fs/cml-datasets/Pile/train/25.jsonl.zst" 32 | - "/fs/cml-datasets/Pile/train/26.jsonl.zst" 33 | - "/fs/cml-datasets/Pile/train/27.jsonl.zst" 34 | - "/fs/cml-datasets/Pile/train/28.jsonl.zst" 35 | - "/fs/cml-datasets/Pile/train/29.jsonl.zst" 36 | filter: 37 | pile_set_name: 38 | - Gutenberg 39 | - Books3 40 | - Wikipedia (en) 41 | # possible pile_set_name values are 42 | # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB 43 | # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB 44 | # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB 45 | # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB 46 | # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB 47 | # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB 48 | # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB 49 | # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB 50 | # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB 51 | # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB 52 | # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB 53 | # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB 54 | # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB 55 | # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB 56 | # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB 57 | # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB 58 | # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB 59 | # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB 60 | # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB 61 | # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB 62 | # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB 63 | # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB 64 | split: train 65 | streaming: True 66 | 67 | # source-specific cleaning rules? 68 | remove_columns: 69 | concatenate_successive_entries: 0 70 | -------------------------------------------------------------------------------- /bert/cramming/config/data/sources/wikipedia.yaml: -------------------------------------------------------------------------------- 1 | # The wikipedia en dataset, drawn from it huggingface mirror 2 | wikipedia: 3 | provider: huggingface 4 | partition: 20220301.en 5 | split: train 6 | 7 | streaming: False 8 | 9 | # source-specific cleaning rules? 10 | remove_columns: title 11 | concatenate_successive_entries: 0 12 | -------------------------------------------------------------------------------- /bert/cramming/config/data/the-pile-natural.yaml: -------------------------------------------------------------------------------- 1 | # This would be a slice of the pile 2 | name: the_pile 3 | defaults: 4 | - sources: 5 | - the_pile_natural 6 | 7 | # 8 | # Preprocessing 9 | normalizer: 10 | force_lowercase: True 11 | strip_accents: True 12 | force_english_keyboard: True 13 | whitespace_escape: False 14 | tokenizer: WordPiece 15 | vocab_size: 32768 # 2^15 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: False 20 | include_sep_token_in_corpus: True 21 | use_type_ids: False 22 | max_entries_in_raw_dataset: 2e6 # This comes out to about 40mio 128-seq entries. Original examples are a bit longer here than the average 23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.3 31 | deduplicate_entries: False 32 | deduplication_threshold: 100 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/data/the-pile.yaml: -------------------------------------------------------------------------------- 1 | # This would be a slice of the pile 2 | name: the_pile 3 | defaults: 4 | - sources: 5 | - the_pile 6 | 7 | # 8 | # Preprocessing 9 | normalizer: 10 | force_lowercase: True 11 | strip_accents: True 12 | force_english_keyboard: True 13 | whitespace_escape: False 14 | tokenizer: WordPiece 15 | vocab_size: 32768 16 | 17 | # Dataset Formation 18 | seq_length: 128 19 | include_cls_token_in_corpus: False 20 | include_sep_token_in_corpus: True 21 | use_type_ids: False 22 | max_entries_in_raw_dataset: 4e6 # About 40 mio seqs of length 128 23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences. 24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training 25 | 26 | # Data Cleaning: 27 | named_entity_simplification: False 28 | remove_whitespaces: False 29 | remove_trash: False 30 | trash_cutoff: 0.3 31 | deduplicate_entries: False 32 | deduplication_threshold: 100 33 | 34 | # Data Order: 35 | ordering: randomized # could be a curriculum 36 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/GLUE.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - optim: adam 3 | - tasks: 4 | - cola 5 | - mnli 6 | - mrpc 7 | - qnli 8 | - qqp 9 | - rte 10 | - sst2 11 | - stsb 12 | # - wnli 13 | 14 | evaluation_set: validation # always keep this at validation except for the final run 15 | 16 | # checkpoint name: 17 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder 18 | checkpoint: latest 19 | path: ${impl.path} # Path for caches of datasets and tokenizers 20 | max_seq_length: 128 21 | 22 | # Default options: 23 | # These can be overwritten by specific tasks 24 | batch_size: 32 25 | batch_size_ramp: 0 26 | 27 | gradient_clipping: 28 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 29 | scheduler: 30 | optim_mod: 31 | name: none 32 | 33 | epochs: 5 34 | 35 | # These options are only used for scheduling: 36 | warmup_steps: 1000 37 | cooldown_steps: 0 38 | steps: 10_000 39 | 40 | testing: 41 | batch_size: 128 42 | 43 | arch_modifications: 44 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/GLUE_sane.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - optim: adam 3 | - tasks: 4 | - cola 5 | - mnli 6 | - mrpc 7 | - qnli 8 | - qqp 9 | - rte 10 | - sst2 11 | - stsb 12 | # - wnli 13 | 14 | metrics_to_average: 15 | - qqp_f1 16 | - qnli_accuracy 17 | - mrpc_f1 18 | - mnli_accuracy_extra 19 | - mnli_accuracy 20 | - stsb_pearson 21 | - sst2_accuracy 22 | - rte_accuracy 23 | - cola_matthews_correlation 24 | 25 | optim: 26 | lr: 4e-5 27 | 28 | evaluation_set: validation # always keep this at validation except for the final run 29 | 30 | # checkpoint name: 31 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder 32 | checkpoint: latest 33 | # Set this to a non-empty value to specify a particular model file to load. 34 | model_pth: 35 | 36 | path: ${impl.path} # Path for caches of datasets and tokenizers 37 | max_seq_length: 128 38 | 39 | # Default options: 40 | # These can be overwritten by specific tasks 41 | batch_size: 16 42 | batch_size_ramp: 0 43 | 44 | gradient_clipping: 45 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 46 | scheduler: cosine-decay 47 | optim_mod: 48 | name: none 49 | 50 | epochs: 5 51 | 52 | # These options are only used for scheduling: 53 | warmup_steps: 0.1 54 | cooldown_steps: 0 55 | steps: 10_000 56 | 57 | testing: 58 | batch_size: 128 59 | 60 | arch_modifications: 61 | classification_head: 62 | pooler: zero_index 63 | include_ff_layer: True 64 | # head_dim: ${arch.hidden_size} 65 | nonlin: Tanh 66 | # classifier_dropout: ${arch.hidden_dropout_prob} 67 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/GLUEmosbach.yaml: -------------------------------------------------------------------------------- 1 | # On the Stability of Fine-tuning BERT: Misconceptions, Explanations, and Strong Baselines 2 | 3 | defaults: 4 | - optim: adam 5 | - tasks: 6 | - cola 7 | - mnli 8 | - mrpc 9 | - qnli 10 | - qqp 11 | - rte 12 | - sst2 13 | - stsb 14 | # - wnli 15 | 16 | optim: 17 | weight_decay: 0.01 18 | betas: 19 | - 0.9 20 | - 0.999 21 | 22 | evaluation_set: validation # always keep this at validation except for the final run 23 | 24 | # checkpoint name: 25 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder 26 | checkpoint: latest 27 | path: ${impl.path} # Path for caches of datasets and tokenizers 28 | max_seq_length: 128 29 | 30 | # Default options: 31 | # These can be overwritten by specific tasks 32 | batch_size: 16 33 | batch_size_ramp: 0 34 | 35 | gradient_clipping: 36 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 37 | scheduler: linear 38 | optim_mod: 39 | name: none 40 | 41 | epochs: 20 42 | 43 | # These options are only used for scheduling: 44 | warmup_steps: 0.1 45 | cooldown_steps: 0 46 | steps: 10_000 47 | 48 | testing: 49 | batch_size: 128 50 | 51 | arch_modifications: 52 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/SuperGLUE.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - optim: adam 3 | - tasks: 4 | - boolq 5 | - cb 6 | - copa 7 | - multirc 8 | - rte_superglue 9 | - wic 10 | - wsc 11 | 12 | metrics_to_average: 13 | - boolq_accuracy 14 | - cb_f1 15 | - copa_accuracy 16 | - multirc_f1_a 17 | - rte_accuracy 18 | - wic_accuracy 19 | - "wsc.fixed_accuracy" 20 | 21 | optim: 22 | lr: 5e-5 23 | 24 | evaluation_set: validation # always keep this at validation except for the final run 25 | 26 | # checkpoint name: 27 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder 28 | checkpoint: latest 29 | # Set this to a non-empty value to specify a particular model file to load. 30 | model_pth: 31 | 32 | path: ${impl.path} # Path for caches of datasets and tokenizers 33 | max_seq_length: 128 34 | 35 | # Default options: 36 | # These can be overwritten by specific tasks 37 | batch_size: 16 38 | batch_size_ramp: 0 39 | 40 | gradient_clipping: 41 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 42 | scheduler: cosine-decay 43 | optim_mod: 44 | name: none 45 | 46 | epochs: 10 47 | 48 | # These options are only used for scheduling: 49 | warmup_steps: 0.1 50 | cooldown_steps: 0 51 | steps: 10_000 52 | 53 | testing: 54 | batch_size: 128 55 | 56 | arch_modifications: 57 | classification_head: 58 | pooler: zero_index 59 | include_ff_layer: True 60 | # head_dim: ${arch.hidden_size} 61 | nonlin: Tanh 62 | # classifier_dropout: ${arch.hidden_dropout_prob} 63 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/eval/__init__.py -------------------------------------------------------------------------------- /bert/cramming/config/eval/boolq.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - optim: adam 3 | - tasks: 4 | - boolq 5 | 6 | metrics_to_average: 7 | - boolq_accuracy 8 | 9 | optim: 10 | lr: 4e-5 11 | 12 | evaluation_set: validation # always keep this at validation except for the final run 13 | 14 | # checkpoint name: 15 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder 16 | checkpoint: latest 17 | # Set this to a non-empty value to specify a particular model file to load. 18 | model_pth: 19 | 20 | path: ${impl.path} # Path for caches of datasets and tokenizers 21 | max_seq_length: 128 22 | 23 | # Default options: 24 | # These can be overwritten by specific tasks 25 | batch_size: 16 26 | batch_size_ramp: 0 27 | 28 | gradient_clipping: 29 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 30 | scheduler: cosine-decay 31 | optim_mod: 32 | name: none 33 | 34 | epochs: 10 35 | 36 | # These options are only used for scheduling: 37 | warmup_steps: 0.1 38 | cooldown_steps: 0 39 | steps: 10_000 40 | 41 | testing: 42 | batch_size: 128 43 | 44 | arch_modifications: 45 | classification_head: 46 | pooler: zero_index 47 | include_ff_layer: True 48 | # head_dim: ${arch.hidden_size} 49 | nonlin: Tanh 50 | # classifier_dropout: ${arch.hidden_dropout_prob} 51 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/mnli.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - optim: adam 3 | - tasks: 4 | - mnli 5 | 6 | optim: 7 | weight_decay: 0.01 8 | 9 | evaluation_set: validation # always keep this at validation except for the final run 10 | 11 | checkpoint: latest 12 | path: ~/data/ # Path for caches of datasets and tokenizers 13 | max_seq_length: 128 14 | 15 | # Default options: 16 | # These can be overwritten by specific tasks 17 | batch_size: 32 18 | batch_size_ramp: 0 19 | 20 | gradient_clipping: 21 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 22 | scheduler: linear 23 | optim_mod: 24 | name: none 25 | 26 | epochs: 10 27 | 28 | # These options are only used for scheduling: 29 | warmup_steps: 0.1 30 | cooldown_steps: 0 31 | steps: 10_000 32 | 33 | arch_modifications: 34 | classification_head: 35 | pooler: zero_index 36 | include_ff_layer: True 37 | # head_dim: ${arch.hidden_size} 38 | nonlin: Tanh 39 | # classifier_dropout: ${arch.hidden_dropout_prob} 40 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/optim/adam.yaml: -------------------------------------------------------------------------------- 1 | type: AdamW 2 | 3 | lr: 2e-5 4 | betas: 5 | - 0.9 6 | - 0.98 7 | eps: 1e-6 8 | weight_decay: 0.00 # no wd in finetuning?? 9 | amsgrad: False 10 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/save_losses_rho_loss.yaml: -------------------------------------------------------------------------------- 1 | arch_modifications: null 2 | evaluation_set: validation # always keep this at validation except for the final run 3 | 4 | checkpoint: latest 5 | path: ~/data/ # Path for caches of datasets and tokenizers 6 | max_seq_length: 128 7 | model_pth: null 8 | 9 | # Default options: 10 | # These can be overwritten by specific tasks 11 | batch_size: 96 12 | batch_size_ramp: 0 13 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/boolq.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | boolq: 3 | collection: super_glue 4 | regression: False 5 | structure: [question, passage] 6 | label: -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/cb.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | cb: 3 | collection: super_glue 4 | regression: False 5 | structure: [premise, hypothesis] 6 | label: -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/cola.yaml: -------------------------------------------------------------------------------- 1 | # COLA-specific settings 2 | cola: 3 | collection: glue 4 | regression: False 5 | structure: [sentence, null] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/copa.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | copa: 3 | collection: super_glue 4 | regression: False 5 | structure: [premise, choice1, choice2, question] 6 | label: -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/mnli.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | mnli: 3 | collection: glue 4 | regression: False 5 | structure: [premise, hypothesis] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/mrpc.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | mrpc: 3 | collection: glue 4 | regression: False 5 | structure: [sentence1, sentence2] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/multirc.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | multirc: 3 | collection: super_glue 4 | regression: False 5 | structure: [paragraph, question, answer] 6 | label: -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/qnli.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | qnli: 3 | collection: glue 4 | regression: False 5 | structure: [question, sentence] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/qqp.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | qqp: 3 | collection: glue 4 | regression: False 5 | structure: [question1, question2] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/record.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | record: 3 | collection: super_glue 4 | regression: False 5 | structure: [passage, query] 6 | label: -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/rte.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | rte: 3 | collection: glue 4 | regression: False 5 | structure: [sentence1, sentence2] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/rte_superglue.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | rte: 3 | collection: super_glue 4 | regression: False 5 | structure: [premise, hypothesis] 6 | label: -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/sst2.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | sst2: 3 | collection: glue 4 | regression: False 5 | structure: [sentence, null] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/stsb.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | stsb: 3 | collection: glue 4 | regression: True 5 | structure: [sentence1, sentence2] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/wic.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | wic: 3 | collection: super_glue 4 | regression: False 5 | structure: [word, sentence1, sentence2] 6 | label: -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/wnli.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | wnli: 3 | collection: glue 4 | regression: False 5 | structure: [sentence1, sentence2] 6 | -------------------------------------------------------------------------------- /bert/cramming/config/eval/tasks/wsc.yaml: -------------------------------------------------------------------------------- 1 | # dataset-specific settings 2 | "wsc.fixed": 3 | collection: super_glue 4 | regression: False 5 | structure: [text, span1_text, span2_text] 6 | label: -------------------------------------------------------------------------------- /bert/cramming/config/hydra/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/hydra/__init__.py -------------------------------------------------------------------------------- /bert/cramming/config/hydra/job_logging/custom.yaml: -------------------------------------------------------------------------------- 1 | # python logging configuration for tasks 2 | version: 1 3 | formatters: 4 | simple: 5 | format: "[%(asctime)s] %(message)s" 6 | handlers: 7 | console: 8 | class: logging.StreamHandler 9 | formatter: simple 10 | stream: ext://sys.stdout 11 | file: 12 | class: logging.FileHandler 13 | formatter: simple 14 | # relative to the job log directory 15 | filename: ${name}_${hydra.job.name}.log 16 | root: 17 | level: INFO 18 | handlers: [console, file] 19 | 20 | disable_existing_loggers: false 21 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/impl/__init__.py -------------------------------------------------------------------------------- /bert/cramming/config/impl/_default.yaml: -------------------------------------------------------------------------------- 1 | # Settings for implementation details 2 | # These settings "should" not influence the outcome of the computation in major ways, only its speed. 3 | 4 | # This is the main folder where data will be stored (such as caches of datasets and tokenizers): 5 | # This can be an absolute path (which will be honored) or a relative path 6 | # The relative path will be executed relative to the cfg.base_dir 7 | # This behavior is controlled in the main_launcher 8 | path: data 9 | 10 | # data implementation: 11 | defaults: 12 | - data_structure: from-disk # can be LMDB or RAM or None to load directly from disk 13 | local_staging_dir: # Optionally copy a preprocessed dataset into this folder before loading it for training 14 | forbid_dataset_preprocessing: False 15 | temporary_corpus: False # Save data directly into local staging dir, forget after use 16 | max_raw_chunk_size: 1e14 17 | 18 | # validation 19 | validate_every_hours: 6 20 | 21 | # checkpointing and logging: 22 | print_loss_every_nth_step: 1000 23 | save_intermediate_checkpoints: False 24 | save_every_nth_step: 10000000 25 | 26 | # early termination, cancel runs that do not meet this loss threshold early. 27 | early_termination: 28 | enabled: False 29 | budget: 3 # budget in hours 30 | loss_threshold: 6.0 # modify this for non-xent losses 31 | 32 | # Batch size settings: 33 | # batch_size: This is handled in train after commit 982a4d33cd7f79a48b691114ae78f6ad1cdbee69 34 | microbatch_size: 128 # dont make it larger than batch_size... 35 | 36 | # Basic pytorch settings 37 | threads: 8 # maximal number of cpu dataloader workers used per GPU, this value will never exceed num_gpus * num_physical threads 38 | benchmark: True # CUDNN benchmarking 39 | deterministic: False # This option will disable non-deterministic ops 40 | non_blocking: True # unblocked .to(device) handles 41 | tf32_allowed: True 42 | 43 | # JIT: 44 | jit: # Global JIT. Can be "script" (but this doesnt work for huggingface models) or "trace" (but trace does not work with AMP) 45 | jit_instruction_type: nvfuser-profiler 46 | trace_shape: 47 | # If jit=trace, then this is the traced shape 48 | # - ${impl.microbatch_size} 49 | # - ${data.seq_length} 50 | no_jit_compilation: False # Optionaly disable all torch.jit calls 51 | 52 | # Dataloader multiprocessing 53 | pad_to_multiple_of: 8 # padding in dataloader during downstream 54 | shuffle_in_dataloader: False # There is still shuffling in the preprocessing pipeline. 55 | pin_memory: True 56 | prefetch_factor: 2 57 | persistent_workers: True # this clashes with pin_memory in pytorch<1.7.1 58 | 59 | # Default floating point precision: 60 | default_precision: float # needs to be a pytorch datatype 61 | 62 | # Distributed training 63 | backend: nccl 64 | sharing_strategy: file_descriptor 65 | 66 | # Misc: 67 | enable_huggingface_offline_mode: False 68 | local_rank: # This is set automatically by the system_startup 69 | 70 | push_to_huggingface_hub: False 71 | hf_directoy_name: "test-crammedBERT-c5" # set a clever name here! 72 | 73 | # Other constants: 74 | # OMP_NUM_THREADS:[number_of_physical_cores] 75 | # OMP_SCHEDULE: # STATIC 76 | # OMP_PROC_BIND: # CLOSE 77 | # GOMP_CPU_AFFINITY: # "N-M" 78 | # KMP_AFFINITY: # "granularity=fine,compact,1,0" 79 | # KMP_BLOCKTIME: # 1 80 | # optional_ld_preloads: 81 | # - libiomp5.so 82 | # - jemalloc.so 83 | 84 | # 85 | # ### jemalloc 86 | # export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" 87 | # export LD_PRELOAD=/home/mingfeim/packages/jemalloc-5.2.1/lib/libjemalloc.so 88 | # 89 | # ### tcmalloc 90 | # export LD_PRELOAD=/home/mingfeim/packages/gperftools-2.8/install/lib/libtcmalloc.so 91 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/data_structure/LMDB.yaml: -------------------------------------------------------------------------------- 1 | # This configuration caches the dataset in an LMDB 2 | name: LMDB 3 | draw_cache_directly: False 4 | 5 | # writing: 6 | rebuild_existing_database: False 7 | write_frequency: 50_000 # how often to flush during database creation 8 | shuffle_while_writing: False 9 | 10 | # reading: 11 | max_readers: 128 12 | readahead: True # this should be beneficial for long sequential reads 13 | meminit: True 14 | max_spare_txns: 128 15 | 16 | access: get # cursor or get 17 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/data_structure/RAM.yaml: -------------------------------------------------------------------------------- 1 | # This configuration caches the dataset in RAM 2 | name: RAM 3 | draw_cache_directly: False 4 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/data_structure/from-disk.yaml: -------------------------------------------------------------------------------- 1 | # Here the data is just read from disk on the fly 2 | name: from-disk 3 | draw_cache_directly: False 4 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/data_structure/none.yaml: -------------------------------------------------------------------------------- 1 | # Here the data is just read from disk on the fly 2 | name: none 3 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/deepspeed-hf.yaml: -------------------------------------------------------------------------------- 1 | # This configuration is a subset of the deepspeed hyperparameters. 2 | name: deepspeed 3 | defaults: 4 | - _default 5 | - _self_ 6 | 7 | # Dynamo 8 | optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser 9 | 10 | train_batch_size: ${train.batch_size} # can be "auto" 11 | train_micro_batch_size_per_gpu: 128 # can be "auto" 12 | 13 | optimizer: ${train.optim} 14 | gradient_clipping: ${train.gradient_clipping} 15 | # DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, 16 | # and OneBitLamb optimizers (See here for details) and will import other optimizers from torch. 17 | 18 | # This scheduler is not quite the same as the schedulers called via huggingface. YMMV 19 | scheduler: 20 | type: WarmupDecayLR 21 | params: 22 | warmup_min_lr: 0 23 | warmup_max_lr: ${train.optim.lr} 24 | warmup_num_steps: ${train.warmup_steps} 25 | warmup_type: linear 26 | total_num_steps: ${train.steps} 27 | 28 | # communication_data_type: # this should be good in the default setting 29 | # prescale_gradients: False # this should be good in the default setting 30 | # gradient_predivide_factor: 1.0 31 | 32 | # Do not combine these with AMP: 33 | fp16: 34 | enabled: False # can be "auto" 35 | loss_scale: 0 36 | initial_scale_power: 16 37 | loss_scale_window: 1000 38 | hysteresis: 2 39 | min_loss_scale: 1 40 | 41 | zero_optimization: 42 | # stage 0, 1, 2, and 3 refer to 43 | # 0) disabled 44 | # 1) optimizer state partitioning 45 | # 2) optimizer+gradient state partitioning 46 | # 3) optimizer+gradient+parameter partitioning 47 | stage: 3 # [0|1|2|3] 48 | overlap_comm: True # Attempts to overlap the reduction of the gradients with backward computation 49 | reduce_scatter: True # Uses reduce or reduce scatter instead of allreduce to average gradients 50 | reduce_bucket_size: 1e6 # Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes 51 | contiguous_gradients: True # Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. 52 | 53 | # Enabling and configuring ZeRO optimization of parameter offloading to CPU/NVMe. Available only with ZeRO stage 3. 54 | offload_param: 55 | device: cpu 56 | pin_memory: True 57 | 58 | # Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. 59 | # This frees up GPU memory for larger models or batch sizes. Valid only with stage 2 and 60 | # Only include these options if stage=2 or higher: 61 | offload_optimizer: 62 | device: cpu 63 | pin_memory: True 64 | 65 | stage3_max_live_parameters: 1e9 # The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication. 66 | stage3_max_reuse_distance: 1e9 # Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but perform more communication. 67 | stage3_prefetch_bucket_size: 0.94e6 # can be "auto" # The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase stalls due to communication. 68 | stage3_param_persistence_threshold: 1e4 # can be "auto" # Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). 69 | 70 | sub_group_size: 1e9 71 | stage3_gather_16bit_weights_on_model_save: True # [true|false] 72 | 73 | steps_per_print: ${impl.print_loss_every_nth_step} 74 | wall_clock_breakdown: False 75 | dump_state: False 76 | 77 | flops_profiler: 78 | enabled: False 79 | profile_step: 1 80 | module_depth: -1 81 | top_modules: 1 82 | detailed: True 83 | output_file: # If None, the profiler prints to stdout.. 84 | 85 | # activation_checkpointing: 86 | # partition_activations: False 87 | # cpu_checkpointing: False 88 | # contiguous_memory_optimization: False 89 | # number_checkpoints: 90 | # synchronize_checkpoint_boundary: False 91 | # profile: False 92 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/deepspeed.yaml: -------------------------------------------------------------------------------- 1 | # This configuration is a subset of the deepspeed hyperparameters. 2 | name: deepspeed 3 | defaults: 4 | - _default 5 | - _self_ 6 | 7 | # Dynamo 8 | optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser 9 | 10 | train_batch_size: ${train.batch_size} # can be "auto" 11 | train_micro_batch_size_per_gpu: 128 # can be "auto" 12 | 13 | optimizer: ${train.optim} 14 | gradient_clipping: 100 15 | # DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, 16 | # and OneBitLamb optimizers (See here for details) and will import other optimizers from torch. 17 | 18 | # This scheduler is not quite the same as the schedulers called via huggingface. YMMV 19 | scheduler: 20 | type: WarmupDecayLR 21 | params: 22 | warmup_min_lr: 0 23 | warmup_max_lr: ${train.optim.lr} 24 | warmup_num_steps: ${train.warmup_steps} 25 | warmup_type: linear 26 | total_num_steps: ${train.steps} 27 | 28 | # communication_data_type: # this should be good in the default setting 29 | # prescale_gradients: False # this should be good in the default setting 30 | # gradient_predivide_factor: 1.0 31 | 32 | # Do not combine these with AMP: 33 | fp16: 34 | enabled: False # can be "auto" 35 | loss_scale: 0 36 | initial_scale_power: 32 37 | loss_scale_window: 1000 38 | hysteresis: 2 39 | min_loss_scale: 1 40 | 41 | # Do not combine this with fp16 or zero: 42 | # bf16: 43 | # enabled: False 44 | # amp: 45 | # enabled: False 46 | # opt_level: O1 47 | # # can draw more args from https://nvidia.github.io/apex/amp.html#apex.amp.initialize 48 | # 49 | 50 | zero_optimization: 51 | # stage 0, 1, 2, and 3 refer to 52 | # 0) disabled 53 | # 1) optimizer state partitioning 54 | # 2) optimizer+gradient state partitioning 55 | # 3) optimizer+gradient+parameter partitioning 56 | stage: 0 # [0|1|2|3] 57 | allgather_partitions: True # [true|false] # Chooses between allgather collective or a series of broadcast collectives to gather updated parameters from all the GPUs at the end of each step 58 | allgather_bucket_size: 5e8 59 | overlap_comm: False # Attempts to overlap the reduction of the gradients with backward computation 60 | reduce_scatter: True # Uses reduce or reduce scatter instead of allreduce to average gradients 61 | reduce_bucket_size: 5e8 # Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes 62 | contiguous_gradients: True # Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. 63 | grad_hooks: True 64 | 65 | # huggingface default is 2e8 for both reduce and all_grather buckets 66 | # both reduce and all_grather buckets can also be can be "auto" 67 | 68 | # Stage 2 optimization for CPU offloading that parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. 69 | # Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism) 70 | round_robin_gradients: False # [true|false] 71 | 72 | # Enabling and configuring ZeRO optimization of parameter offloading to CPU/NVMe. Available only with ZeRO stage 3. 73 | offload_param: 74 | device: cpu 75 | # nvme_path: /nvme 76 | pin_memory: True 77 | buffer_count: 5 78 | buffer_size: 1e8 79 | max_in_cpu: 1e9 80 | 81 | # Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. 82 | # This frees up GPU memory for larger models or batch sizes. Valid only with stage 2 and 83 | # Only include these options if stage=2 or higher: 84 | # offload_optimizer: 85 | # device: cpu 86 | # # nvme_path: /nvme 87 | # pin_memory: True 88 | # buffer_count: 89 | # 4 # Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number of states maintained per parameter by the optimizer. 90 | # # For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance). 91 | # buffer_size: 1e8 92 | # fast_init: False # Enable fast optimizer initialization when offloading to NVMe. 93 | 94 | stage3_max_live_parameters: 1e9 # The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication. 95 | stage3_max_reuse_distance: 1e9 # Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but perform more communication. 96 | stage3_prefetch_bucket_size: 5e8 # can be "auto" # The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase stalls due to communication. 97 | stage3_param_persistence_threshold: 1e6 # can be "auto" # Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). 98 | 99 | sub_group_size: 1e12 100 | elastic_checkpoint: True # [true|false] 101 | stage3_gather_16bit_weights_on_model_save: False # [true|false] 102 | ignore_unused_parameters: False # [true|false] 103 | 104 | # aio: 105 | # block_size: 1048576 106 | # queue_depth: 8 107 | # thread_count: 1 108 | # single_submit: False 109 | # overlap_events: True 110 | 111 | steps_per_print: ${impl.print_loss_every_nth_step} 112 | wall_clock_breakdown: False 113 | dump_state: False 114 | 115 | flops_profiler: 116 | enabled: False 117 | profile_step: 1 118 | module_depth: -1 119 | top_modules: 1 120 | detailed: True 121 | output_file: # If None, the profiler prints to stdout.. 122 | 123 | # activation_checkpointing: 124 | # partition_activations: False 125 | # cpu_checkpointing: False 126 | # contiguous_memory_optimization: False 127 | # number_checkpoints: 128 | # synchronize_checkpoint_boundary: False 129 | # profile: False 130 | 131 | tensorboard: 132 | enabled: False 133 | output_path: tensorboard_logs 134 | job_name: ${name} 135 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/onnx.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | -------------------------------------------------------------------------------- /bert/cramming/config/impl/save_losses_rho_loss.yaml: -------------------------------------------------------------------------------- 1 | # singl(ish) GPU, sane pytorch stuff 2 | name: torch-default 3 | defaults: 4 | - _default 5 | - _self_ 6 | 7 | mixed_precision: True # turns on AMP on GPUs/Intel devices. The default precision needs to be float 8 | grad_scaling: True # Only activates when mixed_precision=True 9 | mixed_precision_target_dtype: float16 10 | 11 | saving_interval: 1000 12 | 13 | # Distributed training: 14 | zero_redundancy_optimizer: False # requires limited_decay_keys=[] for pytorch<=1.10.2 15 | broadcast_buffers: False 16 | bucket_cap_mb: 25 17 | gradient_as_bucket_view: True 18 | static_graph: True 19 | 20 | # Misc: 21 | foreach_optimizer: False 22 | 23 | # Dynamo 24 | optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser 25 | 26 | microbatch_size: 256 # dont make it larger than batch_size... 27 | rho_loss: True -------------------------------------------------------------------------------- /bert/cramming/config/impl/torch-default.yaml: -------------------------------------------------------------------------------- 1 | # singl(ish) GPU, sane pytorch stuff 2 | name: torch-default 3 | defaults: 4 | - _default 5 | - _self_ 6 | 7 | mixed_precision: True # turns on AMP on GPUs/Intel devices. The default precision needs to be float 8 | grad_scaling: True # Only activates when mixed_precision=True 9 | mixed_precision_target_dtype: float16 10 | 11 | # Distributed training: 12 | zero_redundancy_optimizer: False # requires limited_decay_keys=[] for pytorch<=1.10.2 13 | broadcast_buffers: False 14 | bucket_cap_mb: 25 15 | gradient_as_bucket_view: True 16 | static_graph: True 17 | 18 | # Misc: 19 | foreach_optimizer: False 20 | 21 | # Dynamo 22 | optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser 23 | 24 | rho_loss: False -------------------------------------------------------------------------------- /bert/cramming/config/piotr/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - task: pt 4 | 5 | # Experiment args 6 | mode: 'pt' 7 | device: gpu 8 | eval_only: false 9 | predict_only: false 10 | seed: 2137 11 | budget: 24 12 | 13 | model: 14 | name: 'google/t5-v1_1-base' 15 | checkpoint_path: '' 16 | dropout: 0.0 17 | random_init: true 18 | compile: false # Pytorch 2.0 19 | num_active_layers: -1 20 | 21 | data: 22 | input_length: 512 23 | mlm_probability: 0.15 24 | mean_noise_span_length: 3.0 25 | num_workers: 8 26 | dataset_name: 'c4' 27 | config_name: 'en' 28 | streaming: true 29 | 30 | optim: 31 | name: adamwscale 32 | base_lr: 2e-2 33 | batch_size: 144 34 | total_steps: 65536 35 | epochs: -1 # If it's > 0 it overwrites total_steps 36 | warmup_steps: 10000 37 | lr_scheduler: cosine-budget 38 | weight_decay: 0.0 39 | grad_clip: 1.0 40 | grad_acc: 2 41 | final_cosine: 1e-5 42 | 43 | stacking: 44 | enabled: true 45 | num_initial_layers: 3 46 | num_layers_to_add: 12 47 | scheduler: manual 48 | adjust_lr: false 49 | freeze_bottom_layers: false 50 | manual_scheduler: 51 | function: manual 52 | balance_factor: 1.0 53 | T_max_factor: 0.75 54 | copy_optim_states: false 55 | step_fractions: [0.125,0.3] 56 | doubling: true 57 | doubling_interpolation: false 58 | reset_optim: true 59 | 60 | eval: 61 | every_steps: 5000 62 | steps: 500 63 | eval_stacked_model: false 64 | 65 | checkpoint: 66 | every_steps: 1000 67 | start: 65000 68 | 69 | logging: 70 | wandb: true 71 | wandb_creds: 72 | name: 't5' 73 | project: 't5' 74 | entity: '' # change this optionally 75 | tags: 'baseline' 76 | every_steps: 100 77 | grad_l2: true 78 | weights_l2: true 79 | 80 | hydra: 81 | job: 82 | chdir: True 83 | run: 84 | dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S} 85 | -------------------------------------------------------------------------------- /bert/cramming/config/piotr/task/ft.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | mode: 'ft' 4 | 5 | data: 6 | max_seq_len: 1024 7 | max_target_len: 128 8 | max_num_instances_per_task: 100 9 | add_task_name: False 10 | add_task_definition: True 11 | num_pos_examples: 2 12 | num_neg_examples: 0 13 | add_explanation: False 14 | tk_instruct: False 15 | exec_file_path: ./nanoT5/utils/ni_dataset.py 16 | data_dir: ./data/splits/default 17 | task_dir: ./data/tasks 18 | 19 | optim: 20 | name: adamw 21 | base_lr: 5e-5 22 | batch_size: 8 23 | epochs: 2 24 | warmup_steps: 0 25 | lr_scheduler: constant 26 | weight_decay: 0.0 27 | grad_clip: 0.0 28 | grad_acc: 1 29 | 30 | checkpoint: 31 | start: 430000 32 | 33 | eval: 34 | steps: 200 35 | every_steps: 5000 36 | -------------------------------------------------------------------------------- /bert/cramming/config/piotr/task/pt.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | -------------------------------------------------------------------------------- /bert/cramming/config/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/train/__init__.py -------------------------------------------------------------------------------- /bert/cramming/config/train/bert-base.yaml: -------------------------------------------------------------------------------- 1 | # Basic hyperparameter for normal BERT pretraining 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters 3 | 4 | name: bert-base 5 | 6 | defaults: 7 | - optim: adam 8 | - optim_mod: disabled 9 | 10 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 11 | 12 | # steps: 13 | warmup_steps: 30_000 14 | cooldown_steps: 0 15 | steps: 600_000 # these are microbatch steps 16 | scheduler: budget-cosine-decay 17 | 18 | # Training settting: 19 | batch_size: 1536 20 | batch_size_ramp: 0 21 | 22 | gradient_clipping: 23 | pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain 24 | 25 | objective: 26 | name: masked-lm 27 | mlm_probability: 0.15 28 | use_80_20_rule: True 29 | disable_mlm: False 30 | token_drop: 0.0 31 | reverse_dataset_order: False 32 | 33 | budget: ${budget} 34 | 35 | gradinit: 36 | enabled: False 37 | # eta: 1.0 38 | # tau: 1e-3 # step size 39 | # steps: 50 40 | # min_scale: 1e-3 41 | # max_scale: 1e3 42 | # step_type: sign-grad # sign-grad or grad 43 | # second_order: False 44 | -------------------------------------------------------------------------------- /bert/cramming/config/train/bert-izsak.yaml: -------------------------------------------------------------------------------- 1 | # Basic hyperparameter for normal BERT pretraining 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters 3 | 4 | name: bert-base 5 | 6 | defaults: 7 | - optim: adam 8 | - optim_mod: disabled 9 | 10 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 11 | 12 | optim: 13 | lr: 2e-3 14 | eps: 1e-6 15 | weight_decay: 0.01 16 | betas: 17 | - 0.9 18 | - 0.98 19 | 20 | # steps: 21 | warmup_steps: 0.06 # in percentage points 22 | cooldown_steps: 0 23 | steps: 600_000 # these are microbatch steps 24 | scheduler: budget-linear 25 | 26 | # Training settting: 27 | batch_size: 4096 # for mbs=128 28 | batch_size_ramp: 0 29 | 30 | gradient_clipping: 31 | pretrain_in_train_mode: True # default BERT trains with dropout layers enabled in pretrain 32 | 33 | objective: 34 | name: masked-lm 35 | mlm_probability: 0.15 36 | use_80_20_rule: True 37 | disable_mlm: False 38 | token_drop: 0.0 39 | reverse_dataset_order: False 40 | 41 | budget: ${budget} 42 | 43 | gradinit: 44 | enabled: False 45 | # eta: 1.0 46 | # tau: 1e-3 # step size 47 | # steps: 50 48 | # min_scale: 1e-3 49 | # max_scale: 1e3 50 | # step_type: sign-grad # sign-grad or grad 51 | # second_order: False 52 | -------------------------------------------------------------------------------- /bert/cramming/config/train/bert-o1.yaml: -------------------------------------------------------------------------------- 1 | # Basic hyperparameter for normal BERT pretraining 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters 3 | 4 | name: bert-o1 5 | 6 | defaults: 7 | - optim: adam 8 | - optim_mod: disabled 9 | 10 | optim: 11 | lr: 7e-4 12 | eps: 1e-12 13 | weight_decay: 0.01 14 | 15 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 16 | 17 | # steps: 18 | warmup_steps: 0 19 | cooldown_steps: 0 20 | steps: 600_000 # these are microbatch steps 21 | scheduler: budget-cosine-decay 22 | 23 | # Training settting: 24 | batch_size: 1536 25 | batch_size_ramp: 0 26 | 27 | gradient_clipping: 28 | pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain 29 | 30 | objective: 31 | name: masked-lm 32 | mlm_probability: 0.15 33 | use_80_20_rule: True 34 | disable_mlm: False 35 | token_drop: 0.0 36 | reverse_dataset_order: False 37 | 38 | budget: ${budget} 39 | 40 | gradinit: 41 | enabled: False 42 | # eta: 1.0 43 | # tau: 1e-3 # step size 44 | # steps: 50 45 | # min_scale: 1e-3 46 | # max_scale: 1e3 47 | # step_type: sign-grad # sign-grad or grad 48 | # second_order: False 49 | -------------------------------------------------------------------------------- /bert/cramming/config/train/bert-o2.yaml: -------------------------------------------------------------------------------- 1 | # Basic hyperparameter for normal BERT pretraining 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters 3 | 4 | name: bert-o2 5 | 6 | defaults: 7 | - optim: adam 8 | - optim_mod: disabled 9 | 10 | optim: 11 | lr: 1e-3 12 | eps: 1e-12 13 | weight_decay: 0.01 14 | 15 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 16 | 17 | # steps: 18 | warmup_steps: 0 19 | cooldown_steps: 0 20 | steps: 600000 # these are microbatch steps 21 | scheduler: budget-one-cycle 22 | 23 | # Training settting: 24 | batch_size: 1536 25 | batch_size_ramp: 300000 26 | 27 | gradient_clipping: 28 | pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain 29 | 30 | objective: 31 | name: masked-lm 32 | mlm_probability: 0.15 33 | use_80_20_rule: True 34 | disable_mlm: False 35 | token_drop: 0.0 36 | reverse_dataset_order: False 37 | 38 | budget: ${budget} 39 | 40 | gradinit: 41 | enabled: False 42 | # eta: 1.0 43 | # tau: 1e-3 # step size 44 | # steps: 50 45 | # min_scale: 1e-3 46 | # max_scale: 1e3 47 | # step_type: sign-grad # sign-grad or grad 48 | # second_order: False 49 | -------------------------------------------------------------------------------- /bert/cramming/config/train/bert-o3.yaml: -------------------------------------------------------------------------------- 1 | # Basic hyperparameter for normal BERT pretraining 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters 3 | 4 | name: bert-o3 5 | 6 | defaults: 7 | - optim: adam 8 | - optim_mod: disabled 9 | 10 | #optim: 11 | # lr: 1e-3 12 | # eps: 1e-12 13 | # weight_decay: 0.01 14 | 15 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 16 | 17 | # data 18 | validation_set: 19 | enabled: True 20 | fraction: 0.005 21 | seed: 0 22 | il_model: False 23 | truncate_to: 10000 24 | 25 | # Set to integer value to truncate the dataset to this many sequences. 26 | truncate_train_dataset: 27 | 28 | # steps: 29 | warmup_steps: 0 30 | cooldown_steps: 0 31 | steps: 2000000 # these are microbatch steps 32 | scheduler: budget-one-cycle-seconds 33 | 34 | # Training settting: 35 | batch_size: 1536 36 | batch_size_ramp: 0 37 | gradient_clipping: 0.5 38 | pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain 39 | 40 | objective: 41 | name: masked-lm 42 | mlm_probability: 0.15 43 | use_80_20_rule: True 44 | disable_mlm: False 45 | token_drop: 0.0 46 | reverse_dataset_order: False 47 | 48 | budget: ${budget} 49 | 50 | stacking: 51 | enabled: false 52 | num_initial_layers: 4 53 | num_layers_to_add: 12 54 | scheduler: manual 55 | adjust_lr: false 56 | freeze_bottom_layers: false 57 | manual_scheduler: 58 | function: manual 59 | balance_factor: 1.0 60 | T_max_factor: 0.75 61 | copy_optim_states: false 62 | step_fractions: [0.125, 0.3] 63 | doubling: true 64 | doubling_interpolation: false 65 | reset_optim: true 66 | 67 | track_forward_pass_only: true 68 | 69 | rho_loss: 70 | mega_batch_size: 15360 71 | il_losses_path: /home/jean/stackbert/outputs/examples_to_loss 72 | 73 | sb: 74 | scale: 1.0 75 | 76 | sophia: 77 | batch_size_hess_update: 768 78 | hess_update_frequency: 10 79 | free_updates: False 80 | 81 | gradinit: 82 | enabled: False 83 | # eta: 1.0 84 | # tau: 1e-3 # step size 85 | # steps: 50 86 | # min_scale: 1e-3 87 | # max_scale: 1e3 88 | # step_type: sign-grad # sign-grad or grad 89 | # second_order: False 90 | # sequence_curriculum: 91 | # lengths: [8,16,32,64,128] 92 | # triggers: [0.1,0.2,0.3,0.5,0.75] 93 | # unfold: False 94 | 95 | # weight_averaging: 96 | # type: EMA 97 | # frequency: 1 98 | # momentum: 0.995 # only for EMA 99 | # last_k: 10 100 | 101 | # CU1: +train.sequence_curriculum.lengths=[8,16,32,64,128] +train.sequence_curriculum.triggers=[0.1,0.2,0.3,0.5,0.75] +train.sequence_curriculum.unfold=False 102 | # CU2: +train.sequence_curriculum.lengths=[8,16,32,64,128] +train.sequence_curriculum.triggers=[0.1,0.2,0.3,0.5,0.75] +train.sequence_curriculum.unfold=True 103 | 104 | # LAWA: +train.weight_averaging.frequency=5000 +train.weight_averaging.type=LAWA +train.weight_averaging.last_k=10 105 | # EMA: +train.weight_averaging.frequency=1 +train.weight_averaging.type=EMA +train.weight_averaging.momentum=0.995 106 | -------------------------------------------------------------------------------- /bert/cramming/config/train/bert-original.yaml: -------------------------------------------------------------------------------- 1 | # Basic hyperparameter for normal BERT pretraining 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters 3 | 4 | name: bert-original 5 | 6 | defaults: 7 | - optim: adam_classic 8 | - optim_mod: disabled 9 | 10 | optim: 11 | lr: 1e-4 12 | 13 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers 14 | 15 | # steps: 16 | warmup_steps: 80_000 # These are microbatch steps 17 | cooldown_steps: 0 18 | steps: 8_000_000 # These are microbatch steps at bs=64. The original 1mio steps for BERT are recovered with 512/64=8 19 | scheduler: polynomial-decay 20 | 21 | # Training settting: 22 | batch_size: 512 23 | batch_size_ramp: 0 24 | 25 | gradient_clipping: 26 | pretrain_in_train_mode: True # default BERT trains with dropout layers 27 | 28 | objective: 29 | name: masked-lm 30 | mlm_probability: 0.15 31 | use_80_20_rule: True 32 | disable_mlm: False 33 | token_drop: 0.0 34 | reverse_dataset_order: False 35 | 36 | budget: ${budget} 37 | 38 | gradinit: 39 | enabled: False 40 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/adafactor.yaml: -------------------------------------------------------------------------------- 1 | type: Adafactor 2 | 3 | lr: 0.001 4 | eps: 5 | - 1e-30 6 | - 0.001 7 | clip_threshold: 1.0 8 | decay_rate: -0.8 9 | beta1: 10 | weight_decay: 0.0 11 | scale_parameter: False 12 | relative_step: False 13 | warmup_init: False 14 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/adahessian.yaml: -------------------------------------------------------------------------------- 1 | type: AdaHessian 2 | 3 | lr: 0.15 4 | betas: 5 | - 0.9 6 | - 0.98 7 | eps: 1e-6 8 | weight_decay: 0.01 9 | hessian_power: 1.0 10 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/adam.yaml: -------------------------------------------------------------------------------- 1 | type: AdamW 2 | 3 | lr: 1e-3 4 | betas: 5 | - 0.9 6 | - 0.98 7 | weight_decay: 0.01 8 | amsgrad: False 9 | eps: 1e-12 -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/adam_classic.yaml: -------------------------------------------------------------------------------- 1 | type: Adam 2 | 3 | lr: 0.0005 4 | betas: 5 | - 0.9 6 | - 0.999 7 | eps: 1e-8 8 | weight_decay: 0.01 9 | amsgrad: False 10 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/lion.yaml: -------------------------------------------------------------------------------- 1 | type: Lion 2 | 3 | lr: 1e-4 4 | betas: 5 | - 0.9 6 | - 0.99 7 | # use 0.95, 0.98 if unstable 8 | weight_decay: 0.1 9 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/radam.yaml: -------------------------------------------------------------------------------- 1 | type: RAdam 2 | 3 | lr: 0.0005 4 | betas: 5 | - 0.9 6 | - 0.98 7 | eps: 1e-6 8 | weight_decay: 0.01 9 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/sgd.yaml: -------------------------------------------------------------------------------- 1 | type: SGD 2 | 3 | lr: 0.0005 4 | momentum: 0.9 5 | dampening: 0.0 6 | weight_decay: 0.01 7 | nesterov: True 8 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/shampoo.yaml: -------------------------------------------------------------------------------- 1 | type: Shampoo 2 | 3 | lr: 0.0005 4 | betas: 5 | - 0.9 6 | - 0.98 7 | epsilon: 1e-6 8 | use_bias_correction: True 9 | adam_w_mode: True 10 | weight_decay: 0.01 11 | grafting_type: 4 12 | grafting_epsilon: 1e-12 13 | grafting_beta2: 0.98 14 | 15 | root_inv_dist: False 16 | # update_freq (int): frequency for updating inverse preconditioner (Default: 100) 17 | # init_delay (int): initial delay before starting to compute root inverse (Default: 1000) 18 | # threshold (int): threshold for switching to diagonal preconditioner (Default: 1024) 19 | # preconditioner_dtype (torch.dtype): data type for preconditioner (Default: torch.float) 20 | # large_dim_method (LargeDimMethod): method for handling large scale tensors. (Default: LargeDimMethod.BLOCKING) 21 | # root_inv_dist (bool): distributes root inverse computation across multiple GPU workers (Default: True) 22 | # use_merge_dims (bool): merge dimensions if possible while respecting threshold. (Default: True) 23 | # grafting_type (GraftingType): Selects grafting method. (Default: GraftingType.ADAGRAD) 24 | # grafting_epsilon (float): Epsilon for grafting method. (Default: 1e-3) 25 | # grafting_beta2 (float): Exponential moving average factor for grafting method. (Default: 1.0) 26 | 27 | # class PreconditionerType(enum.Enum): 28 | # FULL = 0 29 | # DIAGONAL = 1 30 | # 31 | # 32 | # class GraftingType(enum.Enum): 33 | # NONE = 0 34 | # SGD = 1 35 | # ADAGRAD = 2 36 | # RMSPROP = 3 37 | # ADAM = 4 38 | # 39 | # 40 | # class LargeDimMethod(enum.Enum): 41 | # DIAGONAL = 0 42 | # ADAGRAD = 1 43 | # BLOCKING = 2 44 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim/sophiag.yaml: -------------------------------------------------------------------------------- 1 | type: SophiaG 2 | lr: 1e-3 3 | weight_decay: 0.01 4 | rho: 0.04 5 | bs: 196608 # 128 * 1536 -------------------------------------------------------------------------------- /bert/cramming/config/train/optim_mod/disabled.yaml: -------------------------------------------------------------------------------- 1 | name: none 2 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim_mod/larc.yaml: -------------------------------------------------------------------------------- 1 | name: LARC 2 | 3 | trust_coefficient: 0.02 4 | clip: True 5 | eps: 1e-8 6 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim_mod/lars.yaml: -------------------------------------------------------------------------------- 1 | name: LARS 2 | 3 | trust_coefficient: 0.02 4 | clip: False 5 | eps: 1e-8 6 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim_mod/progressive.yaml: -------------------------------------------------------------------------------- 1 | name: progressive-batching 2 | 3 | progress_rule: norm-based 4 | 5 | monotone: False 6 | theta: 0.9 7 | 8 | min_sample_guard: 2 9 | max_sample_guard: 128 10 | -------------------------------------------------------------------------------- /bert/cramming/config/train/optim_mod/sam.yaml: -------------------------------------------------------------------------------- 1 | name: SAM 2 | rho: 0.05 3 | -------------------------------------------------------------------------------- /bert/cramming/config/wandb/default.yaml: -------------------------------------------------------------------------------- 1 | enabled: True 2 | entity: "" # change this optionally 3 | project: cramming-pretrain 4 | tags: [] 5 | # If set, resume from the given wandb id. 6 | resume: 7 | -------------------------------------------------------------------------------- /bert/cramming/config/wandb/none.yaml: -------------------------------------------------------------------------------- 1 | enabled: False 2 | entity: 3 | project: 4 | tags: [] 5 | -------------------------------------------------------------------------------- /bert/cramming/data/__init__.py: -------------------------------------------------------------------------------- 1 | """This module handles and hides the data away ;)""" 2 | 3 | from .downstream_task_preparation import prepare_task_dataloaders 4 | from .pretraining_preparation import load_pretraining_corpus 5 | -------------------------------------------------------------------------------- /bert/cramming/data/cached_datasets.py: -------------------------------------------------------------------------------- 1 | """Write a PyTorch dataset into RAM.""" 2 | 3 | import logging 4 | 5 | import torch 6 | import transformers 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | def lookup_dtype(vocab_size): 12 | if vocab_size < 2**8: 13 | dtype = torch.uint8 14 | # would really be neat to have uint16 here between the BERT and GPT encoding sizes 15 | elif vocab_size < 2**16 // 2: 16 | dtype = torch.int16 17 | elif vocab_size < 2**32 // 2: 18 | dtype = torch.int32 19 | else: 20 | dtype = torch.int64 21 | return dtype 22 | 23 | 24 | class CachedDataset(torch.utils.data.Dataset): 25 | """Cache a given dataset into RAM or SDRAM (GPU memory). 26 | 27 | This is only a good idea if you have enough RAM, especially if mapping into SDRAM. 28 | """ 29 | 30 | def __init__(self, dataset, seq_length, vocab_size, num_workers=0, target_device=torch.device("cpu")): 31 | """Initialize with a given pytorch dataset. The setup dictionary determines cache location and storage type.""" 32 | self.dataset = dataset 33 | log.info("Caching started ...") 34 | batch_size = min(len(dataset), 2048) 35 | cacheloader = torch.utils.data.DataLoader( 36 | dataset, 37 | batch_size=batch_size, 38 | shuffle=False, 39 | drop_last=False, 40 | num_workers=num_workers, 41 | pin_memory=False, 42 | collate_fn=transformers.data.data_collator.torch_default_data_collator, 43 | ) 44 | self.dataset_keys = list(dataset[0].keys()) 45 | seq_lengths = [len(dataset[0][k]) for k in self.dataset_keys] 46 | assert all([length == seq_lengths[0] for length in seq_lengths]) 47 | 48 | # Allocate memory: 49 | pin = target_device == torch.device("cpu") and torch.cuda.is_available() 50 | cache_setup = dict(device=target_device, dtype=lookup_dtype(vocab_size), pin_memory=pin) 51 | self.cache = torch.empty((len(self.dataset), seq_length * 4), **cache_setup) 52 | 53 | pointer = 0 54 | for data in cacheloader: 55 | batch_length = data[self.dataset_keys[0]].shape[0] 56 | data_block = torch.cat([d.to(cache_setup["dtype"]) for d in data.values()], dim=1) 57 | self.cache[pointer : pointer + batch_length] = data_block 58 | pointer += batch_length 59 | 60 | self.cache = self.cache.contiguous() 61 | log.info(f'Dataset successfully cached into {"RAM" if target_device == torch.device("cpu") else "SDRAM"}.') 62 | 63 | def __getitem__(self, index): 64 | """Get sample, target from cache.""" 65 | sample_data_block = self.cache[index] 66 | sample_dict = dict(zip(self.dataset_keys, torch.chunk(sample_data_block, len(self.dataset_keys), dim=-1))) 67 | return sample_dict 68 | 69 | def __len__(self): 70 | """Length is length of self.dataset.""" 71 | return len(self.dataset) 72 | 73 | def __getattr__(self, name): 74 | """This is only called if all attributes of Subset are exhausted.""" 75 | return getattr(self.dataset, name) 76 | -------------------------------------------------------------------------------- /bert/cramming/data/curriculum_sorting.py: -------------------------------------------------------------------------------- 1 | """Baseline curricula.""" 2 | import logging 3 | 4 | import numpy as np 5 | import torch 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | def _sort_tokenized_dataset_by_unigram(tokenized_dataset, tokenizer, num_threads=1, ngram=1, reverse=False): 11 | # Force unigram counts per token: 12 | map_setup = dict( 13 | batched=True, 14 | batch_size=1024, 15 | # num_proc=None, # have to reimplement counting as in-out instead of side effects for this to work. Lets see how slow num_proc=0 is 16 | load_from_cache_file=False, 17 | # keep_in_memory=True, 18 | ) 19 | 20 | unigrams_counts_per_token = np.zeros(tokenizer.vocab_size, dtype=np.int64) 21 | 22 | def count_unigrams(examples): 23 | nonlocal unigrams_counts_per_token 24 | unigrams_counts_per_token += np.bincount(np.asarray(examples["input_ids"]).reshape(-1), minlength=tokenizer.vocab_size) 25 | 26 | tokenized_dataset.map(count_unigrams, desc="Counting token unigrams", **map_setup, num_proc=None) 27 | 28 | token_count = sum(unigrams_counts_per_token) 29 | k = 1 30 | k_smoothed_probs = (unigrams_counts_per_token + k) / (token_count + k * tokenizer.vocab_size) 31 | log2_probs = np.log2(k_smoothed_probs) 32 | 33 | def return_seq_prob(examples): 34 | # seq_counts = np.apply_along_axis(np.bincount, axis=1, arr=np.asarray(examples["input_ids"]), minlength=tokenizer.vocab_size) 35 | # seq_counts = (np.asarray(examples["input_ids"])[:, :,None] == np.arange(0, tokenizer.vocab_size)[None, None, :]).sum(axis=1) # slower so far 36 | # logprob_scores = (log2_probs * seq_counts).sum(axis=1) / tokenizer.model_max_length 37 | # why make hard when can do easy? 38 | logprob_scores = log2_probs[np.asarray(examples["input_ids"])].sum(axis=1) / tokenizer.model_max_length 39 | return dict(scores=logprob_scores) 40 | 41 | dataset_probs = tokenized_dataset.map( 42 | return_seq_prob, 43 | desc="Computing log probs per sequence", 44 | remove_columns=tokenized_dataset.column_names, 45 | **map_setup, 46 | num_proc=num_threads if num_threads > 0 else None, 47 | ) 48 | 49 | new_order = np.argsort(np.asarray(dataset_probs["scores"])) 50 | 51 | if reverse: 52 | new_order = new_order[::-1] 53 | 54 | return tokenized_dataset.select(indices=new_order, writer_batch_size=1024) 55 | 56 | 57 | def _sort_tokenized_dataset_by_token(tokenized_dataset, tokenizer, target_token_id, num_threads=1): 58 | map_setup = dict( 59 | batched=True, 60 | batch_size=1024, 61 | num_proc=num_threads if num_threads > 0 else None, 62 | load_from_cache_file=False, 63 | # keep_in_memory=True, 64 | ) 65 | 66 | def count_token(examples): 67 | return dict(counts=(np.asarray(examples["input_ids"]) == target_token_id).sum(axis=1)) 68 | 69 | dataset_counts = tokenized_dataset.map( 70 | count_token, 71 | desc=f"Counting occurrences of token {tokenizer.decode(target_token_id)}", 72 | remove_columns=tokenized_dataset.column_names, 73 | **map_setup, 74 | ) 75 | 76 | new_order = np.argsort(np.asarray(dataset_counts["counts"]))[::-1] 77 | 78 | # Print sentence with most occurrences: 79 | sentence_idx = int(new_order[0]) 80 | input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze() # squeeze because hf has leading dim 81 | dataset_size = len(tokenized_dataset) 82 | 83 | log.info("Sentence with most occurrences of token ...") 84 | log.info(tokenizer.batch_decode(input_data[None])[0]) 85 | 86 | sentence_idx = int(new_order[-1]) 87 | input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze() # squeeze because hf has leading dim 88 | dataset_size = len(tokenized_dataset) 89 | 90 | log.info("Sentence with least occurrences of token ...") 91 | log.info(tokenizer.batch_decode(input_data[None])[0]) 92 | 93 | return tokenized_dataset.select(indices=new_order, writer_batch_size=1024) 94 | 95 | 96 | def _sort_tokenized_dataset_by_word_length(tokenized_dataset, tokenizer, num_threads=1): 97 | map_setup = dict( 98 | batched=True, 99 | batch_size=1024, 100 | num_proc=num_threads if num_threads > 0 else None, 101 | load_from_cache_file=False, 102 | # keep_in_memory=True, 103 | ) 104 | 105 | def count_word_lengths(examples): 106 | return dict(lengths=[len(s) for s in tokenizer.batch_decode(torch.as_tensor(examples["input_ids"]))]) 107 | 108 | dataset_counts = tokenized_dataset.map( 109 | count_word_lengths, 110 | desc="Counting word lengths per sequence", 111 | remove_columns=tokenized_dataset.column_names, 112 | **map_setup, 113 | ) 114 | 115 | new_order = np.argsort(np.asarray(dataset_counts["lengths"])) # shortest sentences first 116 | 117 | # Print sentence with shortest length 118 | sentence_idx = int(new_order[0]) 119 | input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze() # squeeze because hf has leading dim 120 | dataset_size = len(tokenized_dataset) 121 | 122 | log.info("Sentence with shortest length ...") 123 | log.info(tokenizer.batch_decode(input_data[None])[0]) 124 | 125 | sentence_idx = int(new_order[-1]) 126 | input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze() # squeeze because hf has leading dim 127 | dataset_size = len(tokenized_dataset) 128 | 129 | log.info("and longest ...") 130 | log.info(tokenizer.batch_decode(input_data[None])[0]) 131 | 132 | return tokenized_dataset.select(indices=new_order, writer_batch_size=1024) 133 | -------------------------------------------------------------------------------- /bert/cramming/data/downstream_task_preparation.py: -------------------------------------------------------------------------------- 1 | """Prepare downstream tasks evaluations.""" 2 | import logging 3 | import os 4 | from collections import defaultdict 5 | 6 | import datasets 7 | import torch 8 | from datasets import load_dataset 9 | 10 | from ..backend.utils import prepare_downstream_dataloader 11 | from .pretraining_preparation import main_process_first 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | def get_sentences(examples, sentence_keys): 17 | return tuple(examples[key] for key in sentence_keys if key is not None) 18 | 19 | 20 | def prepare_task_dataloaders(tokenizer, cfg_eval, cfg_impl): 21 | """Load all datasets in eval.tasks for finetuning and testing.""" 22 | cfg_eval.path = os.path.expanduser(cfg_eval.path) 23 | datasets.enable_caching() # We can cache these 24 | max_seq_length = cfg_eval.max_seq_length 25 | tasks = defaultdict(dict) 26 | 27 | for task_name, task_details in cfg_eval.tasks.items(): 28 | log.info(f"Preparing data for task {task_name}.") 29 | tasks[task_name]["details"] = task_details 30 | raw_datasets = load_dataset(task_details.collection, task_name, cache_dir=cfg_impl.path) 31 | if not task_details.regression: 32 | if "label" in task_details and task_details.label is not None and len(task_details.label) > 0: 33 | label_list = [task_details.label] 34 | else: 35 | label_list = raw_datasets["train"].features["label"].names 36 | log.info(f"{task_name} has classes {label_list}.") 37 | tasks[task_name]["num_classes"] = len(label_list) 38 | else: 39 | tasks[task_name]["num_classes"] = 1 40 | label_list = None 41 | sentence_keys = task_details.structure 42 | 43 | def preprocess_function(examples): 44 | texts = get_sentences(examples, sentence_keys) 45 | result = tokenizer( 46 | *texts, 47 | max_length=max_seq_length, 48 | truncation=True, 49 | pad_to_multiple_of=cfg_impl.pad_to_multiple_of, 50 | ) 51 | 52 | if "label" in examples: 53 | result["labels"] = examples["label"] 54 | if task_name == "multirc": 55 | result["p_idx"] = [ex["paragraph"] for ex in examples["idx"]] 56 | result["q_idx"] = [ex["question"] for ex in examples["idx"]] 57 | result["a_idx"] = [ex["answer"] for ex in examples["idx"]] 58 | return result 59 | 60 | with main_process_first(): 61 | processed_datasets = raw_datasets.map( 62 | preprocess_function, 63 | batched=True, 64 | batch_size=1024, 65 | load_from_cache_file=True, 66 | remove_columns=raw_datasets["train"].column_names, 67 | desc="Running tokenizer on dataset", 68 | ) 69 | 70 | train_dataset = processed_datasets["train"] 71 | train_dataset.set_format("torch") 72 | assert cfg_eval.evaluation_set in ["validation", "test"] 73 | eval_dataset = processed_datasets[f"{cfg_eval.evaluation_set}_matched" if task_name == "mnli" else cfg_eval.evaluation_set] 74 | eval_dataset.set_format("torch") 75 | if task_name == "mnli": 76 | # Extra task loader for MNLI 77 | extra_eval_dataset = processed_datasets[f"{cfg_eval.evaluation_set}_mismatched"] 78 | extra_eval_dataset.set_format("torch") 79 | else: 80 | extra_eval_dataset = None 81 | 82 | train_dataloader, eval_dataloader, extra_eval_dataloader = _build_dataloaders( 83 | tokenizer, 84 | train_dataset, 85 | eval_dataset, 86 | extra_eval_dataset, 87 | cfg_impl, 88 | ) 89 | 90 | tasks[task_name]["trainloader"] = train_dataloader 91 | tasks[task_name]["validloader"] = eval_dataloader 92 | tasks[task_name]["extra_validloader"] = extra_eval_dataloader 93 | 94 | # Log overviews so we always know what's going on with weird tokenization tricks 95 | random_sentence_idx = torch.randint(0, len(train_dataset), (1,)).item() 96 | input_data = train_dataset[random_sentence_idx]["input_ids"].squeeze() # squeeze because hf has leading dim 97 | 98 | log.info(f"Random sentence with seq_length {tokenizer.model_max_length} from trainset of size {len(train_dataset):,}: ...") 99 | log.info(tokenizer.batch_decode(input_data[None])[0]) 100 | log.info("... is tokenized into ...") 101 | log.info("_".join(tokenizer.decode(t) for t in input_data)) 102 | if label_list is not None: 103 | log.info(f"Correct Answer: {label_list[train_dataset[random_sentence_idx]['labels']]}") 104 | else: 105 | log.info(f"Correct Answer: {train_dataset[random_sentence_idx]['labels']}") 106 | random_sentence_idx = torch.randint(0, len(eval_dataset), (1,)).item() 107 | input_data = eval_dataset[random_sentence_idx]["input_ids"].squeeze() # squeeze because hf has leading dim 108 | 109 | log.info(f"Random sentence from validset of size {len(eval_dataset):,}: ...") 110 | log.info(tokenizer.batch_decode(input_data[None])[0]) 111 | if label_list is not None: 112 | log.info(f"Correct Answer: {label_list[eval_dataset[random_sentence_idx]['labels']]}") 113 | else: 114 | log.info(f"Correct Answer: {eval_dataset[random_sentence_idx]['labels']}") 115 | 116 | return tasks 117 | 118 | 119 | def _build_dataloaders(tokenizer, train_dataset, eval_dataset, extra_eval_dataset, cfg_impl): 120 | """Construct dataloaders according to cfg_impl settings. Validation samplers always repeat on all devices.""" 121 | train_dataloader = prepare_downstream_dataloader(train_dataset, tokenizer, "training", cfg_impl) 122 | eval_dataloader = prepare_downstream_dataloader(eval_dataset, tokenizer, "eval", cfg_impl) 123 | if extra_eval_dataset is not None: 124 | extra_eval_dataloader = prepare_downstream_dataloader(extra_eval_dataset, tokenizer, "eval", cfg_impl) 125 | else: 126 | extra_eval_dataloader = None 127 | return train_dataloader, eval_dataloader, extra_eval_dataloader 128 | -------------------------------------------------------------------------------- /bert/cramming/data/utils.py: -------------------------------------------------------------------------------- 1 | """Various utilities.""" 2 | import hashlib 3 | import json 4 | import logging 5 | import os 6 | import shutil 7 | import time 8 | 9 | import datasets 10 | from omegaconf import OmegaConf 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | def checksum_config(cfg): 16 | """This is more annoying that I thought it would be. But a json-dump of the config file is hashed and used as checksum.""" 17 | bindump = json.dumps(OmegaConf.to_container(cfg, resolve=True), sort_keys=True).encode("utf-8") 18 | checksum_of_config = hashlib.md5(bindump).hexdigest() 19 | if "tokenizer" in cfg and "vocab_size" in cfg: 20 | checksum_of_config = f"{cfg.tokenizer}x{cfg.vocab_size}_{checksum_of_config}" 21 | return checksum_of_config 22 | 23 | 24 | def stage_dataset(data_directory_path, local_staging_dir): 25 | """This is a mess because our network drives are a mess. You might not need this.""" 26 | data_directory_name = os.path.basename(data_directory_path) 27 | new_path = os.path.join(local_staging_dir, data_directory_name) 28 | if os.path.isdir(data_directory_path): 29 | try: 30 | if not os.path.isdir(new_path): 31 | try: 32 | shutil.copytree(data_directory_path, new_path) 33 | log.info(f"Staging dataset to {new_path}...") 34 | except FileExistsError: 35 | log.info(f"Concurrent writing to {new_path} detected. Stopping staging in this run and waiting for 300 seconds.") 36 | time.sleep(300) 37 | else: 38 | log.info(f"Using staged dataset found at {new_path}...") 39 | 40 | for retries in range(15): 41 | _, _, free = shutil.disk_usage(new_path) 42 | used = _get_size(new_path) 43 | try: 44 | tokenized_dataset = datasets.load_from_disk(new_path) 45 | log.info(f"Staged dataset size is {used / 1024**3:,.3f}GB. {free/ 1024**3:,.3f}GB free in staging dir.") 46 | return new_path 47 | except FileNotFoundError: 48 | log.info( 49 | f"Staged dataset is incomplete. Size is {used / 1024**3:,.3f}GB. " 50 | f" Waiting for 60 more secs for staging race condition." 51 | ) 52 | time.sleep(60) 53 | log.info(f"Staging dataset corrupted. Falling back to network drive location {data_directory_path}") 54 | return data_directory_path 55 | 56 | except Exception as e: # noqa 57 | log.info(f"Staging failed with error {e}. Falling back to network drive location {data_directory_path}") 58 | return data_directory_path 59 | else: 60 | raise FileNotFoundError(f"Dataset not yet generated or not found at {data_directory_path}.") 61 | 62 | 63 | def _get_size(start_path="."): 64 | """Compute the size of a directory path. Why is this not in the standard library?""" 65 | """Stolen from https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python""" 66 | total_size = 0 67 | for dirpath, dirnames, filenames in os.walk(start_path): 68 | for f in filenames: 69 | fp = os.path.join(dirpath, f) 70 | # skip if it is symbolic link 71 | if not os.path.islink(fp): 72 | total_size += os.path.getsize(fp) 73 | return total_size 74 | -------------------------------------------------------------------------------- /bert/efficient_training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/efficient_training/__init__.py -------------------------------------------------------------------------------- /bert/efficient_training/extract_il_losses.py: -------------------------------------------------------------------------------- 1 | """Extracts the RHO-Loss irreducible losses from a model.""" 2 | import logging 3 | import os 4 | import pickle 5 | import time 6 | from collections import defaultdict 7 | from typing import Optional 8 | 9 | import hydra 10 | import numpy as np 11 | import torch 12 | import wandb 13 | from tqdm import tqdm 14 | 15 | import cramming 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | 20 | def save_chunk(chunk_data: dict, chunk_number: int, path: str) -> None: 21 | with open(os.path.join(path, f"dict_chunk_{chunk_number}.pkl"), "wb") as file: 22 | pickle.dump(chunk_data, file) 23 | 24 | 25 | def get_example_ids_from_batch(examples_counter, len_batch: int, len_dataset: Optional[int] = None) -> list[int]: 26 | example_ids = examples_counter + np.arange(len_batch) 27 | example_ids = example_ids.tolist() 28 | if len_dataset is not None: 29 | example_ids = [example_id % len_dataset for example_id in example_ids] 30 | return example_ids 31 | 32 | 33 | def save_losses_of_il_model(cfg, setup): 34 | """This function controls the central training loop.""" 35 | tokenizer, cfg_arch, model_file = cramming.utils.find_pretrained_checkpoint(cfg) 36 | model = cramming.construct_model(cfg.arch, tokenizer.vocab_size) 37 | train_set, validation_set, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl, cfg.train) 38 | model_engine, _, _, train_dataloader, validation_dataloader = cramming.load_backend( 39 | model, 40 | train_set, 41 | validation_set, 42 | tokenizer, 43 | cfg.train, 44 | cfg.impl, 45 | setup=setup, 46 | ) 47 | model_engine.load_checkpoint(cfg_arch, model_file) 48 | model_engine.eval() 49 | iterable_data = enumerate(tqdm(train_dataloader)) 50 | path = os.path.join(cfg.base_dir, cfg.name, "examples_to_loss") 51 | os.makedirs(path, exist_ok=True) 52 | log.info(f"Saving losses of IL model to {path}") 53 | train_time = time.time() # Crude time measurement for print_loss_every_nth_step 54 | stats = defaultdict(list) 55 | 56 | # Launch training 57 | examples_to_loss_dict = {} 58 | chunk_counter = 0 59 | example_ids = [] 60 | examples_counter = 0 61 | with torch.no_grad(): 62 | for step, batch in iterable_data: 63 | # Heavy lifting is moved to engines 64 | example_ids_in_batch = get_example_ids_from_batch(examples_counter, len(batch["input_ids"])) 65 | example_ids.extend(example_ids_in_batch) 66 | device_batch = model_engine.to_device(batch) 67 | examples_counter += len(batch["input_ids"]) 68 | with torch.autocast(**model_engine.amp_settings): 69 | losses = model_engine.model.forward_all_losses(**device_batch) 70 | examples_to_loss_dict.update(dict(zip(example_ids_in_batch, losses))) 71 | if step > 0 and step % cfg.impl.saving_interval == 0: 72 | examples_to_loss_dict = {k: v.cpu().tolist() for k, v in examples_to_loss_dict.items()} 73 | save_chunk(examples_to_loss_dict, chunk_counter, path) 74 | examples_to_loss_dict = {} # free up RAM 75 | chunk_counter += 1 76 | if step > 0 and step % cfg.impl.print_loss_every_nth_step == 0: 77 | stats["train_time"] += [(time.time() - train_time) / cfg.impl.print_loss_every_nth_step] 78 | stats["step"] += [step] 79 | stats["examples_counter"] += [examples_counter] 80 | train_time = time.time() 81 | wandb.log({k: v[-1] for k, v in stats.items()}, step=stats["step"][-1] if "step" in stats else None) 82 | examples_to_loss_dict = {k: v.cpu().tolist() for k, v in examples_to_loss_dict.items()} 83 | save_chunk(examples_to_loss_dict, chunk_counter, path) 84 | stats["train_time"] += [(time.time() - train_time) / cfg.impl.print_loss_every_nth_step] 85 | stats["step"] += [step] 86 | stats["examples_counter"] += [examples_counter] 87 | wandb.log({k: v[-1] for k, v in stats.items()}, step=stats["step"][-1] if "step" in stats else None) 88 | 89 | 90 | @hydra.main(config_path="../cramming/config", config_name="cfg_save_losses", version_base="1.1") 91 | def launch(cfg): 92 | cramming.utils.main_launcher(cfg, save_losses_of_il_model, job_name="save_losses") 93 | 94 | 95 | if __name__ == "__main__": 96 | launch() 97 | -------------------------------------------------------------------------------- /bert/efficient_training/layer_drop.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | 5 | 6 | def sample_active_layers(seconds: int, step: int, cfg) -> tuple[list[int], float]: 7 | total_layers = cfg.arch.num_transformer_layers 8 | max_drop_prob = _get_drop_prob(seconds, step, cfg) 9 | 10 | active_layers: list[int] = [] 11 | for layer_i in range(0, total_layers): 12 | layer_drop_prob = max_drop_prob / total_layers * (layer_i + 1) 13 | if torch.bernoulli(torch.tensor(1.0 - layer_drop_prob)): 14 | active_layers.append(layer_i) 15 | return active_layers, max_drop_prob 16 | 17 | 18 | def _get_drop_prob(seconds: int, step: int, cfg) -> float: 19 | if cfg.budget == "steps": 20 | t = step 21 | T = cfg.train.steps 22 | else: 23 | budget_seconds = cfg.budget * 60 * 60 24 | t = seconds 25 | T = budget_seconds 26 | gamma = cfg.arch.layer_drop.gamma_factor / T 27 | min_theta = cfg.arch.layer_drop.max_theta 28 | return 1 - (min_theta + (1 - min_theta) * math.exp(-gamma * t)) 29 | -------------------------------------------------------------------------------- /bert/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "ntng-bert" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Jean Kaddour"] 6 | readme = "README.md" 7 | packages = [ 8 | { include = "cramming" }, 9 | { include = "efficient_training" }, 10 | { include = "rst" }, 11 | ] 12 | 13 | [tool.poetry.dependencies] 14 | python = "~3.10" 15 | hydra-core = ">=1.1" 16 | torch = "~1.13+cu117" 17 | datasets = "^2.13.1" 18 | tokenizers = "^0.13.3" 19 | transformers = "^4.30.2" 20 | evaluate = "^0.4.0" 21 | psutil = "^5.9.5" 22 | einops = "^0.6.1" 23 | zstandard = "^0.21.0" 24 | wandb = "^0.15.5" 25 | scipy = "^1.11.1" 26 | scikit-learn = "^1.3.0" 27 | 28 | [tool.poetry.group.dev.dependencies] 29 | black = "^23.7.0" 30 | mypy = "^1.4.1" 31 | isort = "^5.12.0" 32 | pytest = "^7.4.0" 33 | 34 | [[tool.poetry.source]] 35 | name = "PyPI" 36 | priority = "primary" 37 | 38 | [[tool.poetry.source]] 39 | name = "pytorch_cuda_117" 40 | url = "https://download.pytorch.org/whl/cu117" 41 | priority = "supplemental" 42 | 43 | [build-system] 44 | requires = ["poetry-core"] 45 | build-backend = "poetry.core.masonry.api" 46 | 47 | 48 | [tool.black] 49 | line-length = 140 50 | 51 | [tool.isort] 52 | known_third_party = ["wandb"] 53 | profile = "black" 54 | 55 | [tool.pytest.ini_options] 56 | pythonpath = ["."] 57 | -------------------------------------------------------------------------------- /bert/rst/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/rst/__init__.py -------------------------------------------------------------------------------- /bert/rst/get_RSTs_from_wandb.py: -------------------------------------------------------------------------------- 1 | """ 2 | With this script, you can extract RSTs directly from wandb. 3 | You may need to install wandb first and adjust the WANDB_PROJECT variable below. 4 | """ 5 | import matplotlib.pyplot as plt 6 | import wandb 7 | 8 | WANDB_PROJECT = "" # add project name here 9 | 10 | api = wandb.Api(api_key="") 11 | runs = api.runs(path=WANDB_PROJECT) 12 | 13 | 14 | # %% 15 | times = {} 16 | for run in runs: 17 | bs = run.config["train"]["batch_size"] 18 | steps = [ 19 | 850, 20 | 2000, 21 | 2800, 22 | 3500, 23 | 4000, 24 | 4400, 25 | 4800, 26 | 5100, 27 | 5450, 28 | 5700, 29 | 6000, 30 | 6200, 31 | 6400, 32 | 6650, 33 | 6800, 34 | 7100, 35 | ] 36 | df = run.history(keys=["train_time"]) 37 | df = df.set_index("_step") 38 | values = df.loc[steps].values.reshape(-1).tolist() 39 | val_strs = [f"{x:.4f}" for x in values] 40 | print(f"{bs}: [{', '.join(val_strs)},],") 41 | times[bs] = values 42 | 43 | # As a test, plot the times to check it looks similar to the wandb plot. 44 | for bs, times in times.items(): 45 | xs = [] 46 | ys = [] 47 | for i, time in enumerate(times): 48 | xs.append(i + 0.5) 49 | xs.append(i + 1.5) 50 | ys.append(time) 51 | ys.append(time) 52 | print("", bs, len(times)) 53 | plt.plot(xs, ys, label=f"bs{bs}") 54 | plt.legend() 55 | plt.show() 56 | -------------------------------------------------------------------------------- /bert/rst/saved_rsts.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Optional 2 | 3 | Task = Literal["bert"] 4 | 5 | # BERT 16 layer on NVIDIA 3090 6 | # FORWARD AND BACKWARD; here, we track the time of a minibatch size 7 | NUM_LAYERS_AND_BATCH_TO_TIME_FB_BERT = { 8 | 1536: [ 9 | 0.02521, 10 | 0.03837, 11 | 0.05177, 12 | 0.06508, 13 | 0.07847, 14 | 0.09188, 15 | 0.01052, 16 | 0.1186, 17 | 0.1319, 18 | 0.1454, 19 | 0.1589, 20 | 0.1722, 21 | 0.1857, 22 | 0.1988, 23 | 0.212, 24 | 0.2253, 25 | ], 26 | } 27 | 28 | # BERT 16 layer on NVIDIA 3090 29 | # FORWARD ONLY (relevant for RhoLoss); here, we track the time of a microbatch size 30 | NUM_LAYERS_AND_BATCH_TO_TIME_F_BERT = {128: 0.09285} 31 | 32 | 33 | def get_time_per_step( 34 | batch_size: int, num_active_layers: int, task: Task = "bert", forward_only: bool = False, microbatch_size: Optional[int] = None 35 | ) -> float: 36 | if task == "bert": 37 | if forward_only: 38 | time = NUM_LAYERS_AND_BATCH_TO_TIME_F_BERT[batch_size] 39 | else: 40 | time = NUM_LAYERS_AND_BATCH_TO_TIME_FB_BERT[batch_size][num_active_layers - 1] 41 | 42 | if microbatch_size is None: 43 | return time 44 | else: 45 | if microbatch_size % 128 != 0 or microbatch_size <= 0: 46 | raise ValueError("Microbatch size must be multiple of 128") 47 | return time * microbatch_size / 128.0 48 | else: 49 | raise NotImplementedError("Only BERT is supported in this module.") 50 | -------------------------------------------------------------------------------- /bert/validate_bert.py: -------------------------------------------------------------------------------- 1 | """Evaluates a pretrained model on the pretraining validation set. 2 | 3 | Optionally updates the wandb run with the validation loss. 4 | """ 5 | 6 | import logging 7 | import sys 8 | 9 | import hydra 10 | import torch 11 | import wandb 12 | from wandb.apis.public import Run 13 | 14 | import cramming 15 | 16 | log = logging.getLogger(__name__) 17 | from cramming.utils import validate 18 | 19 | 20 | def main_eval_process(cfg, setup): 21 | """This function controls the central training loop.""" 22 | 23 | tokenizer, cfg_arch, model_file = cramming.utils.find_pretrained_checkpoint(cfg) 24 | model = cramming.construct_model(cfg_arch, tokenizer.vocab_size) 25 | train_dataset, validation_set, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl, cfg.train) 26 | if cfg.truncate_dataset > 0: 27 | train_dataset = train_dataset.select(range(min(cfg.truncate_dataset, len(train_dataset)))) 28 | validation_set = validation_set.select(range(min(cfg.truncate_dataset, len(validation_set)))) 29 | log.info(f"Train dataset size: {len(train_dataset)}, validation set size: {len(validation_set)}") 30 | model_engine, _, _, _, validation_loader = cramming.load_backend( 31 | model, 32 | train_dataset, 33 | validation_set, 34 | tokenizer, 35 | cfg.train, 36 | cfg.impl, 37 | setup=setup, 38 | ) 39 | 40 | model_engine.load_checkpoint(cfg_arch, model_file) 41 | model_engine.eval() 42 | validation_loss = [validate(model_engine, validation_loader, model_engine.setup["device"])] 43 | 44 | log.info(f"Avg Validation loss: {validation_loss}") 45 | 46 | if cfg.wandb.resume is not None: 47 | print(f"Would you like to update existing run {cfg.wandb.resume}?") 48 | if _ask_yes_no(): 49 | logged_run: Run = wandb.Api().run(path=f"{cfg.wandb.entity}/{cfg.wandb.project}/{cfg.wandb.resume}") 50 | hour_to_log = logged_run.history(keys=["hours"])["hours"].values[-1] + 0.0001 51 | print(f"Logging at hour {hour_to_log:.3f}") 52 | wandb.log({"validation_loss": validation_loss, "hours": hour_to_log}) 53 | else: 54 | print("Not logging") 55 | 56 | 57 | def _ask_yes_no() -> bool: 58 | while True: 59 | sys.stdout.write("y/n:") 60 | response = input().lower() 61 | if response == "y": 62 | return True 63 | if response == "n": 64 | return False 65 | 66 | 67 | @hydra.main(config_path="cramming/config", config_name="cfg_eval_pt", version_base="1.1") 68 | def launch(cfg): 69 | cramming.utils.main_launcher(cfg, main_eval_process, job_name="eval_pt_task") 70 | 71 | 72 | if __name__ == "__main__": 73 | launch() 74 | -------------------------------------------------------------------------------- /t5/.gitignore: -------------------------------------------------------------------------------- 1 | .neptune/ 2 | data 3 | data/ 4 | .DS_Store 5 | .vscode/ -------------------------------------------------------------------------------- /t5/README.md: -------------------------------------------------------------------------------- 1 | # T5 experiments 2 | The T5 experiments are based off the excellent [nanoT5](https://github.com/PiotrNawrot/nanoT5) repository, see [LICENSE](LICENSE). 3 | 4 | ## Environment setup 5 | 6 | Following nanoT5's setup: 7 | 8 | ``` 9 | conda create -n ntng_t5 python=3.8 10 | conda activate ntng_t5 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | The following commands result in the following [pip freeze](assets/pip_freeze.txt) as of 24.07.2023. We also include our [lscpu](assets/lscpu.txt) and [nvidia-smi](assets/nvidia_smi.txt). 15 | 16 | ## Commands for each experiment 17 | 18 | By default the experiments are run for 24 hours. For more details check the default config with all hyperparameters [here](t5/configs/default.json). We include the RST measurements [here](t5/utils/train.py). 19 | 20 | ### Baseline 21 | 22 | ``` 23 | python -m t5.train stacking.typ=none 24 | ``` 25 | 26 | ### Stacking 27 | 28 | ``` 29 | python -m t5.train stacking.typ=stack 30 | ``` 31 | 32 | ### Layer Dropping 33 | 34 | ``` 35 | python -m t5.train stacking.typ=drop optim.base_lr=1e-2 stacking.gamma_factor=20 36 | ``` 37 | 38 | ### Sophia 39 | 40 | ``` 41 | python -m t5.train stacking.typ=none optim.name=sophia optim.rho=1e-2 optim.base_lr=1e-3 sophia_freq=10 42 | ``` 43 | 44 | ### Lion 45 | 46 | ``` 47 | python -m t5.train stacking.typ=none optim.name=lion optim.base_lr=7.5e-4 48 | ``` 49 | 50 | ### Fine-Tuning 51 | 52 | We fine-tune the models in the original [nanoT5 repository](https://github.com/PiotrNawrot/nanoT5) using the following command: 53 | 54 | ``` 55 | 56 | python -m nanoT5.main task=ft google/t5-v1_1-base model.random_init=false model.checkpoint_path="/path/to/pytorch_model.bin 57 | ``` 58 | 59 | All our models do not modify the original T5 architecture, so all checkpoints trained in this repository are compabible with the original nanoT5 repository. -------------------------------------------------------------------------------- /t5/assets/lscpu.txt: -------------------------------------------------------------------------------- 1 | Architecture: x86_64 2 | CPU op-mode(s): 32-bit, 64-bit 3 | Byte Order: Little Endian 4 | CPU(s): 128 5 | On-line CPU(s) list: 0-127 6 | Thread(s) per core: 1 7 | Core(s) per socket: 64 8 | Socket(s): 2 9 | NUMA node(s): 8 10 | Vendor ID: AuthenticAMD 11 | CPU family: 25 12 | Model: 1 13 | Model name: AMD EPYC 7763 64-Core Processor 14 | Stepping: 1 15 | CPU MHz: 2445.534 16 | BogoMIPS: 4891.06 17 | Virtualization: AMD-V 18 | L1d cache: 32K 19 | L1i cache: 32K 20 | L2 cache: 512K 21 | L3 cache: 32768K 22 | NUMA node0 CPU(s): 0-15 23 | NUMA node1 CPU(s): 16-31 24 | NUMA node2 CPU(s): 32-47 25 | NUMA node3 CPU(s): 48-63 26 | NUMA node4 CPU(s): 64-79 27 | NUMA node5 CPU(s): 80-95 28 | NUMA node6 CPU(s): 96-111 29 | NUMA node7 CPU(s): 112-127 30 | Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca 31 | -------------------------------------------------------------------------------- /t5/assets/nvidia_smi.txt: -------------------------------------------------------------------------------- 1 | Mon Jul 24 11:04:03 2023 2 | +-----------------------------------------------------------------------------+ 3 | | NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 | 4 | |-------------------------------+----------------------+----------------------+ 5 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 6 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 7 | | | | MIG M. | 8 | |===============================+======================+======================| 9 | | 0 NVIDIA A100-SXM... On | 00000000:01:00.0 Off | 0 | 10 | | N/A 40C P0 79W / 500W | 0MiB / 81920MiB | 0% Default | 11 | | | | Disabled | 12 | +-------------------------------+----------------------+----------------------+ 13 | 14 | +-----------------------------------------------------------------------------+ 15 | | Processes: | 16 | | GPU GI CI PID Type Process name GPU Memory | 17 | | ID ID Usage | 18 | |=============================================================================| 19 | | No running processes found | 20 | +-----------------------------------------------------------------------------+ 21 | -------------------------------------------------------------------------------- /t5/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets >= 1.8.0 3 | sentencepiece != 0.1.92 4 | transformers 5 | neptune 6 | pdbpp 7 | notebook 8 | protobuf==3.20.* 9 | pyyaml 10 | pynvml 11 | hydra-core 12 | evaluate 13 | nltk 14 | absl-py 15 | rouge_score 16 | torch>=1.13.1,<=2.0.1 17 | hydra_colorlog 18 | wandb 19 | -------------------------------------------------------------------------------- /t5/t5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/t5/t5/__init__.py -------------------------------------------------------------------------------- /t5/t5/configs/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - task: pt 4 | - local_env: default 5 | 6 | # Experiment args 7 | mode: 'pt' 8 | device: gpu 9 | precision: 'no' 10 | gpus: 1 11 | eval_only: false 12 | predict_only: false 13 | seed: 2137 14 | fine_tune: false 15 | debug: false 16 | 17 | # NTNG args 18 | budget: 24 19 | every_seconds: 86400 20 | sophia_freq: 10 21 | 22 | stacking: 23 | typ: none # {stack, drop} 24 | num_initial_layers: 3 25 | num_layers_to_add: 12 26 | step_fractions: [0.125, 0.3] 27 | doubling: true 28 | gamma_factor: 20 29 | 30 | # Rest of nanoT5 args 31 | model: 32 | klass: my_t5 33 | name: 'google/t5-v1_1-base' 34 | overwrite: 35 | dropout_rate: 0.0 36 | add_config: 37 | share_positional_bias: False 38 | checkpoint_path: '' 39 | random_init: true 40 | compile: true # Pytorch 2.0 41 | 42 | data: 43 | input_length: 512 44 | mlm_probability: 0.15 45 | mean_noise_span_length: 3.0 46 | num_workers: 8 47 | shuffle_buffer_size: 1000 48 | 49 | optim: 50 | name: adamwscale # {sophia, lion} 51 | base_lr: 2e-2 52 | batch_size: 144 53 | total_steps: 65536 54 | epochs: -1 # If it's > 0 it overwrites total_steps 55 | warmup_steps: 10000 56 | lr_scheduler: cosine-budget 57 | weight_decay: 0.0 58 | grad_clip: 1.0 59 | grad_acc: 2 60 | final_cosine: 1e-5 61 | rho: 2e-2 62 | 63 | eval: 64 | every_steps: 500000 # Checkpoint in the end 65 | steps: 500 66 | 67 | checkpoint: 68 | every_steps: 500000 # Checkpoint in the end 69 | 70 | logging: 71 | neptune: false 72 | neptune_creds: 73 | project: 74 | api_token: 75 | tags: 76 | wandb: false 77 | wandb_creds: 78 | name: 79 | project: 80 | entity: 81 | prefix: '' 82 | every_steps: 100 83 | grad_l2: true 84 | weights_l2: true 85 | 86 | hydra: 87 | job: 88 | chdir: True -------------------------------------------------------------------------------- /t5/t5/configs/local_env/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | hydra: 4 | run: 5 | dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}-${logging.neptune_creds.tags} -------------------------------------------------------------------------------- /t5/t5/configs/task/debug.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | debug: true 4 | 5 | logging: 6 | every_steps: 2 7 | 8 | stacking: 9 | step_fractions: [0.0001, 0.0002] 10 | 11 | eval: 12 | steps: 5 13 | -------------------------------------------------------------------------------- /t5/t5/configs/task/ft.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | mode: 'ft' 4 | 5 | logging: 6 | prefix: 'ft/' 7 | 8 | data: 9 | max_seq_len: 1024 10 | max_target_len: 128 11 | max_num_instances_per_task: 100 12 | add_task_name: False 13 | add_task_definition: True 14 | num_pos_examples: 2 15 | num_neg_examples: 0 16 | add_explanation: False 17 | tk_instruct: False 18 | exec_file_path: ./adaptive_moe/utils/sni_dataset.py 19 | data_dir: /home/hpcnawr1/data/natural-instructions/splits/default 20 | task_dir: /home/hpcnawr1/data/natural-instructions/tasks 21 | 22 | optim: 23 | name: adamw 24 | base_lr: 5e-5 25 | batch_size: 8 26 | epochs: 2 27 | warmup_steps: 0 28 | lr_scheduler: constant 29 | weight_decay: 0.0 30 | grad_clip: 0.0 31 | grad_acc: 1 32 | 33 | eval: 34 | steps: 200 -------------------------------------------------------------------------------- /t5/t5/configs/task/pt.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | -------------------------------------------------------------------------------- /t5/t5/models/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from omegaconf import open_dict 3 | from transformers import AutoConfig, AutoTokenizer, T5ForConditionalGeneration 4 | 5 | from .t5 import MyT5 6 | 7 | 8 | def get_model(args, config): 9 | klass = { 10 | "t5": T5ForConditionalGeneration, 11 | "my_t5": MyT5, 12 | }[args.model.klass] 13 | 14 | if args.model.checkpoint_path: 15 | model = klass(config) 16 | model.load_state_dict(torch.load(args.model.checkpoint_path)) 17 | elif args.model.random_init: 18 | model = klass(config) 19 | else: 20 | model = klass.from_pretrained( 21 | args.model.name, 22 | config=config, 23 | ) 24 | 25 | with open_dict(args): 26 | args.n_all_param = sum([p.nelement() for p in model.parameters()]) 27 | 28 | return model 29 | 30 | 31 | def get_config(args): 32 | config = AutoConfig.from_pretrained( 33 | args.model.name, 34 | ) 35 | 36 | if hasattr(args.model, "overwrite"): 37 | for k, v in args.model.overwrite.items(): 38 | assert hasattr(config, k), f"config does not have attribute {k}" 39 | setattr(config, k, v) 40 | 41 | if hasattr(args.model, "add_config"): 42 | for k, v in args.model.add_config.items(): 43 | assert not hasattr(config, k), f"config already has attribute {k}" 44 | setattr(config, k, v) 45 | 46 | return config 47 | 48 | 49 | def get_tokenizer(args): 50 | tokenizer = AutoTokenizer.from_pretrained(args.model.name, use_fast=True) 51 | tokenizer.model_max_length = int(1e9) 52 | 53 | return tokenizer 54 | -------------------------------------------------------------------------------- /t5/t5/train.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import hydra 4 | from accelerate import Accelerator 5 | from omegaconf import open_dict 6 | from torch import compile, no_grad 7 | 8 | from .models import get_config, get_model, get_tokenizer 9 | from .utils.data import get_dataloaders 10 | from .utils.general import setup_basics 11 | from .utils.optim import get_lr_scheduler, get_optimizer 12 | from .utils.train import eval, predict, train 13 | 14 | 15 | @hydra.main(config_path="configs", config_name="default", version_base="1.1") 16 | def main(args): 17 | accelerator = Accelerator( 18 | cpu=args.device == "cpu", 19 | mixed_precision=args.precision, 20 | ) 21 | logger = setup_basics(accelerator, args) 22 | config = get_config(args) 23 | model = get_model(args, config) 24 | tokenizer = get_tokenizer(args) 25 | optimizer = get_optimizer(model, args) 26 | lr_scheduler = get_lr_scheduler(optimizer, args, logger) 27 | train_dataloader, test_dataloader = get_dataloaders(tokenizer, config, args) 28 | 29 | logger.log_args(args) 30 | 31 | ( 32 | model, 33 | optimizer, 34 | lr_scheduler, 35 | train_dataloader, 36 | test_dataloader, 37 | ) = accelerator.prepare( 38 | model, optimizer, lr_scheduler, train_dataloader, test_dataloader 39 | ) 40 | 41 | if args.model.compile: 42 | if args.stacking.typ == "none": 43 | model = compile(model) 44 | else: 45 | model.lm_head = compile(model.lm_head) 46 | model.shared = compile(model.shared) 47 | model.encoder.embed_tokens = compile(model.encoder.embed_tokens) 48 | model.decoder.embed_tokens = compile(model.decoder.embed_tokens) 49 | model.decoder.final_layer_norm = compile(model.decoder.final_layer_norm) 50 | for i in range(len(model.encoder.block)): 51 | model.encoder.block[i] = compile(model.encoder.block[i]) 52 | 53 | for i in range(len(model.decoder.block)): 54 | model.decoder.block[i] = compile(model.decoder.block[i]) 55 | 56 | with open_dict(args): 57 | args.start_time = time.time() 58 | args.current_train_step = 1 59 | args.current_epoch = 1 60 | args.last_log = time.time() 61 | args.seconds_counter = 0.0 62 | args.fake_step = 0 63 | args.eval_cou = 1 64 | args.check_cou = 1 65 | 66 | if args.eval_only: 67 | model.eval() 68 | with no_grad(): 69 | eval(model, test_dataloader, logger, args, tokenizer) 70 | elif args.predict_only: 71 | model.eval() 72 | with no_grad(): 73 | predict(model, test_dataloader, logger, args, tokenizer) 74 | else: 75 | train( 76 | model, 77 | train_dataloader, 78 | test_dataloader, 79 | accelerator, 80 | lr_scheduler, 81 | optimizer, 82 | logger, 83 | args, 84 | tokenizer, 85 | ) 86 | 87 | logger.finish() 88 | 89 | 90 | if __name__ == "__main__": 91 | main() 92 | -------------------------------------------------------------------------------- /t5/t5/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/t5/t5/utils/__init__.py -------------------------------------------------------------------------------- /t5/t5/utils/data.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | from datasets.iterable_dataset import IterableDataset 3 | from omegaconf import open_dict 4 | from torch.utils.data import DataLoader 5 | 6 | from .copied import ( 7 | DataCollatorForNI, 8 | DataCollatorForT5MLM, 9 | compute_input_and_target_lengths, 10 | tokenize_function, 11 | ) 12 | 13 | 14 | def load_dataset_splits(args): 15 | if args.mode == "pt": 16 | dataset = datasets.load_dataset( 17 | "c4", 18 | "en", 19 | streaming=True, 20 | ) 21 | 22 | dataset = dataset.remove_columns(["timestamp", "url"]) 23 | 24 | dataset_splits = { 25 | "train": dataset["train"], 26 | "test": dataset["validation"], 27 | } 28 | 29 | assert ( 30 | dataset["train"].n_shards == 1024 31 | ), "We want to have many shards for efficient processing with num_workes in PyTorch dataloader" 32 | elif args.mode == "ft": 33 | dataset_splits = datasets.load_dataset( 34 | args.data.exec_file_path, 35 | data_dir=args.data.data_dir, 36 | task_dir=args.data.task_dir, 37 | max_num_instances_per_task=args.data.max_num_instances_per_task, 38 | max_num_instances_per_eval_task=args.data.max_num_instances_per_task, 39 | ) 40 | else: 41 | raise NotImplementedError 42 | 43 | return dataset_splits 44 | 45 | 46 | def process_dataset(dataset_splits, args, tokenizer): 47 | if args.mode == "pt": 48 | final_datasets = {} 49 | 50 | for split, dataset_split in dataset_splits.items(): 51 | 52 | # We increase the input_length, because instead of masking tokens T5 replaces 53 | # masked spans with a single token, therefore to avoid padding we need to have 54 | # longer sequences at the start, before masking 55 | before_mask_input_length, target_length = compute_input_and_target_lengths( 56 | inputs_length=args.data.input_length, 57 | noise_density=args.data.mlm_probability, 58 | mean_noise_span_length=args.data.mean_noise_span_length, 59 | ) 60 | 61 | with open_dict(args): 62 | args.data.before_mask_input_length = before_mask_input_length 63 | args.data.target_length = target_length 64 | 65 | dataset_split = dataset_split.map( 66 | tokenize_function, 67 | batched=True, 68 | fn_kwargs={ 69 | "tokenizer": tokenizer, 70 | "in_length": before_mask_input_length, 71 | }, 72 | remove_columns=["text"], 73 | ) 74 | 75 | dataset_split = dataset_split.shuffle( 76 | seed=args.seed, buffer_size=args.data.shuffle_buffer_size 77 | ) 78 | final_datasets[split] = dataset_split 79 | elif args.mode == "ft": 80 | final_datasets = dataset_splits 81 | else: 82 | raise NotImplementedError 83 | 84 | return final_datasets 85 | 86 | 87 | def get_data_collator(tokenizer, config, args): 88 | if args.mode == "pt": 89 | data_collator = DataCollatorForT5MLM( 90 | tokenizer=tokenizer, 91 | noise_density=args.data.mlm_probability, 92 | mean_noise_span_length=args.data.mean_noise_span_length, 93 | input_length=args.data.input_length, 94 | target_length=args.data.target_length, 95 | pad_token_id=config.pad_token_id, 96 | ) 97 | elif args.mode == "ft": 98 | data_collator = DataCollatorForNI( 99 | tokenizer, 100 | padding="longest", 101 | max_source_length=args.data.max_seq_len, 102 | max_target_length=args.data.max_target_len, 103 | label_pad_token_id=-100, 104 | pad_to_multiple_of=1, 105 | add_task_name=args.data.add_task_name, 106 | add_task_definition=args.data.add_task_definition, 107 | num_pos_examples=args.data.num_pos_examples, 108 | num_neg_examples=args.data.num_neg_examples, 109 | add_explanation=args.data.add_explanation, 110 | tk_instruct=args.data.tk_instruct, 111 | ) 112 | else: 113 | raise NotImplementedError 114 | 115 | return data_collator 116 | 117 | 118 | def get_dataloaders(tokenizer, config, args): 119 | dataset_splits = load_dataset_splits(args) 120 | dataset = process_dataset( 121 | dataset_splits=dataset_splits, args=args, tokenizer=tokenizer 122 | ) 123 | data_collator = get_data_collator(tokenizer=tokenizer, config=config, args=args) 124 | 125 | is_iterable = isinstance(dataset["train"], IterableDataset) 126 | 127 | dataloaders = {} 128 | 129 | for split in ["train", "test"]: 130 | batch_size = args.optim.batch_size // args.optim.grad_acc 131 | 132 | if split in ["test"]: 133 | batch_size *= 2 134 | 135 | shuffle = (split == "train") and not is_iterable 136 | 137 | if args.mode == "ft" and split == "train": 138 | assert shuffle is True 139 | else: 140 | assert shuffle is False 141 | 142 | dataloaders[split] = DataLoader( 143 | dataset[split], 144 | shuffle=shuffle, 145 | collate_fn=data_collator, 146 | batch_size=batch_size, 147 | num_workers=args.data.num_workers, 148 | pin_memory=True, 149 | drop_last=False, 150 | ) 151 | 152 | # Add & Check args about data loaders 153 | with open_dict(args): 154 | if not is_iterable: 155 | args.data.train_batches = len(dataloaders["train"]) 156 | args.data.test_batches = len(dataloaders["test"]) 157 | 158 | if args.optim.epochs > 0: 159 | assert not is_iterable 160 | args.optim.total_steps = ( 161 | len(dataloaders["train"]) // args.optim.grad_acc 162 | ) * args.optim.epochs 163 | 164 | # We increase eval BS by 2, so decrease number of eval steps 165 | args.eval.corrected_steps = args.eval.steps / 2 166 | 167 | return dataloaders["train"], dataloaders["test"] 168 | -------------------------------------------------------------------------------- /t5/t5/utils/general.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from accelerate.utils import set_seed 5 | from hydra.utils import to_absolute_path 6 | from omegaconf import open_dict 7 | 8 | from .logging import Logger 9 | 10 | 11 | def check_args_and_env(args): 12 | assert args.optim.batch_size % args.optim.grad_acc == 0 13 | 14 | # Train log must happen before eval log 15 | assert args.eval.every_steps % args.logging.every_steps == 0 16 | 17 | if args.device == "gpu": 18 | assert torch.cuda.is_available(), "We use GPU to train/eval the model" 19 | 20 | assert not (args.eval_only and args.predict_only) 21 | 22 | if args.predict_only: 23 | assert args.mode == "ft" 24 | 25 | 26 | def opti_flags(args): 27 | # This lines reduce training step by 2.4x 28 | torch.backends.cuda.matmul.allow_tf32 = True 29 | torch.backends.cudnn.allow_tf32 = True 30 | 31 | 32 | def update_args_with_env_info(args): 33 | with open_dict(args): 34 | slurm_id = os.getenv("SLURM_JOB_ID") 35 | 36 | if slurm_id is not None: 37 | args.slurm_id = slurm_id 38 | else: 39 | args.slurm_id = "none" 40 | 41 | args.working_dir = os.getcwd() 42 | 43 | 44 | def update_paths(args): 45 | if args.mode == "ft": 46 | args.data.exec_file_path = to_absolute_path(args.data.exec_file_path) 47 | args.data.data_dir = to_absolute_path(args.data.data_dir) 48 | args.data.task_dir = to_absolute_path(args.data.task_dir) 49 | 50 | 51 | def setup_basics(accelerator, args): 52 | check_args_and_env(args) 53 | update_args_with_env_info(args) 54 | update_paths(args) 55 | opti_flags(args) 56 | 57 | # To skip scientific notation 58 | torch.set_printoptions( 59 | precision=3, 60 | sci_mode=False, 61 | ) 62 | 63 | if args.seed is not None: 64 | set_seed(args.seed) 65 | 66 | logger = Logger(args=args, accelerator=accelerator) 67 | 68 | return logger 69 | -------------------------------------------------------------------------------- /t5/t5/utils/lion.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/lucidrains/lion-pytorch/blob/main/lion_pytorch/lion_pytorch.py 2 | 3 | from typing import Callable, Optional, Tuple 4 | 5 | import torch 6 | from torch.optim.optimizer import Optimizer 7 | 8 | # functions 9 | 10 | 11 | def _rms(tensor): 12 | return tensor.norm(2) / (tensor.numel() ** 0.5) 13 | 14 | 15 | def exists(val): 16 | return val is not None 17 | 18 | 19 | # update functions 20 | 21 | 22 | def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2): 23 | # stepweight decay 24 | 25 | # Adafactor RMS 26 | lr = lr * max(1e-3, _rms(p.data)) 27 | 28 | p.data.mul_(1 - lr * wd) 29 | 30 | # weight update 31 | 32 | update = exp_avg.clone().mul_(beta1).add(grad, alpha=1 - beta1).sign_() 33 | p.add_(update, alpha=-lr) 34 | 35 | # decay the momentum running average coefficient 36 | 37 | exp_avg.mul_(beta2).add_(grad, alpha=1 - beta2) 38 | 39 | 40 | # class 41 | 42 | 43 | class Lion(Optimizer): 44 | def __init__( 45 | self, 46 | params, 47 | lr: float = 1e-4, 48 | betas: Tuple[float, float] = (0.9, 0.99), 49 | weight_decay: float = 0.0, 50 | ): 51 | assert lr > 0.0 52 | assert all([0.0 <= beta <= 1.0 for beta in betas]) 53 | 54 | defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay) 55 | 56 | super().__init__(params, defaults) 57 | 58 | self.update_fn = update_fn 59 | 60 | @torch.no_grad() 61 | def step(self, closure: Optional[Callable] = None): 62 | 63 | loss = None 64 | if exists(closure): 65 | with torch.enable_grad(): 66 | loss = closure() 67 | 68 | for group in self.param_groups: 69 | for p in filter(lambda p: exists(p.grad), group["params"]): 70 | 71 | grad, lr, wd, beta1, beta2, state = ( 72 | p.grad, 73 | group["lr"], 74 | group["weight_decay"], 75 | *group["betas"], 76 | self.state[p], 77 | ) 78 | 79 | # init state - exponential moving average of gradient values 80 | 81 | if len(state) == 0: 82 | state["exp_avg"] = torch.zeros_like(p) 83 | 84 | exp_avg = state["exp_avg"] 85 | 86 | self.update_fn(p, grad, exp_avg, lr, wd, beta1, beta2) 87 | 88 | return loss 89 | -------------------------------------------------------------------------------- /t5/t5/utils/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from collections import defaultdict 4 | 5 | import datasets 6 | import neptune 7 | import transformers 8 | import wandb 9 | from accelerate.logging import get_logger 10 | from neptune.utils import stringify_unsupported 11 | from omegaconf import OmegaConf, open_dict 12 | 13 | 14 | class Averager: 15 | def __init__(self, weight: float = 1): 16 | self.weight = weight 17 | self.reset() 18 | 19 | def reset(self): 20 | self.total = defaultdict(float) 21 | self.counter = defaultdict(float) 22 | 23 | def update(self, stats): 24 | for key, value in stats.items(): 25 | self.total[key] = self.total[key] * self.weight + value * self.weight 26 | self.counter[key] = self.counter[key] * self.weight + self.weight 27 | 28 | def average(self): 29 | averaged_stats = { 30 | key: tot / self.counter[key] for key, tot in self.total.items() 31 | } 32 | self.reset() 33 | 34 | return averaged_stats 35 | 36 | 37 | class Logger: 38 | def __init__(self, args, accelerator): 39 | self.logger = get_logger("Main") 40 | 41 | # Make one log on every process with the configuration for debugging. 42 | logging.basicConfig( 43 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 44 | datefmt="%m/%d/%Y %H:%M:%S", 45 | level=logging.INFO, 46 | ) 47 | self.logger.info(accelerator.state, main_process_only=False) 48 | self.logger.info(f"Working directory is {os.getcwd()}") 49 | 50 | if accelerator.is_local_main_process: 51 | datasets.utils.logging.set_verbosity_warning() 52 | transformers.utils.logging.set_verbosity_info() 53 | else: 54 | datasets.utils.logging.set_verbosity_error() 55 | transformers.utils.logging.set_verbosity_error() 56 | 57 | self.setup_neptune(args) 58 | self.setup_wandb(args) 59 | 60 | def setup_wandb(self, args): 61 | if args.logging.wandb: 62 | wandb.init( 63 | name=args.logging.wandb_creds.name, 64 | project=args.logging.wandb_creds.project, 65 | entity=args.logging.wandb_creds.entity, 66 | ) 67 | else: 68 | self.wandb_run = None 69 | 70 | self.wandb_run = wandb.run 71 | 72 | with open_dict(args): 73 | if self.wandb_run is not None: 74 | args.wandb_id = self.wandb_run.id 75 | 76 | def setup_neptune(self, args): 77 | if args.logging.neptune: 78 | tags = [str(item) for item in args.logging.neptune_creds.tags.split(",")] 79 | if tags == [] or tags == [""]: 80 | tags = None 81 | 82 | neptune_logger = neptune.init_run( 83 | project=args.logging.neptune_creds.project, 84 | api_token=args.logging.neptune_creds.api_token, 85 | tags=tags, 86 | ) 87 | else: 88 | neptune_logger = None 89 | 90 | self.neptune_logger = neptune_logger 91 | 92 | with open_dict(args): 93 | if neptune_logger is not None: 94 | args.neptune_id = neptune_logger["sys/id"].fetch() 95 | 96 | def log_args(self, args): 97 | if self.wandb_run is not None: 98 | logging_args = OmegaConf.to_container(args, resolve=True) 99 | wandb.config.update(logging_args) 100 | 101 | if self.neptune_logger is not None: 102 | logging_args = OmegaConf.to_container(args, resolve=True) 103 | self.neptune_logger["args"] = stringify_unsupported(logging_args) 104 | 105 | def log_stats(self, stats, step, args, prefix=""): 106 | if self.neptune_logger is not None: 107 | for k, v in stats.items(): 108 | self.neptune_logger[f"{prefix}{k}"].log(v, step=step) 109 | 110 | if self.wandb_run is not None: 111 | for k, v in stats.items(): 112 | wandb.log({f"{prefix}{k}": v}, step=step) 113 | 114 | msg_start = ( 115 | f"[{prefix[:-1]}] Step {step} out of {args.optim.total_steps}" + " | " 116 | ) 117 | dict_msg = ( 118 | " | ".join([f"{k.capitalize()} --> {v:.3f}" for k, v in stats.items()]) 119 | + " | " 120 | ) 121 | 122 | msg = msg_start + dict_msg 123 | 124 | self.log_message(msg) 125 | 126 | def log_message(self, msg): 127 | self.logger.info(msg) 128 | 129 | def finish(self): 130 | if self.neptune_logger is not None: 131 | self.neptune_logger.stop() 132 | 133 | if self.wandb_run is not None: 134 | wandb.finish() 135 | -------------------------------------------------------------------------------- /t5/t5/utils/optim.py: -------------------------------------------------------------------------------- 1 | def get_optimizer(model, args): 2 | if args.optim.name == "adamwscale": 3 | from .copied import AdamWScale 4 | 5 | optimizer = AdamWScale( 6 | model.parameters(), 7 | lr=args.optim.base_lr, 8 | ) 9 | elif args.optim.name == "lion": 10 | from .lion import Lion 11 | 12 | optimizer = Lion( 13 | model.parameters(), 14 | weight_decay=args.optim.weight_decay, 15 | lr=args.optim.base_lr, 16 | ) 17 | elif args.optim.name == "sophia": 18 | from .sophia import SophiaG 19 | 20 | optimizer = SophiaG( 21 | model.parameters(), 22 | rho=args.optim.rho, 23 | weight_decay=args.optim.weight_decay, 24 | lr=args.optim.base_lr, 25 | ) 26 | else: 27 | raise NotImplementedError 28 | 29 | return optimizer 30 | 31 | 32 | def get_lr_scheduler(optimizer, args, logger): 33 | if args.optim.lr_scheduler == "cosine": 34 | from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR 35 | 36 | scheduler1 = LinearLR( 37 | optimizer, 38 | start_factor=0.5, 39 | end_factor=1, 40 | total_iters=args.optim.warmup_steps, 41 | last_epoch=-1, 42 | ) 43 | 44 | scheduler2 = CosineAnnealingLR( 45 | optimizer, 46 | T_max=args.optim.total_steps - args.optim.warmup_steps, 47 | eta_min=args.optim.final_cosine, 48 | ) 49 | 50 | lr_scheduler = SequentialLR( 51 | optimizer, 52 | schedulers=[scheduler1, scheduler2], 53 | milestones=[args.optim.warmup_steps], 54 | ) 55 | elif args.optim.lr_scheduler == "legacy": 56 | import math 57 | 58 | from torch.optim.lr_scheduler import LambdaLR, LinearLR, SequentialLR 59 | 60 | msg = "You are using T5 legacy LR Schedule, it's independent from the optim.base_lr" 61 | logger.log_message(msg) 62 | 63 | num_steps_optimizer1 = math.ceil(args.optim.total_steps * 0.9) 64 | iters_left_for_optimizer2 = args.optim.total_steps - num_steps_optimizer1 65 | 66 | scheduler1 = LambdaLR( 67 | optimizer, 68 | lambda step: min(1e-2, 1.0 / math.sqrt(step)) / args.optim.base_lr 69 | if step 70 | else 1e-2 / args.optim.base_lr, 71 | ) 72 | 73 | scheduler2 = LinearLR( 74 | optimizer, 75 | start_factor=( 76 | min(1e-2, 1.0 / math.sqrt(num_steps_optimizer1)) / args.optim.base_lr 77 | ), 78 | end_factor=0, 79 | total_iters=iters_left_for_optimizer2, 80 | last_epoch=-1, 81 | ) 82 | 83 | lr_scheduler = SequentialLR( 84 | optimizer, 85 | schedulers=[scheduler1, scheduler2], 86 | milestones=[num_steps_optimizer1], 87 | ) 88 | elif args.optim.lr_scheduler == "constant": 89 | from transformers import get_scheduler 90 | 91 | lr_scheduler = get_scheduler( 92 | name=args.optim.lr_scheduler, 93 | optimizer=optimizer, 94 | ) 95 | elif args.optim.lr_scheduler == "cosine-budget": 96 | import math 97 | 98 | from torch.optim.lr_scheduler import LambdaLR 99 | 100 | num_warmup_steps = args.optim.warmup_steps 101 | num_training_steps = args.optim.total_steps 102 | num_cycles = 0.5 103 | 104 | def lr_lambda(current_step): 105 | fake_step = current_step 106 | 107 | if fake_step < num_warmup_steps: 108 | return ( 109 | (float(fake_step) / float(max(1, num_warmup_steps))) * 0.5 110 | ) + 0.5 111 | 112 | progress = float(fake_step - num_warmup_steps) / float( 113 | max(1, num_training_steps - num_warmup_steps) 114 | ) 115 | return max( 116 | 1e-5, 117 | 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)), 118 | ) 119 | 120 | return LambdaLR(optimizer, lr_lambda, -1) 121 | else: 122 | raise NotImplementedError 123 | 124 | return lr_scheduler 125 | --------------------------------------------------------------------------------