├── .gitignore
├── README.md
├── bert
    ├── .python-version
    ├── LICENSE.txt
    ├── README.md
    ├── conda_env.yaml
    ├── cramming
    │   ├── __init__.py
    │   ├── architectures
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── components.py
    │   │   ├── construction.py
    │   │   ├── embeddings.py
    │   │   ├── fixed_cramlm.py
    │   │   ├── funnel_transformers.py
    │   │   ├── fused_layers.py
    │   │   ├── gpt2.py
    │   │   ├── huggingface_interface.py
    │   │   ├── losses.py
    │   │   ├── recurrent_transformers.py
    │   │   ├── sanity_check.py
    │   │   ├── scriptable_bert.py
    │   │   └── t5.py
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── deepspeed_integration.py
    │   │   ├── optimizers
    │   │   │   ├── __init__.py
    │   │   │   ├── adahessian.py
    │   │   │   ├── adamw_scale.py
    │   │   │   ├── lion_pytorch.py
    │   │   │   ├── optimizer_modifiers.py
    │   │   │   ├── progressive_batching.py
    │   │   │   ├── schedulers.py
    │   │   │   ├── shampoo
    │   │   │   │   ├── CODE_OF_CONDUCT.md
    │   │   │   │   ├── CONTRIBUTING.md
    │   │   │   │   ├── LICENSE
    │   │   │   │   ├── README.md
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── matrix_functions.py
    │   │   │   │   ├── shampoo.py
    │   │   │   │   └── shampoo_utils.py
    │   │   │   └── sophiag.py
    │   │   ├── prepare_backend.py
    │   │   ├── torch_default.py
    │   │   └── utils.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── arch
    │   │   │   ├── __init__.py
    │   │   │   ├── bert-base.yaml
    │   │   │   ├── bert-c2.yaml
    │   │   │   ├── bert-c3.yaml
    │   │   │   ├── bert-c4.yaml
    │   │   │   ├── bert-c5.yaml
    │   │   │   ├── bert-i4.yaml
    │   │   │   ├── bert-large-izsak.yaml
    │   │   │   ├── bert-original.yaml
    │   │   │   ├── bert-tiny.yaml
    │   │   │   ├── funnel-c2.yaml
    │   │   │   ├── hf-bert-base.yaml
    │   │   │   ├── hf-bert-tiny.yaml
    │   │   │   ├── recurrent-c2.yaml
    │   │   │   └── sanitycheck.yaml
    │   │   ├── cfg_eval.yaml
    │   │   ├── cfg_eval_pt.yaml
    │   │   ├── cfg_pretrain.yaml
    │   │   ├── cfg_save_losses.yaml
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   ├── bert-default.yaml
    │   │   │   ├── bookcorpus-wikipedia.yaml
    │   │   │   ├── c4-subset-processed.yaml
    │   │   │   ├── c4-subset-random.yaml
    │   │   │   ├── c4-subset.yaml
    │   │   │   ├── minipile.yaml
    │   │   │   ├── sanity-check-1.yaml
    │   │   │   ├── sanity-check-2.yaml
    │   │   │   ├── sources
    │   │   │   │   ├── ag_news.yaml
    │   │   │   │   ├── bookcorpus.yaml
    │   │   │   │   ├── c4.yaml
    │   │   │   │   ├── c4_non_streaming.yaml
    │   │   │   │   ├── fake.yaml
    │   │   │   │   ├── minipile.yaml
    │   │   │   │   ├── the_pile.yaml
    │   │   │   │   ├── the_pileCC.yaml
    │   │   │   │   ├── the_pile_natural.yaml
    │   │   │   │   └── wikipedia.yaml
    │   │   │   ├── the-pile-natural.yaml
    │   │   │   └── the-pile.yaml
    │   │   ├── eval
    │   │   │   ├── GLUE.yaml
    │   │   │   ├── GLUE_sane.yaml
    │   │   │   ├── GLUEmosbach.yaml
    │   │   │   ├── SuperGLUE.yaml
    │   │   │   ├── __init__.py
    │   │   │   ├── boolq.yaml
    │   │   │   ├── mnli.yaml
    │   │   │   ├── optim
    │   │   │   │   └── adam.yaml
    │   │   │   ├── save_losses_rho_loss.yaml
    │   │   │   └── tasks
    │   │   │   │   ├── boolq.yaml
    │   │   │   │   ├── cb.yaml
    │   │   │   │   ├── cola.yaml
    │   │   │   │   ├── copa.yaml
    │   │   │   │   ├── mnli.yaml
    │   │   │   │   ├── mrpc.yaml
    │   │   │   │   ├── multirc.yaml
    │   │   │   │   ├── qnli.yaml
    │   │   │   │   ├── qqp.yaml
    │   │   │   │   ├── record.yaml
    │   │   │   │   ├── rte.yaml
    │   │   │   │   ├── rte_superglue.yaml
    │   │   │   │   ├── sst2.yaml
    │   │   │   │   ├── stsb.yaml
    │   │   │   │   ├── wic.yaml
    │   │   │   │   ├── wnli.yaml
    │   │   │   │   └── wsc.yaml
    │   │   ├── hydra
    │   │   │   ├── __init__.py
    │   │   │   └── job_logging
    │   │   │   │   └── custom.yaml
    │   │   ├── impl
    │   │   │   ├── __init__.py
    │   │   │   ├── _default.yaml
    │   │   │   ├── data_structure
    │   │   │   │   ├── LMDB.yaml
    │   │   │   │   ├── RAM.yaml
    │   │   │   │   ├── from-disk.yaml
    │   │   │   │   └── none.yaml
    │   │   │   ├── deepspeed-hf.yaml
    │   │   │   ├── deepspeed.yaml
    │   │   │   ├── onnx.yaml
    │   │   │   ├── save_losses_rho_loss.yaml
    │   │   │   └── torch-default.yaml
    │   │   ├── piotr
    │   │   │   ├── default.yaml
    │   │   │   └── task
    │   │   │   │   ├── ft.yaml
    │   │   │   │   └── pt.yaml
    │   │   ├── train
    │   │   │   ├── __init__.py
    │   │   │   ├── bert-base.yaml
    │   │   │   ├── bert-izsak.yaml
    │   │   │   ├── bert-o1.yaml
    │   │   │   ├── bert-o2.yaml
    │   │   │   ├── bert-o3.yaml
    │   │   │   ├── bert-original.yaml
    │   │   │   ├── optim
    │   │   │   │   ├── adafactor.yaml
    │   │   │   │   ├── adahessian.yaml
    │   │   │   │   ├── adam.yaml
    │   │   │   │   ├── adam_classic.yaml
    │   │   │   │   ├── lion.yaml
    │   │   │   │   ├── radam.yaml
    │   │   │   │   ├── sgd.yaml
    │   │   │   │   ├── shampoo.yaml
    │   │   │   │   └── sophiag.yaml
    │   │   │   └── optim_mod
    │   │   │   │   ├── disabled.yaml
    │   │   │   │   ├── larc.yaml
    │   │   │   │   ├── lars.yaml
    │   │   │   │   ├── progressive.yaml
    │   │   │   │   └── sam.yaml
    │   │   └── wandb
    │   │   │   ├── default.yaml
    │   │   │   └── none.yaml
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── cached_datasets.py
    │   │   ├── curriculum_sorting.py
    │   │   ├── deduplicate.py
    │   │   ├── downstream_task_preparation.py
    │   │   ├── generation_gibbs.py
    │   │   ├── lmdb_datasets.py
    │   │   ├── pretraining_preparation.py
    │   │   ├── tokenizer_preparation.py
    │   │   └── utils.py
    │   └── utils.py
    ├── efficient_training
    │   ├── __init__.py
    │   ├── extract_il_losses.py
    │   ├── layer_drop.py
    │   ├── stacking.py
    │   └── test_layer_drop.py
    ├── eval.py
    ├── poetry.lock
    ├── pretrain_bert.py
    ├── pretrain_bert_rho_loss.py
    ├── pretrain_bert_sb.py
    ├── pretrain_bert_sophia.py
    ├── pyproject.toml
    ├── rst
    │   ├── __init__.py
    │   ├── get_RSTs_from_wandb.py
    │   └── saved_rsts.py
    └── validate_bert.py
└── t5
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── assets
        ├── lscpu.txt
        └── nvidia_smi.txt
    ├── requirements.txt
    └── t5
        ├── __init__.py
        ├── configs
            ├── default.yaml
            ├── local_env
            │   └── default.yaml
            └── task
            │   ├── debug.yaml
            │   ├── ft.yaml
            │   └── pt.yaml
        ├── models
            ├── __init__.py
            ├── progressive.py
            └── t5.py
        ├── train.py
        └── utils
            ├── __init__.py
            ├── copied.py
            ├── data.py
            ├── general.py
            ├── lion.py
            ├── logging.py
            ├── optim.py
            ├── sni_dataset.py
            ├── sophia.py
            └── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | # I think for ML projects including the precise python version is useful for
 86 | # reproducibility, but maybe I am wrong - Oscar
 87 | # .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 
133 | # JetBrains
134 | .idea/
135 | .vscode
136 | 
137 | # Experiment outputs
138 | 
139 | saved_models
140 | outputs
141 | 
142 | tmp.py
143 | *.DS_Store
144 | *.ipynb_checkpoints
145 | 
146 | # Ignore generated figures
147 | plots/*.png
148 | plots/*.pdf
149 | 
150 | deprecated/jeans_scripts/jean.sh
151 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # No Train No Gain
 2 | 
 3 | Code for the paper
 4 | "[No Train No Gain: Revisiting Efficient Training Algorithms For Transformer-based Language Models](https://arxiv.org/abs/2307.06440)";
 5 | Jean Kaddour, Oscar Key, Piotr Nawrot, Pasquale Minervini, Matt J. Kusner .
 6 | 
 7 | 
 8 | ## Running the code
 9 | See the README for the:
10 | - [BERT experiments](bert/README.md)
11 | - [T5 experiments](t5/README.md)
12 |  
13 | ## Citation and license
14 | We use two excellent open source codebases to implement our experiments:
15 | - The BERT experiments are forked of [Cramming](https://github.com/JonasGeiping/cramming)
16 | - The T5 experiments are forked of [NanoT5](https://github.com/PiotrNawrot/nanoT5)
17 | 
18 | If you find this repository useful, please consider citing both our work and these original codebases.
19 | 
20 | To cite our work, we suggest the following BibTeX:
21 | ```
22 | @misc{kaddourNoTrainNo2023,
23 | 	title = {No {Train} {No} {Gain}: {Revisiting} {Efficient} {Training} {Algorithms} {For} {Transformer}-based {Language} {Models}},
24 | 	url = {http://arxiv.org/abs/2307.06440},
25 | 	doi = {10.48550/arXiv.2307.06440},
26 | 	urldate = {2023-07-17},
27 | 	publisher = {arXiv},
28 | 	author = {Kaddour, Jean and Key, Oscar and Nawrot, Piotr and Minervini, Pasquale and Kusner, Matt J.},
29 | 	month = jul,
30 | 	year = {2023},
31 | 	note = {arXiv:2307.06440 [cs]},
32 | }
33 | ```
34 | 
35 | We provide separate licenses for the [BERT experiments](bert/LICENSE.txt) and the [T5 experiments](t5/LICENSE).
36 | 
37 | ## Contact
38 | Feel free to open an issue, or email us, with any questions.
39 | 


--------------------------------------------------------------------------------
/bert/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/bert/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The code in this folder is based off the Cramming repository (https://github.com/JonasGeiping/cramming), which is Copyright 2022 Jonas Geiping and released under the MIT license (included below).
 2 | 
 3 | This modified version of the Cramming code is Copyright 2023 Jean Kaddour and Oscar Key and also released under the MIT license.
 4 | 
 5 | -----------
 6 | 
 7 | MIT License
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
14 | 


--------------------------------------------------------------------------------
/bert/README.md:
--------------------------------------------------------------------------------
  1 | # BERT experiments
  2 | The BERT experiments are based off the excellent [Cramming](https://github.com/JonasGeiping/cramming) repository, see [LICENSE.txt](LICENSE.txt).
  3 | 
  4 | ## Environment setup
  5 | The project has the following dependencies:
  6 | - CUDA toolkit + nvcc 11.7 (required to install [FlashAttention](https://github.com/Dao-AILab/flash-attention))
  7 | - Python 3.10
  8 | - [Poetry](https://python-poetry.org/)
  9 | 
 10 | One way to install the dependencies is using Conda and the provided environment file:
 11 | - `conda env update -f conda_env.yaml`
 12 | - `conda activate ntng_bert`
 13 | - `export CUDA_HOME=$CONDA_PREFIX` (this is required so PyTorch finds the correct nvcc version when building FlashAttention)
 14 | 
 15 | Create and activate the Poetry environment:
 16 | - Install: `poetry install`
 17 | - Activate: `poetry shell`
 18 | - Manually install FlashAttention: `pip install --no-build-isolation flash-attn==1.0.9`
 19 | 
 20 | ## Modules
 21 | ### Entry points
 22 | * `pretrain_bert.py`
 23 |   * implements the baseline, layer stacking, layer dropping, and Lion
 24 | * `pretrain_bert_sb.py`
 25 |   * modified copy of `pretrain_bert.py` that includes selective backpropagation
 26 | * `pretrain_bert_rho_loss.py`
 27 |   * modified copy of `pretrain_bert.py` that includes RHO-Loss
 28 |   * requires the irreducible losses to be extracted first (see command below)
 29 | * `pretrain_bert_sophia.py`
 30 |   * modified copy of `pretrain_bert.py` that includes Sophia-G
 31 | * `eval.py`
 32 |   * implements fine-tuning and evaluating a pretrained model
 33 | * `validate_bert.py`
 34 |   * implements validating a pretrained checkpoint on the validation set
 35 | 
 36 | ### Other
 37 | * `efficient_training` contains additional code for some of the efficient training methods
 38 |   * we recommend starting at the entry point scripts to understand how to use the code
 39 | * `rst` includes helper code for tracking the Reference System Time (RST) metric
 40 | 
 41 | ## Experiment commands
 42 | ### Pre-train
 43 | First, download the randomized subset of the C4 dataset from [our archive](https://doi.org/10.5281/zenodo.8279728):
 44 | * `wget https://zenodo.org/record/8279728/files/c4-subset-random.tar.bz2`
 45 | * `mkdir -p outputs/data`
 46 | * `tar xvf c4-subset-random.tar.bz2 -C outputs/data/`
 47 | 
 48 | If you would like to use [Weights & Biases](https://wandb.ai/site), configure this in `cramming/config/wandb/default.yaml`.
 49 | 
 50 | #### Dynamic architectures
 51 | * Baseline (FP16):
 52 | `python pretrain_bert.py name={name} budget={budget in hours} seed={seed}`
 53 | 
 54 | * Layer stacking:
 55 | `python pretrain_bert.py name={name} budget={budget in hours} seed={seed} train.stacking.enabled=True`
 56 | 
 57 | * Layer dropping:
 58 | `python pretrain_bert.py name={name} budget={budget in hours} seed={seed} arch.layer_drop.enabled=True`
 59 | 
 60 | #### Batch selection
 61 | By default the dataset is the randomized subset of C4, but you can also set `data=minipile` or `data=bookcorpus-wikitext`.
 62 | Minipile and BCWK will be downloaded automatically from Hugging Face at the start of training.
 63 | 
 64 | * Selective backprop:
 65 | `python pretrain_bert_sb.py name={name} budget={budget in hours} seed={seed} train.validation_set.fraction=0.2 impl.validate_every_hours=3`
 66 |   * To reproduce the ablation where the additional forward passes are not counted against the training budget, add `train.track_forward_pass_only=false`.
 67 | 
 68 | ##### RHO-loss
 69 | To acquire the irreducible losses you can either:
 70 |   * Download ours:
 71 |     * `wget https://zenodo.org/record/8279728/files/il_losses_[dataset].tar` where `dataset` is `c4`, `bcwk`, or `mp`
 72 |     * `mkdir -p outputs/il_losses`
 73 |     * `tar xvf il_losses_[dataset].tar -C outputs/il_losses`
 74 |   * Train your own irreducible loss model and extract the losses:
 75 |     * `python pretrain_bert.py name=il_model budget={budget in hours} train.validation_set.il_model=True train.validation_set.fraction=0.2`
 76 |     * `python efficient_training/extract_il_losses.py name=il_model`
 77 | 
 78 | Pre-train: `python pretrain_bert_rho_loss.py name={name} budget={budget in hours} seed={seed} data={dataset} train.validation_set.fraction=0.2 impl.validate_every_hours=3 train.rho_loss.il_losses_path={path to irreducible losses for dataset} train.rho_loss.mega_batch_size=3072`
 79 | 
 80 | To reproduce the ablation where the additional forward passes are not counted against the training budget, add `train.track_forward_pass_only=false`.
 81 | 
 82 | #### Efficient optimizers
 83 | We found Sophia was unstable when using FP16, thus for this set of experiments we use BF16.
 84 | 
 85 | * Baseline (BF16):
 86 | `python pretrain_bert.py name={name} budget={budget in hours} seed={seed} impl.mixed_precision_target_dtype=bfloat16`
 87 | 
 88 | * Lion: `python pretrain_bert.py name={name} budget={budget in hours} seed={seed}  impl.mixed_precision_target_dtype=bfloat16 train/optim=lion train.optim.lr={learning rate} train.optim.weight_decay={weight decay}`
 89 | 
 90 | * Sophia: `python pretrain_bert_sophia.py name={name} budget={budget in hours} seed={seed} impl.mixed_precision_target_dtype=bfloat16 train/optim=sophiag train.optim.rho={Sophia rho} train.optim.lr={learning rate} train.optim.weight_decay={weight decay}`
 91 |   * To reproduce the ablation where the additional forward passes are not counted against the training budget, add `train.sophia.free_updates=True`.
 92 | 
 93 | 
 94 | 
 95 | ### Fine tune & evaluate
 96 | Fine tune and evaluate a checkpoint using GLUE:
 97 | 
 98 | `python eval.py name={pretrain name} eval=glue_sane impl.microbatch_size=16 impl.shuffle_in_dataloader=true seed=0 [impl.mixed_precision_target_dtype=bfloat16 if the checkpoint was trained using BF16 rather than FP16]`
 99 | 
100 | Fine tune and evaluate a checkpoint SuperGLUE:
101 | 
102 | `python eval.py name={pretrain name} eval=SuperGLUE impl.microbatch_size=16 seed=0 [impl.mixed_precision_target_dtype=bfloat16 if the checkpoint was trained using BF16 rather than FP16]`
103 | 


--------------------------------------------------------------------------------
/bert/conda_env.yaml:
--------------------------------------------------------------------------------
 1 | name: ntng_bert
 2 | dependencies:
 3 |   - python=3.10
 4 |   - cuda=11.7
 5 |   - cuda-nvcc=11.7
 6 |   - cuda-nvvp=11.7
 7 |   - gxx=11.4.0
 8 |   - pip=23.2
 9 |   - pip:
10 |      - poetry==1.5.1
11 | channels:
12 |   - nvidia
13 |   - pytorch
14 |   - conda-forge
15 | 


--------------------------------------------------------------------------------
/bert/cramming/__init__.py:
--------------------------------------------------------------------------------
 1 | """Initialize cramming"""
 2 | 
 3 | from cramming.architectures import construct_model
 4 | from cramming.backend import load_backend
 5 | from cramming.data import load_pretraining_corpus, prepare_task_dataloaders
 6 | 
 7 | __all__ = [
 8 |     "construct_model",
 9 |     "load_backend",
10 |     "load_pretraining_corpus",
11 |     "prepare_task_dataloaders",
12 | ]
13 | 
14 | 
15 | import hydra
16 | 
17 | """Construct interfaces to some cfg folders for use in packaged installations:"""
18 | 
19 | 
20 | def get_config(overrides=[]):
21 |     """Return default hydra config."""
22 |     with hydra.initialize(config_path="config"):
23 |         cfg = hydra.compose(config_name="cfg", overrides=overrides)
24 |         print(f"Loading default config {cfg.name}.")
25 |     return cfg
26 | 
27 | 
28 | def get_model_config(arch="hf-bert-tiny", overrides=[]):
29 |     """Return default hydra config for a given attack."""
30 |     with hydra.initialize(config_path="config/arch"):
31 |         cfg = hydra.compose(config_name=arch, overrides=overrides)
32 |         print(f"Loading model configuration {cfg.architecture}.")
33 |     return cfg
34 | 
35 | 
36 | def get_backend_config(backend="torch-default", overrides=[]):
37 |     """Return default hydra config for a given attack."""
38 |     with hydra.initialize(config_path="config/impl"):
39 |         cfg = hydra.compose(config_name=backend, overrides=overrides)
40 |         print(f"Loading backend {cfg.name}.")
41 |     return cfg
42 | 


--------------------------------------------------------------------------------
/bert/cramming/architectures/__init__.py:
--------------------------------------------------------------------------------
1 | """This module handles all questions of model architecture."""
2 | 
3 | from .construction import construct_model
4 | 
5 | __all__ = ["construct_model"]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/architectures/construction.py:
--------------------------------------------------------------------------------
 1 | """Interface to construct models."""
 2 | 
 3 | import logging
 4 | 
 5 | from cramming.utils import is_main_process
 6 | 
 7 | from .fixed_cramlm import construct_fixed_cramlm
 8 | from .funnel_transformers import construct_scriptable_funnel
 9 | from .huggingface_interface import construct_huggingface_model
10 | from .recurrent_transformers import construct_scriptable_recurrent
11 | from .sanity_check import SanityCheckforPreTraining
12 | from .scriptable_bert import construct_scriptable_bert
13 | 
14 | log = logging.getLogger(__name__)
15 | 
16 | 
17 | def construct_model(cfg_arch, vocab_size, downstream_classes=None):
18 |     model = None
19 |     if cfg_arch.architectures is not None:
20 |         # attempt to solve locally
21 |         if "ScriptableMaskedLM" in cfg_arch.architectures:
22 |             model = construct_scriptable_bert(cfg_arch, vocab_size, downstream_classes)
23 |         elif "ScriptableFunnelLM" in cfg_arch.architectures:
24 |             model = construct_scriptable_funnel(cfg_arch, vocab_size, downstream_classes)
25 |         elif "ScriptableRecurrentLM" in cfg_arch.architectures:
26 |             model = construct_scriptable_recurrent(cfg_arch, vocab_size, downstream_classes)
27 |         elif "SanityCheckLM" in cfg_arch.architectures:
28 |             model = SanityCheckforPreTraining(cfg_arch.width, vocab_size)
29 |         elif "FusedCraMLM" in cfg_arch.architectures:
30 |             model = construct_fixed_cramlm(cfg_arch, vocab_size, downstream_classes)
31 | 
32 |     if model is not None:  # Return local model arch
33 |         num_params = sum([p.numel() for p in model.parameters()])
34 |         if is_main_process():
35 |             log.info(f"Model with architecture {cfg_arch.architectures[0]} loaded with {num_params:,} parameters.")
36 |         return model
37 | 
38 |     try:  # else try on HF
39 |         model = construct_huggingface_model(cfg_arch, vocab_size, downstream_classes)
40 |         num_params = sum([p.numel() for p in model.parameters()])
41 |         if is_main_process():
42 |             log.info(f"Model with config {cfg_arch} loaded with {num_params:,} parameters.")
43 |         return model
44 |     except Exception as e:
45 |         raise ValueError(f"Invalid model architecture {cfg_arch.architectures} given. Error: {e}")
46 | 


--------------------------------------------------------------------------------
/bert/cramming/architectures/fused_layers.py:
--------------------------------------------------------------------------------
 1 | """Pre-Norm / Post-norm / sandwich fused layers with dropout."""
 2 | 
 3 | from functools import partial
 4 | 
 5 | import torch
 6 | from torch.nn.functional import dropout
 7 | 
 8 | 
 9 | def get_layer_fn(type="pre", prob=0.1, scripting=True, dn=False, drop=False):
10 |     if not dn and not drop:
11 |         base_train, base_eval = simplified_layer_training, simplified_layer_eval
12 |     else:
13 |         base_train, base_eval = scaled_layer_training, scaled_layer_eval
14 |     if type in ["pre", "post"]:
15 |         if scripting:
16 |             fn_train, fn_eval = torch.jit.script(base_train), torch.jit.script(base_eval)
17 |         else:
18 |             fn_train, fn_eval = base_train, base_eval
19 |         return partial(fn_train, prob=prob), partial(fn_eval, prob=prob)
20 |     elif type == "sandwich":
21 |         return torch.jit.script(sandwich_layer_structure) if scripting else sandwich_layer_structure
22 |     else:
23 |         raise ValueError("Invalid layer type.")
24 | 
25 | 
26 | def layer_structure(states, outputs, alpha, residual_scale, prob: float = 0.1, training: bool = False):
27 |     return states * alpha + residual_scale * dropout(outputs, p=prob, training=training)
28 | 
29 | 
30 | def scaled_layer_training(states, outputs, alpha, residual_scale, prob: float = 0.1):
31 |     return layer_structure(states, outputs, alpha, residual_scale, prob, training=True)
32 | 
33 | 
34 | def scaled_layer_eval(states, outputs, alpha, residual_scale, prob: float = 0.1):
35 |     return layer_structure(states, outputs, alpha, residual_scale, prob, training=False)
36 | 
37 | 
38 | def sandwich_layer_structure(states, outputs, alpha, residual_scale, prob: float = 0.1, training: bool = False):
39 |     states = states * alpha + residual_scale * outputs
40 |     return states
41 | 
42 | 
43 | def simplified_layer_structure(states, outputs, alpha, residual_scale, prob: float = 0.1, training: bool = False):
44 |     return states + dropout(outputs, p=prob, training=training)
45 | 
46 | 
47 | def simplified_layer_training(states, outputs, alpha, residual_scale, prob: float = 0.1):
48 |     return simplified_layer_structure(states, outputs, alpha, residual_scale, prob, training=True)
49 | 
50 | 
51 | def simplified_layer_eval(states, outputs, alpha, residual_scale, prob: float = 0.1):
52 |     return simplified_layer_structure(states, outputs, alpha, residual_scale, prob, training=False)
53 | 


--------------------------------------------------------------------------------
/bert/cramming/architectures/huggingface_interface.py:
--------------------------------------------------------------------------------
 1 | """BERT variations based on the huggingface implementation."""
 2 | 
 3 | import transformers
 4 | from omegaconf import OmegaConf
 5 | 
 6 | 
 7 | def construct_huggingface_model(cfg_arch, vocab_size, downstream_classes=None):
 8 |     """construct model from given configuration. Only works if this arch exists on the hub."""
 9 |     if downstream_classes is None:
10 |         if isinstance(cfg_arch, transformers.PretrainedConfig):
11 |             configuration = cfg_arch
12 |         else:
13 |             configuration = transformers.BertConfig(**cfg_arch)
14 |         configuration.vocab_size = vocab_size
15 |         model = transformers.AutoModelForMaskedLM.from_config(configuration)
16 |         model.vocab_size = model.config.vocab_size
17 |     else:
18 |         if isinstance(cfg_arch, transformers.PretrainedConfig):
19 |             configuration = cfg_arch
20 |             configuration.num_labels = downstream_classes
21 |         else:
22 |             configuration = OmegaConf.to_container(cfg_arch)
23 |             configuration = transformers.BertConfig(**configuration, num_labels=downstream_classes)
24 |         configuration.vocab_size = vocab_size
25 |         model = transformers.AutoModelForSequenceClassification.from_config(configuration)
26 |         model.vocab_size = vocab_size
27 |     return model
28 | 


--------------------------------------------------------------------------------
/bert/cramming/architectures/sanity_check.py:
--------------------------------------------------------------------------------
 1 | """Sanity Check architecture."""
 2 | from typing import Optional
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class SanityCheckforPreTraining(torch.nn.Module):
 8 |     """Make big go fast."""
 9 | 
10 |     def __init__(self, width, vocab_size):
11 |         super().__init__()
12 |         self.word_embedding = torch.nn.Embedding(vocab_size, width, padding_idx=0)
13 |         self.transform = torch.nn.Linear(width, width, bias=False)
14 | 
15 |     def forward(self, input_ids, attention_mask: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None):
16 |         embeds = self.word_embedding(input_ids)
17 |         outputs = self.transform(embeds)
18 |         loss = outputs.mean()
19 |         return dict(outputs=outputs, loss=loss)
20 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/__init__.py:
--------------------------------------------------------------------------------
1 | """This module implements interfaces to the various backends."""
2 | 
3 | from .prepare_backend import load_backend
4 | from .utils import prepare_pretraining_dataloader
5 | 
6 | __all__ = ["load_backend"]
7 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/deepspeed_integration.py:
--------------------------------------------------------------------------------
  1 | """(Hopefully) seamless integration of deepspeed."""
  2 | import json
  3 | import logging
  4 | import os
  5 | from functools import partial
  6 | 
  7 | import torch
  8 | from omegaconf import OmegaConf
  9 | 
 10 | from .optimizers import get_schedule_fn
 11 | from .utils import (
 12 |     group_parameters,
 13 |     prepare_pretraining_dataloader,
 14 |     torchdynamo_compile_method,
 15 | )
 16 | 
 17 | log = logging.getLogger(__name__)
 18 | _default_setup = dict(device=torch.device("cpu"), dtype=torch.float)
 19 | 
 20 | 
 21 | """Todo:
 22 | * integrate batch size ramping via
 23 | https://deepspeed.readthedocs.io/en/latest/pipeline.html#deepspeed.runtime.pipe.engine.PipelineEngine.set_train_batch_size
 24 | """
 25 | 
 26 | 
 27 | def initialize_deepspeed(model, dataset, validation_set, tokenizer, cfg_train, cfg_impl, setup=_default_setup):
 28 |     """Initialize deepspeed. Module is imported lazily here."""
 29 |     import deepspeed
 30 | 
 31 |     if cfg_impl.jit == "trace":
 32 |         # This variant is very experimental...
 33 |         input_setup = dict(dtype=torch.long, device=setup["device"])
 34 |         templates = torch.randint(0, model.vocab_size, (*cfg_impl.trace_shape,), **input_setup)
 35 |         labels = torch.randint(0, model.vocab_size, (*cfg_impl.trace_shape,), **input_setup)
 36 | 
 37 |         model.to(**setup)
 38 |         model.kwargs_forward = model.forward
 39 |         model.forward = lambda input_ids, labels: model.kwargs_forward(input_ids=input_ids, labels=labels)
 40 |         model = torch.jit.trace(model, (templates, labels), strict=False)
 41 |     elif cfg_impl.jit == "script":
 42 |         # This does not work for huggingface models
 43 |         model = torch.jit.script(model)
 44 | 
 45 |     model_engine, optimizer, dataloader, scheduler = deepspeed.initialize(
 46 |         config=OmegaConf.to_container(cfg_impl, resolve=True),
 47 |         model=model,
 48 |         model_parameters=group_parameters(model, cfg_train),
 49 |         lr_scheduler=get_schedule_fn(cfg_train),
 50 |         # training_data=dataset, # handle this natively
 51 |         # collate_fn=collate_fn,
 52 |     )
 53 |     # Monkey-patch checkpointing
 54 |     model_engine.save_training_checkpoint = partial(save_training_checkpoint, self=model_engine)
 55 |     model_engine.save_final_model = partial(save_final_model, model_engine)
 56 |     # And more methods
 57 |     model_engine.gradinit = partial(gradinit, self=model_engine)
 58 |     model_engine.to_device = lambda batch: to_device(self=model_engine, batch=batch, keys=["input_ids", "labels"])
 59 | 
 60 |     model_engine.setup = setup
 61 |     model_engine.record_batch_size = lambda: cfg_train.batch_size
 62 |     model_engine.record_tokens_per_step = lambda: tokenizer.model_max_length * cfg_impl.microbatch_size
 63 | 
 64 |     def step(self, batch):
 65 |         loss = self.forward(**batch)["loss"]
 66 |         self.backward(loss)
 67 |         self.optimizer_step()
 68 |         return loss.detach()
 69 | 
 70 |     model_engine.step = lambda batch: torchdynamo_compile_method(step, cfg_impl.optimizer_context)(self=model_engine, batch=batch)
 71 | 
 72 |     if dataset is not None:
 73 |         dataloader = prepare_pretraining_dataloader(dataset, tokenizer, cfg_train, cfg_impl)
 74 |         validation_dataloader = (
 75 |             prepare_pretraining_dataloader(validation_set, tokenizer, cfg_train, cfg_impl, is_validation=True)
 76 |             if validation_set is not None
 77 |             else None
 78 |         )
 79 |     else:
 80 |         dataloader = None
 81 |         validation_dataloader = None
 82 |     # dataloader = deepspeed.RepeatingLoader(dataloader)
 83 |     return model_engine, optimizer, scheduler, dataloader, validation_dataloader
 84 | 
 85 | 
 86 | def save_training_checkpoint(self, identifier, directory="checkpoints", state=None):
 87 |     """Path, identifier and additional client state. This checkpoint can be used to resume training.
 88 |     The default behavior is to save this checkpoint relative to the training working directory.
 89 |     """
 90 |     self.save_checkpoint(directory, identifier, client_state=state)
 91 | 
 92 | 
 93 | def save_final_model(self, base_directory, identifier, tokenizer, cfg_arch, dryrun=False):
 94 |     """This checkpoint can be used for downstream tasks.
 95 |     The default behavior is to save this checkpoint to a checkpoints folder under base_directory/name/checkpoints"""
 96 |     try:
 97 |         identifier_str = f"{identifier:2.4f}"
 98 |     except ValueError:
 99 |         identifier_str = str(identifier)
100 |     full_path = os.path.join(base_directory, "checkpoints", identifier_str)
101 |     os.makedirs(full_path, exist_ok=True)
102 |     # This saves tokenizer_config.json, tokenizer.json and special_tokens_map.json to this folder
103 |     if not dryrun:
104 |         tokenizer.save_pretrained(full_path)
105 |         # Save model.pth, model_config.json
106 |         self.save_checkpoint(full_path, "model")
107 |         with open(os.path.join(full_path, "model_config.json"), "w") as file:
108 |             json.dump(OmegaConf.to_container(cfg_arch, resolve=True), file)
109 | 
110 | 
111 | def gradinit(self, dataloader, config):
112 |     raise ValueError("GradInit not implemented for deepspeed.")
113 | 
114 | 
115 | def to_device(self, batch, keys=["input_ids", "labels"]):
116 |     """Move batch of data into device memory."""
117 |     return {
118 |         k: v.to(device=self.setup["device"], dtype=torch.long, non_blocking=True)
119 |         for k, v in batch.items()
120 |         if k in keys  # Add more keywords here if needed
121 |     }
122 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .adahessian import Adahessian
2 | from .lion_pytorch import Lion
3 | from .optimizer_modifiers import LARS, SAM
4 | from .progressive_batching import ProgressiveBatching
5 | from .schedulers import get_schedule_fn
6 | from .shampoo import Shampoo
7 | from .sophiag import SophiaG
8 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/optimizers/adamw_scale.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Iterable, Tuple
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | from torch.optim import Optimizer
  7 | 
  8 | 
  9 | class AdamWScale(Optimizer):
 10 |     """
 11 |     This AdamW implementation is copied from Huggingface.
 12 |     We modified it with Adagrad scaling by rms of a weight tensor
 13 | 
 14 |     Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay
 15 |     Regularization](https://arxiv.org/abs/1711.05101).
 16 | 
 17 |     Parameters:
 18 |         params (`Iterable[nn.parameter.Parameter]`):
 19 |             Iterable of parameters to optimize or dictionaries defining parameter groups.
 20 |         lr (`float`, *optional*, defaults to 1e-3):
 21 |             The learning rate to use.
 22 |         betas (`Tuple[float,float]`, *optional*, defaults to (0.9, 0.999)):
 23 |             Adam's betas parameters (b1, b2).
 24 |         eps (`float`, *optional*, defaults to 1e-6):
 25 |             Adam's epsilon for numerical stability.
 26 |         weight_decay (`float`, *optional*, defaults to 0):
 27 |             Decoupled weight decay to apply.
 28 |         correct_bias (`bool`, *optional*, defaults to `True`):
 29 |             Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
 30 |         no_deprecation_warning (`bool`, *optional*, defaults to `False`):
 31 |             A flag used to disable the deprecation warning (set to `True` to disable the warning).
 32 |     """
 33 | 
 34 |     def __init__(
 35 |         self,
 36 |         params: Iterable[nn.parameter.Parameter],
 37 |         lr: float = 1e-3,
 38 |         betas: Tuple[float, float] = (0.9, 0.999),
 39 |         eps: float = 1e-6,
 40 |         weight_decay: float = 0.0,
 41 |         correct_bias: bool = True,
 42 |     ):
 43 |         if lr < 0.0:
 44 |             raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
 45 |         if not 0.0 <= betas[0] < 1.0:
 46 |             raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
 47 |         if not 0.0 <= betas[1] < 1.0:
 48 |             raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
 49 |         if not 0.0 <= eps:
 50 |             raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
 51 |         defaults = dict(
 52 |             lr=lr,
 53 |             betas=betas,
 54 |             eps=eps,
 55 |             weight_decay=weight_decay,
 56 |             correct_bias=correct_bias,
 57 |         )
 58 |         super().__init__(params, defaults)
 59 | 
 60 |     @staticmethod
 61 |     def _rms(tensor):
 62 |         return tensor.norm(2) / (tensor.numel() ** 0.5)
 63 | 
 64 |     def step(self, closure=None):
 65 |         """
 66 |         Performs a single optimization step.
 67 | 
 68 |         Arguments:
 69 |             closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
 70 |         """
 71 |         loss = None
 72 |         if closure is not None:
 73 |             loss = closure()
 74 | 
 75 |         for group in self.param_groups:
 76 |             for p in group["params"]:
 77 |                 if p.grad is None:
 78 |                     continue
 79 |                 grad = p.grad.data
 80 |                 if grad.is_sparse:
 81 |                     raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
 82 | 
 83 |                 state = self.state[p]
 84 |                 beta1, beta2 = group["betas"]
 85 | 
 86 |                 # State initialization
 87 |                 if len(state) == 0:
 88 |                     state["step"] = 0
 89 |                     # Exponential moving average of gradient values
 90 |                     state["exp_avg"] = torch.zeros_like(p.data)
 91 |                     # Exponential moving average of squared gradient values
 92 |                     state["exp_avg_sq"] = torch.zeros_like(p.data)
 93 | 
 94 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
 95 | 
 96 |                 state["step"] += 1
 97 | 
 98 |                 # Decay the first and second moment running average coefficient
 99 |                 # In-place operations to update the averages at the same time
100 |                 exp_avg.mul_(beta1)
101 |                 exp_avg.add_(grad, alpha=(1.0 - beta1))
102 |                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
103 |                 denom = exp_avg_sq.sqrt().add_(group["eps"])
104 | 
105 |                 step_size = group["lr"]
106 |                 if group["correct_bias"]:  # No bias correction for Bert
107 |                     bias_correction1 = 1.0 - beta1 ** state["step"]
108 |                     bias_correction2 = 1.0 - beta2 ** state["step"]
109 |                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
110 | 
111 |                 # /Adapt Step from Adagrad
112 |                 step_size = step_size * max(1e-3, self._rms(p.data))
113 |                 # /Adapt Step from Adagrad
114 | 
115 |                 p.data.addcdiv_(exp_avg, denom, value=-step_size)
116 | 
117 |                 # Just adding the square of the weights to the loss function is *not*
118 |                 # the correct way of using L2 regularization/weight decay with Adam,
119 |                 # since that will interact with the m and v parameters in strange ways.
120 |                 #
121 |                 # Instead we want to decay the weights in a manner that doesn't interact
122 |                 # with the m/v parameters. This is equivalent to adding the square
123 |                 # of the weights to the loss with plain (non-momentum) SGD.
124 |                 # Add weight decay at the end (fixed version)
125 |                 if group["weight_decay"] > 0.0:
126 |                     p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"]))
127 | 
128 |         return loss
129 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/optimizers/lion_pytorch.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Optional, Tuple
 2 | 
 3 | import torch
 4 | from torch.optim.optimizer import Optimizer
 5 | 
 6 | # functions
 7 | 
 8 | 
 9 | def exists(val):
10 |     return val is not None
11 | 
12 | 
13 | # update functions
14 | 
15 | 
16 | def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2):
17 |     # stepweight decay
18 | 
19 |     p.data.mul_(1 - lr * wd)
20 | 
21 |     # weight update
22 | 
23 |     update = exp_avg.clone().mul_(beta1).add(grad, alpha=1 - beta1).sign_()
24 |     p.add_(update, alpha=-lr)
25 | 
26 |     # decay the momentum running average coefficient
27 | 
28 |     exp_avg.mul_(beta2).add_(grad, alpha=1 - beta2)
29 | 
30 | 
31 | # class
32 | 
33 | 
34 | class Lion(Optimizer):
35 |     def __init__(
36 |         self, params, lr: float = 1e-4, betas: Tuple[float, float] = (0.9, 0.99), weight_decay: float = 0.0, use_triton: bool = False
37 |     ):
38 |         assert lr > 0.0
39 |         assert all([0.0 <= beta <= 1.0 for beta in betas])
40 | 
41 |         defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)
42 | 
43 |         super().__init__(params, defaults)
44 | 
45 |         self.update_fn = update_fn
46 | 
47 |         if use_triton:
48 |             from lion_pytorch.triton import update_fn as triton_update_fn
49 | 
50 |             self.update_fn = triton_update_fn
51 | 
52 |     @torch.no_grad()
53 |     def step(self, closure: Optional[Callable] = None):
54 | 
55 |         loss = None
56 |         if exists(closure):
57 |             with torch.enable_grad():
58 |                 loss = closure()
59 | 
60 |         for group in self.param_groups:
61 |             for p in filter(lambda p: exists(p.grad), group["params"]):
62 | 
63 |                 grad, lr, wd, beta1, beta2, state = p.grad, group["lr"], group["weight_decay"], *group["betas"], self.state[p]
64 | 
65 |                 # init state - exponential moving average of gradient values
66 | 
67 |                 if len(state) == 0:
68 |                     state["exp_avg"] = torch.zeros_like(p)
69 | 
70 |                 exp_avg = state["exp_avg"]
71 | 
72 |                 self.update_fn(p, grad, exp_avg, lr, wd, beta1, beta2)
73 | 
74 |         return loss
75 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/optimizers/shampoo/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/optimizers/shampoo/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Optimizers
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible. Our goal is to provide a repo that promotes optimizer research
 4 | and development separate from the official PyTorch library. Please only
 5 | create pull requests for improving existing optimizers in the repo; new
 6 | optimizers should be created in a separate public repo.
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests for existing optimizers.
10 | 
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Meta's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## Coding Style
33 | * 4 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * Please maintain a consistent style with the rest of the code
36 | 
37 | ## License
38 | By contributing to Optimizers, you agree that your contributions will be licensed
39 | under the LICENSE file in the root directory of this source tree.
40 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/optimizers/shampoo/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD License
 2 | 
 3 | For Optimizers software
 4 | 
 5 | Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 |  * Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 |  * Neither the name Meta nor the names of its contributors may be used to
18 |    endorse or promote products derived from this software without specific
19 |    prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/optimizers/shampoo/README.md:
--------------------------------------------------------------------------------
 1 | # Optimizers
 2 | 
 3 | *Copyright (c) Meta Platforms, Inc. and affiliates.
 4 | All rights reserved.*
 5 | 
 6 | ## Description
 7 | Optimizers is a Github repository of PyTorch optimization algorithms. It is designed for external collaboration and development.
 8 | 
 9 | Currently includes the optimizers:
10 | - Distributed Shampoo
11 | 
12 | See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out.
13 | 
14 | ## License
15 | Optimizers is BSD licensed, as found in the LICENSE file.
16 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/optimizers/shampoo/__init__.py:
--------------------------------------------------------------------------------
1 | from .shampoo import Shampoo
2 | 


--------------------------------------------------------------------------------
/bert/cramming/backend/prepare_backend.py:
--------------------------------------------------------------------------------
 1 | """Instantiate backend objects in a congruent format."""
 2 | import torch
 3 | 
 4 | from .deepspeed_integration import initialize_deepspeed
 5 | from .torch_default import initialize_torch
 6 | 
 7 | _default_setup = dict(device=torch.device("cpu"), dtype=torch.float)
 8 | 
 9 | 
10 | def load_backend(model, dataset, validation_set, tokenizer, cfg_train, cfg_impl, setup=_default_setup):
11 |     if cfg_impl.name == "torch-default":
12 |         return initialize_torch(model, dataset, validation_set, tokenizer, cfg_train, cfg_impl, setup=setup)
13 |     elif cfg_impl.name == "deepspeed":
14 |         return initialize_deepspeed(model, dataset, validation_set, tokenizer, cfg_train, cfg_impl, setup=setup)
15 |     else:
16 |         raise ValueError(f"Invalid backend {cfg_impl.name} given.")
17 | 


--------------------------------------------------------------------------------
/bert/cramming/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/__init__.py


--------------------------------------------------------------------------------
/bert/cramming/config/arch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/arch/__init__.py


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-base.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableMaskedLM
 6 | 
 7 | num_transformer_layers: 12
 8 | hidden_size: 768
 9 | intermed_size: 3072
10 | hidden_dropout_prob: 0.1
11 | 
12 | norm: LayerNorm
13 | norm_eps: 1e-6
14 | norm_scheme: post # can be "pre", "post", "sandwich"
15 | nonlin: GELU
16 | 
17 | tie_weights: True # Tie input/output embedding
18 | sparse_prediction: True # Whether to predict only on masked tokens
19 | decoder_bias: True # Whether to include a bias in the decoding step
20 | loss: cross-entropy
21 | z_loss_factor: 0
22 | gradient_checkpointing: False
23 | layer_fusion: True # Fuse transformer layer residual structure
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: learned
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: self-attention
36 |   causal_attention: False
37 |   num_attention_heads: 12
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: True
41 | 
42 |   rotary_embedding: False
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: True
47 |   low_level_fusion: True
48 | 
49 | init:
50 |   type: normal
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | ffn_layer_frequency: 1 # FFN layer in every layer
55 | deepnorm_scaling: False
56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size
57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
58 | use_bias: True # Whether to learn biases on all dense layers
59 | final_norm: False # Add a final norm layer before the end
60 | recurrent_layers:
61 | layer_macro_type: transformer # can also be FLASH
62 | 
63 | # Downstream settings:
64 | num_labels: # This can be automatically filled in for downstream
65 | classification_head:
66 |   pooler: zero_index
67 |   include_ff_layer: True
68 |   head_dim: ${arch.hidden_size}
69 |   nonlin: Tanh
70 |   classifier_dropout: ${arch.hidden_dropout_prob}
71 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-c2.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableMaskedLM
 6 | 
 7 | num_transformer_layers: 12
 8 | hidden_size: 768
 9 | intermed_size: 3072
10 | hidden_dropout_prob: 0.1
11 | 
12 | norm: LayerNorm
13 | norm_eps: 1e-6
14 | norm_scheme: pre # can be "pre", "post", "sandwich"
15 | nonlin: GELU
16 | 
17 | tie_weights: True # Tie input/output embedding
18 | sparse_prediction: True # Whether to predict only on masked tokens
19 | decoder_bias: True # Whether to include a bias in the decoding step
20 | loss: cross-entropy
21 | z_loss_factor: 0
22 | gradient_checkpointing: False
23 | layer_fusion: True # Fuse transformer layer residual structure
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: scaled-sinusoidal
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: self-attention
36 |   causal_attention: False
37 |   num_attention_heads: 12
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: True
41 | 
42 |   rotary_embedding: True
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: True
47 |   low_level_fusion: True
48 | 
49 | init:
50 |   type: normal
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | ffn_layer_frequency: 1 # FFN layer in every layer
55 | deepnorm_scaling: False
56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size
57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
58 | use_bias: True # Whether to learn biases on all dense layers
59 | final_norm: False # Add a final norm layer before the end
60 | recurrent_layers:
61 | layer_macro_type: transformer # can also be FLASH
62 | 
63 | # Downstream settings:
64 | num_labels: # This can be automatically filled in for downstream
65 | classification_head:
66 |   pooler: avg
67 |   include_ff_layer: True
68 |   head_dim: 1024
69 |   nonlin: Tanh
70 |   classifier_dropout: ${arch.hidden_dropout_prob}
71 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-c3.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableMaskedLM
 6 | 
 7 | num_transformer_layers: 12
 8 | hidden_size: 768
 9 | intermed_size: 3072
10 | hidden_dropout_prob: 0.1
11 | 
12 | norm: LayerNorm
13 | norm_eps: 1e-6
14 | norm_scheme: pre # can be "pre", "post", "sandwich"
15 | nonlin: GELU
16 | 
17 | tie_weights: True # Tie input/output embedding
18 | sparse_prediction: True # Whether to predict only on masked tokens
19 | decoder_bias: False # Whether to include a bias in the decoding step
20 | loss: cross-entropy
21 | z_loss_factor: 0
22 | gradient_checkpointing: False
23 | layer_fusion: True # Fuse transformer layer residual structure
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: scaled-sinusoidal
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: self-attention
36 |   causal_attention: False
37 |   num_attention_heads: 12
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: True
41 | 
42 |   rotary_embedding: True
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: True
47 |   low_level_fusion: True
48 | 
49 | init:
50 |   type: normal
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | ffn_layer_frequency: 1 # FFN layer in every layer
55 | deepnorm_scaling: False
56 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size
57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
58 | use_bias: True # Whether to learn biases on all dense layers
59 | final_norm: False # Add a final norm layer before the end
60 | recurrent_layers:
61 | layer_macro_type: transformer # can also be FLASH
62 | 
63 | # Downstream settings:
64 | num_labels: # This can be automatically filled in for downstream
65 | classification_head:
66 |   pooler: avg
67 |   include_ff_layer: True
68 |   head_dim: 1024
69 |   nonlin: Tanh
70 |   classifier_dropout: ${arch.hidden_dropout_prob}
71 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-c4.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableMaskedLM
 6 | 
 7 | num_transformer_layers: 12
 8 | hidden_size: 768
 9 | intermed_size: 3072
10 | hidden_dropout_prob: 0.1
11 | 
12 | norm: LayerNorm
13 | norm_eps: 1e-12
14 | norm_scheme: pre # can be "pre", "post", "sandwich"
15 | nonlin: GELUglu
16 | 
17 | tie_weights: True # Tie input/output embedding
18 | sparse_prediction: True # Whether to predict only on masked tokens
19 | decoder_bias: False # Whether to include a bias in the decoding step
20 | loss: cross-entropy
21 | z_loss_factor: 0
22 | gradient_checkpointing: False
23 | layer_fusion: True # Fuse transformer layer residual structure
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: scaled-sinusoidal
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: self-attention
36 |   causal_attention: False
37 |   num_attention_heads: 4
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: False
41 | 
42 |   rotary_embedding: True
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: True
47 |   low_level_fusion: True
48 | 
49 | init:
50 |   type: normal
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | ffn_layer_frequency: 1 # FFN layer in every layer
55 | deepnorm_scaling: False
56 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size
57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
58 | use_bias: False # Whether to learn biases on all dense layers
59 | final_norm: True # Add a final norm layer before the end
60 | recurrent_layers:
61 | layer_macro_type: transformer # can also be FLASH
62 | 
63 | # Downstream settings:
64 | num_labels: # This can be automatically filled in for downstream
65 | classification_head:
66 |   pooler: avg
67 |   include_ff_layer: True
68 |   head_dim: 1024
69 |   nonlin: Tanh
70 |   classifier_dropout: ${arch.hidden_dropout_prob}
71 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-c5.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableMaskedLM
 6 | 
 7 | num_transformer_layers: 16
 8 | hidden_size: 768
 9 | intermed_size: 3072
10 | hidden_dropout_prob: 0.1
11 | 
12 | norm: LayerNorm
13 | norm_eps: 1e-12
14 | norm_scheme: pre # can be "pre", "post", "sandwich"
15 | nonlin: GELUglu
16 | 
17 | tie_weights: True # Tie input/output embedding
18 | sparse_prediction: True # Whether to predict only on masked tokens
19 | decoder_bias: False # Whether to include a bias in the decoding step
20 | loss: cross-entropy
21 | z_loss_factor: 0
22 | gradient_checkpointing: False
23 | layer_fusion: True # Fuse transformer layer residual structure
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: scaled-sinusoidal
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: flash-attention-impl
36 |   causal_attention: False
37 |   num_attention_heads: 12
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: False
41 | 
42 |   rotary_embedding: False
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: False
47 |   low_level_fusion: True
48 | 
49 | init:
50 |   type: normal
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | ffn_layer_frequency: 1 # FFN layer in every layer
55 | deepnorm_scaling: False
56 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size
57 | layer_drop:
58 |   enabled: False # If true, then layers will be dynaically dropped.
59 |   max_theta: 0.5 # The maximum probability of keeping a dropping, when the drop schedule is at the end.
60 |   gamma_factor: 100
61 | use_bias: False # Whether to learn biases on all dense layers
62 | final_norm: True # Add a final norm layer before the end
63 | recurrent_layers:
64 | layer_macro_type: transformer # can also be FLASH
65 | 
66 | # Downstream settings:
67 | num_labels: # This can be automatically filled in for downstream
68 | classification_head:
69 |   pooler: avg
70 |   include_ff_layer: True
71 |   head_dim: 1024
72 |   nonlin: Tanh
73 |   classifier_dropout: ${arch.hidden_dropout_prob}
74 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-i4.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # based on amp_b1536_L8_H1088_I4352_H8
 4 | 
 5 | # These are the huggingface bert parameters
 6 | architectures:
 7 |   - ScriptableMaskedLM
 8 | 
 9 | num_transformer_layers: 8
10 | hidden_size: 1088
11 | intermed_size: 4352
12 | hidden_dropout_prob: 0.1
13 | 
14 | norm: LayerNorm
15 | norm_eps: 1e-12
16 | norm_scheme: pre # maybe post is actually better??
17 | nonlin: GELU # glu?
18 | 
19 | tie_weights: True # Tie input/output embedding
20 | sparse_prediction: True # Whether to predict only on masked tokens
21 | decoder_bias: False # Whether to include a bias in the decoding step
22 | loss: cross-entropy
23 | z_loss_factor: 0
24 | gradient_checkpointing: False
25 | layer_fusion: True # Fuse transformer layer residual structure
26 | 
27 | embedding:
28 |   vocab_size: # will be populated automatically
29 |   pos_embedding: scaled-sinusoidal
30 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
31 |   pad_token_id: 0
32 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
33 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
34 |   normalization: True
35 | 
36 | attention:
37 |   type: self-attention
38 |   causal_attention: False
39 |   num_attention_heads: 8
40 |   dropout_prob: 0.1
41 |   skip_output_projection: False
42 |   qkv_bias: False
43 | 
44 |   rotary_embedding: True
45 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
46 |   sequence_op: torch-softmax # Can be normalization
47 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
48 |   high_level_fusion: True
49 |   low_level_fusion: True
50 | 
51 | init:
52 |   type: normal
53 |   std: 0.02
54 | 
55 | # Very experimental options:
56 | ffn_layer_frequency: 1 # FFN layer in every layer
57 | deepnorm_scaling: False
58 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size
59 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
60 | use_bias: False # Whether to learn biases on all dense layers
61 | final_norm: True # Add a final norm layer before the end
62 | recurrent_layers:
63 | layer_macro_type: transformer # can also be FLASH
64 | 
65 | # Downstream settings:
66 | num_labels: # This can be automatically filled in for downstream
67 | classification_head:
68 |   pooler: avg
69 |   include_ff_layer: True
70 |   head_dim: 1024
71 |   nonlin: Tanh
72 |   classifier_dropout: ${arch.hidden_dropout_prob}
73 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-large-izsak.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableMaskedLM
 6 | 
 7 | num_transformer_layers: 24
 8 | hidden_size: 1024
 9 | intermed_size: 4096
10 | hidden_dropout_prob: 0.1
11 | 
12 | norm: LayerNorm
13 | norm_eps: 1e-6
14 | norm_scheme: pre # can be "pre", "post", "sandwich"
15 | nonlin: GELU
16 | 
17 | tie_weights: True # Tie input/output embedding
18 | sparse_prediction: True # Whether to predict only on masked tokens
19 | decoder_bias: True # Whether to include a bias in the decoding step
20 | loss: cross-entropy
21 | z_loss_factor: 0
22 | gradient_checkpointing: False
23 | layer_fusion: True # Fuse transformer layer residual structure
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: learned
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: self-attention
36 |   causal_attention: False
37 |   num_attention_heads: 16
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: True
41 | 
42 |   rotary_embedding: False
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: True
47 |   low_level_fusion: True
48 | 
49 | init:
50 |   type: normal
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | ffn_layer_frequency: 1 # FFN layer in every layer
55 | deepnorm_scaling: False
56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size
57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
58 | use_bias: True # Whether to learn biases on all dense layers
59 | final_norm: False # Add a final norm layer before the end
60 | recurrent_layers:
61 | layer_macro_type: transformer # can also be FLASH
62 | 
63 | # Downstream settings:
64 | num_labels: # This can be automatically filled in for downstream
65 | classification_head:
66 |   pooler: zero_index
67 |   include_ff_layer: True
68 |   head_dim: ${arch.hidden_size}
69 |   nonlin: Tanh
70 |   classifier_dropout: ${arch.hidden_dropout_prob}
71 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-original.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baselin
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableMaskedLM
 6 | 
 7 | num_transformer_layers: 12
 8 | hidden_size: 768
 9 | intermed_size: 3072
10 | hidden_dropout_prob: 0.1
11 | 
12 | norm: LayerNorm
13 | norm_eps: 1e-12
14 | norm_scheme: post # can be "pre", "post", "sandwich"
15 | nonlin: GELU
16 | 
17 | tie_weights: True # Tie input/output embedding
18 | sparse_prediction: False # Whether to predict only on masked tokens
19 | decoder_bias: True # Whether to include a bias in the decoding step
20 | loss: cross-entropy
21 | z_loss_factor: 0
22 | gradient_checkpointing: False
23 | layer_fusion: False # Fuse transformer layer residual structure
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: learned
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 512 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: self-attention
36 |   causal_attention: False
37 |   num_attention_heads: 12
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: True
41 | 
42 |   rotary_embedding: False
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (this is the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be other normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: False
47 |   low_level_fusion: False
48 | 
49 | init:
50 |   type: normal
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | ffn_layer_frequency: 1 # FFN layer in every layer
55 | deepnorm_scaling: False
56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size
57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
58 | use_bias: True # Whether to learn biases on all dense layers
59 | final_norm: False # Add a final norm layer before the end
60 | recurrent_layers:
61 | layer_macro_type: transformer
62 | 
63 | # Downstream settings:
64 | num_labels: # This can be automatically filled in for downstream
65 | classification_head:
66 |   pooler: zero_index
67 |   include_ff_layer: True
68 |   head_dim: ${arch.hidden_size}
69 |   nonlin: Tanh
70 |   classifier_dropout: ${arch.hidden_dropout_prob}
71 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/bert-tiny.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableMaskedLM
 6 | 
 7 | num_transformer_layers: 4
 8 | hidden_size: 384
 9 | intermed_size: 1024
10 | hidden_dropout_prob: 0.1
11 | 
12 | norm: LayerNorm
13 | norm_eps: 1e-6
14 | norm_scheme: post # can be "pre", "post", "sandwich"
15 | nonlin: GELU
16 | 
17 | tie_weights: True # Tie input/output embedding
18 | sparse_prediction: True # Whether to predict only on masked tokens
19 | decoder_bias: True # Whether to include a bias in the decoding step
20 | loss: cross-entropy
21 | z_loss_factor: 0
22 | gradient_checkpointing: False
23 | layer_fusion: True # Fuse transformer layer residual structure
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: learned
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: 96 # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: self-attention
36 |   causal_attention: False
37 |   num_attention_heads: 12
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: True
41 | 
42 |   rotary_embedding: False
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: True
47 |   low_level_fusion: True
48 | 
49 | init:
50 |   type: small
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | ffn_layer_frequency: 1 # FFN layer in every layer
55 | deepnorm_scaling: False
56 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size
57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
58 | use_bias: True # Whether to learn biases on all dense layers
59 | final_norm: False # Add a final norm layer before the end
60 | recurrent_layers:
61 | layer_macro_type: transformer # can also be FLASH
62 | 
63 | # Downstream settings:
64 | num_labels: # This can be automatically filled in for downstream
65 | classification_head:
66 |   pooler: zero_index
67 |   include_ff_layer: True
68 |   head_dim: ${arch.hidden_size}
69 |   nonlin: Tanh
70 |   classifier_dropout: ${arch.hidden_dropout_prob}
71 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/funnel-c2.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableFunnelLM
 6 | 
 7 | setup: [128, 64, 32, 16, 8, 4, 2, 4, 8, 16, 32, 64, 128]
 8 | num_transformer_layers: 12
 9 | 
10 | hidden_size: 768
11 | intermed_size: 3072
12 | hidden_dropout_prob: 0.1
13 | 
14 | norm: LayerNorm
15 | norm_eps: 1e-6
16 | norm_scheme: pre # can be "pre", "post", "sandwich"
17 | nonlin: GELU
18 | 
19 | tie_weights: True # Tie input/output embedding
20 | sparse_prediction: True # Whether to predict only on masked tokens
21 | decoder_bias: True # Whether to include a bias in the decoding step
22 | loss: cross-entropy
23 | z_loss_factor: 0
24 | 
25 | embedding:
26 |   vocab_size: # will be populated automatically
27 |   pos_embedding: scaled-sinusoidal
28 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
29 |   pad_token_id: 0
30 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
31 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
32 |   normalization: True
33 | 
34 | attention:
35 |   type: funnel
36 |   causal_attention: False
37 |   num_attention_heads: 12
38 |   dropout_prob: 0.1
39 |   skip_output_projection: False
40 |   qkv_bias: True
41 | 
42 |   rotary_embedding: True
43 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
44 |   sequence_op: torch-softmax # Can be normalization
45 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
46 |   high_level_fusion: True
47 |   low_level_fusion: True
48 | 
49 | init:
50 |   type: normal
51 |   std: 0.02
52 | 
53 | # Very experimental options:
54 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size
55 | use_bias: True # Whether to learn biases on all dense layers
56 | final_norm: False # Add a final norm layer before the end
57 | 
58 | # Downstream settings:
59 | num_labels: # This can be automatically filled in for downstream
60 | classification_head:
61 |   pooler: avg
62 |   include_ff_layer: True
63 |   head_dim: 1024
64 |   nonlin: Tanh
65 |   classifier_dropout: ${arch.hidden_dropout_prob}
66 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/hf-bert-base.yaml:
--------------------------------------------------------------------------------
 1 | # These are the huggingface bert parameters
 2 | architectures:
 3 |   - BertForMaskedLM
 4 | 
 5 | attention_probs_dropout_prob: 0.1
 6 | hidden_act: gelu
 7 | hidden_dropout_prob: 0.1
 8 | hidden_size: 768
 9 | initializer_range: 0.02
10 | intermediate_size: 3072
11 | layer_norm_eps: 1e-12
12 | max_position_embeddings: 512
13 | num_attention_heads: 12
14 | num_hidden_layers: 12
15 | pad_token_id: 0
16 | position_embedding_type: absolute
17 | 
18 | type_vocab_size: 2
19 | use_cache: true
20 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/hf-bert-tiny.yaml:
--------------------------------------------------------------------------------
 1 | # These are the huggingface bert parameters
 2 | architectures:
 3 |   - BertForMaskedLM
 4 | 
 5 | attention_probs_dropout_prob: 0.1
 6 | hidden_act: gelu
 7 | hidden_dropout_prob: 0.1
 8 | hidden_size: 128
 9 | initializer_range: 0.02
10 | intermediate_size: 512
11 | layer_norm_eps: 1e-12
12 | max_position_embeddings: 512
13 | num_attention_heads: 2
14 | num_hidden_layers: 2
15 | pad_token_id: 0
16 | position_embedding_type: absolute
17 | 
18 | type_vocab_size: 2
19 | use_cache: true
20 | # original bert-tiny hparams from https://github.com/google-research/bert:
21 | # {"hidden_size": 128,
22 | #  "hidden_act": "gelu",
23 | #  "initializer_range": 0.02,
24 | #  "vocab_size": 30522,
25 | #  "hidden_dropout_prob": 0.1,
26 | #  "num_attention_heads": 2,
27 | #  "type_vocab_size": 2,
28 | #  "max_position_embeddings": 512,
29 | #  "num_hidden_layers": 2,
30 | #  "intermediate_size": 512,
31 | #  "attention_probs_dropout_prob": 0.1}
32 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/recurrent-c2.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | 
 3 | # These are the huggingface bert parameters
 4 | architectures:
 5 |   - ScriptableRecurrentLM
 6 | 
 7 | training_scheme: bptt-deepthinking
 8 | maximal_recurrence: 12
 9 | recurrent_layers: 2 # How deep is the block of transformer layers that is recurring
10 | hidden_size: 768
11 | intermed_size: 3072
12 | hidden_dropout_prob: 0.1
13 | 
14 | norm: LayerNorm
15 | norm_eps: 1e-6
16 | norm_scheme: pre # can be "pre", "post", "sandwich"
17 | nonlin: GELU
18 | 
19 | tie_weights: True # Tie input/output embedding
20 | sparse_prediction: True # Whether to predict only on masked tokens
21 | decoder_bias: True # Whether to include a bias in the decoding step
22 | loss: cross-entropy
23 | 
24 | layer_fusion: True # Fuse transformer layer residual structure
25 | 
26 | embedding:
27 |   vocab_size: # will be populated automatically
28 |   pos_embedding: scaled-sinusoidal
29 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
30 |   pad_token_id: 0
31 |   max_seq_length: 128 # max seq length that the positional embedding is instantiated for
32 |   embedding_dim: ${arch.hidden_size} # can be smaller than hidden size (this is the ALBERT trick)
33 |   normalization: True
34 | 
35 | attention:
36 |   type: self-attention
37 |   causal_attention: False
38 |   num_attention_heads: 12
39 |   dropout_prob: 0.1
40 |   skip_output_projection: False
41 |   qkv_bias: True
42 | 
43 |   rotary_embedding: True
44 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
45 |   sequence_op: torch-softmax # Can be normalization
46 |   # hybrid_layers: [10, 11]  # Only used when type=fourier-hybrid to denote self-attention layers
47 |   high_level_fusion: True
48 |   low_level_fusion: True
49 | 
50 | init:
51 |   type: normal
52 |   std: 0.02
53 | 
54 | # Very experimental options:
55 | ffn_layer_frequency: 1 # FFN layer in every layer
56 | deepnorm_scaling: False
57 | layer_drop_theta: # Set to a non-null value to dynamically drop layers
58 | skip_head_transform: False # This is only possible if embedding_dim=hidden_size
59 | use_bias: True # Whether to learn biases on all dense layers
60 | 
61 | # Downstream settings:
62 | num_labels: # This can be automatically filled in for downstream
63 | classification_head:
64 |   pooler: avg
65 |   include_ff_layer: True
66 |   head_dim: 1024
67 |   nonlin: Tanh
68 |   classifier_dropout: ${arch.hidden_dropout_prob}
69 | 
70 | num_transformer_layers: ${arch.maximal_recurrence} # only for compatibility with other archs
71 | 


--------------------------------------------------------------------------------
/bert/cramming/config/arch/sanitycheck.yaml:
--------------------------------------------------------------------------------
1 | architectures:
2 |   - SanityCheckLM
3 | 
4 | width: 8352
5 | 


--------------------------------------------------------------------------------
/bert/cramming/config/cfg_eval.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration defaults
 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams
 3 | defaults:
 4 |   - impl: torch-default
 5 |   - wandb: default
 6 |   - eval: mnli
 7 |   - _self_
 8 |   - override hydra/job_logging: custom
 9 | 
10 | wandb:
11 |   project: cramming-eval
12 | 
13 | base_dir: outputs
14 | hydra:
15 |   sweep:
16 |     dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S}
17 |   run:
18 |     dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S}
19 |   job:
20 |     chdir: True
21 | 
22 | seed: # Optional: Set initial seed
23 | 
24 | # A name for this run [will draw the checkpoint from runs with this name
25 | # and use this name for the summary table and outputs folder]
26 | name: default
27 | # If set, override the name on wandb. Otherwise, uses name above.
28 | wandb_name:
29 | 
30 | # debug implementation by running every loop just once:
31 | dryrun: False
32 | 


--------------------------------------------------------------------------------
/bert/cramming/config/cfg_eval_pt.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration defaults
 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams
 3 | defaults:
 4 |   - arch: bert-c5
 5 |   - data: c4-subset-random #bookcorpus-wikipedia
 6 |   - impl: torch-default
 7 |   - wandb: default
 8 |   - eval: save_losses_rho_loss
 9 |   - train: bert-o3
10 |   - _self_
11 |   - override hydra/job_logging: custom
12 | 
13 | wandb:
14 |   project: cramming-eval
15 | 
16 | base_dir: outputs
17 | hydra:
18 |   sweep:
19 |     dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S}
20 |   run:
21 |     dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S}
22 |   job:
23 |     chdir: True
24 | 
25 | seed: 0 # Optional: Set initial seed
26 | 
27 | # A name for this run [will draw the checkpoint from runs with this name
28 | # and use this name for the summary table and outputs folder]
29 | name: default
30 | budget: 96
31 | # debug implementation by running every loop just once:
32 | dryrun: False
33 | 
34 | train:
35 |   validation_set:
36 |     enabled: true
37 |     fraction: 0.001
38 | 
39 | truncate_dataset: 0


--------------------------------------------------------------------------------
/bert/cramming/config/cfg_pretrain.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration defaults
 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams
 3 | # default settings run a sanity check with a small model and test data.
 4 | defaults:
 5 |   - arch: bert-c5
 6 |   - data: c4-subset-random #bookcorpus-wikipedia
 7 |   - impl: torch-default
 8 |   - wandb: default
 9 |   - train: bert-o3
10 |   - _self_
11 |   - override hydra/job_logging: custom
12 | 
13 | base_dir: outputs
14 | hydra:
15 |   sweep:
16 |     dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S}
17 |   run:
18 |     dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S}
19 |   job:
20 |     chdir: True
21 | 
22 | seed: 0 # Optional: Set initial seed
23 | name: default # A name for this run [will be used for the summary table and outputs folder]
24 | 
25 | # If a number, then the total compute budget in hours. If "steps", then instead train
26 | # for the number of steps given by train.steps.
27 | budget: 24
28 | # debug implementation by running every loop just once:
29 | dryrun: False
30 | 


--------------------------------------------------------------------------------
/bert/cramming/config/cfg_save_losses.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration defaults
 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams
 3 | # default settings run a sanity check with a small model and test data.
 4 | defaults:
 5 |   - arch: bert-c5
 6 |   - data: c4-subset-random #bookcorpus-wikipedia
 7 |   - impl: save_losses_rho_loss
 8 |   - wandb: default
 9 |   - eval: save_losses_rho_loss
10 |   - train: bert-o3
11 |   - _self_
12 |   - override hydra/job_logging: custom
13 | 
14 | base_dir: outputs
15 | hydra:
16 |   sweep:
17 |     dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S}
18 |   run:
19 |     dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S}
20 |   job:
21 |     chdir: True
22 | 
23 | seed: 0 # Optional: Set initial seed
24 | name: rho_loss_save_losses # A name for this run [will be used for the summary table and outputs folder]
25 | budget: 24
26 | 
27 | # debug implementation by running every loop just once:
28 | dryrun: False
29 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/data/__init__.py


--------------------------------------------------------------------------------
/bert/cramming/config/data/bert-default.yaml:
--------------------------------------------------------------------------------
 1 | # This is the "default" BERT dataset
 2 | name: bookcorpus-wikitext
 3 | defaults:
 4 |   - sources:
 5 |       - bookcorpus
 6 |       - wikipedia
 7 | 
 8 | # Preprocessing
 9 | normalizer: # This is ignored and the default bert normalizer is used instead
10 |   force_lowercase: # True
11 |   strip_accents: # True
12 |   force_english_keyboard: # False
13 |   whitespace_escape: # False
14 | tokenizer: bert-base-uncased
15 | vocab_size: 30522
16 | 
17 | # Dataset Formation
18 | seq_length: 512
19 | include_cls_token_in_corpus: # True, but ignored and the default post_processor is used
20 | include_sep_token_in_corpus: # True, but ignored and the default post_processor is used
21 | use_type_ids: # True
22 | max_entries_in_raw_dataset: 1e14 # Select no more than this number of examples from the dataset
23 | max_seq_in_tokenized_dataset: 1e14 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.3
31 | deduplicate_entries: False
32 | deduplication_threshold: 100
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/bookcorpus-wikipedia.yaml:
--------------------------------------------------------------------------------
 1 | # This is a modernized/sanitized config for bookcorpus-wikipedia
 2 | name: bookcorpus-wikitext
 3 | defaults:
 4 |   - sources:
 5 |       - bookcorpus
 6 |       - wikipedia
 7 | 
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: True
11 |   strip_accents: True
12 |   force_english_keyboard: True
13 |   whitespace_escape: False
14 | tokenizer: WordPiece
15 | vocab_size: 32768 # 2^15
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus: False
20 | include_sep_token_in_corpus: True
21 | use_type_ids: False
22 | max_entries_in_raw_dataset: 1e10 # Select only this many examples from the dataset
23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.3
31 | deduplicate_entries: False
32 | deduplication_threshold: 100
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/c4-subset-processed.yaml:
--------------------------------------------------------------------------------
 1 | # This would be a slice of C4
 2 | name: c4-subset
 3 | defaults:
 4 |   - sources:
 5 |       - c4
 6 | 
 7 | #
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: True
11 |   strip_accents: True
12 |   force_english_keyboard: True
13 |   whitespace_escape: False
14 | tokenizer: WordPiece
15 | vocab_size: 32768 # 2^15
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus: False
20 | include_sep_token_in_corpus: True
21 | use_type_ids: False
22 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
23 | max_seq_in_tokenized_dataset: 85e6 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: True
30 | trash_cutoff: 0.25
31 | deduplicate_entries: True
32 | deduplication_threshold: 75
33 | 
34 | # Data Order:
35 | ordering: sentence-length-curriculum # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/c4-subset-random.yaml:
--------------------------------------------------------------------------------
 1 | # This would be a slice of C4
 2 | name: c4-subset-random
 3 | defaults:
 4 |   - sources:
 5 |       - c4_non_streaming
 6 | 
 7 | #
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: True
11 |   strip_accents: True
12 |   force_english_keyboard: True
13 |   whitespace_escape: False
14 | tokenizer: WordPiece
15 | vocab_size: 32768 # 2^15
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus: False
20 | include_sep_token_in_corpus: True
21 | use_type_ids: False
22 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
23 | max_seq_in_tokenized_dataset: 85e6 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.25
31 | deduplicate_entries: False
32 | deduplication_threshold: 75
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/c4-subset.yaml:
--------------------------------------------------------------------------------
 1 | # This would be a slice of C4
 2 | name: c4-subset
 3 | defaults:
 4 |   - sources:
 5 |       - c4
 6 | 
 7 | #
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: True
11 |   strip_accents: True
12 |   force_english_keyboard: True
13 |   whitespace_escape: False
14 | tokenizer: WordPiece
15 | vocab_size: 32768 # 2^15
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus: False
20 | include_sep_token_in_corpus: True
21 | use_type_ids: False
22 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.3
31 | deduplicate_entries: False
32 | deduplication_threshold: 100
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/minipile.yaml:
--------------------------------------------------------------------------------
 1 | # This would be a slice of C4
 2 | name: minipile
 3 | defaults:
 4 |   - sources:
 5 |       - minipile
 6 | 
 7 | #
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: True
11 |   strip_accents: True
12 |   force_english_keyboard: True
13 |   whitespace_escape: False
14 | tokenizer: WordPiece
15 | vocab_size: 32768 # 2^15
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus: False
20 | include_sep_token_in_corpus: True
21 | use_type_ids: False
22 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.3
31 | deduplicate_entries: False
32 | deduplication_threshold: 100
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sanity-check-1.yaml:
--------------------------------------------------------------------------------
 1 | # Just a bunch of fake data ...
 2 | name: sanity-check-1
 3 | defaults:
 4 |   - sources:
 5 |       - fake
 6 | 
 7 | #
 8 | # Preprocessing
 9 | normalizer: # This is ignored and the default bert normalizer is used instead
10 |   force_lowercase:
11 |   strip_accents:
12 |   force_english_keyboard:
13 |   whitespace_escape:
14 | tokenizer: bert-base-uncased
15 | vocab_size: 30522
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus:
20 | include_sep_token_in_corpus:
21 | use_type_ids:
22 | max_entries_in_raw_dataset: 1e12 # Select only this many examples from the dataset
23 | max_seq_in_tokenized_dataset: 1e12 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.3
31 | deduplicate_entries: False
32 | deduplication_threshold: 100
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sanity-check-2.yaml:
--------------------------------------------------------------------------------
 1 | # Just a tiny test dataset ...
 2 | name: sanity-check-2
 3 | # https://hydra.cc/docs/patterns/select_multiple_configs_from_config_group/
 4 | defaults:
 5 |   - sources:
 6 |       - ag_news
 7 | 
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: True
11 |   strip_accents: True
12 |   force_english_keyboard: True
13 |   whitespace_escape: False
14 | tokenizer: BPE # faster for sanity checks
15 | vocab_size: 32768 # to make sure there are not memory surprises compared to the actual data
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus: False
20 | include_sep_token_in_corpus: False
21 | use_type_ids: False
22 | max_entries_in_raw_dataset: 1e10 # Select only this many examples from the dataset
23 | max_seq_in_tokenized_dataset: 1e10 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.3
31 | deduplicate_entries: False
32 | deduplication_threshold: 100
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/ag_news.yaml:
--------------------------------------------------------------------------------
 1 | # For sanity testing
 2 | ag_news:
 3 |   provider: huggingface
 4 |   partition: default
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   remove_columns: label
10 |   concatenate_successive_entries: 0
11 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/bookcorpus.yaml:
--------------------------------------------------------------------------------
 1 | # The bookcorpus dataset, drawn from it huggingface mirror
 2 | bookcorpus:
 3 |   provider: huggingface
 4 |   partition: plain_text
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 16
12 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/c4.yaml:
--------------------------------------------------------------------------------
 1 | # The wikipedia en dataset, drawn from it huggingface mirror
 2 | c4:
 3 |   provider: huggingface
 4 |   partition: en
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/c4_non_streaming.yaml:
--------------------------------------------------------------------------------
 1 | # The wikipedia en dataset, drawn from it huggingface mirror
 2 | c4:
 3 |   provider: huggingface
 4 |   partition: en
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/fake.yaml:
--------------------------------------------------------------------------------
1 | # Just a bunch of fake data ...
2 | fake:
3 |   provider: fake
4 |   split:
5 | 
6 |   randgen_seed: 0
7 |   size: 2048
8 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/minipile.yaml:
--------------------------------------------------------------------------------
 1 | # The minipile dataset, drawn from it huggingface mirror
 2 | JeanKaddour/minipile:
 3 |   provider: huggingface
 4 |   partition: null
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/the_pile.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | the_pile:
 3 |   provider: local
 4 |   file_type: json
 5 |   files:
 6 |     - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
 7 |     - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
 8 |     - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
 9 |     - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
10 |     - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
11 |     - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
12 |     - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
13 |     - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
14 |     - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
15 |     - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
16 |     - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
17 |     - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
18 |     - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
19 |     - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
20 |     - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
21 |     - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
22 |     - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
23 |     - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
24 |     - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
25 |     - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
26 |     - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
27 |     - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
28 |     - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
29 |     - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
30 |     - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
31 |     - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
32 |     - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
33 |     - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
34 |     - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
35 |     - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
36 |   filter:
37 |     #  pile_set_name:
38 |     # possible pile_set_name values are
39 |     # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
40 |     # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
41 |     # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
42 |     # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
43 |     # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
44 |     # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
45 |     # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
46 |     # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
47 |     # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
48 |     # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
49 |     # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
50 |     # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
51 |     # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
52 |     # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
53 |     # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
54 |     # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
55 |     # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
56 |     # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
57 |     # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
58 |     # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
59 |     # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
60 |     # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
61 |   split: train
62 |   streaming: True
63 | 
64 |   # source-specific cleaning rules?
65 |   remove_columns:
66 |   concatenate_successive_entries: 0
67 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/the_pileCC.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | the_pileCC:
 3 |   provider: local
 4 |   file_type: json
 5 |   files:
 6 |     - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
 7 |     - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
 8 |     - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
 9 |     - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
10 |     - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
11 |     - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
12 |     - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
13 |     - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
14 |     - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
15 |     - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
16 |     - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
17 |     - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
18 |     - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
19 |     - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
20 |     - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
21 |     - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
22 |     - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
23 |     - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
24 |     - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
25 |     - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
26 |     - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
27 |     - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
28 |     - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
29 |     - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
30 |     - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
31 |     - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
32 |     - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
33 |     - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
34 |     - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
35 |     - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
36 |   filter:
37 |     pile_set_name:
38 |       - Pile-CC
39 |   # possible pile_set_name values are
40 |   # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
41 |   # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
42 |   # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
43 |   # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
44 |   # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
45 |   # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
46 |   # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
47 |   # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
48 |   # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
49 |   # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
50 |   # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
51 |   # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
52 |   # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
53 |   # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
54 |   # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
55 |   # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
56 |   # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
57 |   # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
58 |   # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
59 |   # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
60 |   # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
61 |   # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
62 |   split: train
63 |   streaming: True
64 | 
65 |   # source-specific cleaning rules?
66 |   remove_columns:
67 |   concatenate_successive_entries: 0
68 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/the_pile_natural.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | the_pile_natural:
 3 |   provider: local
 4 |   file_type: json
 5 |   files:
 6 |     - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
 7 |     - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
 8 |     - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
 9 |     - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
10 |     - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
11 |     - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
12 |     - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
13 |     - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
14 |     - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
15 |     - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
16 |     - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
17 |     - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
18 |     - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
19 |     - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
20 |     - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
21 |     - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
22 |     - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
23 |     - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
24 |     - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
25 |     - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
26 |     - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
27 |     - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
28 |     - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
29 |     - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
30 |     - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
31 |     - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
32 |     - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
33 |     - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
34 |     - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
35 |     - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
36 |   filter:
37 |     pile_set_name:
38 |       - Gutenberg
39 |       - Books3
40 |       - Wikipedia (en)
41 |   # possible pile_set_name values are
42 |   # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
43 |   # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
44 |   # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
45 |   # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
46 |   # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
47 |   # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
48 |   # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
49 |   # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
50 |   # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
51 |   # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
52 |   # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
53 |   # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
54 |   # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
55 |   # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
56 |   # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
57 |   # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
58 |   # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
59 |   # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
60 |   # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
61 |   # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
62 |   # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
63 |   # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
64 |   split: train
65 |   streaming: True
66 | 
67 |   # source-specific cleaning rules?
68 |   remove_columns:
69 |   concatenate_successive_entries: 0
70 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/sources/wikipedia.yaml:
--------------------------------------------------------------------------------
 1 | # The wikipedia en dataset, drawn from it huggingface mirror
 2 | wikipedia:
 3 |   provider: huggingface
 4 |   partition: 20220301.en
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns: title
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/the-pile-natural.yaml:
--------------------------------------------------------------------------------
 1 | # This would be a slice of the pile
 2 | name: the_pile
 3 | defaults:
 4 |   - sources:
 5 |       - the_pile_natural
 6 | 
 7 | #
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: True
11 |   strip_accents: True
12 |   force_english_keyboard: True
13 |   whitespace_escape: False
14 | tokenizer: WordPiece
15 | vocab_size: 32768 # 2^15
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus: False
20 | include_sep_token_in_corpus: True
21 | use_type_ids: False
22 | max_entries_in_raw_dataset: 2e6 # This comes out to about 40mio 128-seq entries. Original examples are a bit longer here than the average
23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.3
31 | deduplicate_entries: False
32 | deduplication_threshold: 100
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/data/the-pile.yaml:
--------------------------------------------------------------------------------
 1 | # This would be a slice of the pile
 2 | name: the_pile
 3 | defaults:
 4 |   - sources:
 5 |       - the_pile
 6 | 
 7 | #
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: True
11 |   strip_accents: True
12 |   force_english_keyboard: True
13 |   whitespace_escape: False
14 | tokenizer: WordPiece
15 | vocab_size: 32768
16 | 
17 | # Dataset Formation
18 | seq_length: 128
19 | include_cls_token_in_corpus: False
20 | include_sep_token_in_corpus: True
21 | use_type_ids: False
22 | max_entries_in_raw_dataset: 4e6 # About 40 mio seqs of length 128
23 | max_seq_in_tokenized_dataset: 35e6 # Select only this many tokenized sequences.
24 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
25 | 
26 | # Data Cleaning:
27 | named_entity_simplification: False
28 | remove_whitespaces: False
29 | remove_trash: False
30 | trash_cutoff: 0.3
31 | deduplicate_entries: False
32 | deduplication_threshold: 100
33 | 
34 | # Data Order:
35 | ordering: randomized # could be a curriculum
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/GLUE.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - optim: adam
 3 |   - tasks:
 4 |       - cola
 5 |       - mnli
 6 |       - mrpc
 7 |       - qnli
 8 |       - qqp
 9 |       - rte
10 |       - sst2
11 |       - stsb
12 | #      - wnli
13 | 
14 | evaluation_set: validation # always keep this at validation except for the final run
15 | 
16 | # checkpoint name:
17 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder
18 | checkpoint: latest
19 | path: ${impl.path} # Path for caches of datasets and tokenizers
20 | max_seq_length: 128
21 | 
22 | # Default options:
23 | # These can be overwritten by specific tasks
24 | batch_size: 32
25 | batch_size_ramp: 0
26 | 
27 | gradient_clipping:
28 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
29 | scheduler:
30 | optim_mod:
31 |   name: none
32 | 
33 | epochs: 5
34 | 
35 | # These options are only used for scheduling:
36 | warmup_steps: 1000
37 | cooldown_steps: 0
38 | steps: 10_000
39 | 
40 | testing:
41 |   batch_size: 128
42 | 
43 | arch_modifications:
44 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/GLUE_sane.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - optim: adam
 3 |   - tasks:
 4 |       - cola
 5 |       - mnli
 6 |       - mrpc
 7 |       - qnli
 8 |       - qqp
 9 |       - rte
10 |       - sst2
11 |       - stsb
12 | #      - wnli
13 | 
14 | metrics_to_average:
15 |   - qqp_f1
16 |   - qnli_accuracy
17 |   - mrpc_f1
18 |   - mnli_accuracy_extra
19 |   - mnli_accuracy
20 |   - stsb_pearson
21 |   - sst2_accuracy
22 |   - rte_accuracy
23 |   - cola_matthews_correlation
24 | 
25 | optim:
26 |   lr: 4e-5
27 | 
28 | evaluation_set: validation # always keep this at validation except for the final run
29 | 
30 | # checkpoint name:
31 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder
32 | checkpoint: latest
33 | # Set this to a non-empty value to specify a particular model file to load.
34 | model_pth:
35 | 
36 | path: ${impl.path} # Path for caches of datasets and tokenizers
37 | max_seq_length: 128
38 | 
39 | # Default options:
40 | # These can be overwritten by specific tasks
41 | batch_size: 16
42 | batch_size_ramp: 0
43 | 
44 | gradient_clipping:
45 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
46 | scheduler: cosine-decay
47 | optim_mod:
48 |   name: none
49 | 
50 | epochs: 5
51 | 
52 | # These options are only used for scheduling:
53 | warmup_steps: 0.1
54 | cooldown_steps: 0
55 | steps: 10_000
56 | 
57 | testing:
58 |   batch_size: 128
59 | 
60 | arch_modifications:
61 |   classification_head:
62 |     pooler: zero_index
63 |     include_ff_layer: True
64 |     # head_dim: ${arch.hidden_size}
65 |     nonlin: Tanh
66 |     # classifier_dropout: ${arch.hidden_dropout_prob}
67 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/GLUEmosbach.yaml:
--------------------------------------------------------------------------------
 1 | # On the Stability of Fine-tuning BERT: Misconceptions, Explanations, and Strong Baselines
 2 | 
 3 | defaults:
 4 |   - optim: adam
 5 |   - tasks:
 6 |       - cola
 7 |       - mnli
 8 |       - mrpc
 9 |       - qnli
10 |       - qqp
11 |       - rte
12 |       - sst2
13 |       - stsb
14 | #      - wnli
15 | 
16 | optim:
17 |   weight_decay: 0.01
18 |   betas:
19 |     - 0.9
20 |     - 0.999
21 | 
22 | evaluation_set: validation # always keep this at validation except for the final run
23 | 
24 | # checkpoint name:
25 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder
26 | checkpoint: latest
27 | path: ${impl.path} # Path for caches of datasets and tokenizers
28 | max_seq_length: 128
29 | 
30 | # Default options:
31 | # These can be overwritten by specific tasks
32 | batch_size: 16
33 | batch_size_ramp: 0
34 | 
35 | gradient_clipping:
36 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
37 | scheduler: linear
38 | optim_mod:
39 |   name: none
40 | 
41 | epochs: 20
42 | 
43 | # These options are only used for scheduling:
44 | warmup_steps: 0.1
45 | cooldown_steps: 0
46 | steps: 10_000
47 | 
48 | testing:
49 |   batch_size: 128
50 | 
51 | arch_modifications:
52 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/SuperGLUE.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - optim: adam
 3 |   - tasks:
 4 |       - boolq
 5 |       - cb
 6 |       - copa
 7 |       - multirc
 8 |       - rte_superglue
 9 |       - wic
10 |       - wsc
11 | 
12 | metrics_to_average:
13 |   - boolq_accuracy
14 |   - cb_f1
15 |   - copa_accuracy
16 |   - multirc_f1_a
17 |   - rte_accuracy
18 |   - wic_accuracy
19 |   - "wsc.fixed_accuracy"
20 | 
21 | optim:
22 |   lr: 5e-5
23 | 
24 | evaluation_set: validation # always keep this at validation except for the final run
25 | 
26 | # checkpoint name:
27 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder
28 | checkpoint: latest
29 | # Set this to a non-empty value to specify a particular model file to load.
30 | model_pth:
31 | 
32 | path: ${impl.path} # Path for caches of datasets and tokenizers
33 | max_seq_length: 128
34 | 
35 | # Default options:
36 | # These can be overwritten by specific tasks
37 | batch_size: 16
38 | batch_size_ramp: 0
39 | 
40 | gradient_clipping:
41 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
42 | scheduler: cosine-decay
43 | optim_mod:
44 |   name: none
45 | 
46 | epochs: 10
47 | 
48 | # These options are only used for scheduling:
49 | warmup_steps: 0.1
50 | cooldown_steps: 0
51 | steps: 10_000
52 | 
53 | testing:
54 |   batch_size: 128
55 | 
56 | arch_modifications:
57 |   classification_head:
58 |     pooler: zero_index
59 |     include_ff_layer: True
60 |     # head_dim: ${arch.hidden_size}
61 |     nonlin: Tanh
62 |     # classifier_dropout: ${arch.hidden_dropout_prob}
63 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/eval/__init__.py


--------------------------------------------------------------------------------
/bert/cramming/config/eval/boolq.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - optim: adam
 3 |   - tasks:
 4 |       - boolq
 5 | 
 6 | metrics_to_average:
 7 |   - boolq_accuracy
 8 | 
 9 | optim:
10 |   lr: 4e-5
11 | 
12 | evaluation_set: validation # always keep this at validation except for the final run
13 | 
14 | # checkpoint name:
15 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder
16 | checkpoint: latest
17 | # Set this to a non-empty value to specify a particular model file to load.
18 | model_pth:
19 | 
20 | path: ${impl.path} # Path for caches of datasets and tokenizers
21 | max_seq_length: 128
22 | 
23 | # Default options:
24 | # These can be overwritten by specific tasks
25 | batch_size: 16
26 | batch_size_ramp: 0
27 | 
28 | gradient_clipping:
29 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
30 | scheduler: cosine-decay
31 | optim_mod:
32 |   name: none
33 | 
34 | epochs: 10
35 | 
36 | # These options are only used for scheduling:
37 | warmup_steps: 0.1
38 | cooldown_steps: 0
39 | steps: 10_000
40 | 
41 | testing:
42 |   batch_size: 128
43 | 
44 | arch_modifications:
45 |   classification_head:
46 |     pooler: zero_index
47 |     include_ff_layer: True
48 |     # head_dim: ${arch.hidden_size}
49 |     nonlin: Tanh
50 |     # classifier_dropout: ${arch.hidden_dropout_prob}
51 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/mnli.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - optim: adam
 3 |   - tasks:
 4 |       - mnli
 5 | 
 6 | optim:
 7 |   weight_decay: 0.01
 8 | 
 9 | evaluation_set: validation # always keep this at validation except for the final run
10 | 
11 | checkpoint: latest
12 | path: ~/data/ # Path for caches of datasets and tokenizers
13 | max_seq_length: 128
14 | 
15 | # Default options:
16 | # These can be overwritten by specific tasks
17 | batch_size: 32
18 | batch_size_ramp: 0
19 | 
20 | gradient_clipping:
21 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
22 | scheduler: linear
23 | optim_mod:
24 |   name: none
25 | 
26 | epochs: 10
27 | 
28 | # These options are only used for scheduling:
29 | warmup_steps: 0.1
30 | cooldown_steps: 0
31 | steps: 10_000
32 | 
33 | arch_modifications:
34 |   classification_head:
35 |     pooler: zero_index
36 |     include_ff_layer: True
37 |     # head_dim: ${arch.hidden_size}
38 |     nonlin: Tanh
39 |     # classifier_dropout: ${arch.hidden_dropout_prob}
40 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/optim/adam.yaml:
--------------------------------------------------------------------------------
 1 | type: AdamW
 2 | 
 3 | lr: 2e-5
 4 | betas:
 5 |   - 0.9
 6 |   - 0.98
 7 | eps: 1e-6
 8 | weight_decay: 0.00 # no wd in finetuning??
 9 | amsgrad: False
10 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/save_losses_rho_loss.yaml:
--------------------------------------------------------------------------------
 1 | arch_modifications: null
 2 | evaluation_set: validation # always keep this at validation except for the final run
 3 | 
 4 | checkpoint: latest
 5 | path: ~/data/ # Path for caches of datasets and tokenizers
 6 | max_seq_length: 128
 7 | model_pth: null
 8 | 
 9 | # Default options:
10 | # These can be overwritten by specific tasks
11 | batch_size: 96
12 | batch_size_ramp: 0
13 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/boolq.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | boolq:
3 |   collection: super_glue
4 |   regression: False
5 |   structure: [question, passage]
6 |   label:


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/cb.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | cb:
3 |   collection: super_glue
4 |   regression: False
5 |   structure: [premise, hypothesis]
6 |   label:


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/cola.yaml:
--------------------------------------------------------------------------------
1 | # COLA-specific settings
2 | cola:
3 |   collection: glue
4 |   regression: False
5 |   structure: [sentence, null]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/copa.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | copa:
3 |   collection: super_glue
4 |   regression: False
5 |   structure: [premise, choice1, choice2, question]
6 |   label:


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/mnli.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | mnli:
3 |   collection: glue
4 |   regression: False
5 |   structure: [premise, hypothesis]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/mrpc.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | mrpc:
3 |   collection: glue
4 |   regression: False
5 |   structure: [sentence1, sentence2]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/multirc.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | multirc:
3 |   collection: super_glue
4 |   regression: False
5 |   structure: [paragraph, question, answer]
6 |   label:


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/qnli.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | qnli:
3 |   collection: glue
4 |   regression: False
5 |   structure: [question, sentence]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/qqp.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | qqp:
3 |   collection: glue
4 |   regression: False
5 |   structure: [question1, question2]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/record.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | record:
3 |   collection: super_glue
4 |   regression: False
5 |   structure: [passage, query]
6 |   label:


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/rte.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | rte:
3 |   collection: glue
4 |   regression: False
5 |   structure: [sentence1, sentence2]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/rte_superglue.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | rte:
3 |   collection: super_glue
4 |   regression: False
5 |   structure: [premise, hypothesis]
6 |   label:


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/sst2.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | sst2:
3 |   collection: glue
4 |   regression: False
5 |   structure: [sentence, null]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/stsb.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | stsb:
3 |   collection: glue
4 |   regression: True
5 |   structure: [sentence1, sentence2]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/wic.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | wic:
3 |   collection: super_glue
4 |   regression: False
5 |   structure: [word, sentence1, sentence2]
6 |   label:


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/wnli.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | wnli:
3 |   collection: glue
4 |   regression: False
5 |   structure: [sentence1, sentence2]
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/eval/tasks/wsc.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | "wsc.fixed":
3 |   collection: super_glue
4 |   regression: False
5 |   structure: [text, span1_text, span2_text]
6 |   label:


--------------------------------------------------------------------------------
/bert/cramming/config/hydra/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/hydra/__init__.py


--------------------------------------------------------------------------------
/bert/cramming/config/hydra/job_logging/custom.yaml:
--------------------------------------------------------------------------------
 1 | # python logging configuration for tasks
 2 | version: 1
 3 | formatters:
 4 |   simple:
 5 |     format: "[%(asctime)s] %(message)s"
 6 | handlers:
 7 |   console:
 8 |     class: logging.StreamHandler
 9 |     formatter: simple
10 |     stream: ext://sys.stdout
11 |   file:
12 |     class: logging.FileHandler
13 |     formatter: simple
14 |     # relative to the job log directory
15 |     filename: ${name}_${hydra.job.name}.log
16 | root:
17 |   level: INFO
18 |   handlers: [console, file]
19 | 
20 | disable_existing_loggers: false
21 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/impl/__init__.py


--------------------------------------------------------------------------------
/bert/cramming/config/impl/_default.yaml:
--------------------------------------------------------------------------------
 1 | # Settings for implementation details
 2 | # These settings "should" not influence the outcome of the computation in major ways, only its speed.
 3 | 
 4 | # This is the main folder where data will be stored (such as caches of datasets and tokenizers):
 5 | # This can be an absolute path (which will be honored) or a relative path
 6 | # The relative path will be executed relative to the cfg.base_dir
 7 | # This behavior is controlled in the main_launcher
 8 | path: data
 9 | 
10 | # data implementation:
11 | defaults:
12 |   - data_structure: from-disk # can be LMDB or RAM or None to load directly from disk
13 | local_staging_dir: # Optionally copy a preprocessed dataset into this folder before loading it for training
14 | forbid_dataset_preprocessing: False
15 | temporary_corpus: False # Save data directly into local staging dir, forget after use
16 | max_raw_chunk_size: 1e14
17 | 
18 | # validation
19 | validate_every_hours: 6
20 | 
21 | # checkpointing and logging:
22 | print_loss_every_nth_step: 1000
23 | save_intermediate_checkpoints: False
24 | save_every_nth_step: 10000000
25 | 
26 | # early termination, cancel runs that do not meet this loss threshold early.
27 | early_termination:
28 |   enabled: False
29 |   budget: 3 # budget in hours
30 |   loss_threshold: 6.0 # modify this for non-xent losses
31 | 
32 | # Batch size settings:
33 | # batch_size: This is handled in train after commit 982a4d33cd7f79a48b691114ae78f6ad1cdbee69
34 | microbatch_size: 128 # dont make it larger than batch_size...
35 | 
36 | # Basic pytorch settings
37 | threads: 8 # maximal number of cpu dataloader workers used per GPU, this value will never exceed num_gpus * num_physical threads
38 | benchmark: True # CUDNN benchmarking
39 | deterministic: False # This option will disable non-deterministic ops
40 | non_blocking: True # unblocked .to(device) handles
41 | tf32_allowed: True
42 | 
43 | # JIT:
44 | jit: # Global JIT. Can be "script" (but this doesnt work for huggingface models) or "trace" (but trace does not work with AMP)
45 | jit_instruction_type: nvfuser-profiler
46 | trace_shape:
47 |   # If jit=trace, then this is the traced shape
48 |   # - ${impl.microbatch_size}
49 |   # - ${data.seq_length}
50 | no_jit_compilation: False # Optionaly disable all torch.jit calls
51 | 
52 | # Dataloader multiprocessing
53 | pad_to_multiple_of: 8 # padding in dataloader during downstream
54 | shuffle_in_dataloader: False # There is still shuffling in the preprocessing pipeline.
55 | pin_memory: True
56 | prefetch_factor: 2
57 | persistent_workers: True # this clashes with pin_memory in pytorch<1.7.1
58 | 
59 | # Default floating point precision:
60 | default_precision: float # needs to be a pytorch datatype
61 | 
62 | # Distributed training
63 | backend: nccl
64 | sharing_strategy: file_descriptor
65 | 
66 | # Misc:
67 | enable_huggingface_offline_mode: False
68 | local_rank: # This is set automatically by the system_startup
69 | 
70 | push_to_huggingface_hub: False
71 | hf_directoy_name: "test-crammedBERT-c5" # set a clever name here!
72 | 
73 | # Other constants:
74 | # OMP_NUM_THREADS:[number_of_physical_cores]
75 | # OMP_SCHEDULE:  # STATIC
76 | # OMP_PROC_BIND: # CLOSE
77 | # GOMP_CPU_AFFINITY:  # "N-M"
78 | # KMP_AFFINITY: # "granularity=fine,compact,1,0"
79 | # KMP_BLOCKTIME: # 1
80 | # optional_ld_preloads:
81 | #  - libiomp5.so
82 | # - jemalloc.so
83 | 
84 | #
85 | # ### jemalloc
86 | # export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
87 | # export LD_PRELOAD=/home/mingfeim/packages/jemalloc-5.2.1/lib/libjemalloc.so
88 | #
89 | # ### tcmalloc
90 | # export LD_PRELOAD=/home/mingfeim/packages/gperftools-2.8/install/lib/libtcmalloc.so
91 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/data_structure/LMDB.yaml:
--------------------------------------------------------------------------------
 1 | # This configuration caches the dataset in an LMDB
 2 | name: LMDB
 3 | draw_cache_directly: False
 4 | 
 5 | # writing:
 6 | rebuild_existing_database: False
 7 | write_frequency: 50_000 # how often to flush during database creation
 8 | shuffle_while_writing: False
 9 | 
10 | # reading:
11 | max_readers: 128
12 | readahead: True # this should be beneficial for long sequential reads
13 | meminit: True
14 | max_spare_txns: 128
15 | 
16 | access: get # cursor or get
17 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/data_structure/RAM.yaml:
--------------------------------------------------------------------------------
1 | # This configuration caches the dataset in RAM
2 | name: RAM
3 | draw_cache_directly: False
4 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/data_structure/from-disk.yaml:
--------------------------------------------------------------------------------
1 | # Here the data is just read from disk on the fly
2 | name: from-disk
3 | draw_cache_directly: False
4 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/data_structure/none.yaml:
--------------------------------------------------------------------------------
1 | # Here the data is just read from disk on the fly
2 | name: none
3 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/deepspeed-hf.yaml:
--------------------------------------------------------------------------------
 1 | # This configuration is a subset of the deepspeed hyperparameters.
 2 | name: deepspeed
 3 | defaults:
 4 |   - _default
 5 |   - _self_
 6 | 
 7 | # Dynamo
 8 | optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser
 9 | 
10 | train_batch_size: ${train.batch_size} # can be "auto"
11 | train_micro_batch_size_per_gpu: 128 # can be "auto"
12 | 
13 | optimizer: ${train.optim}
14 | gradient_clipping: ${train.gradient_clipping}
15 | # DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb,
16 | # and OneBitLamb optimizers (See here for details) and will import other optimizers from torch.
17 | 
18 | # This scheduler is not quite the same as the schedulers called via huggingface. YMMV
19 | scheduler:
20 |   type: WarmupDecayLR
21 |   params:
22 |     warmup_min_lr: 0
23 |     warmup_max_lr: ${train.optim.lr}
24 |     warmup_num_steps: ${train.warmup_steps}
25 |     warmup_type: linear
26 |     total_num_steps: ${train.steps}
27 | 
28 | # communication_data_type: # this should be good in the default setting
29 | # prescale_gradients: False # this should be good in the default setting
30 | # gradient_predivide_factor: 1.0
31 | 
32 | # Do not combine these with AMP:
33 | fp16:
34 |   enabled: False # can be "auto"
35 |   loss_scale: 0
36 |   initial_scale_power: 16
37 |   loss_scale_window: 1000
38 |   hysteresis: 2
39 |   min_loss_scale: 1
40 | 
41 | zero_optimization:
42 |   # stage 0, 1, 2, and 3 refer to
43 |   # 0) disabled
44 |   # 1) optimizer state partitioning
45 |   # 2) optimizer+gradient state partitioning
46 |   # 3) optimizer+gradient+parameter partitioning
47 |   stage: 3 # [0|1|2|3]
48 |   overlap_comm: True # Attempts to overlap the reduction of the gradients with backward computation
49 |   reduce_scatter: True # Uses reduce or reduce scatter instead of allreduce to average gradients
50 |   reduce_bucket_size: 1e6 # Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes
51 |   contiguous_gradients: True # Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass.
52 | 
53 |   # Enabling and configuring ZeRO optimization of parameter offloading to CPU/NVMe. Available only with ZeRO stage 3.
54 |   offload_param:
55 |     device: cpu
56 |     pin_memory: True
57 | 
58 |   # Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU.
59 |   #  This frees up GPU memory for larger models or batch sizes. Valid only with stage 2 and
60 |   # Only include these options if stage=2 or higher:
61 |   offload_optimizer:
62 |     device: cpu
63 |     pin_memory: True
64 | 
65 |   stage3_max_live_parameters: 1e9 # The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication.
66 |   stage3_max_reuse_distance: 1e9 # Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but perform more communication.
67 |   stage3_prefetch_bucket_size: 0.94e6 # can be "auto" # The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase stalls due to communication.
68 |   stage3_param_persistence_threshold: 1e4 # can be "auto" # Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages).
69 | 
70 |   sub_group_size: 1e9
71 |   stage3_gather_16bit_weights_on_model_save: True # [true|false]
72 | 
73 | steps_per_print: ${impl.print_loss_every_nth_step}
74 | wall_clock_breakdown: False
75 | dump_state: False
76 | 
77 | flops_profiler:
78 |   enabled: False
79 |   profile_step: 1
80 |   module_depth: -1
81 |   top_modules: 1
82 |   detailed: True
83 |   output_file: #  If None, the profiler prints to stdout..
84 | 
85 | # activation_checkpointing:
86 | #   partition_activations: False
87 | #   cpu_checkpointing: False
88 | #   contiguous_memory_optimization: False
89 | #   number_checkpoints:
90 | #   synchronize_checkpoint_boundary: False
91 | #   profile: False
92 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/deepspeed.yaml:
--------------------------------------------------------------------------------
  1 | # This configuration is a subset of the deepspeed hyperparameters.
  2 | name: deepspeed
  3 | defaults:
  4 |   - _default
  5 |   - _self_
  6 | 
  7 | # Dynamo
  8 | optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser
  9 | 
 10 | train_batch_size: ${train.batch_size} # can be "auto"
 11 | train_micro_batch_size_per_gpu: 128 # can be "auto"
 12 | 
 13 | optimizer: ${train.optim}
 14 | gradient_clipping: 100
 15 | # DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb,
 16 | # and OneBitLamb optimizers (See here for details) and will import other optimizers from torch.
 17 | 
 18 | # This scheduler is not quite the same as the schedulers called via huggingface. YMMV
 19 | scheduler:
 20 |   type: WarmupDecayLR
 21 |   params:
 22 |     warmup_min_lr: 0
 23 |     warmup_max_lr: ${train.optim.lr}
 24 |     warmup_num_steps: ${train.warmup_steps}
 25 |     warmup_type: linear
 26 |     total_num_steps: ${train.steps}
 27 | 
 28 | # communication_data_type: # this should be good in the default setting
 29 | # prescale_gradients: False # this should be good in the default setting
 30 | # gradient_predivide_factor: 1.0
 31 | 
 32 | # Do not combine these with AMP:
 33 | fp16:
 34 |   enabled: False # can be "auto"
 35 |   loss_scale: 0
 36 |   initial_scale_power: 32
 37 |   loss_scale_window: 1000
 38 |   hysteresis: 2
 39 |   min_loss_scale: 1
 40 | 
 41 | # Do not combine this with fp16 or zero:
 42 | # bf16:
 43 | #   enabled: False
 44 | # amp:
 45 | #   enabled: False
 46 | #   opt_level: O1
 47 | #   # can draw more args from https://nvidia.github.io/apex/amp.html#apex.amp.initialize
 48 | #
 49 | 
 50 | zero_optimization:
 51 |   # stage 0, 1, 2, and 3 refer to
 52 |   # 0) disabled
 53 |   # 1) optimizer state partitioning
 54 |   # 2) optimizer+gradient state partitioning
 55 |   # 3) optimizer+gradient+parameter partitioning
 56 |   stage: 0 # [0|1|2|3]
 57 |   allgather_partitions: True # [true|false] # Chooses between allgather collective or a series of broadcast collectives to gather updated parameters from all the GPUs at the end of each step
 58 |   allgather_bucket_size: 5e8
 59 |   overlap_comm: False # Attempts to overlap the reduction of the gradients with backward computation
 60 |   reduce_scatter: True # Uses reduce or reduce scatter instead of allreduce to average gradients
 61 |   reduce_bucket_size: 5e8 # Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes
 62 |   contiguous_gradients: True # Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass.
 63 |   grad_hooks: True
 64 | 
 65 |   # huggingface default is 2e8 for both reduce and all_grather buckets
 66 |   # both reduce and all_grather buckets can also be can be "auto"
 67 | 
 68 |   # Stage 2 optimization for CPU offloading that parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning.
 69 |   # Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism)
 70 |   round_robin_gradients: False # [true|false]
 71 | 
 72 |   # Enabling and configuring ZeRO optimization of parameter offloading to CPU/NVMe. Available only with ZeRO stage 3.
 73 |   offload_param:
 74 |     device: cpu
 75 |     # nvme_path: /nvme
 76 |     pin_memory: True
 77 |     buffer_count: 5
 78 |     buffer_size: 1e8
 79 |     max_in_cpu: 1e9
 80 | 
 81 |   # Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU.
 82 |   #  This frees up GPU memory for larger models or batch sizes. Valid only with stage 2 and
 83 |   # Only include these options if stage=2 or higher:
 84 |   # offload_optimizer:
 85 |   #   device: cpu
 86 |   #   # nvme_path: /nvme
 87 |   #   pin_memory: True
 88 |   #   buffer_count:
 89 |   #     4 # Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number of states maintained per parameter by the optimizer.
 90 |   #     # For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance).
 91 |   #   buffer_size: 1e8
 92 |   #   fast_init: False # Enable fast optimizer initialization when offloading to NVMe.
 93 | 
 94 |   stage3_max_live_parameters: 1e9 # The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication.
 95 |   stage3_max_reuse_distance: 1e9 # Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but perform more communication.
 96 |   stage3_prefetch_bucket_size: 5e8 # can be "auto" # The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase stalls due to communication.
 97 |   stage3_param_persistence_threshold: 1e6 # can be "auto" # Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages).
 98 | 
 99 |   sub_group_size: 1e12
100 |   elastic_checkpoint: True # [true|false]
101 |   stage3_gather_16bit_weights_on_model_save: False # [true|false]
102 |   ignore_unused_parameters: False # [true|false]
103 | 
104 |   # aio:
105 |   #  block_size: 1048576
106 |   #  queue_depth: 8
107 |   #  thread_count: 1
108 |   #  single_submit: False
109 |   #  overlap_events: True
110 | 
111 | steps_per_print: ${impl.print_loss_every_nth_step}
112 | wall_clock_breakdown: False
113 | dump_state: False
114 | 
115 | flops_profiler:
116 |   enabled: False
117 |   profile_step: 1
118 |   module_depth: -1
119 |   top_modules: 1
120 |   detailed: True
121 |   output_file: #  If None, the profiler prints to stdout..
122 | 
123 | # activation_checkpointing:
124 | #   partition_activations: False
125 | #   cpu_checkpointing: False
126 | #   contiguous_memory_optimization: False
127 | #   number_checkpoints:
128 | #   synchronize_checkpoint_boundary: False
129 | #   profile: False
130 | 
131 | tensorboard:
132 |   enabled: False
133 |   output_path: tensorboard_logs
134 |   job_name: ${name}
135 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/onnx.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - _default
3 |   - _self_
4 | 


--------------------------------------------------------------------------------
/bert/cramming/config/impl/save_losses_rho_loss.yaml:
--------------------------------------------------------------------------------
 1 | # singl(ish) GPU, sane pytorch stuff
 2 | name: torch-default
 3 | defaults:
 4 |   - _default
 5 |   - _self_
 6 | 
 7 | mixed_precision: True # turns on AMP on GPUs/Intel devices. The default precision needs to be float
 8 | grad_scaling: True # Only activates when mixed_precision=True
 9 | mixed_precision_target_dtype: float16
10 | 
11 | saving_interval: 1000
12 | 
13 | # Distributed training:
14 | zero_redundancy_optimizer: False # requires limited_decay_keys=[] for pytorch<=1.10.2
15 | broadcast_buffers: False
16 | bucket_cap_mb: 25
17 | gradient_as_bucket_view: True
18 | static_graph: True
19 | 
20 | # Misc:
21 | foreach_optimizer: False
22 | 
23 | # Dynamo
24 | optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser
25 | 
26 | microbatch_size: 256 # dont make it larger than batch_size...
27 | rho_loss: True


--------------------------------------------------------------------------------
/bert/cramming/config/impl/torch-default.yaml:
--------------------------------------------------------------------------------
 1 | # singl(ish) GPU, sane pytorch stuff
 2 | name: torch-default
 3 | defaults:
 4 |   - _default
 5 |   - _self_
 6 | 
 7 | mixed_precision: True # turns on AMP on GPUs/Intel devices. The default precision needs to be float
 8 | grad_scaling: True # Only activates when mixed_precision=True
 9 | mixed_precision_target_dtype: float16
10 | 
11 | # Distributed training:
12 | zero_redundancy_optimizer: False # requires limited_decay_keys=[] for pytorch<=1.10.2
13 | broadcast_buffers: False
14 | bucket_cap_mb: 25
15 | gradient_as_bucket_view: True
16 | static_graph: True
17 | 
18 | # Misc:
19 | foreach_optimizer: False
20 | 
21 | # Dynamo
22 | optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser
23 | 
24 | rho_loss: False


--------------------------------------------------------------------------------
/bert/cramming/config/piotr/default.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - task: pt
 4 | 
 5 | # Experiment args
 6 | mode: 'pt'
 7 | device: gpu
 8 | eval_only: false
 9 | predict_only: false
10 | seed: 2137
11 | budget: 24
12 | 
13 | model:
14 |   name: 'google/t5-v1_1-base'
15 |   checkpoint_path: ''
16 |   dropout: 0.0
17 |   random_init: true
18 |   compile: false # Pytorch 2.0
19 |   num_active_layers: -1
20 | 
21 | data:
22 |   input_length: 512
23 |   mlm_probability: 0.15
24 |   mean_noise_span_length: 3.0
25 |   num_workers: 8
26 |   dataset_name: 'c4'
27 |   config_name: 'en'
28 |   streaming: true
29 | 
30 | optim:
31 |   name: adamwscale
32 |   base_lr: 2e-2
33 |   batch_size: 144
34 |   total_steps: 65536
35 |   epochs: -1 # If it's > 0 it overwrites total_steps
36 |   warmup_steps: 10000
37 |   lr_scheduler: cosine-budget
38 |   weight_decay: 0.0
39 |   grad_clip: 1.0
40 |   grad_acc: 2
41 |   final_cosine: 1e-5
42 | 
43 | stacking:
44 |   enabled: true
45 |   num_initial_layers: 3
46 |   num_layers_to_add: 12
47 |   scheduler: manual
48 |   adjust_lr: false
49 |   freeze_bottom_layers: false
50 |   manual_scheduler:
51 |     function: manual
52 |     balance_factor: 1.0
53 |   T_max_factor: 0.75
54 |   copy_optim_states: false
55 |   step_fractions: [0.125,0.3]
56 |   doubling: true
57 |   doubling_interpolation: false
58 |   reset_optim: true
59 | 
60 | eval:
61 |   every_steps: 5000
62 |   steps: 500
63 |   eval_stacked_model: false
64 | 
65 | checkpoint:
66 |   every_steps: 1000
67 |   start: 65000
68 | 
69 | logging:
70 |   wandb: true
71 |   wandb_creds:
72 |       name: 't5'
73 |       project: 't5'
74 |       entity: '' # change this optionally
75 |       tags: 'baseline'
76 |   every_steps: 100
77 |   grad_l2: true
78 |   weights_l2: true
79 | 
80 | hydra:
81 |   job:
82 |     chdir: True
83 |   run:
84 |     dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
85 | 


--------------------------------------------------------------------------------
/bert/cramming/config/piotr/task/ft.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | mode: 'ft'
 4 | 
 5 | data:
 6 |   max_seq_len: 1024
 7 |   max_target_len: 128
 8 |   max_num_instances_per_task: 100
 9 |   add_task_name: False
10 |   add_task_definition: True
11 |   num_pos_examples: 2
12 |   num_neg_examples: 0
13 |   add_explanation: False
14 |   tk_instruct: False
15 |   exec_file_path: ./nanoT5/utils/ni_dataset.py
16 |   data_dir: ./data/splits/default
17 |   task_dir: ./data/tasks
18 | 
19 | optim:
20 |   name: adamw
21 |   base_lr: 5e-5
22 |   batch_size: 8
23 |   epochs: 2
24 |   warmup_steps: 0
25 |   lr_scheduler: constant
26 |   weight_decay: 0.0
27 |   grad_clip: 0.0
28 |   grad_acc: 1
29 | 
30 | checkpoint:
31 |     start: 430000
32 | 
33 | eval:
34 |   steps: 200
35 |   every_steps: 5000
36 | 


--------------------------------------------------------------------------------
/bert/cramming/config/piotr/task/pt.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/cramming/config/train/__init__.py


--------------------------------------------------------------------------------
/bert/cramming/config/train/bert-base.yaml:
--------------------------------------------------------------------------------
 1 | # Basic hyperparameter for normal BERT pretraining
 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters
 3 | 
 4 | name: bert-base
 5 | 
 6 | defaults:
 7 |   - optim: adam
 8 |   - optim_mod: disabled
 9 | 
10 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
11 | 
12 | # steps:
13 | warmup_steps: 30_000
14 | cooldown_steps: 0
15 | steps: 600_000 # these are microbatch steps
16 | scheduler: budget-cosine-decay
17 | 
18 | # Training settting:
19 | batch_size: 1536
20 | batch_size_ramp: 0
21 | 
22 | gradient_clipping:
23 | pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain
24 | 
25 | objective:
26 |   name: masked-lm
27 |   mlm_probability: 0.15
28 |   use_80_20_rule: True
29 |   disable_mlm: False
30 |   token_drop: 0.0
31 | reverse_dataset_order: False
32 | 
33 | budget: ${budget}
34 | 
35 | gradinit:
36 |   enabled: False
37 |   # eta: 1.0
38 |   # tau: 1e-3 # step size
39 |   # steps: 50
40 |   # min_scale: 1e-3
41 |   # max_scale: 1e3
42 |   # step_type: sign-grad # sign-grad or grad
43 |   # second_order: False
44 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/bert-izsak.yaml:
--------------------------------------------------------------------------------
 1 | # Basic hyperparameter for normal BERT pretraining
 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters
 3 | 
 4 | name: bert-base
 5 | 
 6 | defaults:
 7 |   - optim: adam
 8 |   - optim_mod: disabled
 9 | 
10 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
11 | 
12 | optim:
13 |   lr: 2e-3
14 |   eps: 1e-6
15 |   weight_decay: 0.01
16 |   betas:
17 |     - 0.9
18 |     - 0.98
19 | 
20 | # steps:
21 | warmup_steps: 0.06 # in percentage points
22 | cooldown_steps: 0
23 | steps: 600_000 # these are microbatch steps
24 | scheduler: budget-linear
25 | 
26 | # Training settting:
27 | batch_size: 4096 # for mbs=128
28 | batch_size_ramp: 0
29 | 
30 | gradient_clipping:
31 | pretrain_in_train_mode: True # default BERT trains with dropout layers enabled in pretrain
32 | 
33 | objective:
34 |   name: masked-lm
35 |   mlm_probability: 0.15
36 |   use_80_20_rule: True
37 |   disable_mlm: False
38 |   token_drop: 0.0
39 | reverse_dataset_order: False
40 | 
41 | budget: ${budget}
42 | 
43 | gradinit:
44 |   enabled: False
45 |   # eta: 1.0
46 |   # tau: 1e-3 # step size
47 |   # steps: 50
48 |   # min_scale: 1e-3
49 |   # max_scale: 1e3
50 |   # step_type: sign-grad # sign-grad or grad
51 |   # second_order: False
52 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/bert-o1.yaml:
--------------------------------------------------------------------------------
 1 | # Basic hyperparameter for normal BERT pretraining
 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters
 3 | 
 4 | name: bert-o1
 5 | 
 6 | defaults:
 7 |   - optim: adam
 8 |   - optim_mod: disabled
 9 | 
10 | optim:
11 |   lr: 7e-4
12 |   eps: 1e-12
13 |   weight_decay: 0.01
14 | 
15 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
16 | 
17 | # steps:
18 | warmup_steps: 0
19 | cooldown_steps: 0
20 | steps: 600_000 # these are microbatch steps
21 | scheduler: budget-cosine-decay
22 | 
23 | # Training settting:
24 | batch_size: 1536
25 | batch_size_ramp: 0
26 | 
27 | gradient_clipping:
28 | pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain
29 | 
30 | objective:
31 |   name: masked-lm
32 |   mlm_probability: 0.15
33 |   use_80_20_rule: True
34 |   disable_mlm: False
35 |   token_drop: 0.0
36 | reverse_dataset_order: False
37 | 
38 | budget: ${budget}
39 | 
40 | gradinit:
41 |   enabled: False
42 |   # eta: 1.0
43 |   # tau: 1e-3 # step size
44 |   # steps: 50
45 |   # min_scale: 1e-3
46 |   # max_scale: 1e3
47 |   # step_type: sign-grad # sign-grad or grad
48 |   # second_order: False
49 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/bert-o2.yaml:
--------------------------------------------------------------------------------
 1 | # Basic hyperparameter for normal BERT pretraining
 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters
 3 | 
 4 | name: bert-o2
 5 | 
 6 | defaults:
 7 |   - optim: adam
 8 |   - optim_mod: disabled
 9 | 
10 | optim:
11 |   lr: 1e-3
12 |   eps: 1e-12
13 |   weight_decay: 0.01
14 | 
15 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
16 | 
17 | # steps:
18 | warmup_steps: 0
19 | cooldown_steps: 0
20 | steps: 600000 # these are microbatch steps
21 | scheduler: budget-one-cycle
22 | 
23 | # Training settting:
24 | batch_size: 1536
25 | batch_size_ramp: 300000
26 | 
27 | gradient_clipping:
28 | pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain
29 | 
30 | objective:
31 |   name: masked-lm
32 |   mlm_probability: 0.15
33 |   use_80_20_rule: True
34 |   disable_mlm: False
35 |   token_drop: 0.0
36 | reverse_dataset_order: False
37 | 
38 | budget: ${budget}
39 | 
40 | gradinit:
41 |   enabled: False
42 |   # eta: 1.0
43 |   # tau: 1e-3 # step size
44 |   # steps: 50
45 |   # min_scale: 1e-3
46 |   # max_scale: 1e3
47 |   # step_type: sign-grad # sign-grad or grad
48 |   # second_order: False
49 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/bert-o3.yaml:
--------------------------------------------------------------------------------
  1 | # Basic hyperparameter for normal BERT pretraining
  2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters
  3 | 
  4 | name: bert-o3
  5 | 
  6 | defaults:
  7 |   - optim: adam
  8 |   - optim_mod: disabled
  9 | 
 10 | #optim:
 11 | #  lr: 1e-3
 12 | #  eps: 1e-12
 13 | #  weight_decay: 0.01
 14 | 
 15 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
 16 | 
 17 | # data
 18 | validation_set:
 19 |     enabled: True
 20 |     fraction: 0.005
 21 |     seed: 0
 22 |     il_model: False
 23 |     truncate_to: 10000
 24 | 
 25 | # Set to integer value to truncate the dataset to this many sequences.
 26 | truncate_train_dataset:
 27 | 
 28 | # steps:
 29 | warmup_steps: 0
 30 | cooldown_steps: 0
 31 | steps: 2000000 # these are microbatch steps
 32 | scheduler: budget-one-cycle-seconds
 33 | 
 34 | # Training settting:
 35 | batch_size: 1536
 36 | batch_size_ramp: 0
 37 | gradient_clipping: 0.5
 38 | pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain
 39 | 
 40 | objective:
 41 |   name: masked-lm
 42 |   mlm_probability: 0.15
 43 |   use_80_20_rule: True
 44 |   disable_mlm: False
 45 |   token_drop: 0.0
 46 | reverse_dataset_order: False
 47 | 
 48 | budget: ${budget}
 49 | 
 50 | stacking:
 51 |   enabled: false
 52 |   num_initial_layers: 4
 53 |   num_layers_to_add: 12
 54 |   scheduler: manual
 55 |   adjust_lr: false
 56 |   freeze_bottom_layers: false
 57 |   manual_scheduler:
 58 |     function: manual
 59 |     balance_factor: 1.0
 60 |   T_max_factor: 0.75
 61 |   copy_optim_states: false
 62 |   step_fractions: [0.125, 0.3]
 63 |   doubling: true
 64 |   doubling_interpolation: false
 65 |   reset_optim: true
 66 | 
 67 | track_forward_pass_only: true
 68 | 
 69 | rho_loss:
 70 |   mega_batch_size: 15360
 71 |   il_losses_path: /home/jean/stackbert/outputs/examples_to_loss
 72 | 
 73 | sb:
 74 |   scale: 1.0
 75 | 
 76 | sophia:
 77 |   batch_size_hess_update: 768
 78 |   hess_update_frequency: 10
 79 |   free_updates: False
 80 | 
 81 | gradinit:
 82 |   enabled: False
 83 |   # eta: 1.0
 84 |   # tau: 1e-3 # step size
 85 |   # steps: 50
 86 |   # min_scale: 1e-3
 87 |   # max_scale: 1e3
 88 |   # step_type: sign-grad # sign-grad or grad
 89 |   # second_order: False
 90 | # sequence_curriculum:
 91 | #   lengths: [8,16,32,64,128]
 92 | #   triggers: [0.1,0.2,0.3,0.5,0.75]
 93 | #   unfold: False
 94 | 
 95 | # weight_averaging:
 96 | #   type: EMA
 97 | #   frequency: 1
 98 | #   momentum: 0.995 # only for EMA
 99 | #   last_k: 10
100 | 
101 | # CU1: +train.sequence_curriculum.lengths=[8,16,32,64,128] +train.sequence_curriculum.triggers=[0.1,0.2,0.3,0.5,0.75] +train.sequence_curriculum.unfold=False
102 | # CU2: +train.sequence_curriculum.lengths=[8,16,32,64,128] +train.sequence_curriculum.triggers=[0.1,0.2,0.3,0.5,0.75] +train.sequence_curriculum.unfold=True
103 | 
104 | # LAWA: +train.weight_averaging.frequency=5000 +train.weight_averaging.type=LAWA +train.weight_averaging.last_k=10
105 | # EMA: +train.weight_averaging.frequency=1 +train.weight_averaging.type=EMA +train.weight_averaging.momentum=0.995
106 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/bert-original.yaml:
--------------------------------------------------------------------------------
 1 | # Basic hyperparameter for normal BERT pretraining
 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters
 3 | 
 4 | name: bert-original
 5 | 
 6 | defaults:
 7 |   - optim: adam_classic
 8 |   - optim_mod: disabled
 9 | 
10 | optim:
11 |   lr: 1e-4
12 | 
13 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight] # no weight decay for these layers
14 | 
15 | # steps:
16 | warmup_steps: 80_000 # These are microbatch steps
17 | cooldown_steps: 0
18 | steps: 8_000_000 # These are microbatch steps at bs=64. The original 1mio steps for BERT are recovered with 512/64=8
19 | scheduler: polynomial-decay
20 | 
21 | # Training settting:
22 | batch_size: 512
23 | batch_size_ramp: 0
24 | 
25 | gradient_clipping:
26 | pretrain_in_train_mode: True # default BERT trains with dropout layers
27 | 
28 | objective:
29 |   name: masked-lm
30 |   mlm_probability: 0.15
31 |   use_80_20_rule: True
32 |   disable_mlm: False
33 |   token_drop: 0.0
34 | reverse_dataset_order: False
35 | 
36 | budget: ${budget}
37 | 
38 | gradinit:
39 |   enabled: False
40 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/adafactor.yaml:
--------------------------------------------------------------------------------
 1 | type: Adafactor
 2 | 
 3 | lr: 0.001
 4 | eps:
 5 |   - 1e-30
 6 |   - 0.001
 7 | clip_threshold: 1.0
 8 | decay_rate: -0.8
 9 | beta1:
10 | weight_decay: 0.0
11 | scale_parameter: False
12 | relative_step: False
13 | warmup_init: False
14 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/adahessian.yaml:
--------------------------------------------------------------------------------
 1 | type: AdaHessian
 2 | 
 3 | lr: 0.15
 4 | betas:
 5 |   - 0.9
 6 |   - 0.98
 7 | eps: 1e-6
 8 | weight_decay: 0.01
 9 | hessian_power: 1.0
10 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/adam.yaml:
--------------------------------------------------------------------------------
1 | type: AdamW
2 | 
3 | lr: 1e-3
4 | betas:
5 |   - 0.9
6 |   - 0.98
7 | weight_decay: 0.01
8 | amsgrad: False
9 | eps: 1e-12


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/adam_classic.yaml:
--------------------------------------------------------------------------------
 1 | type: Adam
 2 | 
 3 | lr: 0.0005
 4 | betas:
 5 |   - 0.9
 6 |   - 0.999
 7 | eps: 1e-8
 8 | weight_decay: 0.01
 9 | amsgrad: False
10 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/lion.yaml:
--------------------------------------------------------------------------------
1 | type: Lion
2 | 
3 | lr: 1e-4
4 | betas:
5 |   - 0.9
6 |   - 0.99
7 | # use 0.95, 0.98 if unstable
8 | weight_decay: 0.1
9 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/radam.yaml:
--------------------------------------------------------------------------------
1 | type: RAdam
2 | 
3 | lr: 0.0005
4 | betas:
5 |   - 0.9
6 |   - 0.98
7 | eps: 1e-6
8 | weight_decay: 0.01
9 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/sgd.yaml:
--------------------------------------------------------------------------------
1 | type: SGD
2 | 
3 | lr: 0.0005
4 | momentum: 0.9
5 | dampening: 0.0
6 | weight_decay: 0.01
7 | nesterov: True
8 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/shampoo.yaml:
--------------------------------------------------------------------------------
 1 | type: Shampoo
 2 | 
 3 | lr: 0.0005
 4 | betas:
 5 |   - 0.9
 6 |   - 0.98
 7 | epsilon: 1e-6
 8 | use_bias_correction: True
 9 | adam_w_mode: True
10 | weight_decay: 0.01
11 | grafting_type: 4
12 | grafting_epsilon: 1e-12
13 | grafting_beta2: 0.98
14 | 
15 | root_inv_dist: False
16 | # update_freq (int): frequency for updating inverse preconditioner (Default: 100)
17 | # init_delay (int): initial delay before starting to compute root inverse (Default: 1000)
18 | # threshold (int): threshold for switching to diagonal preconditioner (Default: 1024)
19 | # preconditioner_dtype (torch.dtype): data type for preconditioner (Default: torch.float)
20 | # large_dim_method (LargeDimMethod): method for handling large scale tensors. (Default: LargeDimMethod.BLOCKING)
21 | # root_inv_dist (bool): distributes root inverse computation across multiple GPU workers (Default: True)
22 | # use_merge_dims (bool): merge dimensions if possible while respecting threshold. (Default: True)
23 | # grafting_type (GraftingType): Selects grafting method. (Default: GraftingType.ADAGRAD)
24 | # grafting_epsilon (float): Epsilon for grafting method. (Default: 1e-3)
25 | # grafting_beta2 (float): Exponential moving average factor for grafting method. (Default: 1.0)
26 | 
27 | # class PreconditionerType(enum.Enum):
28 | #     FULL = 0
29 | #     DIAGONAL = 1
30 | #
31 | #
32 | # class GraftingType(enum.Enum):
33 | #     NONE = 0
34 | #     SGD = 1
35 | #     ADAGRAD = 2
36 | #     RMSPROP = 3
37 | #     ADAM = 4
38 | #
39 | #
40 | # class LargeDimMethod(enum.Enum):
41 | #     DIAGONAL = 0
42 | #     ADAGRAD = 1
43 | #     BLOCKING = 2
44 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim/sophiag.yaml:
--------------------------------------------------------------------------------
1 | type: SophiaG
2 | lr: 1e-3
3 | weight_decay: 0.01
4 | rho: 0.04
5 | bs: 196608 # 128 * 1536


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim_mod/disabled.yaml:
--------------------------------------------------------------------------------
1 | name: none
2 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim_mod/larc.yaml:
--------------------------------------------------------------------------------
1 | name: LARC
2 | 
3 | trust_coefficient: 0.02
4 | clip: True
5 | eps: 1e-8
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim_mod/lars.yaml:
--------------------------------------------------------------------------------
1 | name: LARS
2 | 
3 | trust_coefficient: 0.02
4 | clip: False
5 | eps: 1e-8
6 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim_mod/progressive.yaml:
--------------------------------------------------------------------------------
 1 | name: progressive-batching
 2 | 
 3 | progress_rule: norm-based
 4 | 
 5 | monotone: False
 6 | theta: 0.9
 7 | 
 8 | min_sample_guard: 2
 9 | max_sample_guard: 128
10 | 


--------------------------------------------------------------------------------
/bert/cramming/config/train/optim_mod/sam.yaml:
--------------------------------------------------------------------------------
1 | name: SAM
2 | rho: 0.05
3 | 


--------------------------------------------------------------------------------
/bert/cramming/config/wandb/default.yaml:
--------------------------------------------------------------------------------
1 | enabled: True
2 | entity: "" # change this optionally
3 | project: cramming-pretrain
4 | tags: []
5 | # If set, resume from the given wandb id.
6 | resume:
7 | 


--------------------------------------------------------------------------------
/bert/cramming/config/wandb/none.yaml:
--------------------------------------------------------------------------------
1 | enabled: False
2 | entity:
3 | project:
4 | tags: []
5 | 


--------------------------------------------------------------------------------
/bert/cramming/data/__init__.py:
--------------------------------------------------------------------------------
1 | """This module handles and hides the data away ;)"""
2 | 
3 | from .downstream_task_preparation import prepare_task_dataloaders
4 | from .pretraining_preparation import load_pretraining_corpus
5 | 


--------------------------------------------------------------------------------
/bert/cramming/data/cached_datasets.py:
--------------------------------------------------------------------------------
 1 | """Write a PyTorch dataset into RAM."""
 2 | 
 3 | import logging
 4 | 
 5 | import torch
 6 | import transformers
 7 | 
 8 | log = logging.getLogger(__name__)
 9 | 
10 | 
11 | def lookup_dtype(vocab_size):
12 |     if vocab_size < 2**8:
13 |         dtype = torch.uint8
14 |     # would really be neat to have uint16 here between the BERT and GPT encoding sizes
15 |     elif vocab_size < 2**16 // 2:
16 |         dtype = torch.int16
17 |     elif vocab_size < 2**32 // 2:
18 |         dtype = torch.int32
19 |     else:
20 |         dtype = torch.int64
21 |     return dtype
22 | 
23 | 
24 | class CachedDataset(torch.utils.data.Dataset):
25 |     """Cache a given dataset into RAM or SDRAM (GPU memory).
26 | 
27 |     This is only a good idea if you have enough RAM, especially if mapping into SDRAM.
28 |     """
29 | 
30 |     def __init__(self, dataset, seq_length, vocab_size, num_workers=0, target_device=torch.device("cpu")):
31 |         """Initialize with a given pytorch dataset. The setup dictionary determines cache location and storage type."""
32 |         self.dataset = dataset
33 |         log.info("Caching started ...")
34 |         batch_size = min(len(dataset), 2048)
35 |         cacheloader = torch.utils.data.DataLoader(
36 |             dataset,
37 |             batch_size=batch_size,
38 |             shuffle=False,
39 |             drop_last=False,
40 |             num_workers=num_workers,
41 |             pin_memory=False,
42 |             collate_fn=transformers.data.data_collator.torch_default_data_collator,
43 |         )
44 |         self.dataset_keys = list(dataset[0].keys())
45 |         seq_lengths = [len(dataset[0][k]) for k in self.dataset_keys]
46 |         assert all([length == seq_lengths[0] for length in seq_lengths])
47 | 
48 |         # Allocate memory:
49 |         pin = target_device == torch.device("cpu") and torch.cuda.is_available()
50 |         cache_setup = dict(device=target_device, dtype=lookup_dtype(vocab_size), pin_memory=pin)
51 |         self.cache = torch.empty((len(self.dataset), seq_length * 4), **cache_setup)
52 | 
53 |         pointer = 0
54 |         for data in cacheloader:
55 |             batch_length = data[self.dataset_keys[0]].shape[0]
56 |             data_block = torch.cat([d.to(cache_setup["dtype"]) for d in data.values()], dim=1)
57 |             self.cache[pointer : pointer + batch_length] = data_block
58 |             pointer += batch_length
59 | 
60 |         self.cache = self.cache.contiguous()
61 |         log.info(f'Dataset successfully cached into {"RAM" if target_device == torch.device("cpu") else "SDRAM"}.')
62 | 
63 |     def __getitem__(self, index):
64 |         """Get sample, target from cache."""
65 |         sample_data_block = self.cache[index]
66 |         sample_dict = dict(zip(self.dataset_keys, torch.chunk(sample_data_block, len(self.dataset_keys), dim=-1)))
67 |         return sample_dict
68 | 
69 |     def __len__(self):
70 |         """Length is length of self.dataset."""
71 |         return len(self.dataset)
72 | 
73 |     def __getattr__(self, name):
74 |         """This is only called if all attributes of Subset are exhausted."""
75 |         return getattr(self.dataset, name)
76 | 


--------------------------------------------------------------------------------
/bert/cramming/data/curriculum_sorting.py:
--------------------------------------------------------------------------------
  1 | """Baseline curricula."""
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | log = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | def _sort_tokenized_dataset_by_unigram(tokenized_dataset, tokenizer, num_threads=1, ngram=1, reverse=False):
 11 |     # Force unigram counts per token:
 12 |     map_setup = dict(
 13 |         batched=True,
 14 |         batch_size=1024,
 15 |         # num_proc=None,  # have to reimplement counting as in-out instead of side effects for this to work. Lets see how slow num_proc=0 is
 16 |         load_from_cache_file=False,
 17 |         # keep_in_memory=True,
 18 |     )
 19 | 
 20 |     unigrams_counts_per_token = np.zeros(tokenizer.vocab_size, dtype=np.int64)
 21 | 
 22 |     def count_unigrams(examples):
 23 |         nonlocal unigrams_counts_per_token
 24 |         unigrams_counts_per_token += np.bincount(np.asarray(examples["input_ids"]).reshape(-1), minlength=tokenizer.vocab_size)
 25 | 
 26 |     tokenized_dataset.map(count_unigrams, desc="Counting token unigrams", **map_setup, num_proc=None)
 27 | 
 28 |     token_count = sum(unigrams_counts_per_token)
 29 |     k = 1
 30 |     k_smoothed_probs = (unigrams_counts_per_token + k) / (token_count + k * tokenizer.vocab_size)
 31 |     log2_probs = np.log2(k_smoothed_probs)
 32 | 
 33 |     def return_seq_prob(examples):
 34 |         # seq_counts = np.apply_along_axis(np.bincount, axis=1, arr=np.asarray(examples["input_ids"]), minlength=tokenizer.vocab_size)
 35 |         # seq_counts = (np.asarray(examples["input_ids"])[:, :,None] == np.arange(0, tokenizer.vocab_size)[None, None, :]).sum(axis=1)  # slower so far
 36 |         # logprob_scores = (log2_probs * seq_counts).sum(axis=1) / tokenizer.model_max_length
 37 |         # why make hard when can do easy?
 38 |         logprob_scores = log2_probs[np.asarray(examples["input_ids"])].sum(axis=1) / tokenizer.model_max_length
 39 |         return dict(scores=logprob_scores)
 40 | 
 41 |     dataset_probs = tokenized_dataset.map(
 42 |         return_seq_prob,
 43 |         desc="Computing log probs per sequence",
 44 |         remove_columns=tokenized_dataset.column_names,
 45 |         **map_setup,
 46 |         num_proc=num_threads if num_threads > 0 else None,
 47 |     )
 48 | 
 49 |     new_order = np.argsort(np.asarray(dataset_probs["scores"]))
 50 | 
 51 |     if reverse:
 52 |         new_order = new_order[::-1]
 53 | 
 54 |     return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)
 55 | 
 56 | 
 57 | def _sort_tokenized_dataset_by_token(tokenized_dataset, tokenizer, target_token_id, num_threads=1):
 58 |     map_setup = dict(
 59 |         batched=True,
 60 |         batch_size=1024,
 61 |         num_proc=num_threads if num_threads > 0 else None,
 62 |         load_from_cache_file=False,
 63 |         # keep_in_memory=True,
 64 |     )
 65 | 
 66 |     def count_token(examples):
 67 |         return dict(counts=(np.asarray(examples["input_ids"]) == target_token_id).sum(axis=1))
 68 | 
 69 |     dataset_counts = tokenized_dataset.map(
 70 |         count_token,
 71 |         desc=f"Counting occurrences of token {tokenizer.decode(target_token_id)}",
 72 |         remove_columns=tokenized_dataset.column_names,
 73 |         **map_setup,
 74 |     )
 75 | 
 76 |     new_order = np.argsort(np.asarray(dataset_counts["counts"]))[::-1]
 77 | 
 78 |     # Print sentence with most occurrences:
 79 |     sentence_idx = int(new_order[0])
 80 |     input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
 81 |     dataset_size = len(tokenized_dataset)
 82 | 
 83 |     log.info("Sentence with most occurrences of token ...")
 84 |     log.info(tokenizer.batch_decode(input_data[None])[0])
 85 | 
 86 |     sentence_idx = int(new_order[-1])
 87 |     input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
 88 |     dataset_size = len(tokenized_dataset)
 89 | 
 90 |     log.info("Sentence with least occurrences of token ...")
 91 |     log.info(tokenizer.batch_decode(input_data[None])[0])
 92 | 
 93 |     return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)
 94 | 
 95 | 
 96 | def _sort_tokenized_dataset_by_word_length(tokenized_dataset, tokenizer, num_threads=1):
 97 |     map_setup = dict(
 98 |         batched=True,
 99 |         batch_size=1024,
100 |         num_proc=num_threads if num_threads > 0 else None,
101 |         load_from_cache_file=False,
102 |         # keep_in_memory=True,
103 |     )
104 | 
105 |     def count_word_lengths(examples):
106 |         return dict(lengths=[len(s) for s in tokenizer.batch_decode(torch.as_tensor(examples["input_ids"]))])
107 | 
108 |     dataset_counts = tokenized_dataset.map(
109 |         count_word_lengths,
110 |         desc="Counting word lengths per sequence",
111 |         remove_columns=tokenized_dataset.column_names,
112 |         **map_setup,
113 |     )
114 | 
115 |     new_order = np.argsort(np.asarray(dataset_counts["lengths"]))  # shortest sentences first
116 | 
117 |     # Print sentence with shortest length
118 |     sentence_idx = int(new_order[0])
119 |     input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
120 |     dataset_size = len(tokenized_dataset)
121 | 
122 |     log.info("Sentence with shortest length ...")
123 |     log.info(tokenizer.batch_decode(input_data[None])[0])
124 | 
125 |     sentence_idx = int(new_order[-1])
126 |     input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
127 |     dataset_size = len(tokenized_dataset)
128 | 
129 |     log.info("and longest ...")
130 |     log.info(tokenizer.batch_decode(input_data[None])[0])
131 | 
132 |     return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)
133 | 


--------------------------------------------------------------------------------
/bert/cramming/data/downstream_task_preparation.py:
--------------------------------------------------------------------------------
  1 | """Prepare downstream tasks evaluations."""
  2 | import logging
  3 | import os
  4 | from collections import defaultdict
  5 | 
  6 | import datasets
  7 | import torch
  8 | from datasets import load_dataset
  9 | 
 10 | from ..backend.utils import prepare_downstream_dataloader
 11 | from .pretraining_preparation import main_process_first
 12 | 
 13 | log = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def get_sentences(examples, sentence_keys):
 17 |     return tuple(examples[key] for key in sentence_keys if key is not None)
 18 | 
 19 | 
 20 | def prepare_task_dataloaders(tokenizer, cfg_eval, cfg_impl):
 21 |     """Load all datasets in eval.tasks for finetuning and testing."""
 22 |     cfg_eval.path = os.path.expanduser(cfg_eval.path)
 23 |     datasets.enable_caching()  # We can cache these
 24 |     max_seq_length = cfg_eval.max_seq_length
 25 |     tasks = defaultdict(dict)
 26 | 
 27 |     for task_name, task_details in cfg_eval.tasks.items():
 28 |         log.info(f"Preparing data for task {task_name}.")
 29 |         tasks[task_name]["details"] = task_details
 30 |         raw_datasets = load_dataset(task_details.collection, task_name, cache_dir=cfg_impl.path)
 31 |         if not task_details.regression:
 32 |             if "label" in task_details and task_details.label is not None and len(task_details.label) > 0:
 33 |                 label_list = [task_details.label]
 34 |             else:
 35 |                 label_list = raw_datasets["train"].features["label"].names
 36 |             log.info(f"{task_name} has classes {label_list}.")
 37 |             tasks[task_name]["num_classes"] = len(label_list)
 38 |         else:
 39 |             tasks[task_name]["num_classes"] = 1
 40 |             label_list = None
 41 |         sentence_keys = task_details.structure
 42 | 
 43 |         def preprocess_function(examples):
 44 |             texts = get_sentences(examples, sentence_keys)
 45 |             result = tokenizer(
 46 |                 *texts,
 47 |                 max_length=max_seq_length,
 48 |                 truncation=True,
 49 |                 pad_to_multiple_of=cfg_impl.pad_to_multiple_of,
 50 |             )
 51 | 
 52 |             if "label" in examples:
 53 |                 result["labels"] = examples["label"]
 54 |             if task_name == "multirc":
 55 |                 result["p_idx"] = [ex["paragraph"] for ex in examples["idx"]]
 56 |                 result["q_idx"] = [ex["question"] for ex in examples["idx"]]
 57 |                 result["a_idx"] = [ex["answer"] for ex in examples["idx"]]
 58 |             return result
 59 | 
 60 |         with main_process_first():
 61 |             processed_datasets = raw_datasets.map(
 62 |                 preprocess_function,
 63 |                 batched=True,
 64 |                 batch_size=1024,
 65 |                 load_from_cache_file=True,
 66 |                 remove_columns=raw_datasets["train"].column_names,
 67 |                 desc="Running tokenizer on dataset",
 68 |             )
 69 | 
 70 |         train_dataset = processed_datasets["train"]
 71 |         train_dataset.set_format("torch")
 72 |         assert cfg_eval.evaluation_set in ["validation", "test"]
 73 |         eval_dataset = processed_datasets[f"{cfg_eval.evaluation_set}_matched" if task_name == "mnli" else cfg_eval.evaluation_set]
 74 |         eval_dataset.set_format("torch")
 75 |         if task_name == "mnli":
 76 |             # Extra task loader for MNLI
 77 |             extra_eval_dataset = processed_datasets[f"{cfg_eval.evaluation_set}_mismatched"]
 78 |             extra_eval_dataset.set_format("torch")
 79 |         else:
 80 |             extra_eval_dataset = None
 81 | 
 82 |         train_dataloader, eval_dataloader, extra_eval_dataloader = _build_dataloaders(
 83 |             tokenizer,
 84 |             train_dataset,
 85 |             eval_dataset,
 86 |             extra_eval_dataset,
 87 |             cfg_impl,
 88 |         )
 89 | 
 90 |         tasks[task_name]["trainloader"] = train_dataloader
 91 |         tasks[task_name]["validloader"] = eval_dataloader
 92 |         tasks[task_name]["extra_validloader"] = extra_eval_dataloader
 93 | 
 94 |         # Log overviews so we always know what's going on with weird tokenization tricks
 95 |         random_sentence_idx = torch.randint(0, len(train_dataset), (1,)).item()
 96 |         input_data = train_dataset[random_sentence_idx]["input_ids"].squeeze()  # squeeze because hf has leading dim
 97 | 
 98 |         log.info(f"Random sentence with seq_length {tokenizer.model_max_length} from trainset of size {len(train_dataset):,}: ...")
 99 |         log.info(tokenizer.batch_decode(input_data[None])[0])
100 |         log.info("... is tokenized into ...")
101 |         log.info("_".join(tokenizer.decode(t) for t in input_data))
102 |         if label_list is not None:
103 |             log.info(f"Correct Answer: {label_list[train_dataset[random_sentence_idx]['labels']]}")
104 |         else:
105 |             log.info(f"Correct Answer: {train_dataset[random_sentence_idx]['labels']}")
106 |         random_sentence_idx = torch.randint(0, len(eval_dataset), (1,)).item()
107 |         input_data = eval_dataset[random_sentence_idx]["input_ids"].squeeze()  # squeeze because hf has leading dim
108 | 
109 |         log.info(f"Random sentence from validset of size {len(eval_dataset):,}: ...")
110 |         log.info(tokenizer.batch_decode(input_data[None])[0])
111 |         if label_list is not None:
112 |             log.info(f"Correct Answer: {label_list[eval_dataset[random_sentence_idx]['labels']]}")
113 |         else:
114 |             log.info(f"Correct Answer: {eval_dataset[random_sentence_idx]['labels']}")
115 | 
116 |     return tasks
117 | 
118 | 
119 | def _build_dataloaders(tokenizer, train_dataset, eval_dataset, extra_eval_dataset, cfg_impl):
120 |     """Construct dataloaders according to cfg_impl settings. Validation samplers always repeat on all devices."""
121 |     train_dataloader = prepare_downstream_dataloader(train_dataset, tokenizer, "training", cfg_impl)
122 |     eval_dataloader = prepare_downstream_dataloader(eval_dataset, tokenizer, "eval", cfg_impl)
123 |     if extra_eval_dataset is not None:
124 |         extra_eval_dataloader = prepare_downstream_dataloader(extra_eval_dataset, tokenizer, "eval", cfg_impl)
125 |     else:
126 |         extra_eval_dataloader = None
127 |     return train_dataloader, eval_dataloader, extra_eval_dataloader
128 | 


--------------------------------------------------------------------------------
/bert/cramming/data/utils.py:
--------------------------------------------------------------------------------
 1 | """Various utilities."""
 2 | import hashlib
 3 | import json
 4 | import logging
 5 | import os
 6 | import shutil
 7 | import time
 8 | 
 9 | import datasets
10 | from omegaconf import OmegaConf
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | 
15 | def checksum_config(cfg):
16 |     """This is more annoying that I thought it would be. But a json-dump of the config file is hashed and used as checksum."""
17 |     bindump = json.dumps(OmegaConf.to_container(cfg, resolve=True), sort_keys=True).encode("utf-8")
18 |     checksum_of_config = hashlib.md5(bindump).hexdigest()
19 |     if "tokenizer" in cfg and "vocab_size" in cfg:
20 |         checksum_of_config = f"{cfg.tokenizer}x{cfg.vocab_size}_{checksum_of_config}"
21 |     return checksum_of_config
22 | 
23 | 
24 | def stage_dataset(data_directory_path, local_staging_dir):
25 |     """This is a mess because our network drives are a mess. You might not need this."""
26 |     data_directory_name = os.path.basename(data_directory_path)
27 |     new_path = os.path.join(local_staging_dir, data_directory_name)
28 |     if os.path.isdir(data_directory_path):
29 |         try:
30 |             if not os.path.isdir(new_path):
31 |                 try:
32 |                     shutil.copytree(data_directory_path, new_path)
33 |                     log.info(f"Staging dataset to {new_path}...")
34 |                 except FileExistsError:
35 |                     log.info(f"Concurrent writing to {new_path} detected. Stopping staging in this run and waiting for 300 seconds.")
36 |                     time.sleep(300)
37 |             else:
38 |                 log.info(f"Using staged dataset found at {new_path}...")
39 | 
40 |             for retries in range(15):
41 |                 _, _, free = shutil.disk_usage(new_path)
42 |                 used = _get_size(new_path)
43 |                 try:
44 |                     tokenized_dataset = datasets.load_from_disk(new_path)
45 |                     log.info(f"Staged dataset size is {used / 1024**3:,.3f}GB. {free/ 1024**3:,.3f}GB free in staging dir.")
46 |                     return new_path
47 |                 except FileNotFoundError:
48 |                     log.info(
49 |                         f"Staged dataset is incomplete. Size is {used / 1024**3:,.3f}GB. "
50 |                         f" Waiting for 60 more secs for staging race condition."
51 |                     )
52 |                     time.sleep(60)
53 |             log.info(f"Staging dataset corrupted. Falling back to network drive location {data_directory_path}")
54 |             return data_directory_path
55 | 
56 |         except Exception as e:  # noqa
57 |             log.info(f"Staging failed with error {e}. Falling back to network drive location {data_directory_path}")
58 |             return data_directory_path
59 |     else:
60 |         raise FileNotFoundError(f"Dataset not yet generated or not found at {data_directory_path}.")
61 | 
62 | 
63 | def _get_size(start_path="."):
64 |     """Compute the size of a directory path. Why is this not in the standard library?"""
65 |     """Stolen from https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python"""
66 |     total_size = 0
67 |     for dirpath, dirnames, filenames in os.walk(start_path):
68 |         for f in filenames:
69 |             fp = os.path.join(dirpath, f)
70 |             # skip if it is symbolic link
71 |             if not os.path.islink(fp):
72 |                 total_size += os.path.getsize(fp)
73 |     return total_size
74 | 


--------------------------------------------------------------------------------
/bert/efficient_training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/efficient_training/__init__.py


--------------------------------------------------------------------------------
/bert/efficient_training/extract_il_losses.py:
--------------------------------------------------------------------------------
 1 | """Extracts the RHO-Loss irreducible losses from a model."""
 2 | import logging
 3 | import os
 4 | import pickle
 5 | import time
 6 | from collections import defaultdict
 7 | from typing import Optional
 8 | 
 9 | import hydra
10 | import numpy as np
11 | import torch
12 | import wandb
13 | from tqdm import tqdm
14 | 
15 | import cramming
16 | 
17 | log = logging.getLogger(__name__)
18 | 
19 | 
20 | def save_chunk(chunk_data: dict, chunk_number: int, path: str) -> None:
21 |     with open(os.path.join(path, f"dict_chunk_{chunk_number}.pkl"), "wb") as file:
22 |         pickle.dump(chunk_data, file)
23 | 
24 | 
25 | def get_example_ids_from_batch(examples_counter, len_batch: int, len_dataset: Optional[int] = None) -> list[int]:
26 |     example_ids = examples_counter + np.arange(len_batch)
27 |     example_ids = example_ids.tolist()
28 |     if len_dataset is not None:
29 |         example_ids = [example_id % len_dataset for example_id in example_ids]
30 |     return example_ids
31 | 
32 | 
33 | def save_losses_of_il_model(cfg, setup):
34 |     """This function controls the central training loop."""
35 |     tokenizer, cfg_arch, model_file = cramming.utils.find_pretrained_checkpoint(cfg)
36 |     model = cramming.construct_model(cfg.arch, tokenizer.vocab_size)
37 |     train_set, validation_set, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl, cfg.train)
38 |     model_engine, _, _, train_dataloader, validation_dataloader = cramming.load_backend(
39 |         model,
40 |         train_set,
41 |         validation_set,
42 |         tokenizer,
43 |         cfg.train,
44 |         cfg.impl,
45 |         setup=setup,
46 |     )
47 |     model_engine.load_checkpoint(cfg_arch, model_file)
48 |     model_engine.eval()
49 |     iterable_data = enumerate(tqdm(train_dataloader))
50 |     path = os.path.join(cfg.base_dir, cfg.name, "examples_to_loss")
51 |     os.makedirs(path, exist_ok=True)
52 |     log.info(f"Saving losses of IL model to {path}")
53 |     train_time = time.time()  # Crude time measurement for print_loss_every_nth_step
54 |     stats = defaultdict(list)
55 | 
56 |     # Launch training
57 |     examples_to_loss_dict = {}
58 |     chunk_counter = 0
59 |     example_ids = []
60 |     examples_counter = 0
61 |     with torch.no_grad():
62 |         for step, batch in iterable_data:
63 |             # Heavy lifting is moved to engines
64 |             example_ids_in_batch = get_example_ids_from_batch(examples_counter, len(batch["input_ids"]))
65 |             example_ids.extend(example_ids_in_batch)
66 |             device_batch = model_engine.to_device(batch)
67 |             examples_counter += len(batch["input_ids"])
68 |             with torch.autocast(**model_engine.amp_settings):
69 |                 losses = model_engine.model.forward_all_losses(**device_batch)
70 |             examples_to_loss_dict.update(dict(zip(example_ids_in_batch, losses)))
71 |             if step > 0 and step % cfg.impl.saving_interval == 0:
72 |                 examples_to_loss_dict = {k: v.cpu().tolist() for k, v in examples_to_loss_dict.items()}
73 |                 save_chunk(examples_to_loss_dict, chunk_counter, path)
74 |                 examples_to_loss_dict = {}  # free up RAM
75 |                 chunk_counter += 1
76 |             if step > 0 and step % cfg.impl.print_loss_every_nth_step == 0:
77 |                 stats["train_time"] += [(time.time() - train_time) / cfg.impl.print_loss_every_nth_step]
78 |                 stats["step"] += [step]
79 |                 stats["examples_counter"] += [examples_counter]
80 |                 train_time = time.time()
81 |                 wandb.log({k: v[-1] for k, v in stats.items()}, step=stats["step"][-1] if "step" in stats else None)
82 |     examples_to_loss_dict = {k: v.cpu().tolist() for k, v in examples_to_loss_dict.items()}
83 |     save_chunk(examples_to_loss_dict, chunk_counter, path)
84 |     stats["train_time"] += [(time.time() - train_time) / cfg.impl.print_loss_every_nth_step]
85 |     stats["step"] += [step]
86 |     stats["examples_counter"] += [examples_counter]
87 |     wandb.log({k: v[-1] for k, v in stats.items()}, step=stats["step"][-1] if "step" in stats else None)
88 | 
89 | 
90 | @hydra.main(config_path="../cramming/config", config_name="cfg_save_losses", version_base="1.1")
91 | def launch(cfg):
92 |     cramming.utils.main_launcher(cfg, save_losses_of_il_model, job_name="save_losses")
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     launch()
97 | 


--------------------------------------------------------------------------------
/bert/efficient_training/layer_drop.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def sample_active_layers(seconds: int, step: int, cfg) -> tuple[list[int], float]:
 7 |     total_layers = cfg.arch.num_transformer_layers
 8 |     max_drop_prob = _get_drop_prob(seconds, step, cfg)
 9 | 
10 |     active_layers: list[int] = []
11 |     for layer_i in range(0, total_layers):
12 |         layer_drop_prob = max_drop_prob / total_layers * (layer_i + 1)
13 |         if torch.bernoulli(torch.tensor(1.0 - layer_drop_prob)):
14 |             active_layers.append(layer_i)
15 |     return active_layers, max_drop_prob
16 | 
17 | 
18 | def _get_drop_prob(seconds: int, step: int, cfg) -> float:
19 |     if cfg.budget == "steps":
20 |         t = step
21 |         T = cfg.train.steps
22 |     else:
23 |         budget_seconds = cfg.budget * 60 * 60
24 |         t = seconds
25 |         T = budget_seconds
26 |     gamma = cfg.arch.layer_drop.gamma_factor / T
27 |     min_theta = cfg.arch.layer_drop.max_theta
28 |     return 1 - (min_theta + (1 - min_theta) * math.exp(-gamma * t))
29 | 


--------------------------------------------------------------------------------
/bert/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "ntng-bert"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Jean Kaddour"]
 6 | readme = "README.md"
 7 | packages = [
 8 |     { include = "cramming" },
 9 |     { include = "efficient_training" },
10 |     { include = "rst" },
11 | ]
12 | 
13 | [tool.poetry.dependencies]
14 | python = "~3.10"
15 | hydra-core = ">=1.1"
16 | torch = "~1.13+cu117"
17 | datasets = "^2.13.1"
18 | tokenizers = "^0.13.3"
19 | transformers = "^4.30.2"
20 | evaluate = "^0.4.0"
21 | psutil = "^5.9.5"
22 | einops = "^0.6.1"
23 | zstandard = "^0.21.0"
24 | wandb = "^0.15.5"
25 | scipy = "^1.11.1"
26 | scikit-learn = "^1.3.0"
27 | 
28 | [tool.poetry.group.dev.dependencies]
29 | black = "^23.7.0"
30 | mypy = "^1.4.1"
31 | isort = "^5.12.0"
32 | pytest = "^7.4.0"
33 | 
34 | [[tool.poetry.source]]
35 | name = "PyPI"
36 | priority = "primary"
37 | 
38 | [[tool.poetry.source]]
39 | name = "pytorch_cuda_117"
40 | url = "https://download.pytorch.org/whl/cu117"
41 | priority = "supplemental"
42 | 
43 | [build-system]
44 | requires = ["poetry-core"]
45 | build-backend = "poetry.core.masonry.api"
46 | 
47 | 
48 | [tool.black]
49 | line-length = 140
50 | 
51 | [tool.isort]
52 | known_third_party = ["wandb"]
53 | profile = "black"
54 | 
55 | [tool.pytest.ini_options]
56 | pythonpath = ["."]
57 | 


--------------------------------------------------------------------------------
/bert/rst/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/bert/rst/__init__.py


--------------------------------------------------------------------------------
/bert/rst/get_RSTs_from_wandb.py:
--------------------------------------------------------------------------------
 1 | """
 2 | With this script, you can extract RSTs directly from wandb.
 3 | You may need to install wandb first and adjust the WANDB_PROJECT variable below.
 4 | """
 5 | import matplotlib.pyplot as plt
 6 | import wandb
 7 | 
 8 | WANDB_PROJECT = ""  # add project name here
 9 | 
10 | api = wandb.Api(api_key="")
11 | runs = api.runs(path=WANDB_PROJECT)
12 | 
13 | 
14 | # %%
15 | times = {}
16 | for run in runs:
17 |     bs = run.config["train"]["batch_size"]
18 |     steps = [
19 |         850,
20 |         2000,
21 |         2800,
22 |         3500,
23 |         4000,
24 |         4400,
25 |         4800,
26 |         5100,
27 |         5450,
28 |         5700,
29 |         6000,
30 |         6200,
31 |         6400,
32 |         6650,
33 |         6800,
34 |         7100,
35 |     ]
36 |     df = run.history(keys=["train_time"])
37 |     df = df.set_index("_step")
38 |     values = df.loc[steps].values.reshape(-1).tolist()
39 |     val_strs = [f"{x:.4f}" for x in values]
40 |     print(f"{bs}: [{', '.join(val_strs)},],")
41 |     times[bs] = values
42 | 
43 | # As a test, plot the times to check it looks similar to the wandb plot.
44 | for bs, times in times.items():
45 |     xs = []
46 |     ys = []
47 |     for i, time in enumerate(times):
48 |         xs.append(i + 0.5)
49 |         xs.append(i + 1.5)
50 |         ys.append(time)
51 |         ys.append(time)
52 |     print("", bs, len(times))
53 |     plt.plot(xs, ys, label=f"bs{bs}")
54 | plt.legend()
55 | plt.show()
56 | 


--------------------------------------------------------------------------------
/bert/rst/saved_rsts.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal, Optional
 2 | 
 3 | Task = Literal["bert"]
 4 | 
 5 | # BERT 16 layer on NVIDIA 3090
 6 | # FORWARD AND BACKWARD; here, we track the time of a minibatch size
 7 | NUM_LAYERS_AND_BATCH_TO_TIME_FB_BERT = {
 8 |     1536: [
 9 |         0.02521,
10 |         0.03837,
11 |         0.05177,
12 |         0.06508,
13 |         0.07847,
14 |         0.09188,
15 |         0.01052,
16 |         0.1186,
17 |         0.1319,
18 |         0.1454,
19 |         0.1589,
20 |         0.1722,
21 |         0.1857,
22 |         0.1988,
23 |         0.212,
24 |         0.2253,
25 |     ],
26 | }
27 | 
28 | # BERT 16 layer on NVIDIA 3090
29 | # FORWARD ONLY (relevant for RhoLoss); here, we track the time of a microbatch size
30 | NUM_LAYERS_AND_BATCH_TO_TIME_F_BERT = {128: 0.09285}
31 | 
32 | 
33 | def get_time_per_step(
34 |     batch_size: int, num_active_layers: int, task: Task = "bert", forward_only: bool = False, microbatch_size: Optional[int] = None
35 | ) -> float:
36 |     if task == "bert":
37 |         if forward_only:
38 |             time = NUM_LAYERS_AND_BATCH_TO_TIME_F_BERT[batch_size]
39 |         else:
40 |             time = NUM_LAYERS_AND_BATCH_TO_TIME_FB_BERT[batch_size][num_active_layers - 1]
41 | 
42 |         if microbatch_size is None:
43 |             return time
44 |         else:
45 |             if microbatch_size % 128 != 0 or microbatch_size <= 0:
46 |                 raise ValueError("Microbatch size must be multiple of 128")
47 |             return time * microbatch_size / 128.0
48 |     else:
49 |         raise NotImplementedError("Only BERT is supported in this module.")
50 | 


--------------------------------------------------------------------------------
/bert/validate_bert.py:
--------------------------------------------------------------------------------
 1 | """Evaluates a pretrained model on the pretraining validation set.
 2 | 
 3 | Optionally updates the wandb run with the validation loss.
 4 | """
 5 | 
 6 | import logging
 7 | import sys
 8 | 
 9 | import hydra
10 | import torch
11 | import wandb
12 | from wandb.apis.public import Run
13 | 
14 | import cramming
15 | 
16 | log = logging.getLogger(__name__)
17 | from cramming.utils import validate
18 | 
19 | 
20 | def main_eval_process(cfg, setup):
21 |     """This function controls the central training loop."""
22 | 
23 |     tokenizer, cfg_arch, model_file = cramming.utils.find_pretrained_checkpoint(cfg)
24 |     model = cramming.construct_model(cfg_arch, tokenizer.vocab_size)
25 |     train_dataset, validation_set, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl, cfg.train)
26 |     if cfg.truncate_dataset > 0:
27 |         train_dataset = train_dataset.select(range(min(cfg.truncate_dataset, len(train_dataset))))
28 |         validation_set = validation_set.select(range(min(cfg.truncate_dataset, len(validation_set))))
29 |     log.info(f"Train dataset size: {len(train_dataset)}, validation set size: {len(validation_set)}")
30 |     model_engine, _, _, _, validation_loader = cramming.load_backend(
31 |         model,
32 |         train_dataset,
33 |         validation_set,
34 |         tokenizer,
35 |         cfg.train,
36 |         cfg.impl,
37 |         setup=setup,
38 |     )
39 | 
40 |     model_engine.load_checkpoint(cfg_arch, model_file)
41 |     model_engine.eval()
42 |     validation_loss = [validate(model_engine, validation_loader, model_engine.setup["device"])]
43 | 
44 |     log.info(f"Avg Validation loss: {validation_loss}")
45 | 
46 |     if cfg.wandb.resume is not None:
47 |         print(f"Would you like to update existing run {cfg.wandb.resume}?")
48 |         if _ask_yes_no():
49 |             logged_run: Run = wandb.Api().run(path=f"{cfg.wandb.entity}/{cfg.wandb.project}/{cfg.wandb.resume}")
50 |             hour_to_log = logged_run.history(keys=["hours"])["hours"].values[-1] + 0.0001
51 |             print(f"Logging at hour {hour_to_log:.3f}")
52 |             wandb.log({"validation_loss": validation_loss, "hours": hour_to_log})
53 |         else:
54 |             print("Not logging")
55 | 
56 | 
57 | def _ask_yes_no() -> bool:
58 |     while True:
59 |         sys.stdout.write("y/n:")
60 |         response = input().lower()
61 |         if response == "y":
62 |             return True
63 |         if response == "n":
64 |             return False
65 | 
66 | 
67 | @hydra.main(config_path="cramming/config", config_name="cfg_eval_pt", version_base="1.1")
68 | def launch(cfg):
69 |     cramming.utils.main_launcher(cfg, main_eval_process, job_name="eval_pt_task")
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     launch()
74 | 


--------------------------------------------------------------------------------
/t5/.gitignore:
--------------------------------------------------------------------------------
1 | .neptune/
2 | data
3 | data/
4 | .DS_Store
5 | .vscode/


--------------------------------------------------------------------------------
/t5/README.md:
--------------------------------------------------------------------------------
 1 | # T5 experiments
 2 | The T5 experiments are based off the excellent [nanoT5](https://github.com/PiotrNawrot/nanoT5) repository, see [LICENSE](LICENSE).
 3 | 
 4 | ## Environment setup
 5 | 
 6 | Following nanoT5's setup:
 7 | 
 8 | ```
 9 | conda create -n ntng_t5 python=3.8
10 | conda activate ntng_t5
11 | pip install -r requirements.txt
12 | ```
13 | 
14 | The following commands result in the following [pip freeze](assets/pip_freeze.txt) as of 24.07.2023. We also include our [lscpu](assets/lscpu.txt) and [nvidia-smi](assets/nvidia_smi.txt).
15 | 
16 | ## Commands for each experiment
17 | 
18 | By default the experiments are run for 24 hours. For more details check the default config with all hyperparameters [here](t5/configs/default.json). We include the RST measurements [here](t5/utils/train.py).
19 | 
20 | ### Baseline
21 | 
22 | ```
23 |     python -m t5.train stacking.typ=none
24 | ```
25 | 
26 | ### Stacking
27 | 
28 | ```
29 |     python -m t5.train stacking.typ=stack
30 | ```
31 | 
32 | ### Layer Dropping
33 | 
34 | ```
35 |     python -m t5.train stacking.typ=drop optim.base_lr=1e-2 stacking.gamma_factor=20
36 | ```
37 | 
38 | ### Sophia
39 | 
40 | ```
41 |     python -m t5.train stacking.typ=none optim.name=sophia optim.rho=1e-2 optim.base_lr=1e-3 sophia_freq=10
42 | ```
43 | 
44 | ### Lion
45 | 
46 | ```
47 |     python -m t5.train stacking.typ=none optim.name=lion optim.base_lr=7.5e-4
48 | ```
49 | 
50 | ### Fine-Tuning
51 | 
52 | We fine-tune the models in the original [nanoT5 repository](https://github.com/PiotrNawrot/nanoT5) using the following command:
53 | 
54 | ```
55 | 
56 |     python -m nanoT5.main task=ft google/t5-v1_1-base model.random_init=false model.checkpoint_path="/path/to/pytorch_model.bin
57 | ```
58 | 
59 | All our models do not modify the original T5 architecture, so all checkpoints trained in this repository are compabible with the original nanoT5 repository.


--------------------------------------------------------------------------------
/t5/assets/lscpu.txt:
--------------------------------------------------------------------------------
 1 | Architecture:        x86_64
 2 | CPU op-mode(s):      32-bit, 64-bit
 3 | Byte Order:          Little Endian
 4 | CPU(s):              128
 5 | On-line CPU(s) list: 0-127
 6 | Thread(s) per core:  1
 7 | Core(s) per socket:  64
 8 | Socket(s):           2
 9 | NUMA node(s):        8
10 | Vendor ID:           AuthenticAMD
11 | CPU family:          25
12 | Model:               1
13 | Model name:          AMD EPYC 7763 64-Core Processor
14 | Stepping:            1
15 | CPU MHz:             2445.534
16 | BogoMIPS:            4891.06
17 | Virtualization:      AMD-V
18 | L1d cache:           32K
19 | L1i cache:           32K
20 | L2 cache:            512K
21 | L3 cache:            32768K
22 | NUMA node0 CPU(s):   0-15
23 | NUMA node1 CPU(s):   16-31
24 | NUMA node2 CPU(s):   32-47
25 | NUMA node3 CPU(s):   48-63
26 | NUMA node4 CPU(s):   64-79
27 | NUMA node5 CPU(s):   80-95
28 | NUMA node6 CPU(s):   96-111
29 | NUMA node7 CPU(s):   112-127
30 | Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca
31 | 


--------------------------------------------------------------------------------
/t5/assets/nvidia_smi.txt:
--------------------------------------------------------------------------------
 1 | Mon Jul 24 11:04:03 2023       
 2 | +-----------------------------------------------------------------------------+
 3 | | NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
 4 | |-------------------------------+----------------------+----------------------+
 5 | | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
 6 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
 7 | |                               |                      |               MIG M. |
 8 | |===============================+======================+======================|
 9 | |   0  NVIDIA A100-SXM...  On   | 00000000:01:00.0 Off |                    0 |
10 | | N/A   40C    P0    79W / 500W |      0MiB / 81920MiB |      0%      Default |
11 | |                               |                      |             Disabled |
12 | +-------------------------------+----------------------+----------------------+
13 |                                                                                
14 | +-----------------------------------------------------------------------------+
15 | | Processes:                                                                  |
16 | |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
17 | |        ID   ID                                                   Usage      |
18 | |=============================================================================|
19 | |  No running processes found                                                 |
20 | +-----------------------------------------------------------------------------+
21 | 


--------------------------------------------------------------------------------
/t5/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | datasets >= 1.8.0
 3 | sentencepiece != 0.1.92
 4 | transformers
 5 | neptune
 6 | pdbpp
 7 | notebook
 8 | protobuf==3.20.*
 9 | pyyaml
10 | pynvml
11 | hydra-core
12 | evaluate
13 | nltk
14 | absl-py
15 | rouge_score
16 | torch>=1.13.1,<=2.0.1
17 | hydra_colorlog
18 | wandb
19 | 


--------------------------------------------------------------------------------
/t5/t5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/t5/t5/__init__.py


--------------------------------------------------------------------------------
/t5/t5/configs/default.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - task: pt
 4 |   - local_env: default 
 5 | 
 6 | # Experiment args
 7 | mode: 'pt'
 8 | device: gpu
 9 | precision: 'no'
10 | gpus: 1
11 | eval_only: false
12 | predict_only: false
13 | seed: 2137
14 | fine_tune: false
15 | debug: false
16 | 
17 | # NTNG args
18 | budget: 24
19 | every_seconds: 86400
20 | sophia_freq: 10
21 | 
22 | stacking:
23 |   typ: none # {stack, drop}
24 |   num_initial_layers: 3
25 |   num_layers_to_add: 12
26 |   step_fractions: [0.125, 0.3]
27 |   doubling: true
28 |   gamma_factor: 20
29 | 
30 | # Rest of nanoT5 args
31 | model:
32 |   klass: my_t5
33 |   name: 'google/t5-v1_1-base'
34 |   overwrite:
35 |     dropout_rate: 0.0
36 |   add_config:
37 |     share_positional_bias: False
38 |   checkpoint_path: ''
39 |   random_init: true
40 |   compile: true # Pytorch 2.0
41 |   
42 | data:
43 |   input_length: 512
44 |   mlm_probability: 0.15
45 |   mean_noise_span_length: 3.0
46 |   num_workers: 8
47 |   shuffle_buffer_size: 1000
48 | 
49 | optim:
50 |   name: adamwscale # {sophia, lion}
51 |   base_lr: 2e-2
52 |   batch_size: 144
53 |   total_steps: 65536
54 |   epochs: -1 # If it's > 0 it overwrites total_steps
55 |   warmup_steps: 10000
56 |   lr_scheduler: cosine-budget
57 |   weight_decay: 0.0
58 |   grad_clip: 1.0
59 |   grad_acc: 2
60 |   final_cosine: 1e-5
61 |   rho: 2e-2
62 | 
63 | eval:
64 |   every_steps: 500000 # Checkpoint in the end
65 |   steps: 500
66 | 
67 | checkpoint:
68 |   every_steps: 500000 # Checkpoint in the end
69 | 
70 | logging:
71 |   neptune: false
72 |   neptune_creds:
73 |     project:
74 |     api_token:
75 |     tags:
76 |   wandb: false
77 |   wandb_creds:
78 |     name:
79 |     project:
80 |     entity:
81 |   prefix: ''
82 |   every_steps: 100
83 |   grad_l2: true
84 |   weights_l2: true
85 | 
86 | hydra:
87 |   job:
88 |     chdir: True


--------------------------------------------------------------------------------
/t5/t5/configs/local_env/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | hydra:
4 |   run:
5 |     dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}-${logging.neptune_creds.tags}


--------------------------------------------------------------------------------
/t5/t5/configs/task/debug.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | debug: true
 4 | 
 5 | logging:
 6 |   every_steps: 2
 7 | 
 8 | stacking:
 9 |   step_fractions: [0.0001, 0.0002]
10 | 
11 | eval:
12 |   steps: 5
13 | 


--------------------------------------------------------------------------------
/t5/t5/configs/task/ft.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | mode: 'ft'
 4 | 
 5 | logging:
 6 |   prefix: 'ft/'
 7 | 
 8 | data:
 9 |   max_seq_len: 1024
10 |   max_target_len: 128
11 |   max_num_instances_per_task: 100
12 |   add_task_name: False
13 |   add_task_definition: True
14 |   num_pos_examples: 2
15 |   num_neg_examples: 0
16 |   add_explanation: False
17 |   tk_instruct: False
18 |   exec_file_path: ./adaptive_moe/utils/sni_dataset.py
19 |   data_dir: /home/hpcnawr1/data/natural-instructions/splits/default
20 |   task_dir: /home/hpcnawr1/data/natural-instructions/tasks
21 | 
22 | optim:
23 |   name: adamw
24 |   base_lr: 5e-5
25 |   batch_size: 8
26 |   epochs: 2
27 |   warmup_steps: 0
28 |   lr_scheduler: constant
29 |   weight_decay: 0.0
30 |   grad_clip: 0.0
31 |   grad_acc: 1
32 | 
33 | eval:
34 |   steps: 200


--------------------------------------------------------------------------------
/t5/t5/configs/task/pt.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 


--------------------------------------------------------------------------------
/t5/t5/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from omegaconf import open_dict
 3 | from transformers import AutoConfig, AutoTokenizer, T5ForConditionalGeneration
 4 | 
 5 | from .t5 import MyT5
 6 | 
 7 | 
 8 | def get_model(args, config):
 9 |     klass = {
10 |         "t5": T5ForConditionalGeneration,
11 |         "my_t5": MyT5,
12 |     }[args.model.klass]
13 | 
14 |     if args.model.checkpoint_path:
15 |         model = klass(config)
16 |         model.load_state_dict(torch.load(args.model.checkpoint_path))
17 |     elif args.model.random_init:
18 |         model = klass(config)
19 |     else:
20 |         model = klass.from_pretrained(
21 |             args.model.name,
22 |             config=config,
23 |         )
24 | 
25 |     with open_dict(args):
26 |         args.n_all_param = sum([p.nelement() for p in model.parameters()])
27 | 
28 |     return model
29 | 
30 | 
31 | def get_config(args):
32 |     config = AutoConfig.from_pretrained(
33 |         args.model.name,
34 |     )
35 | 
36 |     if hasattr(args.model, "overwrite"):
37 |         for k, v in args.model.overwrite.items():
38 |             assert hasattr(config, k), f"config does not have attribute {k}"
39 |             setattr(config, k, v)
40 | 
41 |     if hasattr(args.model, "add_config"):
42 |         for k, v in args.model.add_config.items():
43 |             assert not hasattr(config, k), f"config already has attribute {k}"
44 |             setattr(config, k, v)
45 | 
46 |     return config
47 | 
48 | 
49 | def get_tokenizer(args):
50 |     tokenizer = AutoTokenizer.from_pretrained(args.model.name, use_fast=True)
51 |     tokenizer.model_max_length = int(1e9)
52 | 
53 |     return tokenizer
54 | 


--------------------------------------------------------------------------------
/t5/t5/train.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import hydra
 4 | from accelerate import Accelerator
 5 | from omegaconf import open_dict
 6 | from torch import compile, no_grad
 7 | 
 8 | from .models import get_config, get_model, get_tokenizer
 9 | from .utils.data import get_dataloaders
10 | from .utils.general import setup_basics
11 | from .utils.optim import get_lr_scheduler, get_optimizer
12 | from .utils.train import eval, predict, train
13 | 
14 | 
15 | @hydra.main(config_path="configs", config_name="default", version_base="1.1")
16 | def main(args):
17 |     accelerator = Accelerator(
18 |         cpu=args.device == "cpu",
19 |         mixed_precision=args.precision,
20 |     )
21 |     logger = setup_basics(accelerator, args)
22 |     config = get_config(args)
23 |     model = get_model(args, config)
24 |     tokenizer = get_tokenizer(args)
25 |     optimizer = get_optimizer(model, args)
26 |     lr_scheduler = get_lr_scheduler(optimizer, args, logger)
27 |     train_dataloader, test_dataloader = get_dataloaders(tokenizer, config, args)
28 | 
29 |     logger.log_args(args)
30 | 
31 |     (
32 |         model,
33 |         optimizer,
34 |         lr_scheduler,
35 |         train_dataloader,
36 |         test_dataloader,
37 |     ) = accelerator.prepare(
38 |         model, optimizer, lr_scheduler, train_dataloader, test_dataloader
39 |     )
40 | 
41 |     if args.model.compile:
42 |         if args.stacking.typ == "none":
43 |             model = compile(model)
44 |         else:
45 |             model.lm_head = compile(model.lm_head)
46 |             model.shared = compile(model.shared)
47 |             model.encoder.embed_tokens = compile(model.encoder.embed_tokens)
48 |             model.decoder.embed_tokens = compile(model.decoder.embed_tokens)
49 |             model.decoder.final_layer_norm = compile(model.decoder.final_layer_norm)
50 |             for i in range(len(model.encoder.block)):
51 |                 model.encoder.block[i] = compile(model.encoder.block[i])
52 | 
53 |             for i in range(len(model.decoder.block)):
54 |                 model.decoder.block[i] = compile(model.decoder.block[i])
55 | 
56 |     with open_dict(args):
57 |         args.start_time = time.time()
58 |         args.current_train_step = 1
59 |         args.current_epoch = 1
60 |         args.last_log = time.time()
61 |         args.seconds_counter = 0.0
62 |         args.fake_step = 0
63 |         args.eval_cou = 1
64 |         args.check_cou = 1
65 | 
66 |     if args.eval_only:
67 |         model.eval()
68 |         with no_grad():
69 |             eval(model, test_dataloader, logger, args, tokenizer)
70 |     elif args.predict_only:
71 |         model.eval()
72 |         with no_grad():
73 |             predict(model, test_dataloader, logger, args, tokenizer)
74 |     else:
75 |         train(
76 |             model,
77 |             train_dataloader,
78 |             test_dataloader,
79 |             accelerator,
80 |             lr_scheduler,
81 |             optimizer,
82 |             logger,
83 |             args,
84 |             tokenizer,
85 |         )
86 | 
87 |     logger.finish()
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     main()
92 | 


--------------------------------------------------------------------------------
/t5/t5/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JeanKaddour/NoTrainNoGain/a7e65998c897865c17ac37482295ca9560113adc/t5/t5/utils/__init__.py


--------------------------------------------------------------------------------
/t5/t5/utils/data.py:
--------------------------------------------------------------------------------
  1 | import datasets
  2 | from datasets.iterable_dataset import IterableDataset
  3 | from omegaconf import open_dict
  4 | from torch.utils.data import DataLoader
  5 | 
  6 | from .copied import (
  7 |     DataCollatorForNI,
  8 |     DataCollatorForT5MLM,
  9 |     compute_input_and_target_lengths,
 10 |     tokenize_function,
 11 | )
 12 | 
 13 | 
 14 | def load_dataset_splits(args):
 15 |     if args.mode == "pt":
 16 |         dataset = datasets.load_dataset(
 17 |             "c4",
 18 |             "en",
 19 |             streaming=True,
 20 |         )
 21 | 
 22 |         dataset = dataset.remove_columns(["timestamp", "url"])
 23 | 
 24 |         dataset_splits = {
 25 |             "train": dataset["train"],
 26 |             "test": dataset["validation"],
 27 |         }
 28 | 
 29 |         assert (
 30 |             dataset["train"].n_shards == 1024
 31 |         ), "We want to have many shards for efficient processing with num_workes in PyTorch dataloader"
 32 |     elif args.mode == "ft":
 33 |         dataset_splits = datasets.load_dataset(
 34 |             args.data.exec_file_path,
 35 |             data_dir=args.data.data_dir,
 36 |             task_dir=args.data.task_dir,
 37 |             max_num_instances_per_task=args.data.max_num_instances_per_task,
 38 |             max_num_instances_per_eval_task=args.data.max_num_instances_per_task,
 39 |         )
 40 |     else:
 41 |         raise NotImplementedError
 42 | 
 43 |     return dataset_splits
 44 | 
 45 | 
 46 | def process_dataset(dataset_splits, args, tokenizer):
 47 |     if args.mode == "pt":
 48 |         final_datasets = {}
 49 | 
 50 |         for split, dataset_split in dataset_splits.items():
 51 | 
 52 |             # We increase the input_length, because instead of masking tokens T5 replaces
 53 |             # masked spans with a single token, therefore to avoid padding we need to have
 54 |             # longer sequences at the start, before masking
 55 |             before_mask_input_length, target_length = compute_input_and_target_lengths(
 56 |                 inputs_length=args.data.input_length,
 57 |                 noise_density=args.data.mlm_probability,
 58 |                 mean_noise_span_length=args.data.mean_noise_span_length,
 59 |             )
 60 | 
 61 |             with open_dict(args):
 62 |                 args.data.before_mask_input_length = before_mask_input_length
 63 |                 args.data.target_length = target_length
 64 | 
 65 |             dataset_split = dataset_split.map(
 66 |                 tokenize_function,
 67 |                 batched=True,
 68 |                 fn_kwargs={
 69 |                     "tokenizer": tokenizer,
 70 |                     "in_length": before_mask_input_length,
 71 |                 },
 72 |                 remove_columns=["text"],
 73 |             )
 74 | 
 75 |             dataset_split = dataset_split.shuffle(
 76 |                 seed=args.seed, buffer_size=args.data.shuffle_buffer_size
 77 |             )
 78 |             final_datasets[split] = dataset_split
 79 |     elif args.mode == "ft":
 80 |         final_datasets = dataset_splits
 81 |     else:
 82 |         raise NotImplementedError
 83 | 
 84 |     return final_datasets
 85 | 
 86 | 
 87 | def get_data_collator(tokenizer, config, args):
 88 |     if args.mode == "pt":
 89 |         data_collator = DataCollatorForT5MLM(
 90 |             tokenizer=tokenizer,
 91 |             noise_density=args.data.mlm_probability,
 92 |             mean_noise_span_length=args.data.mean_noise_span_length,
 93 |             input_length=args.data.input_length,
 94 |             target_length=args.data.target_length,
 95 |             pad_token_id=config.pad_token_id,
 96 |         )
 97 |     elif args.mode == "ft":
 98 |         data_collator = DataCollatorForNI(
 99 |             tokenizer,
100 |             padding="longest",
101 |             max_source_length=args.data.max_seq_len,
102 |             max_target_length=args.data.max_target_len,
103 |             label_pad_token_id=-100,
104 |             pad_to_multiple_of=1,
105 |             add_task_name=args.data.add_task_name,
106 |             add_task_definition=args.data.add_task_definition,
107 |             num_pos_examples=args.data.num_pos_examples,
108 |             num_neg_examples=args.data.num_neg_examples,
109 |             add_explanation=args.data.add_explanation,
110 |             tk_instruct=args.data.tk_instruct,
111 |         )
112 |     else:
113 |         raise NotImplementedError
114 | 
115 |     return data_collator
116 | 
117 | 
118 | def get_dataloaders(tokenizer, config, args):
119 |     dataset_splits = load_dataset_splits(args)
120 |     dataset = process_dataset(
121 |         dataset_splits=dataset_splits, args=args, tokenizer=tokenizer
122 |     )
123 |     data_collator = get_data_collator(tokenizer=tokenizer, config=config, args=args)
124 | 
125 |     is_iterable = isinstance(dataset["train"], IterableDataset)
126 | 
127 |     dataloaders = {}
128 | 
129 |     for split in ["train", "test"]:
130 |         batch_size = args.optim.batch_size // args.optim.grad_acc
131 | 
132 |         if split in ["test"]:
133 |             batch_size *= 2
134 | 
135 |         shuffle = (split == "train") and not is_iterable
136 | 
137 |         if args.mode == "ft" and split == "train":
138 |             assert shuffle is True
139 |         else:
140 |             assert shuffle is False
141 | 
142 |         dataloaders[split] = DataLoader(
143 |             dataset[split],
144 |             shuffle=shuffle,
145 |             collate_fn=data_collator,
146 |             batch_size=batch_size,
147 |             num_workers=args.data.num_workers,
148 |             pin_memory=True,
149 |             drop_last=False,
150 |         )
151 | 
152 |     # Add & Check args about data loaders
153 |     with open_dict(args):
154 |         if not is_iterable:
155 |             args.data.train_batches = len(dataloaders["train"])
156 |             args.data.test_batches = len(dataloaders["test"])
157 | 
158 |         if args.optim.epochs > 0:
159 |             assert not is_iterable
160 |             args.optim.total_steps = (
161 |                 len(dataloaders["train"]) // args.optim.grad_acc
162 |             ) * args.optim.epochs
163 | 
164 |         # We increase eval BS by 2, so decrease number of eval steps
165 |         args.eval.corrected_steps = args.eval.steps / 2
166 | 
167 |     return dataloaders["train"], dataloaders["test"]
168 | 


--------------------------------------------------------------------------------
/t5/t5/utils/general.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from accelerate.utils import set_seed
 5 | from hydra.utils import to_absolute_path
 6 | from omegaconf import open_dict
 7 | 
 8 | from .logging import Logger
 9 | 
10 | 
11 | def check_args_and_env(args):
12 |     assert args.optim.batch_size % args.optim.grad_acc == 0
13 | 
14 |     # Train log must happen before eval log
15 |     assert args.eval.every_steps % args.logging.every_steps == 0
16 | 
17 |     if args.device == "gpu":
18 |         assert torch.cuda.is_available(), "We use GPU to train/eval the model"
19 | 
20 |     assert not (args.eval_only and args.predict_only)
21 | 
22 |     if args.predict_only:
23 |         assert args.mode == "ft"
24 | 
25 | 
26 | def opti_flags(args):
27 |     # This lines reduce training step by 2.4x
28 |     torch.backends.cuda.matmul.allow_tf32 = True
29 |     torch.backends.cudnn.allow_tf32 = True
30 | 
31 | 
32 | def update_args_with_env_info(args):
33 |     with open_dict(args):
34 |         slurm_id = os.getenv("SLURM_JOB_ID")
35 | 
36 |         if slurm_id is not None:
37 |             args.slurm_id = slurm_id
38 |         else:
39 |             args.slurm_id = "none"
40 | 
41 |         args.working_dir = os.getcwd()
42 | 
43 | 
44 | def update_paths(args):
45 |     if args.mode == "ft":
46 |         args.data.exec_file_path = to_absolute_path(args.data.exec_file_path)
47 |         args.data.data_dir = to_absolute_path(args.data.data_dir)
48 |         args.data.task_dir = to_absolute_path(args.data.task_dir)
49 | 
50 | 
51 | def setup_basics(accelerator, args):
52 |     check_args_and_env(args)
53 |     update_args_with_env_info(args)
54 |     update_paths(args)
55 |     opti_flags(args)
56 | 
57 |     # To skip scientific notation
58 |     torch.set_printoptions(
59 |         precision=3,
60 |         sci_mode=False,
61 |     )
62 | 
63 |     if args.seed is not None:
64 |         set_seed(args.seed)
65 | 
66 |     logger = Logger(args=args, accelerator=accelerator)
67 | 
68 |     return logger
69 | 


--------------------------------------------------------------------------------
/t5/t5/utils/lion.py:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/lucidrains/lion-pytorch/blob/main/lion_pytorch/lion_pytorch.py
 2 | 
 3 | from typing import Callable, Optional, Tuple
 4 | 
 5 | import torch
 6 | from torch.optim.optimizer import Optimizer
 7 | 
 8 | # functions
 9 | 
10 | 
11 | def _rms(tensor):
12 |     return tensor.norm(2) / (tensor.numel() ** 0.5)
13 | 
14 | 
15 | def exists(val):
16 |     return val is not None
17 | 
18 | 
19 | # update functions
20 | 
21 | 
22 | def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2):
23 |     # stepweight decay
24 | 
25 |     # Adafactor RMS
26 |     lr = lr * max(1e-3, _rms(p.data))
27 | 
28 |     p.data.mul_(1 - lr * wd)
29 | 
30 |     # weight update
31 | 
32 |     update = exp_avg.clone().mul_(beta1).add(grad, alpha=1 - beta1).sign_()
33 |     p.add_(update, alpha=-lr)
34 | 
35 |     # decay the momentum running average coefficient
36 | 
37 |     exp_avg.mul_(beta2).add_(grad, alpha=1 - beta2)
38 | 
39 | 
40 | # class
41 | 
42 | 
43 | class Lion(Optimizer):
44 |     def __init__(
45 |         self,
46 |         params,
47 |         lr: float = 1e-4,
48 |         betas: Tuple[float, float] = (0.9, 0.99),
49 |         weight_decay: float = 0.0,
50 |     ):
51 |         assert lr > 0.0
52 |         assert all([0.0 <= beta <= 1.0 for beta in betas])
53 | 
54 |         defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)
55 | 
56 |         super().__init__(params, defaults)
57 | 
58 |         self.update_fn = update_fn
59 | 
60 |     @torch.no_grad()
61 |     def step(self, closure: Optional[Callable] = None):
62 | 
63 |         loss = None
64 |         if exists(closure):
65 |             with torch.enable_grad():
66 |                 loss = closure()
67 | 
68 |         for group in self.param_groups:
69 |             for p in filter(lambda p: exists(p.grad), group["params"]):
70 | 
71 |                 grad, lr, wd, beta1, beta2, state = (
72 |                     p.grad,
73 |                     group["lr"],
74 |                     group["weight_decay"],
75 |                     *group["betas"],
76 |                     self.state[p],
77 |                 )
78 | 
79 |                 # init state - exponential moving average of gradient values
80 | 
81 |                 if len(state) == 0:
82 |                     state["exp_avg"] = torch.zeros_like(p)
83 | 
84 |                 exp_avg = state["exp_avg"]
85 | 
86 |                 self.update_fn(p, grad, exp_avg, lr, wd, beta1, beta2)
87 | 
88 |         return loss
89 | 


--------------------------------------------------------------------------------
/t5/t5/utils/logging.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from collections import defaultdict
  4 | 
  5 | import datasets
  6 | import neptune
  7 | import transformers
  8 | import wandb
  9 | from accelerate.logging import get_logger
 10 | from neptune.utils import stringify_unsupported
 11 | from omegaconf import OmegaConf, open_dict
 12 | 
 13 | 
 14 | class Averager:
 15 |     def __init__(self, weight: float = 1):
 16 |         self.weight = weight
 17 |         self.reset()
 18 | 
 19 |     def reset(self):
 20 |         self.total = defaultdict(float)
 21 |         self.counter = defaultdict(float)
 22 | 
 23 |     def update(self, stats):
 24 |         for key, value in stats.items():
 25 |             self.total[key] = self.total[key] * self.weight + value * self.weight
 26 |             self.counter[key] = self.counter[key] * self.weight + self.weight
 27 | 
 28 |     def average(self):
 29 |         averaged_stats = {
 30 |             key: tot / self.counter[key] for key, tot in self.total.items()
 31 |         }
 32 |         self.reset()
 33 | 
 34 |         return averaged_stats
 35 | 
 36 | 
 37 | class Logger:
 38 |     def __init__(self, args, accelerator):
 39 |         self.logger = get_logger("Main")
 40 | 
 41 |         # Make one log on every process with the configuration for debugging.
 42 |         logging.basicConfig(
 43 |             format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 44 |             datefmt="%m/%d/%Y %H:%M:%S",
 45 |             level=logging.INFO,
 46 |         )
 47 |         self.logger.info(accelerator.state, main_process_only=False)
 48 |         self.logger.info(f"Working directory is {os.getcwd()}")
 49 | 
 50 |         if accelerator.is_local_main_process:
 51 |             datasets.utils.logging.set_verbosity_warning()
 52 |             transformers.utils.logging.set_verbosity_info()
 53 |         else:
 54 |             datasets.utils.logging.set_verbosity_error()
 55 |             transformers.utils.logging.set_verbosity_error()
 56 | 
 57 |         self.setup_neptune(args)
 58 |         self.setup_wandb(args)
 59 | 
 60 |     def setup_wandb(self, args):
 61 |         if args.logging.wandb:
 62 |             wandb.init(
 63 |                 name=args.logging.wandb_creds.name,
 64 |                 project=args.logging.wandb_creds.project,
 65 |                 entity=args.logging.wandb_creds.entity,
 66 |             )
 67 |         else:
 68 |             self.wandb_run = None
 69 | 
 70 |         self.wandb_run = wandb.run
 71 | 
 72 |         with open_dict(args):
 73 |             if self.wandb_run is not None:
 74 |                 args.wandb_id = self.wandb_run.id
 75 | 
 76 |     def setup_neptune(self, args):
 77 |         if args.logging.neptune:
 78 |             tags = [str(item) for item in args.logging.neptune_creds.tags.split(",")]
 79 |             if tags == [] or tags == [""]:
 80 |                 tags = None
 81 | 
 82 |             neptune_logger = neptune.init_run(
 83 |                 project=args.logging.neptune_creds.project,
 84 |                 api_token=args.logging.neptune_creds.api_token,
 85 |                 tags=tags,
 86 |             )
 87 |         else:
 88 |             neptune_logger = None
 89 | 
 90 |         self.neptune_logger = neptune_logger
 91 | 
 92 |         with open_dict(args):
 93 |             if neptune_logger is not None:
 94 |                 args.neptune_id = neptune_logger["sys/id"].fetch()
 95 | 
 96 |     def log_args(self, args):
 97 |         if self.wandb_run is not None:
 98 |             logging_args = OmegaConf.to_container(args, resolve=True)
 99 |             wandb.config.update(logging_args)
100 | 
101 |         if self.neptune_logger is not None:
102 |             logging_args = OmegaConf.to_container(args, resolve=True)
103 |             self.neptune_logger["args"] = stringify_unsupported(logging_args)
104 | 
105 |     def log_stats(self, stats, step, args, prefix=""):
106 |         if self.neptune_logger is not None:
107 |             for k, v in stats.items():
108 |                 self.neptune_logger[f"{prefix}{k}"].log(v, step=step)
109 | 
110 |         if self.wandb_run is not None:
111 |             for k, v in stats.items():
112 |                 wandb.log({f"{prefix}{k}": v}, step=step)
113 | 
114 |         msg_start = (
115 |             f"[{prefix[:-1]}] Step {step} out of {args.optim.total_steps}" + " | "
116 |         )
117 |         dict_msg = (
118 |             " | ".join([f"{k.capitalize()} --> {v:.3f}" for k, v in stats.items()])
119 |             + " | "
120 |         )
121 | 
122 |         msg = msg_start + dict_msg
123 | 
124 |         self.log_message(msg)
125 | 
126 |     def log_message(self, msg):
127 |         self.logger.info(msg)
128 | 
129 |     def finish(self):
130 |         if self.neptune_logger is not None:
131 |             self.neptune_logger.stop()
132 | 
133 |         if self.wandb_run is not None:
134 |             wandb.finish()
135 | 


--------------------------------------------------------------------------------
/t5/t5/utils/optim.py:
--------------------------------------------------------------------------------
  1 | def get_optimizer(model, args):
  2 |     if args.optim.name == "adamwscale":
  3 |         from .copied import AdamWScale
  4 | 
  5 |         optimizer = AdamWScale(
  6 |             model.parameters(),
  7 |             lr=args.optim.base_lr,
  8 |         )
  9 |     elif args.optim.name == "lion":
 10 |         from .lion import Lion
 11 | 
 12 |         optimizer = Lion(
 13 |             model.parameters(),
 14 |             weight_decay=args.optim.weight_decay,
 15 |             lr=args.optim.base_lr,
 16 |         )
 17 |     elif args.optim.name == "sophia":
 18 |         from .sophia import SophiaG
 19 | 
 20 |         optimizer = SophiaG(
 21 |             model.parameters(),
 22 |             rho=args.optim.rho,
 23 |             weight_decay=args.optim.weight_decay,
 24 |             lr=args.optim.base_lr,
 25 |         )
 26 |     else:
 27 |         raise NotImplementedError
 28 | 
 29 |     return optimizer
 30 | 
 31 | 
 32 | def get_lr_scheduler(optimizer, args, logger):
 33 |     if args.optim.lr_scheduler == "cosine":
 34 |         from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
 35 | 
 36 |         scheduler1 = LinearLR(
 37 |             optimizer,
 38 |             start_factor=0.5,
 39 |             end_factor=1,
 40 |             total_iters=args.optim.warmup_steps,
 41 |             last_epoch=-1,
 42 |         )
 43 | 
 44 |         scheduler2 = CosineAnnealingLR(
 45 |             optimizer,
 46 |             T_max=args.optim.total_steps - args.optim.warmup_steps,
 47 |             eta_min=args.optim.final_cosine,
 48 |         )
 49 | 
 50 |         lr_scheduler = SequentialLR(
 51 |             optimizer,
 52 |             schedulers=[scheduler1, scheduler2],
 53 |             milestones=[args.optim.warmup_steps],
 54 |         )
 55 |     elif args.optim.lr_scheduler == "legacy":
 56 |         import math
 57 | 
 58 |         from torch.optim.lr_scheduler import LambdaLR, LinearLR, SequentialLR
 59 | 
 60 |         msg = "You are using T5 legacy LR Schedule, it's independent from the optim.base_lr"
 61 |         logger.log_message(msg)
 62 | 
 63 |         num_steps_optimizer1 = math.ceil(args.optim.total_steps * 0.9)
 64 |         iters_left_for_optimizer2 = args.optim.total_steps - num_steps_optimizer1
 65 | 
 66 |         scheduler1 = LambdaLR(
 67 |             optimizer,
 68 |             lambda step: min(1e-2, 1.0 / math.sqrt(step)) / args.optim.base_lr
 69 |             if step
 70 |             else 1e-2 / args.optim.base_lr,
 71 |         )
 72 | 
 73 |         scheduler2 = LinearLR(
 74 |             optimizer,
 75 |             start_factor=(
 76 |                 min(1e-2, 1.0 / math.sqrt(num_steps_optimizer1)) / args.optim.base_lr
 77 |             ),
 78 |             end_factor=0,
 79 |             total_iters=iters_left_for_optimizer2,
 80 |             last_epoch=-1,
 81 |         )
 82 | 
 83 |         lr_scheduler = SequentialLR(
 84 |             optimizer,
 85 |             schedulers=[scheduler1, scheduler2],
 86 |             milestones=[num_steps_optimizer1],
 87 |         )
 88 |     elif args.optim.lr_scheduler == "constant":
 89 |         from transformers import get_scheduler
 90 | 
 91 |         lr_scheduler = get_scheduler(
 92 |             name=args.optim.lr_scheduler,
 93 |             optimizer=optimizer,
 94 |         )
 95 |     elif args.optim.lr_scheduler == "cosine-budget":
 96 |         import math
 97 | 
 98 |         from torch.optim.lr_scheduler import LambdaLR
 99 | 
100 |         num_warmup_steps = args.optim.warmup_steps
101 |         num_training_steps = args.optim.total_steps
102 |         num_cycles = 0.5
103 | 
104 |         def lr_lambda(current_step):
105 |             fake_step = current_step
106 | 
107 |             if fake_step < num_warmup_steps:
108 |                 return (
109 |                     (float(fake_step) / float(max(1, num_warmup_steps))) * 0.5
110 |                 ) + 0.5
111 | 
112 |             progress = float(fake_step - num_warmup_steps) / float(
113 |                 max(1, num_training_steps - num_warmup_steps)
114 |             )
115 |             return max(
116 |                 1e-5,
117 |                 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
118 |             )
119 | 
120 |         return LambdaLR(optimizer, lr_lambda, -1)
121 |     else:
122 |         raise NotImplementedError
123 | 
124 |     return lr_scheduler
125 | 


--------------------------------------------------------------------------------