├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── abacus.py
├── arithmetic_eval_quicker.py
├── cramming
    ├── __init__.py
    ├── architectures
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── components.py
    │   ├── construction.py
    │   ├── crammed_depthrecurrent.py
    │   ├── crammed_transformer.py
    │   ├── embeddings.py
    │   ├── huggingface_interface.py
    │   ├── losses.py
    │   └── sanity_check.py
    ├── backend
    │   ├── __init__.py
    │   ├── optimizers
    │   │   ├── __init__.py
    │   │   ├── optimizer_modifiers.py
    │   │   ├── progressive_batching.py
    │   │   └── schedulers.py
    │   ├── prepare_backend.py
    │   ├── torch_default.py
    │   └── utils.py
    ├── config
    │   ├── __init__.py
    │   ├── arch
    │   │   ├── __init__.py
    │   │   ├── albert.yaml
    │   │   ├── crammed-depthrecurrent.yaml
    │   │   ├── crammed-fakeRNN.yaml
    │   │   ├── crammed-janus.yaml
    │   │   ├── crammed-rnn.yaml
    │   │   ├── crammed-stack-janus.yaml
    │   │   ├── crammed-tiny.yaml
    │   │   ├── crammed-transformer.yaml
    │   │   ├── gpt2-base.yaml
    │   │   ├── hf-gpt2.yaml
    │   │   └── sanitycheck.yaml
    │   ├── cfg_eval.yaml
    │   ├── cfg_pretrain.yaml
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── arithmetic.yaml
    │   │   ├── c4-subset-processed.yaml
    │   │   ├── openweb.yaml
    │   │   ├── proofpile.yaml
    │   │   ├── sanity-check-1.yaml
    │   │   ├── sanity-check-2.yaml
    │   │   └── sources
    │   │   │   ├── ag_news.yaml
    │   │   │   ├── arithmetic.yaml
    │   │   │   ├── bookcorpus.yaml
    │   │   │   ├── c4.yaml
    │   │   │   ├── dash_books.yaml
    │   │   │   ├── fake.yaml
    │   │   │   ├── iwslt.yaml
    │   │   │   ├── local.yaml
    │   │   │   ├── no_code_stackexchange.yaml
    │   │   │   ├── openwebtext.yaml
    │   │   │   ├── oscar.yaml
    │   │   │   ├── proofpiledata.yaml
    │   │   │   ├── the_pile.yaml
    │   │   │   ├── the_pileCC.yaml
    │   │   │   ├── the_pile_dedup.yaml
    │   │   │   ├── the_pile_natural.yaml
    │   │   │   ├── the_pile_stream.yaml
    │   │   │   ├── uncorpus.yaml
    │   │   │   ├── uspto.yaml
    │   │   │   ├── wikibooks.yaml
    │   │   │   ├── wikinews.yaml
    │   │   │   ├── wikipedia.yaml
    │   │   │   ├── wikiquote.yaml
    │   │   │   ├── wikiversity.yaml
    │   │   │   └── wikivoyage.yaml
    │   ├── eval
    │   │   ├── __init__.py
    │   │   ├── pythia.yaml
    │   │   └── tasks
    │   │   │   ├── lambada_openai.yaml
    │   │   │   └── winogrande.yaml
    │   ├── hydra
    │   │   ├── __init__.py
    │   │   └── job_logging
    │   │   │   └── custom.yaml
    │   ├── impl
    │   │   ├── __init__.py
    │   │   ├── _default.yaml
    │   │   └── torch-default.yaml
    │   ├── train
    │   │   ├── __init__.py
    │   │   ├── common.yaml
    │   │   ├── cramming.yaml
    │   │   ├── janus-regime.yaml
    │   │   ├── optim
    │   │   │   ├── adafactor.yaml
    │   │   │   ├── adahessian.yaml
    │   │   │   ├── adam.yaml
    │   │   │   ├── adam8bit.yaml
    │   │   │   ├── adam_classic.yaml
    │   │   │   ├── adamscale.yaml
    │   │   │   ├── agd.yaml
    │   │   │   ├── lion.yaml
    │   │   │   ├── radam.yaml
    │   │   │   ├── sgd.yaml
    │   │   │   └── shampoo.yaml
    │   │   └── optim_mod
    │   │   │   ├── disabled.yaml
    │   │   │   ├── larc.yaml
    │   │   │   ├── lars.yaml
    │   │   │   ├── progressive.yaml
    │   │   │   └── sam.yaml
    │   └── wandb
    │   │   ├── default.yaml
    │   │   └── none.yaml
    ├── data
    │   ├── __init__.py
    │   ├── arithmetic_tokenizers.py
    │   ├── curriculum_sorting.py
    │   ├── deduplicate.py
    │   ├── pretraining_preparation.py
    │   ├── tokenizer_preparation.py
    │   └── utils.py
    └── utils.py
├── create_data_split.py
├── create_pos_or_variants.py
├── dataset_analysis.py
├── gen_eval_script.py
├── load_local_model.py
├── pretrain.py
├── pretty_plotter.py
├── pretty_plotter_big.py
├── pretty_plotter_sort.py
├── pyproject.toml
├── setup.cfg
├── shells
    ├── addition_ff.sh
    ├── addition_lt.sh
    ├── bitwise_or.sh
    ├── evaluation.sh
    ├── generate_and_tokenize_data.sh
    ├── multiplication.sh
    └── sorting.sh
├── sort_eval.py
└── upload_processed_dataset.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | outputs
  2 | tables/*/*.csv
  3 | tables/*/*.csv#
  4 | tables/*.csv
  5 | tables/*.csv#
  6 | tables/*.ods
  7 | *.png
  8 | *.pdf
  9 | 
 10 | # torchdynamo debug
 11 | isolate
 12 | repro.py
 13 | 
 14 | checkpoints
 15 | wandb-metadata.json
 16 | 
 17 | torch_compile_debug/
 18 | 
 19 | dedup
 20 | 
 21 | .vs/
 22 | 
 23 | *.pdf
 24 | images
 25 | 
 26 | *.temp.sh
 27 | 
 28 | # Byte-compiled / optimized / DLL files
 29 | __pycache__/
 30 | *.py[cod]
 31 | *$py.class
 32 | 
 33 | # C extensions
 34 | *.so
 35 | 
 36 | # Distribution / packaging
 37 | .Python
 38 | build/
 39 | develop-eggs/
 40 | dist/
 41 | downloads/
 42 | eggs/
 43 | .eggs/
 44 | lib/
 45 | lib64/
 46 | parts/
 47 | sdist/
 48 | var/
 49 | wheels/
 50 | pip-wheel-metadata/
 51 | share/python-wheels/
 52 | *.egg-info/
 53 | .installed.cfg
 54 | *.egg
 55 | MANIFEST
 56 | 
 57 | # PyInstaller
 58 | #  Usually these files are written by a python script from a template
 59 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 60 | *.manifest
 61 | *.spec
 62 | 
 63 | # Installer logs
 64 | pip-log.txt
 65 | pip-delete-this-directory.txt
 66 | 
 67 | # Unit test / coverage reports
 68 | htmlcov/
 69 | .tox/
 70 | .nox/
 71 | .coverage
 72 | .coverage.*
 73 | .cache
 74 | nosetests.xml
 75 | coverage.xml
 76 | *.cover
 77 | *.py,cover
 78 | .hypothesis/
 79 | .pytest_cache/
 80 | 
 81 | # Translations
 82 | *.mo
 83 | *.pot
 84 | 
 85 | # Django stuff:
 86 | *.log
 87 | local_settings.py
 88 | db.sqlite3
 89 | db.sqlite3-journal
 90 | 
 91 | # Flask stuff:
 92 | instance/
 93 | .webassets-cache
 94 | 
 95 | # Scrapy stuff:
 96 | .scrapy
 97 | 
 98 | # Sphinx documentation
 99 | docs/_build/
100 | 
101 | # PyBuilder
102 | target/
103 | 
104 | # Jupyter Notebook
105 | .ipynb_checkpoints
106 | 
107 | # IPython
108 | profile_default/
109 | ipython_config.py
110 | 
111 | # pyenv
112 | .python-version
113 | 
114 | # pipenv
115 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
116 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
117 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
118 | #   install all needed dependencies.
119 | #Pipfile.lock
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | 
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 | 
144 | # Rope project settings
145 | .ropeproject
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 | 
155 | # Pyre type checker
156 | .pyre/
157 | 
158 | *.csv
159 | *.txt
160 | *.pth
161 | 
162 | cramming-data/
163 | sanity.sh
164 | log/
165 | del.sh
166 | del.py
167 | sort_plots/


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # precommit hooks from https://github.com/ashleve/lightning-hydra-template
 2 | repos:
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v3.4.0
 5 |     hooks:
 6 |       # list of supported hooks: https://pre-commit.com/hooks.html
 7 |       - id: trailing-whitespace
 8 |       - id: end-of-file-fixer
 9 |       - id: check-yaml
10 |       - id: check-added-large-files
11 |       - id: debug-statements
12 |       - id: detect-private-key
13 | 
14 |   # python code formatting
15 |   - repo: https://github.com/psf/black
16 |     rev: 22.3.0
17 |     hooks:
18 |       - id: black
19 |         args: [--line-length, "140", "--fast"] # ;>
20 | 
21 |   # yaml formatting
22 |   - repo: https://github.com/pre-commit/mirrors-prettier
23 |     rev: v2.3.0
24 |     hooks:
25 |       - id: prettier
26 |         types: [yaml]
27 | 
28 |   # python code analysis
29 |   - repo: https://github.com/PyCQA/flake8
30 |     rev: 4.0.1
31 |     hooks:
32 |       - id: flake8
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Sean McLeish, Jonas Geiping
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # added by check-manifest
2 | include *.py
3 | include *.yaml
4 | recursive-include cramming *.md
5 | recursive-include cramming *.yaml
6 | global-exclude *.pyc
7 | global-exclude __pycache__
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Transformers Can Do Arithmetic with the Right Embeddings! [Link to arXiv paper](https://arxiv.org/abs/2405.17399)
  2 | 
  3 | A joint project by: Sean McLeish, Arpit Bansal, Alex Stein,  Neel Jain, John Kirchenbauer, Brian R. Bartoldson, Bhavya Kailkhura, Abhinav Bhatele, Jonas Geiping, Avi Schwarzschild and Tom Goldstein
  4 | 
  5 | 
  6 | 
  7 | This repository contains code to replicate our research. It is a fork of the language model training framework [cramming](https://github.com/JonasGeiping/cramming) edited to for a next token prediction objective.
  8 | 
  9 | We provide a standalone implementation of Abacus Embeddings in [abacus.py](abacus.py).
 10 | 
 11 | ## Citing Our Work
 12 | To cite our work, please use this bibtex.
 13 | ```
 14 | @article{mcleish2024transformers,
 15 |     title={Transformers Can Do Arithmetic with the Right Embeddings}, 
 16 |     author={Sean McLeish and Arpit Bansal and Alex Stein and Neel Jain and John Kirchenbauer and Brian R. Bartoldson and Bhavya Kailkhura and Abhinav Bhatele and Jonas Geiping and Avi Schwarzschild and Tom Goldstein},
 17 |     journal={arXiv preprint arXiv:2405.17399},
 18 |     year={2024}
 19 | }
 20 | ```
 21 | 
 22 | # Getting Started
 23 | We developed in Python 3.10.4, to install run:
 24 | ```
 25 | git clone git@github.com:mcleish7/arithmetic.git
 26 | cd arithmetic
 27 | pip install .
 28 | ```
 29 | 
 30 | On some machines you will need to run:
 31 | 1. `pip install multiprocess -U`
 32 | 2. `pip install dill -U`
 33 | 3. `pip install apache-beam -U`
 34 | 
 35 | # Arithmetic
 36 | ## Datasets
 37 | We release our datasets on [Google Drive](https://drive.google.com/drive/folders/1DqjCrUM1cNV7069Zl25_qBw2Px2xAw9j?usp=sharing) both in zipped format. We recommend you work with the zipped version until it is correctly placed in your file system.
 38 | 
 39 | Alternatively, you can make your own datasets using [create_data_split.py](create_data_split.py) using the commands from [shells/generate_and_tokenize_data.sh](shells/generate_and_tokenize_data.sh).
 40 | 
 41 | ## File Structure
 42 | We recommend creating another directory `cramming-data` inside of arithmetic. This is where the models, logs and data will be stored.
 43 | 
 44 | You can either export you cramming base directory path to your `.bashrc` or you can replace `$cramming_base_dir` manually in the provided shells.
 45 | ```
 46 | cd arithmetic
 47 | mkdir cramming-data
 48 | echo 'export cramming_base_dir=MY_BASE_DIR' >> ~/.bashrc
 49 | source ~/.bashrc
 50 | ```
 51 | For example, this may look like: `echo 'export cramming_base_dir=~/arithmetic/cramming-data' >> ~/.bashrc`
 52 | 
 53 | For example our file system looks like:
 54 | ```
 55 | cramming-generative
 56 | └── cramming-data
 57 |     ├── addition-train-one
 58 |     │    ├── pretrain/<DATE>/<TIME>
 59 |     │    │    ├── .hydra
 60 |     │    │    │   ├── config.yaml
 61 |     │    │    │   ├── hydra.yaml
 62 |     │    │    │   └── overrides.yaml
 63 |     │    │    └── addition-train-one_pretrain.log
 64 |     │    ├── checkpoints/FINAL_<LOSS_VAL>
 65 |     │    │    ├── model_config.json
 66 |     │    │    ├── model.safetensors
 67 |     │    │    └── state_dict.pth
 68 |     │    └── downstream
 69 |     └── data
 70 |         └── arithmetic_data
 71 |             ├── +_grid_eval_dataset_reverse_all_tokenized
 72 |             └── ... other datasets ...
 73 | ```
 74 | 
 75 | ## Training
 76 | Example commands are in the [shells](shells) directory, organised by task.
 77 | 
 78 | ### Explanation of Some Commands
 79 | 1. Give samples instead of tokens equal importance in loss: `arch.loss_reduction=none`
 80 | 2. Divide the gradients in the recurrent block by the number of recurrences: `arch.throttle=True`
 81 | 3. Mask before the equals sign: `arch.mask_before_equals=True`
 82 | 4. Skip connections inside of the recurrent block: `arch.forward_only_model_with_skip=True`
 83 | 5. Multi-GPU: `python` -> `torchrun --nproc_per_node=<NUM GPUS> --standalone ` and add `impl.fullgraph=false`
 84 | 
 85 | ### Positional Embeddings:
 86 | #### Absolute
 87 | 1. Learned: `arch.embedding.pos_embedding=learned`
 88 | 2. Abacus: `arch.embedding.pos_embedding=abacus`
 89 | * If you want the maximum k in abacus to be larger: `arch.embedding.max_abacus_len=100`, be default this value is 100. Abacus is also implemented in a standalone manner in [abacus.py](abacus.py).
 90 | 
 91 | #### Relative
 92 | 1. NoPE: `arch.embedding.pos_embedding=None`
 93 | 2. FIRE: `arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"`
 94 | 3. FIRE randomised: e.g:`arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.attention.max_length=128` by default `arch.attention.max_length=0` so setting this longer than the max sequence length gives some randomness in the embedding.
 95 | 4. RoPE: `arch.attention.type="self-attention" arch.attention.rotary_embedding=true`
 96 | 
 97 | ### Checkpointing
 98 | We have implemented *single* GPU training checkpointing, to do this use:
 99 | `impl.save_every_n_minutes=60 impl.save_intermediate_model_name='last'`
100 | This saves a checkpoint every 60 minutes under the name 'last'
101 | 
102 | Caution: This feature is not fully tested for multi-GPU cases. We also cannot currently train models which have used their full budget for longer.
103 | 
104 | ### WandB
105 | You can log runs to your weights&biases account. To do so, simply modify `wandb.entity` and `wandb.project` on the command line or at [cramming/config/wandb/default.yaml](cramming/config/wandb/default.yaml).
106 | 
107 | ## Testing
108 | We show examples in [shells/evaluation.sh](shells/evaluation.sh). 
109 | 
110 | We provide a very basic automation in [gen_eval_script.py](gen_eval_script.py), this prints the basic commands you may need to further edit these.
111 | 
112 | ### Addition
113 | For addition we have a very large possible evaluation set, we do a grid search over a 100x100 grid which we split into 20 pieces with the aim of balancing the number of forward calls across all 20 pieces.
114 | We then have a further eval for operand lengths 100->160.
115 | 
116 | ### Multiplication
117 | We only evaluate up to 25x25, which we do in a single job.
118 | 
119 | ### Sorting
120 | Sorting uses a separate evaluation file [sort_eval.py](sort_eval.py), this is because the evaluation calls cannot be parallelised, making evaluation much longer.
121 | The evaluation cannot be parallelised because the place of the equals sign is not fixed for a batch.
122 | We currently evaluate across 30 jobs for a 30x30 grid but this can be reduced to a smaller number of jobs using these flags: `max_size_given, start_ind_1_given, start_ind_2_given`
123 | 
124 | ### Bitwise OR
125 | We use the same framework as for addition but the process is quicker as some of the batches do not contain 100 samples as there are not 100 possibilities for some batches. Unlike addition we do not sample with replacement for this task.
126 | 
127 | # Analysis
128 | 1. We provide [pretty_plotter.py](pretty_plotter.py) to combine the small evaluation grids together into one plot.
129 | Use this by putting the model name into the string at the top of the `main` function.
130 | 2. For the large 100x100 grids we provide [pretty_plotter_big.py](pretty_plotter_big.py).
131 | These are designed to be as flexible as possible but may need to be edited to fit your file set up.
132 | 3. For sorting, we provide [pretty_plotter_sort.py](pretty_plotter_sort.py), this allows us to read the individual `.txt` files created during testing and merge them all together into a nice plot.
133 | 
134 | # Contact
135 | Please, feel free to contact us with any questions, or open an issue on Github.


--------------------------------------------------------------------------------
/abacus.py:
--------------------------------------------------------------------------------
 1 | """Implementation of abacus embeddings"""
 2 | # Example of how to extract digit tokens to pass into constructor
 3 | # digit_tokens = tokenizer.convert_tokens_to_ids(['0','1','2','3','4','5','6','7','8','9'])
 4 | 
 5 | class Abacus(torch.nn.Module):
 6 |     """
 7 |     Abacus Embeddings, learned emebddings resued for each digit.
 8 |     Integers must be reversed for this to work correctly.
 9 |     Transformers Can Do Arithmetic with the Right Embeddings, McLeish et al. (2024)
10 |     """
11 |     def __init__(self, digit_tokens, embedding_dim, max_seq_length=1024, max_k=99):
12 |         """
13 |         digit_tokens (list): list of the tokens for each of the 10 digits, `digit_tokens = tokenizer.convert_tokens_to_ids(['0','1','2','3','4','5','6','7','8','9'])`
14 |         embedding_dim (int): dimension to embed into
15 |         max_seq_length (int): maximum number of embeddings that can be trained
16 |         max_k (int): maximum k value which we randomly shift by during training
17 |         """
18 |         super().__init__()
19 |         self.embedding = torch.nn.Embedding(max_seq_length, embedding_dim)
20 |         self.register_buffer("digits", torch.tensor(digit_tokens), persistent=False)
21 | 
22 |         self.max_k = max_k
23 | 
24 |     def helper(self, mask, device):
25 |         """
26 |         Converts a binary mask of digit locations into spans of consecutive digits
27 |         """
28 |         mask_shape = mask.shape
29 |         
30 |         # Create a shifted version of the mask to detect changes from 0 to 1
31 |         shifted_mask = torch.cat([torch.zeros((mask_shape[0], 1), device=device, dtype=mask.dtype), mask[:, :-1]], dim=1)
32 |         starts = (shifted_mask != mask) & mask
33 |         
34 |         # Generate IDs for each segment of 1s, processing row-wise
35 |         segment_ids = torch.cumsum(starts, dim=1)
36 |         
37 |         # Generate an index array row-wise
38 |         index = torch.arange(mask.size(1)).repeat(mask.size(0), 1).to(device)
39 |         
40 |         # Reset index at the start of each segment
41 |         reset_index = torch.zeros_like(mask).long()
42 |         second_term = index * starts.long()
43 |         reset_index = reset_index.scatter_add(1, segment_ids, second_term)
44 |         
45 |         # Calculate positions in segment
46 |         positions = index - reset_index.gather(1, segment_ids) + 1
47 |         
48 |         # Ensure only values within 1-segments are non-zero
49 |         result = positions * mask
50 | 
51 |         return result
52 | 
53 |     def forward(self, input_ids):
54 |         """
55 |         input_ids (tensor): a batch of inputs, each row is a sample
56 |         """
57 |         mask = torch.isin(input_ids, self.digits)
58 |         output = self.helper(mask, input_ids.device)
59 | 
60 |         k=0
61 |         if self.training:
62 |             k = random.randint(0, self.max_k)
63 |             output[output>0] += k # as we already have ones in the tensor, the tensor values will be k+1
64 | 
65 |         return self.embedding(output)
66 | 


--------------------------------------------------------------------------------
/cramming/__init__.py:
--------------------------------------------------------------------------------
 1 | """Initialize cramming"""
 2 | 
 3 | from cramming import utils
 4 | from cramming.architectures import construct_model
 5 | from cramming.backend import load_backend
 6 | from cramming.data import load_pretraining_corpus, prepare_dataloaders
 7 | 
 8 | 
 9 | __all__ = [
10 |     "construct_model",
11 |     "load_backend",
12 |     "prepare_dataloaders",
13 |     "load_pretraining_corpus",
14 |     "utils",
15 | ]
16 | 
17 | 
18 | import hydra
19 | 
20 | """Construct interfaces to some cfg folders for use in packaged installations:"""
21 | 
22 | 
23 | def get_config(overrides=[]):
24 |     """Return default hydra config."""
25 |     with hydra.initialize(config_path="config"):
26 |         cfg = hydra.compose(config_name="cfg", overrides=overrides)
27 |         print(f"Loading default config {cfg.name}.")
28 |     return cfg
29 | 
30 | 
31 | def get_model_config(arch="hf-bert-tiny", overrides=[]):
32 |     """Return default hydra config for a given attack."""
33 |     with hydra.initialize(config_path="config/arch"):
34 |         cfg = hydra.compose(config_name=arch, overrides=overrides)
35 |         print(f"Loading model configuration {cfg.architecture}.")
36 |     return cfg
37 | 
38 | 
39 | def get_backend_config(backend="torch-default", overrides=[]):
40 |     """Return default hydra config for a given attack."""
41 |     with hydra.initialize(config_path="config/impl"):
42 |         cfg = hydra.compose(config_name=backend, overrides=overrides)
43 |         print(f"Loading backend {cfg.name}.")
44 |     return cfg
45 | 


--------------------------------------------------------------------------------
/cramming/architectures/__init__.py:
--------------------------------------------------------------------------------
1 | """This module handles all questions of model architecture."""
2 | 
3 | from .construction import construct_model
4 | 
5 | __all__ = ["construct_model"]
6 | 


--------------------------------------------------------------------------------
/cramming/architectures/construction.py:
--------------------------------------------------------------------------------
 1 | """Interface to construct models."""
 2 | 
 3 | from .huggingface_interface import construct_huggingface_model
 4 | from .sanity_check import SanityCheckforPreTraining
 5 | from .crammed_transformer import construct_crammed_transformer
 6 | from .crammed_depthrecurrent import construct_crammed_recurrent
 7 | 
 8 | import logging
 9 | from ..utils import is_main_process
10 | 
11 | log = logging.getLogger(__name__)
12 | 
13 | 
14 | def construct_model(cfg_arch, tokenizer):
15 |     model = None
16 |     eos_token_id = tokenizer.eos_token  # tokenizer.vocab["<eot>"]
17 |     if "model_type" in cfg_arch:
18 |         # attempt to solve locally
19 |         if "SanityCheckLM" in cfg_arch.model_type:
20 |             model = SanityCheckforPreTraining(cfg_arch.width, tokenizer.vocab_size)
21 |         elif "ScriptableCrammedTransformer" in cfg_arch.model_type:
22 |             model = construct_crammed_transformer(cfg_arch, tokenizer.vocab_size)
23 |         elif "ScriptableCrammedDepthRecurrent" in cfg_arch.model_type:
24 |             equals_token = tokenizer.vocab["="]
25 |             model = construct_crammed_recurrent(cfg_arch, tokenizer.vocab_size, equals_token)
26 | 
27 |     if model is not None:  # Return local model arch
28 |         num_params = sum([p.numel() for p in model.parameters()])
29 |         if is_main_process():
30 |             log.info(f"Model with architecture {cfg_arch.model_type} loaded with {num_params:,} parameters.")
31 |         return model
32 | 
33 |     try:  # else try on HF
34 |         model = construct_huggingface_model(cfg_arch, tokenizer.vocab_size)
35 |         num_params = sum([p.numel() for p in model.parameters()])
36 |         if is_main_process():
37 |             log.info(f"Model with config {cfg_arch} loaded with {num_params:,} parameters.")
38 |         return model
39 |     except Exception as e:
40 |         raise ValueError(f"Invalid model architecture {cfg_arch.model_type} given. Error: {e}")
41 | 


--------------------------------------------------------------------------------
/cramming/architectures/crammed_transformer.py:
--------------------------------------------------------------------------------
  1 | """Base file for modifications of the transformer architecture"""
  2 | import torch
  3 | from transformers import PretrainedConfig, PreTrainedModel
  4 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
  5 | 
  6 | from typing import Optional
  7 | from omegaconf import OmegaConf
  8 | 
  9 | from .components import (
 10 |     _get_norm_fn,
 11 |     _get_nonlin_fn,
 12 |     NormalizedResidualConnection,
 13 |     EmbeddingComponent,
 14 |     GLU,
 15 |     get_causal_attention_mask,
 16 |     _init_module,
 17 | )
 18 | from .attention import get_attention_mechanism
 19 | 
 20 | 
 21 | class crammedTransformerConfig(PretrainedConfig):
 22 |     model_type = "crammedTransformer"
 23 | 
 24 |     def __init__(self, cfg_arch_container: dict = {}, **kwargs):
 25 |         self.arch = cfg_arch_container
 26 |         super().__init__(**kwargs)
 27 | 
 28 | 
 29 | def construct_crammed_transformer(cfg_arch, vocab_size):
 30 |     """See the config file for details on what is possible."""
 31 |     cfg_arch.embedding.vocab_size = vocab_size
 32 | 
 33 |     config = crammedTransformerConfig(OmegaConf.to_container(cfg_arch, resolve=True))
 34 |     model = ScriptableLMForPreTraining(config)
 35 | 
 36 |     return model
 37 | 
 38 | 
 39 | class FFNComponent(torch.nn.Module):
 40 |     """Note: The FF layer is not auto-scaled when using a GLU type activation.
 41 |     Better do this manually and choose a sensible intermed_size that is nicely divisible.
 42 | 
 43 |     The neox suggestion for approx. equal parameter count is int(4 * 2 / 3 * hidden_size) * 2 [this is ~5.33]
 44 |     """
 45 | 
 46 |     def __init__(self, hidden_size, intermed_size, cfg_arch, output_size=None):
 47 |         super().__init__()
 48 |         self.dense_in = torch.nn.Linear(hidden_size, intermed_size, bias=cfg_arch.use_bias)
 49 |         self.nonlin = _get_nonlin_fn(cfg_arch.nonlin)()
 50 |         if isinstance(self.nonlin, GLU):
 51 |             intermed_output_size = intermed_size // 2
 52 |         else:
 53 |             intermed_output_size = intermed_size
 54 |         if cfg_arch.sub_normalization:
 55 |             self.norm = _get_norm_fn(cfg_arch.norm)(intermed_output_size, eps=cfg_arch.norm_eps)
 56 |         else:
 57 |             self.norm = torch.nn.Identity()
 58 |         output_size = hidden_size if output_size is None else output_size
 59 |         self.dense_out = torch.nn.Linear(intermed_output_size, output_size, bias=cfg_arch.use_bias)
 60 | 
 61 |     def forward(self, hidden_states):
 62 |         return self.dense_out(self.norm(self.nonlin(self.dense_in(hidden_states))))
 63 | 
 64 | 
 65 | class TransformerLayer(torch.nn.Module):
 66 |     """A transformer structure based on the components from above."""
 67 | 
 68 |     def __init__(self, idx, cfg_arch):
 69 |         super().__init__()
 70 |         self.residual1 = NormalizedResidualConnection(cfg_arch.hidden_size, cfg_arch)
 71 |         self.residual2 = NormalizedResidualConnection(cfg_arch.hidden_size, cfg_arch)
 72 |         if cfg_arch.attention.sub_normalization:
 73 |             sub_norm_fn = lambda: get_norm_fn(cfg_arch.norm)(cfg_arch.hidden_size, eps=cfg_arch.norm_eps)  # noqa
 74 |         else:
 75 |             sub_norm_fn = torch.nn.Identity
 76 |         self.attn = get_attention_mechanism(idx, cfg_arch.hidden_size, cfg_arch.attention, sub_norm_fn)
 77 |         self.ffn = FFNComponent(cfg_arch.hidden_size, cfg_arch.intermed_size, cfg_arch)
 78 |         self.LAYOUT = self.attn.LAYOUT
 79 | 
 80 |     def forward(self, states, attention_mask: Optional[torch.Tensor] = None):
 81 |         states = self.residual1(states, self.attn, states, attention_mask)
 82 |         states = self.residual2(states, self.ffn, states)
 83 |         return states
 84 | 
 85 | 
 86 | class ScriptableLM(PreTrainedModel):
 87 |     """Simplified transformer wrapper."""
 88 | 
 89 |     config_class = crammedTransformerConfig
 90 | 
 91 |     def __init__(self, config):
 92 |         super().__init__(config)
 93 |         self.cfg = OmegaConf.create(config.arch)
 94 | 
 95 |         self.embedding = EmbeddingComponent(self.cfg.embedding, self.cfg.norm, self.cfg.norm_eps)
 96 |         self.layers = torch.nn.ModuleList([TransformerLayer(idx, self.cfg) for idx in range(self.cfg.num_transformer_layers)])
 97 |         self.seq_first = self.layers[0].LAYOUT == "[S B H]" if len(self.layers) > 0 else False
 98 | 
 99 |         if self.cfg.final_norm:
100 |             self.final_norm = _get_norm_fn(self.cfg.norm)(self.cfg.hidden_size, eps=self.cfg.norm_eps)
101 |         else:
102 |             self.final_norm = torch.nn.Identity()
103 | 
104 |         self.register_buffer("attention_mask", torch.ones([0, 0, 0, 0], dtype=torch.bool), persistent=False)
105 | 
106 |     def forward(self, input_ids: torch.Tensor):
107 |         if input_ids.shape[1] != self.attention_mask.shape[1]:
108 |             self.attention_mask = get_causal_attention_mask(input_ids)
109 |         hidden_states = self.embedding(input_ids)
110 | 
111 |         if self.seq_first:
112 |             hidden_states = hidden_states.transpose(0, 1).contiguous()
113 | 
114 |         for i, layer_module in enumerate(self.layers):
115 |             hidden_states = layer_module(hidden_states, self.attention_mask)
116 | 
117 |         # if self.seq_first:
118 |         #     hidden_states = hidden_states.transpose(0, 1).contiguous()
119 |         # this happens only in the output if necessary
120 | 
121 |         return self.final_norm(hidden_states)
122 | 
123 | 
124 | class ScriptableLMForPreTraining(PreTrainedModel):
125 |     """Pretraining version with optional prediction head and variant for sparse prediction."""
126 | 
127 |     config_class = crammedTransformerConfig
128 | 
129 |     def __init__(self, config):
130 |         super().__init__(config)
131 |         self.cfg = OmegaConf.create(config.arch)
132 | 
133 |         self.encoder = ScriptableLM(config)
134 | 
135 |         self.decoder = torch.nn.Linear(self.cfg.embedding.embedding_dim, self.cfg.embedding.vocab_size, bias=self.cfg.decoder_bias)
136 |         self.decoder.weight = self.encoder.embedding.word_embedding.weight
137 | 
138 |         self.loss_fn = torch.nn.CrossEntropyLoss()
139 |         self._init_weights()
140 | 
141 |     def _init_weights(self, module=None):
142 |         modules = self.modules() if module is None else [module]
143 |         for module in modules:
144 |             _init_module(
145 |                 module,
146 |                 self.cfg.init.type,
147 |                 self.cfg.init.std,
148 |                 self.cfg.hidden_size,
149 |                 self.cfg.num_transformer_layers,
150 |             )
151 | 
152 |     def forward(self, input_ids: torch.Tensor, *args, **kwargs):
153 |         outputs = self.decoder(self.encoder(input_ids))
154 | 
155 |         if self.encoder.seq_first:
156 |             shifted_outputs = outputs[:-1]
157 |             shifted_labels = input_ids.transpose(0, 1)[1:].contiguous()
158 |             outputs = outputs.detach().transpose(0, 1)
159 |         else:
160 |             shifted_outputs = outputs[..., :-1, :].contiguous()
161 |             shifted_labels = input_ids[..., 1:].contiguous()
162 |             outputs = outputs.detach()
163 |         # Flatten the tokens and compute loss
164 |         loss = self.loss_fn(shifted_outputs.view(-1, shifted_outputs.shape[-1]), shifted_labels.view(-1))
165 | 
166 |         return {"loss": loss, "logits": outputs[:, -1, :], "log_perplexity": loss.clone().detach()}
167 | 
168 | 
169 | # ###### HF registry here? ############### #
170 | 
171 | AutoConfig.register("crammedTransformer", crammedTransformerConfig)
172 | AutoModel.register(crammedTransformerConfig, ScriptableLM)
173 | AutoModelForCausalLM.register(crammedTransformerConfig, ScriptableLMForPreTraining)
174 | 


--------------------------------------------------------------------------------
/cramming/architectures/huggingface_interface.py:
--------------------------------------------------------------------------------
 1 | """HF model variations based on reconfiguring their huggingface implementations."""
 2 | 
 3 | import transformers
 4 | 
 5 | 
 6 | def construct_huggingface_model(cfg_arch, vocab_size):
 7 |     """construct model from given configuration. Only works if this arch exists on the hub."""
 8 | 
 9 |     if isinstance(cfg_arch, transformers.PretrainedConfig):
10 |         configuration = cfg_arch
11 |     else:
12 |         model_type = cfg_arch["model_type"]
13 |         configuration = transformers.AutoConfig.from_pretrained(pretrained_model_name_or_path=model_type, **cfg_arch)
14 |     configuration.vocab_size = vocab_size
15 |     model = transformers.AutoModelForPreTraining.from_config(configuration)
16 |     model.vocab_size = model.config.vocab_size
17 | 
18 |     old_forward = model.forward
19 | 
20 |     def modified_forward(input_ids, attention_mask=None, **kwargs):
21 |         return old_forward(input_ids=input_ids, labels=input_ids, attention_mask=attention_mask)
22 | 
23 |     model.forward = modified_forward
24 | 
25 |     return model
26 | 


--------------------------------------------------------------------------------
/cramming/architectures/losses.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import math
  3 | 
  4 | 
  5 | class CosineLoss(torch.nn.Module):
  6 |     __constants__ = ["reduction"]
  7 |     reduction: str
  8 | 
  9 |     def __init__(self, reduction: str = "mean", dim=-1, eps=1e-8) -> None:
 10 |         super().__init__()
 11 |         self.reduction = reduction
 12 |         assert self.reduction == "mean"
 13 |         self.dim = dim
 14 |         self.eps = eps
 15 | 
 16 |     def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
 17 |         return 1 - torch.nn.functional.cosine_similarity(x1, x2, self.dim, self.eps).mean()
 18 | 
 19 | 
 20 | class CrossEntropyWithZLoss(torch.nn.Module):
 21 |     """Cross Entropy plus logit regularization via z_loss."""
 22 | 
 23 |     __constants__ = ["ignore_index", "z_loss_factor"]
 24 |     ignore_index: int
 25 |     z_loss_factor: float
 26 | 
 27 |     def __init__(self, ignore_index=-100, z_loss_factor=1e-4):
 28 |         super().__init__()
 29 |         self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index)
 30 |         self.z_loss_factor = z_loss_factor
 31 |         self.ignore_index = ignore_index
 32 | 
 33 |     def forward(self, inputs, labels):
 34 |         """Is this is the optimal implementation? Is this even what is meant?
 35 |         I wish there were more answers or code for PaLM
 36 | 
 37 |         This implementation assumes that log(Z) is log(sum(exp(logits))).
 38 |         The usage of log2 here is also a bit wild...
 39 |         """
 40 |         z_reg = inputs.exp().sum(dim=-1).log2().sum() * self.z_loss_factor
 41 |         return self.loss_fn(inputs, labels) + z_reg
 42 | 
 43 | 
 44 | class MSELoss(torch.nn.Module):
 45 |     """MSE Loss as a drop-in replacement for Cross Entropy Loss.
 46 | 
 47 |     This implementation includes a mean reduction in batch dimension and a 1/num_classes/M reduction in classes."""
 48 | 
 49 |     def __init__(self, ignore_index=-100):
 50 |         """Parameters as in Hui&Belkin, 2021, but k=1, and M=sqrt(C) (so maybe not really Hui&Belkin?)"""
 51 |         super().__init__()
 52 |         self.ignore_index = ignore_index
 53 | 
 54 |     def forward(self, inputs, labels):
 55 |         """Is this is the optimal implementation? Could also do an index_select variation..."""
 56 |         num_classes = inputs.shape[-1]
 57 |         valid_mask = labels != self.ignore_index
 58 |         M = math.sqrt(num_classes)
 59 |         onehot_labels = self._label_to_onehot(labels[valid_mask], M, num_classes=num_classes)
 60 |         return 1 / (2 * M * num_classes) * (inputs[valid_mask] - onehot_labels).pow(2).sum()
 61 | 
 62 |     @staticmethod
 63 |     @torch.jit.script
 64 |     def _label_to_onehot(target, M: float = 1.0, num_classes: int = 100):
 65 |         onehot_target = torch.zeros(target.shape[0], num_classes, device=target.device)
 66 |         onehot_target.scatter_(1, target.view(-1, 1), M)
 67 |         return onehot_target
 68 | 
 69 | 
 70 | class MSELossFast(torch.nn.Module):
 71 |     """MSE Loss as a drop-in replacement for Cross Entropy Loss. Only for 2dim inputs and 1dim labels
 72 | 
 73 |     This implementation includes a mean reduction in batch dimension and a 1/num_classes/M reduction in classes."""
 74 | 
 75 |     def __init__(self, ignore_index=-100):
 76 |         """Parameters as in Hui&Belkin, 2021, but k=1, and M=sqrt(C) (so maybe not really Hui&Belkin?)"""
 77 |         super().__init__()
 78 |         self.ignore_index = ignore_index
 79 | 
 80 |     def forward(self, inputs, labels):
 81 |         """Is this is the optimal implementation? This at least circumvents literal 1-hot labels"""
 82 |         num_examples, num_classes = inputs.shape
 83 |         valid_mask = labels != self.ignore_index
 84 |         M = math.sqrt(num_classes)
 85 | 
 86 |         inputs = inputs[valid_mask]
 87 |         labels = labels[valid_mask]
 88 | 
 89 |         x_i = inputs.pow(2).sum()
 90 |         x_j = inputs[torch.arange(labels.shape[-1]), labels].sum()
 91 |         return 1 / (2 * M * num_classes) * (x_i - 2 * M * x_j + labels.shape[-1] * M**2)
 92 | 
 93 | 
 94 | class L1Loss(torch.nn.Module):
 95 |     """L1 Loss as a drop-in replacement for Cross Entropy Loss. Only for 2dim inputs and 1dim labels
 96 | 
 97 |     This implementation includes a mean reduction in batch dimension and a 1/num_classes reduction in classes."""
 98 | 
 99 |     def __init__(self, ignore_index=-100):
100 |         """."""
101 |         super().__init__()
102 |         self.ignore_index = ignore_index
103 | 
104 |     def forward(self, inputs, labels):
105 |         """Optimal scaling is less clear for L1"""
106 |         num_classes = inputs.shape[-1]
107 |         valid_mask = labels != self.ignore_index
108 |         M = math.sqrt(num_classes)
109 |         onehot_labels = self._label_to_onehot(labels[valid_mask], float(num_classes), num_classes=num_classes)
110 |         return 1 / inputs.shape[0] / M * (inputs[valid_mask] - onehot_labels).abs().sum()
111 | 
112 |     @staticmethod
113 |     @torch.jit.script
114 |     def _label_to_onehot(target, M: float = 1.0, num_classes: int = 100):
115 |         onehot_target = torch.zeros(target.shape[0], num_classes, device=target.device)
116 |         onehot_target.scatter_(1, target.view(-1, 1), M)
117 |         return onehot_target
118 | 
119 | 
120 | class SzegedyLoss(torch.nn.Module):
121 |     """Regression directly back to input embedding. Remove the decoding layer if using this loss.
122 | 
123 |     As mentioned at https://twitter.com/ChrSzegedy/status/1533322132368728064?t=xz00T1YT3-WiE0id-h3MEA&s=19
124 |     """
125 | 
126 |     def __init__(self, embedding_layer, ignore_index=-100, overrelaxation=2.0):
127 |         """Overrelax parameter is quite a bit speculative..."""
128 |         super().__init__()
129 |         self.embedding = embedding_layer
130 |         self.ignore_index = ignore_index
131 |         self.overrelaxation = overrelaxation
132 | 
133 |     def forward(self, inputs, labels):
134 |         """This really just does L2(DNN(embed(x[:,:-1]), 2.0 * stop_gradient(embed(x[:,1:]))) as quoted above"""
135 |         num_examples, num_classes = inputs.shape
136 |         valid_mask = labels != self.ignore_index
137 |         M = math.sqrt(num_classes)
138 | 
139 |         inputs = inputs[valid_mask]
140 |         with torch.no_grad():
141 |             embedded_labels = self.overrelaxation * self.embedding(labels)[valid_mask]
142 | 
143 |         return (inputs - embedded_labels).pow(2).sum() / labels.shape[-1] / num_classes
144 | 
145 | 
146 | """Focal Loss from https://github.com/clcarwin/focal_loss_pytorch (minimally modernized into pytorch 1.12)"""
147 | 
148 | """
149 | MIT License
150 | 
151 | Copyright (c) 2017 carwin
152 | 
153 | Permission is hereby granted, free of charge, to any person obtaining a copy
154 | of this software and associated documentation files (the "Software"), to deal
155 | in the Software without restriction, including without limitation the rights
156 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
157 | copies of the Software, and to permit persons to whom the Software is
158 | furnished to do so, subject to the following conditions:
159 | 
160 | The above copyright notice and this permission notice shall be included in all
161 | copies or substantial portions of the Software.
162 | 
163 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
164 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
165 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
166 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
167 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
168 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
169 | SOFTWARE.
170 | """
171 | 
172 | 
173 | class FocalLoss(torch.nn.Module):
174 |     def __init__(self, gamma: float = 5.0, size_average: bool = True, ignore_index: int = -100):
175 |         super().__init__()
176 |         self.register_buffer("gamma", torch.as_tensor(gamma, dtype=torch.float), persistent=False)
177 |         self.size_average = size_average
178 |         self.ignore_index = ignore_index
179 | 
180 |     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
181 |         valid_mask = target != self.ignore_index
182 | 
183 |         log_probs = torch.nn.functional.log_softmax(input[valid_mask]).gather(1, target[None, valid_mask])
184 |         loss = -1 * (1 - log_probs.exp()) ** self.gamma * log_probs
185 |         if self.size_average:
186 |             return loss.mean()
187 |         else:
188 |             return loss.sum()
189 | 
190 | 
191 | class IncorrectCrossEntropyLoss(torch.nn.CrossEntropyLoss):
192 |     """CrossEntropyLoss, but only on incorrectly classified examples."""
193 | 
194 |     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
195 |         with torch.no_grad():
196 |             incorrect_preds = input.argmax(dim=-1) != target
197 |         return torch.nn.functional.cross_entropy(
198 |             input[incorrect_preds],
199 |             target[incorrect_preds],
200 |             weight=self.weight,
201 |             ignore_index=self.ignore_index,
202 |             reduction=self.reduction,
203 |             label_smoothing=self.label_smoothing,
204 |         )
205 | 


--------------------------------------------------------------------------------
/cramming/architectures/sanity_check.py:
--------------------------------------------------------------------------------
 1 | """Sanity Check architecture."""
 2 | import torch
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class SanityCheckforPreTraining(torch.nn.Module):
 7 |     """Make big go fast."""
 8 | 
 9 |     def __init__(self, width, vocab_size):
10 |         super().__init__()
11 |         self.word_embedding = torch.nn.Embedding(vocab_size, width, padding_idx=0)
12 |         self.transform = torch.nn.Linear(width, width, bias=False)
13 | 
14 |     def forward(
15 |         self,
16 |         input_ids,
17 |         attention_mask: Optional[torch.Tensor] = None,
18 |         labels: Optional[torch.Tensor] = None,
19 |         token_type_ids: Optional[torch.Tensor] = None,
20 |     ) -> dict[str, torch.Tensor]:
21 | 
22 |         embeds = self.word_embedding(input_ids)
23 |         outputs = self.transform(embeds)
24 |         loss = outputs.mean()
25 | 
26 |         return {"logits": outputs, "loss": loss}
27 | 


--------------------------------------------------------------------------------
/cramming/backend/__init__.py:
--------------------------------------------------------------------------------
 1 | """This module implements interfaces to the various backends."""
 2 | 
 3 | from .prepare_backend import load_backend
 4 | from .utils import load_model_checkpoint, get_model_engine_tokenizer_dataloaders
 5 | 
 6 | __all__ = [
 7 |     "load_backend",
 8 |     "load_model_checkpoint",
 9 |     "get_model_engine_tokenizer_dataloaders",
10 | ]
11 | 


--------------------------------------------------------------------------------
/cramming/backend/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .progressive_batching import ProgressiveBatching
2 | from .optimizer_modifiers import SAM, LARS
3 | from .schedulers import get_schedule_fn
4 | 


--------------------------------------------------------------------------------
/cramming/backend/optimizers/optimizer_modifiers.py:
--------------------------------------------------------------------------------
  1 | """This is the apex LARS implementation, from the apex repository.
  2 | 
  3 | It implements LARS + optional clipping
  4 | 
  5 | https://github.com/NVIDIA/apex/blob/d74fda260c403f775817470d87f810f816f3d615/apex/parallel/LARC.py
  6 | 
  7 | 
  8 | I did rename it to "LARS".
  9 | """
 10 | 
 11 | import torch
 12 | 
 13 | 
 14 | class MetaOptimizer(torch.optim.Optimizer):
 15 |     """base class for a meta optimizer that wraps and modifies an existing pytorch optimizer."""
 16 | 
 17 |     def __init__(self, optimizer):
 18 |         self.param_groups = optimizer.param_groups
 19 |         self.optim = optimizer
 20 | 
 21 |     def __getstate__(self):
 22 |         return self.optim.__getstate__()
 23 | 
 24 |     def __setstate__(self, state):
 25 |         self.optim.__setstate__(state)
 26 | 
 27 |     def __repr__(self):
 28 |         return self.__class__.__name__ + self.optim.__repr__()
 29 | 
 30 |     def __getattr__(self, name):
 31 |         """Call this only if all other attributes are exhausted."""
 32 |         return getattr(self.optim, name)
 33 | 
 34 |     @torch.no_grad()
 35 |     def step(self, closure=None):
 36 |         return self.optim.step(closure)
 37 | 
 38 | 
 39 | class LARS(MetaOptimizer):
 40 |     """
 41 |     :class:`LARS` [LARC in apex] is a pytorch implementation of both the scaling and clipping variants of LARS,
 42 |     in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive
 43 |     local learning rate for each individual parameter. The algorithm is designed to improve
 44 |     convergence of large batch training.
 45 | 
 46 |     See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate.
 47 | 
 48 |     In practice it modifies the gradients of parameters as a proxy for modifying the learning rate
 49 |     of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer.
 50 | 
 51 |     ```
 52 |     model = ...
 53 |     optim = torch.optim.Adam(model.parameters(), lr=...)
 54 |     optim = LARS(optim)
 55 |     ```
 56 | 
 57 |     Args:
 58 |         optimizer: Pytorch optimizer to wrap and modify learning rate for.
 59 |         trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888
 60 |         clip: Decides between clipping or scaling mode of LARC [LARS + clip].
 61 |               If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter.
 62 |               If `clip=False` the learning rate is set to `local_lr*optimizer_lr`.
 63 |         eps: epsilon kludge to help with numerical stability while calculating adaptive_lr
 64 |     """
 65 | 
 66 |     def __init__(self, optimizer, trust_coefficient=0.02, clip=False, eps=1e-8):
 67 |         self.param_groups = optimizer.param_groups
 68 |         self.optim = optimizer
 69 |         self.trust_coefficient = trust_coefficient
 70 |         self.eps = eps
 71 |         self.clip = clip
 72 | 
 73 |     def step(self, closure=None):
 74 |         loss = None
 75 |         with torch.no_grad():
 76 |             weight_decays = []
 77 |             for group in self.optim.param_groups:
 78 |                 # absorb weight decay control from optimizer
 79 |                 weight_decay = group["weight_decay"] if "weight_decay" in group else 0
 80 |                 weight_decays.append(weight_decay)
 81 |                 group["weight_decay"] = 0
 82 |                 for p in group["params"]:
 83 |                     if p.grad is None:
 84 |                         continue
 85 |                     param_norm = torch.norm(p.data)
 86 |                     grad_norm = torch.norm(p.grad.data)
 87 | 
 88 |                     if param_norm != 0 and grad_norm != 0:
 89 |                         # calculate adaptive lr + weight decay
 90 |                         adaptive_lr = self.trust_coefficient * (param_norm) / (grad_norm + param_norm * weight_decay + self.eps)
 91 | 
 92 |                         # clip learning rate for LARC
 93 |                         if self.clip:
 94 |                             # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)`
 95 |                             adaptive_lr = min(adaptive_lr / group["lr"], 1)
 96 | 
 97 |                         p.grad.data += weight_decay * p.data
 98 |                         p.grad.data *= adaptive_lr
 99 | 
100 |         loss = self.optim.step(closure)
101 |         # return weight decay control to optimizer
102 |         for i, group in enumerate(self.optim.param_groups):
103 |             group["weight_decay"] = weight_decays[i]
104 | 
105 |         return loss
106 | 
107 | 
108 | """This the SAM pytorch implementation from https://github.com/davda54/sam
109 | with a minor modification """
110 | 
111 | """
112 | MIT License
113 | Copyright (c) 2021 David Samuel
114 | Permission is hereby granted, free of charge, to any person obtaining a copy
115 | of this software and associated documentation files (the "Software"), to deal
116 | in the Software without restriction, including without limitation the rights
117 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
118 | copies of the Software, and to permit persons to whom the Software is
119 | furnished to do so, subject to the following conditions:
120 | The above copyright notice and this permission notice shall be included in all
121 | copies or substantial portions of the Software.
122 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
123 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
124 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
125 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
126 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
127 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
128 | SOFTWARE.
129 | """
130 | 
131 | 
132 | class SAM(MetaOptimizer):
133 |     def __init__(self, base_optimizer_instance, rho=0.05):
134 |         assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"
135 |         self.rho = rho
136 | 
137 |         self.optim = base_optimizer_instance
138 |         self.param_groups = base_optimizer_instance.param_groups
139 | 
140 |     @torch.no_grad()
141 |     def first_step(self, zero_grad=False):
142 |         grad_norm = self._grad_norm()
143 |         for group in self.param_groups:
144 |             scale = self.rho / (grad_norm + 1e-12)
145 | 
146 |             for p in group["params"]:
147 |                 if p.grad is None:
148 |                     continue
149 |                 e_w = p.grad * scale.to(p)
150 |                 p.add_(e_w)  # climb to the local maximum "w + e(w)"
151 |                 self.state[p]["e_w"] = e_w
152 | 
153 |         if zero_grad:
154 |             self.zero_grad()
155 | 
156 |     @torch.no_grad()
157 |     def second_step(self, zero_grad=False):
158 |         for group in self.param_groups:
159 |             for p in group["params"]:
160 |                 if p.grad is None:
161 |                     continue
162 |                 p.sub_(self.state[p]["e_w"])  # get back to "w" from "w + e(w)"
163 | 
164 |         self.optim.step()  # do the actual "sharpness-aware" update
165 | 
166 |         if zero_grad:
167 |             self.zero_grad()
168 | 
169 |     @torch.no_grad()
170 |     def step(self, closure=None):
171 |         assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided"
172 |         closure = torch.enable_grad()(closure)  # the closure should do a full forward-backward pass
173 | 
174 |         closure()
175 |         self.first_step(zero_grad=True)
176 |         loss = closure()
177 |         self.second_step()
178 |         return loss
179 | 
180 |     def _grad_norm(self):
181 |         # put everything on the same device, in case of model parallelism
182 |         shared_device = self.param_groups[0]["params"][0].device
183 |         norm = torch.norm(
184 |             torch.stack([p.grad.norm(p=2).to(shared_device) for group in self.param_groups for p in group["params"] if p.grad is not None]),
185 |             p=2,
186 |         )
187 |         return norm
188 | 


--------------------------------------------------------------------------------
/cramming/backend/optimizers/progressive_batching.py:
--------------------------------------------------------------------------------
  1 | """Implementation of a progressive batching meta optimizer.
  2 | The optimizer may defer an optimization step until gradient variance is small enough
  3 | """
  4 | 
  5 | import torch
  6 | 
  7 | from collections import defaultdict
  8 | from .optimizer_modifiers import MetaOptimizer
  9 | 
 10 | 
 11 | import logging
 12 | 
 13 | log = logging.getLogger(__name__)
 14 | DEBUG = False
 15 | 
 16 | 
 17 | class ProgressiveBatching(MetaOptimizer):
 18 |     def __init__(self, optimizer, progress_rule="norm-based", theta=0.9, monotone=False, min_sample_guard=2, max_sample_guard=128):
 19 |         super().__init__(optimizer)
 20 | 
 21 |         self.progress_rule = progress_rule
 22 |         self.theta = theta
 23 |         self.monotone = monotone
 24 | 
 25 |         self.min_sample_guard = min_sample_guard
 26 |         self.max_sample_guard = max_sample_guard
 27 | 
 28 |         self.progress_state = defaultdict(dict)
 29 |         self.accumulated_steps = 0
 30 |         self.reset_sample_statistics()
 31 | 
 32 |     @torch.no_grad()
 33 |     def step(self):
 34 |         """(Maybe) performs a single optimization step."""
 35 |         self.update_sample_statistics()
 36 |         if self.accumulated_steps < self.min_sample_guard:
 37 |             rule_check = False
 38 |         else:
 39 |             if self.accumulated_steps > self.max_sample_guard:
 40 |                 rule_check = True
 41 |             else:
 42 |                 if self.progress_rule == "norm-based":
 43 |                     rule_check = self.norm_test()
 44 |                 elif self.progress_rule == "inner-product":
 45 |                     rule_check = self.inner_product_test()
 46 |                 elif self.progress_rule == "cov":
 47 |                     rule_check = self.coefficient_of_variation()
 48 |                 elif self.progress_rule == "cosine":
 49 |                     rule_check = self.cosine_test()
 50 |                 else:
 51 |                     raise ValueError(f"Invalid progress rules {self.progress_rule} given.")
 52 | 
 53 |         if rule_check:
 54 |             self.copy_mean_grad()  # reference running mean in p.grad attributes
 55 |             if self.monotone:
 56 |                 self.min_sample_guard = self.accumulated_steps  # raise lower limit if forcing monotone batch sizes
 57 |             self.reset_sample_statistics()  # reset running mean
 58 |             super().step()
 59 |         else:
 60 |             # otherwise defer the step and accumulate more gradients
 61 |             pass
 62 | 
 63 |     def inner_product_test(self):
 64 |         """Inner product similar to description in Bollapragada,Byrd,Nocedal, "Adaptive Sampling Strategies for Stochastic Optimization".
 65 | 
 66 |         This is only a zero-memory inner product test.
 67 |         """
 68 | 
 69 |         global_inner_product, global_variance = 0, 0
 70 |         for group in self.param_groups:
 71 |             for p in group["params"]:
 72 |                 state = self.progress_state[p]
 73 |                 ndivn1 = self.accumulated_steps / (self.accumulated_steps - 1)
 74 |                 corrected_mean = (state["running_mean"] - p.grad / self.accumulated_steps) * ndivn1
 75 |                 global_inner_product += (p.grad * corrected_mean).sum()
 76 |                 global_variance += corrected_mean.pow(2).sum()
 77 |         final_v = (global_inner_product - global_variance).pow(2)
 78 | 
 79 |         if DEBUG:
 80 |             inequality_repr = f"{final_v / (self.accumulated_steps - 1):10.2f} < {self.theta * global_variance**2:10.2f}"
 81 |             log.info(f"{self.accumulated_steps} - {inequality_repr}")
 82 | 
 83 |         return final_v / (self.accumulated_steps - 1) < self.theta * global_variance**2
 84 | 
 85 |     def norm_test(self):
 86 |         """Sohams version."""
 87 | 
 88 |         sample_var, mean_norm = 0, 0
 89 |         for group in self.param_groups:
 90 |             for p in group["params"]:
 91 |                 state = self.progress_state[p]
 92 |                 sample_var += state["running_variance"].sum() / (self.accumulated_steps - 1)  # bessel-corrected variance
 93 |                 mean_norm += state["running_mean"].pow(2).sum()
 94 | 
 95 |         if DEBUG:
 96 |             log.info(f"{self.accumulated_steps} -  {sample_var / self.accumulated_steps:10.2f} < {self.theta * mean_norm:10.2f}")
 97 | 
 98 |         return sample_var / self.accumulated_steps < self.theta * mean_norm  # divide by |B| as in bigbatch, original version is theta=1
 99 | 
100 |     def cosine_test(self):
101 |         """Experimental."""
102 | 
103 |         total_angles, num_params = 0, 0
104 |         for group in self.param_groups:
105 |             for p in group["params"]:
106 |                 state = self.progress_state[p]
107 |                 ndivn1 = self.accumulated_steps / (self.accumulated_steps - 1)
108 |                 corrected_mean = (state["running_mean"] - p.grad / self.accumulated_steps) * ndivn1
109 |                 total_angles += (p.grad * corrected_mean).sum() / corrected_mean.norm() / p.grad.norm()
110 |                 num_params += 1
111 | 
112 |         average_angle = total_angles / num_params  # rather the average cosine, this not (yet) the angle
113 | 
114 |         if DEBUG:
115 |             log.info(f"{self.accumulated_steps} -  {average_angle:10.2f} > {self.theta:10.2f}")
116 | 
117 |         return average_angle > self.theta
118 | 
119 |     def coefficient_of_variation(self):
120 |         """unbiased cov test."""
121 |         cov, mean_norm, num_params = 0, 0, 0
122 |         for group in self.param_groups:
123 |             for p in group["params"]:
124 |                 state = self.progress_state[p]
125 |                 cov += (state["running_variance"].sum() / (self.accumulated_steps - 1)).sqrt() / (state["running_mean"].pow(2).sum() + 1e-6)
126 |                 mean_norm += state["running_mean"].pow(2).sum()
127 |                 num_params += 1
128 | 
129 |         unbiased_avg_cov = (1 + 1 / (4 * self.accumulated_steps)) * cov / num_params / self.accumulated_steps
130 | 
131 |         if DEBUG:
132 |             log.info(f"{self.accumulated_steps} -  {unbiased_avg_cov:10.2f} < {self.theta * 100:10.2f}")
133 | 
134 |         return unbiased_avg_cov < self.theta * 100
135 | 
136 |     def update_sample_statistics(self):
137 |         """Update sample statistics based on welford accumulation. At any step variance can be finalized via running_variance / count"""
138 |         self.accumulated_steps += 1
139 |         for group in self.param_groups:
140 |             for p in group["params"]:
141 |                 state = self.progress_state[p]
142 |                 current_delta = p.grad - state["running_mean"]
143 |                 state["running_mean"] += current_delta / self.accumulated_steps
144 |                 corrected_delta = p.grad - state["running_mean"]
145 |                 state["running_variance"] += current_delta * corrected_delta
146 | 
147 |     def reset_sample_statistics(self):
148 |         """Allocate new tensors, old references are still required for the optimizer step."""
149 |         self.last_full_step_accumulation = self.accumulated_steps + 1
150 |         self.accumulated_steps = 0
151 |         for group in self.param_groups:
152 |             for p in group["params"]:
153 |                 state = self.progress_state[p]
154 |                 state["running_mean"] = torch.zeros_like(p, memory_format=torch.preserve_format)
155 |                 state["running_variance"] = torch.zeros_like(p, memory_format=torch.preserve_format)
156 | 
157 |     def copy_mean_grad(self):
158 |         for group in self.param_groups:
159 |             for p in group["params"]:
160 |                 p.grad = self.progress_state[p]["running_mean"]
161 | 


--------------------------------------------------------------------------------
/cramming/backend/prepare_backend.py:
--------------------------------------------------------------------------------
 1 | """Instantiate backend objects in a congruent format."""
 2 | import torch
 3 | 
 4 | from .torch_default import initialize_torch
 5 | 
 6 | _default_setup = dict(device=torch.device("cpu"), dtype=torch.float)
 7 | 
 8 | 
 9 | def load_backend(model, tokenizer, cfg_train, cfg_impl, setup=_default_setup, init_compile_and_distribute=True):
10 |     if cfg_impl.name == "torch-default":
11 |         return initialize_torch(model, tokenizer, cfg_train, cfg_impl, setup=setup, init_compile_and_distribute=init_compile_and_distribute)
12 |     else:
13 |         raise ValueError(f"Invalid backend {cfg_impl.name} given.")
14 | 


--------------------------------------------------------------------------------
/cramming/backend/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import torch
  4 | 
  5 | import logging
  6 | 
  7 | from safetensors.torch import load_file, save_file
  8 | import cramming
  9 | 
 10 | log = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | """Utilities common to several backends."""
 14 | def group_parameters(model, cfg_train):
 15 |     model_parameters = list(model.named_parameters())
 16 |     if len(cfg_train.limited_decay_keys) > 0:
 17 |         grouped_parameters = optimizer_grouped_parameters = [
 18 |             {
 19 |                 "params": [p for n, p in model_parameters if not any(nd in n for nd in cfg_train.limited_decay_keys)],
 20 |                 "weight_decay": cfg_train.optim.weight_decay,
 21 |             },
 22 |             {
 23 |                 "params": [p for n, p in model_parameters if any(nd in n for nd in cfg_train.limited_decay_keys)],
 24 |                 "weight_decay": 0.0,
 25 |             },
 26 |         ]
 27 |     else:
 28 |         grouped_parameters = [p for n, p in model_parameters]
 29 |     return grouped_parameters
 30 | 
 31 | 
 32 | def get_model_engine_tokenizer_dataloaders(cfg, setup, train_eval: bool = True):
 33 |     """This function gets the model, model engine (if needed), tokenizer, and data"""
 34 |     if train_eval:
 35 |         train_eval_cfg = cfg.train
 36 |     else:
 37 |         train_eval_cfg = cfg.eval
 38 | 
 39 |     tokenizer_model = None
 40 |     cfg_arch = cfg.arch  # if not loading from checkpoint, need architecture config
 41 |     checkpoint_path = None
 42 |     try:
 43 |         # attempt to load latest in case of preemption
 44 |         prev_checkpoint_path = os.path.join(cfg.model_dir, cfg.name, "checkpoints")
 45 |         tokenizer_model, cfg_arch, checkpoint_path = cramming.utils.find_pretrained_checkpoint(
 46 |             "latest",
 47 |             local_checkpoint_folder=str(prev_checkpoint_path),
 48 |             arch_modifications=train_eval_cfg.arch_modifications
 49 |         )
 50 |         log.info(f"Getting latest checkpoint at {prev_checkpoint_path}")
 51 | 
 52 |     except:
 53 |         # no previous checkpoint saved.  Checking separate model directory
 54 |         if train_eval_cfg.checkpoint is not None:
 55 |             try:
 56 |                 tokenizer_model, cfg_arch, checkpoint_path = cramming.utils.find_pretrained_checkpoint(
 57 |                     train_eval_cfg.checkpoint,
 58 |                     local_checkpoint_folder=cfg.model_dir,
 59 |                     arch_modifications=train_eval_cfg.arch_modifications
 60 |                 )
 61 |                 log.info(f"Found checkpoint at {cfg.model_dir} or {train_eval_cfg.checkpoint}")
 62 |                 # importantly, if checkpoint is found, we will use that model arch, modifications doesnt seem to work.
 63 |             except Exception as e:
 64 |                 log.info(f"Unable to load checkpoint {train_eval_cfg.checkpoint} or in directory {cfg.model_dir}."
 65 |                          f"  Initializing model from scratch!")
 66 | 
 67 |     log.info(f"Loading Data")
 68 |     datasets, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl, cfg.data_dir)
 69 | 
 70 |     real_dataset_sample_length = len(datasets['train'][0]['input_ids']) # for arithmetic datasets
 71 | 
 72 |     if tokenizer_model is not None:
 73 |         # todo consider if we even need to return the tokenizer with the checkpoint (only HF?)
 74 |         tokenizer = tokenizer_model
 75 |     dataloaders = cramming.prepare_dataloaders(datasets, tokenizer, train_eval_cfg, cfg.impl)
 76 | 
 77 |     log.info(f"Constructing Model")
 78 |     model = cramming.construct_model(cfg_arch, tokenizer)
 79 | 
 80 |     metadata = {}
 81 | 
 82 |     if train_eval:
 83 |         # if in train mode, need engine
 84 |         fully_init_model_to_begin = False if checkpoint_path is not None else True
 85 |         model_engine = cramming.load_backend(
 86 |             model,
 87 |             tokenizer,
 88 |             cfg.train,
 89 |             cfg.impl,
 90 |             setup=setup,
 91 |             init_compile_and_distribute=fully_init_model_to_begin, # false if we are planning to load a checkpoint in later
 92 |         )
 93 | 
 94 |         if checkpoint_path is not None:
 95 |             # load checkpoint, engine handles loaded model
 96 |             metadata = model_engine.load_checkpoint(cfg_arch, checkpoint_path)
 97 |             for k, v in dataloaders.items():
 98 |                 try:
 99 |                     # for dataloaders with epochs (RuntimeInfiniteDataLoader) set that epoch to start here
100 |                     v.set_epoch(metadata.get("epoch", 0))
101 |                 except:
102 |                     pass
103 | 
104 |         model_engine.train(train_eval_cfg.pretrain_in_train_mode)
105 |         model_engine.current_seq_length = real_dataset_sample_length # setting the number of tokens seen correctly for arithmetic data
106 |     else:
107 |         if checkpoint_path is not None:
108 |             model = load_model_checkpoint(model, checkpoint_path)
109 |         model_engine = None
110 |     return model, model_engine, tokenizer, dataloaders, metadata
111 | 
112 | 
113 | def load_model_checkpoint(model, model_dir, forward_only_model_with_skip=False):
114 |     ext = "model.safetensors"
115 |     try:
116 |         model_file = os.path.join(model_dir, ext)
117 |         model_state = load_file(model_file)
118 |     except:
119 |         ext = "state_dict.pth"
120 |         model_file = os.path.join(model_dir, ext)
121 |         loaded = torch.load(model_file)
122 |         model_state = loaded.get("model_state", None)
123 | 
124 |     if model_state is None:
125 |         raise ValueError(f"No model found in directory {model_dir} (in '/state_dict.pth' or '/model.safetensors')")
126 |     else:
127 |         log.info(f"Loading Model from {model_file}")
128 | 
129 |     if "encoder.embedding.word_embedding.weight" not in model_state:
130 |         # Hack to save space when saving the model, more clever though would be save the right one in the first place
131 |         model_state["encoder.embedding.word_embedding.weight"] = model_state["decoder.weight"]
132 |     sanitized_state = {}
133 |     try:
134 |         for k, v in model_state.items():
135 |             if k.startswith("module."):
136 |                 k = k[7:]
137 |             if forward_only_model_with_skip:
138 |                 if "_orig_mod" in k: # we load in original model to here so we can drop this
139 |                     k = k.replace("._orig_mod", "")
140 |             sanitized_state[k] = v
141 | 
142 |         model.load_state_dict(sanitized_state, strict=True)
143 |         log.info("finished loading state dict")
144 |     except RuntimeError as e:
145 |         log.info(f"State dict difference is {str(e).split('Error(s) in loading state_dict for')[1]}... Ok?")
146 |         exit()
147 | 
148 |     return model
149 | 


--------------------------------------------------------------------------------
/cramming/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcleish7/arithmetic/86022a57d38c0fde46444d62e8dcbebcc0af614c/cramming/config/__init__.py


--------------------------------------------------------------------------------
/cramming/config/arch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcleish7/arithmetic/86022a57d38c0fde46444d62e8dcbebcc0af614c/cramming/config/arch/__init__.py


--------------------------------------------------------------------------------
/cramming/config/arch/albert.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable decoder-based LM
 2 | # This is set up to be as close to ALBERT-large (Lan et al.) as reasonable for a decoder-based model
 3 | 
 4 | model_type: ScriptableCrammedDepthRecurrent
 5 | 
 6 | layers_in_recurrent_block: 1
 7 | maximal_recurrence: 24
 8 | max_backprop: # use half of maximal_recurrence if not given, minimal is 1 # only valid for TBTT
 9 | maximal_recurrence_in_eval: 24
10 | 
11 | hidden_size: 1024
12 | intermed_size: 4096
13 | input_injection_type: none
14 | initial_hidden_randomized: False
15 | state_init:
16 | 
17 | norm: LayerNorm
18 | norm_eps: 1e-12
19 | norm_scheme: post # can be "pre", "post"
20 | nonlin: GELU
21 | sub_normalization: False
22 | 
23 | tie_weights: True # Tie input/output embedding
24 | decoder_bias: True # Whether to include a bias in the decoding step
25 | use_bias: True # Whether to learn biases on all dense layers
26 | final_norm: False # Add a final norm layer before the end
27 | head: identity
28 | 
29 | objective_layout: fixed
30 | 
31 | embedding:
32 |   vocab_size: # will be populated automatically
33 |   pos_embedding: learned
34 |   max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
35 |   embedding_dim: 128
36 |   normalization: True
37 |   stable_low_precision: False
38 | 
39 | attention:
40 |   type: pytorch # also works with "pytorch"
41 |   num_attention_heads: 16 # for flash
42 |   skip_output_projection: False
43 |   qkv_bias: True
44 |   bias_in_proj: True
45 | 
46 |   rotary_embedding: False
47 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
48 |   sequence_op: torch-softmax # Can be normalization
49 |   sub_normalization: False # could be turned off separately # Is only used if type=self-attention (i.e the hand-made version)
50 | 
51 | init:
52 |   type: normal
53 |   std: 0.02
54 | 
55 | throttle: False # only active during TBPTT
56 | local_compilation: True # Try to compile the static block, no matter what the global compile setting is set to
57 | 


--------------------------------------------------------------------------------
/cramming/config/arch/crammed-depthrecurrent.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable decoder-based LM
 2 | # This inherits architecture changes from the crammed-bert project
 3 | 
 4 | model_type: ScriptableCrammedDepthRecurrent
 5 | 
 6 | layers_in_recurrent_block: 4
 7 | maximal_recurrence: 4
 8 | max_backprop: # use half of maximal_recurrence if not given, minimal is 1
 9 | maximal_recurrence_in_eval: ${arch.maximal_recurrence} # could be set to think longer
10 | 
11 | hidden_size: 768
12 | intermed_size: 3072
13 | input_injection_type: add
14 | initial_hidden_randomized: True
15 | state_init: embed # initialized random like embedding
16 | 
17 | 
18 | norm: LayerNorm
19 | norm_eps: 1e-12
20 | norm_scheme: post # can be "pre", "post"
21 | 
22 | nonlin: GELUglu
23 | sub_normalization: False # Sub-normalization in attn and ffn blocks
24 | 
25 | tie_weights: False # Tie input/output embedding
26 | decoder_bias: False # Whether to include a bias in the decoding step
27 | use_bias: False # Whether to learn biases on all dense layers
28 | final_norm: True # Add a final norm layer before the end
29 | head: ffn
30 | 
31 | objective_layout: TBPTT
32 | 
33 | embedding:
34 |   vocab_size: # will be populated automatically
35 |   pos_embedding: learned
36 |   max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
37 |   embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
38 |   normalization: True
39 |   stable_low_precision: False
40 |   max_abacus_len: 100
41 | 
42 | attention:
43 |   type: pytorch # also works with "pytorch"
44 |   num_attention_heads: 16 # for flash
45 |   skip_output_projection: False
46 |   qkv_bias: False
47 |   bias_in_proj: False
48 |   max_length: 0 # for randomised PE's (NOT IMPLEMENTED FOR ALL)
49 | 
50 |   rotary_embedding: False
51 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
52 |   sequence_op: torch-softmax # Can be normalization
53 |   sub_normalization: ${arch.sub_normalization} # could be turned off separately # Is only used if type=self-attention (i.e the hand-made version)
54 | 
55 | init:
56 |   type: deepnorm-straight
57 |   std: 0.02 # only used if type=normal
58 | 
59 | throttle: False # only active during TBPTT
60 | alpha: 1.0 # only active during TBPTT
61 | mask_before_equals: False
62 | local_compilation: True # Try to compile the static block, no matter what the global compile setting is set to
63 | loss_reduction: mean
64 | forward_only_model_with_skip: False # forward only model with skip


--------------------------------------------------------------------------------
/cramming/config/arch/crammed-fakeRNN.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | # Modernized version of bert-c5
 3 | 
 4 | # These are the huggingface bert parameters
 5 | model_type: ScriptableFakeRNN
 6 | 
 7 | n_blocks: 5
 8 | state_size: 512
 9 | hidden_size: 512
10 | bottle_size: 256
11 | block_type: resnet
12 | 
13 | tie_weights: True # Tie input/output embedding
14 | decoder_bias: False # Whether to include a bias in the decoding step
15 | 
16 | loss: cross-entropy
17 | objective_layout: autoregressive
18 | 
19 | embedding:
20 |   vocab_size: # will be populated automatically
21 |   pos_embedding: None
22 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
23 |   pad_token_id: 0
24 |   max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
25 |   embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
26 |   normalization: False
27 |   stable_low_precision: False
28 | 
29 | init:
30 |   type: normal
31 |   std: 0.02
32 | 
33 | # Set dynamically:
34 | eos_token_id:
35 | 


--------------------------------------------------------------------------------
/cramming/config/arch/crammed-janus.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable janus-type RNN, right now with all tested bells-and-whistles
 2 | 
 3 | # These are the huggingface bert parameters
 4 | model_type: ScriptableCrammedJanus
 5 | 
 6 | num_transformer_layers: 8
 7 | state_dim: 1024
 8 | 
 9 | norm_scheme: shaped
10 | norm: LayerNorm
11 | norm_eps: 1e-12
12 | 
13 | nonlin: GELUglu
14 | sub_normalization: False # Sub-normalization in attn and ffn blocks
15 | 
16 | tie_weights: True # Tie input/output embedding
17 | decoder_bias: False # Whether to include a bias in the decoding step
18 | use_bias: True # Whether to learn biases on all dense layers
19 | final_norm: True # crashes without this improvement to stability
20 | force_normalized_state: False # last normalization learnable?
21 | 
22 | loss: cross-entropy
23 | objective_layout: autoregressive # nothing else implemented so far
24 | 
25 | ffn_block:
26 |   structure: joined-injection # state-branch-embedding-injection
27 | 
28 |   intermed_multiplier: 4
29 |   hidden_dropout_prob: 0.0
30 | 
31 |   num_chunks_in_sequence: 16 # only necessary if head.structure=chunked
32 | 
33 | head:
34 |   structure: ffn # dense-nonlin-norm
35 |   nonlin: GELU
36 |   norm: LayerNorm
37 |   norm_eps: 1e-12
38 |   use_bias: True
39 |   include_attn_in_chunked_heads: False # only valid for chunked heads
40 |   num_chunked_heads: 4 # only valid for chunked heads
41 |   intermed_multiplier: 4
42 | 
43 | objective:
44 |   historian_weight: 1.0
45 |   predictor_weight: 1.0
46 |   present_historian_weight: 1.0
47 |   present_predictor_weight: 1.0
48 |   rscale_correction: False
49 | 
50 |   antiquarian_weight: 0.0 #
51 |   antiquarian_range: ${data.seq_length} # maximal range a previous state may be looked up with # set to -1 to encompass all previous states
52 |   historian_loss_fn: MSE # can also be cosine
53 | 
54 | embedding:
55 |   vocab_size: # will be populated automatically
56 |   pos_embedding:
57 |   embedding_dim: 512
58 |   normalization: True
59 |   stable_low_precision: False
60 |   max_seq_length: ${data.seq_length} # legacy position, do not use
61 | 
62 | 
63 | max_seq_length: ${data.seq_length} # max seq length during training (not always used)
64 | position_information: learned # none learned or simple
65 | 
66 | init:
67 |   type: megatron
68 |   std: 0.02 # only used if type=normal
69 | 
70 | # Experimental options:
71 | state_corruption: 0.0
72 | state_init: unit
73 | eos_state_reset: True
74 | 
75 | # Set dynamically:
76 | eos_token_id:
77 | 


--------------------------------------------------------------------------------
/cramming/config/arch/crammed-rnn.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
 2 | # Modernized version of bert-c5
 3 | 
 4 | # These are the huggingface bert parameters
 5 | model_type: ScriptableCrammedRNN
 6 | 
 7 | # PyTorch LSTM settings:
 8 | input_size: 512
 9 | hidden_size: 512
10 | num_layers: 2
11 | bias: True
12 | seq_first: True
13 | dropout: 0.1
14 | bidirectional: False
15 | proj_size: 0
16 | 
17 | norm: LayerNorm
18 | norm_eps: 1e-12
19 | final_norm: True # Add a final norm layer before the end
20 | skip_head_transform: True # This is only possible if embedding_dim=hidden_size
21 | use_bias: False # Whether to learn biases on all dense layers
22 | 
23 | tie_weights: True # Tie input/output embedding
24 | decoder_bias: False # Whether to include a bias in the decoding step
25 | 
26 | loss: cross-entropy
27 | objective_layout: autoregressive
28 | 
29 | embedding:
30 |   vocab_size: # will be populated automatically
31 |   pos_embedding: scaled-sinusoidal
32 |   dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
33 |   pad_token_id: 0
34 |   max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
35 |   embedding_dim: ${arch.input_size} # has to be this value for crammedBERT
36 |   normalization: True
37 |   stable_low_precision: False
38 | 
39 | # Set dynamically:
40 | eos_token_id:
41 | 


--------------------------------------------------------------------------------
/cramming/config/arch/crammed-stack-janus.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable janus-type RNN, right now with all tested bells-and-whistles
 2 | 
 3 | # These are the huggingface bert parameters
 4 | model_type: ScriptableCrammedJanus
 5 | 
 6 | num_transformer_layers: 8
 7 | state_dim: 3584
 8 | 
 9 | norm_scheme: shaped
10 | norm: LayerNorm
11 | norm_eps: 1e-12
12 | 
13 | nonlin: GELUglu
14 | sub_normalization: False # Sub-normalization in attn and ffn blocks
15 | 
16 | tie_weights: True # Tie input/output embedding
17 | decoder_bias: False # Whether to include a bias in the decoding step
18 | use_bias: True # Whether to learn biases on all dense layers
19 | final_norm: True # crashes without this improvement to stability
20 | force_normalized_state: True # last normalization learnable?
21 | 
22 | loss: cross-entropy
23 | objective_layout: autoregressive # nothing else implemented so far
24 | 
25 | ffn_block:
26 |   structure: stack-sideways-transformer
27 |   intermed_multiplier: 4
28 |   hidden_dropout_prob: 0.0
29 | 
30 |   # settings only relevant for structure=state-attention:
31 |   qkv_bias: True
32 |   proj_bias: True
33 |   num_chunks_in_sequence: 16
34 |   num_read_write_heads: 8
35 |   run_causal_heads: False
36 |   positional_info: True
37 |   garbage_collect_state: False
38 |   num_blocks_to_accumulate: 0 # Can be any number of embedding chunks that will added to state, this is N^2 atttention again :>
39 |   gradient_checkpointing: False
40 |   workspace: ${arch.ffn_block.num_chunks_in_sequence} # only used if block in structure, can be smaller than num_chunks_in_sequence
41 | 
42 | head:
43 |   structure: chunked # dense-nonlin-norm
44 |   nonlin: GELU
45 |   norm: LayerNorm
46 |   norm_eps: 1e-12
47 |   use_bias: True
48 |   include_attn_in_chunked_heads: True # only valid for chunked heads
49 |   num_chunked_heads: 4 # only valid for chunked heads
50 |   intermed_multiplier: 4
51 | 
52 | objective:
53 |   historian_weight: 1.0
54 |   predictor_weight: 1.0
55 |   present_historian_weight: 1.0
56 |   present_predictor_weight: 1.0
57 |   rscale_correction: False
58 | 
59 |   antiquarian_weight: 0.0 #
60 |   antiquarian_range: ${data.seq_length} # maximal range a previous state may be looked up with # set to -1 to encompass all previous states
61 |   historian_loss_fn: MSE
62 | 
63 | embedding:
64 |   vocab_size: # will be populated automatically
65 |   pos_embedding:
66 |   embedding_dim: 512
67 |   normalization: True
68 |   stable_low_precision: False
69 |   max_seq_length: ${data.seq_length} # legacy position, do not use
70 | 
71 | 
72 | max_seq_length: ${data.seq_length} # max seq length during training (not always used)
73 | position_information: learned # none learned or simple
74 | 
75 | init:
76 |   type: deepnorm-straight
77 |   std: 0.02
78 | 
79 | # Experimental options:
80 | state_corruption: 0.0
81 | eos_state_reset: True
82 | state_init: unit
83 | 
84 | # Set dynamically:
85 | eos_token_id:
86 | 


--------------------------------------------------------------------------------
/cramming/config/arch/crammed-tiny.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable decoder-based LM
 2 | # This is the tiny setting, modified from bert-tiny with larger hidden and lower number of heads
 3 | 
 4 | model_type: ScriptableCrammedTransformer
 5 | 
 6 | num_transformer_layers: 4
 7 | hidden_size: 256
 8 | intermed_size: 1024
 9 | 
10 | norm: LayerNorm
11 | norm_eps: 1e-12
12 | norm_scheme: pre # can be "pre", "post", "sandwich"
13 | nonlin: GELUglu
14 | 
15 | tie_weights: True # Tie input/output embedding
16 | decoder_bias: False # Whether to include a bias in the decoding step
17 | use_bias: False # Whether to learn biases on all dense layers
18 | final_norm: True # Add a final norm layer before the end
19 | sub_normalization: False # Sub-normalization in attn and ffn blocks
20 | 
21 | loss: cross-entropy
22 | 
23 | embedding:
24 |   vocab_size: # will be populated automatically
25 |   pos_embedding: scaled-sinusoidal
26 |   max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
27 |   embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
28 |   normalization: True
29 |   stable_low_precision: False
30 | 
31 | attention:
32 |   type: pytorch # also works with "pytorch"
33 |   num_attention_heads: 8
34 |   skip_output_projection: False
35 |   qkv_bias: False
36 |   bias_in_proj: False
37 | 
38 |   rotary_embedding: False
39 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
40 |   sequence_op: torch-softmax # Can be normalization
41 |   sub_normalization: ${arch.sub_normalization} # could be turned off separately # Is only used if type=self-attention (i.e the hand-made version)
42 | 
43 | init:
44 |   type: normal
45 |   std: 0.02
46 | 


--------------------------------------------------------------------------------
/cramming/config/arch/crammed-transformer.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable decoder-based LM
 2 | # This inherits architecture changes from the crammed-bert project
 3 | # How performant is this?
 4 | 
 5 | model_type: ScriptableCrammedTransformer
 6 | 
 7 | num_transformer_layers: 16
 8 | hidden_size: 768
 9 | intermed_size: 3072
10 | 
11 | norm: LayerNorm
12 | norm_eps: 1e-12
13 | norm_scheme: pre # can be "pre", "post"
14 | nonlin: GELUglu
15 | 
16 | tie_weights: True # Tie input/output embedding
17 | decoder_bias: False # Whether to include a bias in the decoding step
18 | use_bias: False # Whether to learn biases on all dense layers
19 | final_norm: True # Add a final norm layer before the end
20 | sub_normalization: False # Sub-normalization in attn and ffn blocks
21 | 
22 | embedding:
23 |   vocab_size: # will be populated automatically
24 |   pos_embedding: scaled-sinusoidal
25 |   max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
26 |   embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
27 |   normalization: True
28 |   stable_low_precision: False
29 | 
30 | attention:
31 |   type: pytorch # also works with "pytorch"
32 |   num_attention_heads: 16 # for flash
33 |   skip_output_projection: False
34 |   qkv_bias: False
35 |   bias_in_proj: False
36 | 
37 |   rotary_embedding: False
38 |   seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
39 |   sequence_op: torch-softmax # Can be normalization
40 |   sub_normalization: ${arch.sub_normalization} # could be turned off separately # Is only used if type=self-attention (i.e the hand-made version)
41 | 
42 | init:
43 |   type: normal
44 |   std: 0.02
45 | 


--------------------------------------------------------------------------------
/cramming/config/arch/gpt2-base.yaml:
--------------------------------------------------------------------------------
 1 | # Instantiates a (non-huggingface) scriptable decoder-based LM
 2 | # This matches the gpt2 settings in the custom implementation
 3 | # (minus dropout which I did not even implement)
 4 | 
 5 | model_type: ScriptableCrammedTransformer
 6 | 
 7 | num_transformer_layers: 12
 8 | hidden_size: 768
 9 | intermed_size: 3072
10 | 
11 | norm: LayerNorm
12 | norm_eps: 1e-05
13 | norm_scheme: post # can be "pre", "post"
14 | nonlin: GELU
15 | 
16 | tie_weights: True # Tie input/output embedding
17 | decoder_bias: False # Whether to include a bias in the decoding step
18 | use_bias: True # Whether to learn biases on all dense layers
19 | final_norm: True # Add a final norm layer before the end
20 | sub_normalization: False
21 | 
22 | embedding:
23 |   vocab_size: # will be populated automatically
24 |   pos_embedding: learned
25 |   max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
26 |   embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
27 |   normalization: True
28 |   stable_low_precision: False
29 | 
30 | attention:
31 |   type: pytorch # also works with "pytorch"
32 |   num_attention_heads: 12
33 |   skip_output_projection: False
34 |   qkv_bias: True
35 |   bias_in_proj: True
36 | 
37 |   rotary_embedding: False
38 |   seq_op_in_fp32: True # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
39 |   sequence_op: torch-softmax # Can be normalization
40 |   sub_normalization: False
41 | 
42 | init:
43 |   type: normal
44 |   std: 0.02
45 | 


--------------------------------------------------------------------------------
/cramming/config/arch/hf-gpt2.yaml:
--------------------------------------------------------------------------------
 1 | # These are the huggingface bert parameters
 2 | 
 3 | model_type: "gpt2"
 4 | 
 5 | n_ctx: 1024
 6 | n_embd: 768
 7 | n_head: 12
 8 | n_layer: 12
 9 | n_positions: ${data.seq_length} # max seq length that the positional embedding is instantiated for
10 | 
11 | 
12 | activation_function: "gelu_new"
13 | attn_pdrop: 0.1
14 | resid_pdrop: 0.1
15 | embd_pdrop: 0.1
16 | initializer_range: 0.02
17 | layer_norm_epsilon: 1e-05
18 | 
19 | 
20 | 
21 | 
22 | summary_activation: null
23 | summary_first_dropout: 0.1
24 | summary_proj_to_labels: true
25 | summary_type: "cls_index"
26 | summary_use_proj: true
27 | 
28 | bos_token_id: 50256
29 | eos_token_id: 50256
30 | 


--------------------------------------------------------------------------------
/cramming/config/arch/sanitycheck.yaml:
--------------------------------------------------------------------------------
1 | model_type: SanityCheckLM
2 | 
3 | width: 1024 # 8352
4 | 


--------------------------------------------------------------------------------
/cramming/config/cfg_eval.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration defaults
 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams
 3 | defaults:
 4 |   - impl: torch-default
 5 |   - train: common
 6 |   - wandb: default
 7 |   - eval: pythia
 8 |   - data: arithemtic
 9 |   - _self_
10 |   - override hydra/job_logging: custom
11 | 
12 | reverse_inputs: True
13 | pad_zeros: 0
14 | extended_eval: False
15 | greedy: True
16 | temp: 1.0
17 | token_limit: 30 # number of tokens in 'thinking plot'
18 | max_rec: null # to give more or less recurrence at evaluation that during training
19 | 
20 | ## Addition
21 | remove_padding: True # used as our eval data has some padding in it that needs to be removed on the fly
22 | large: True
23 | ood_only: False
24 | up_to_40: False
25 | up_to_50: False
26 | 
27 | checkerboard: null
28 | big_eval_step_1: False
29 | big_eval_step_2: False
30 | big_eval_step_3: False
31 | big_eval_step_4: False
32 | big_eval_step_5: False
33 | big_eval_step_6: False
34 | big_eval_step_7: False
35 | big_eval_step_8: False
36 | big_eval_step_9: False
37 | big_eval_step_10: False
38 | 
39 | # for doing custom splits
40 | max_size_given: null
41 | start_ind_1_given: null
42 | start_ind_2_given: null
43 | 
44 | ## Multiplication
45 | mul: False
46 | 
47 | ## Pos arithmetic
48 | pos_arth: False
49 | pos_arth_ood: False
50 | 
51 | wandb:
52 |   project: generative-eval
53 | 
54 | # Total and central computation budget in hours:
55 | budget: 24
56 | overall_budget: ${budget}
57 | 
58 | base_dir: outputs
59 | model_dir:
60 | 
61 | hydra:
62 |   sweep:
63 |     dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S}
64 |   run:
65 |     dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S}
66 |   job:
67 |     chdir: True
68 | 
69 | seed: # Optional: Set initial seed
70 | 
71 | # A name for this run [will draw the checkpoint from runs with this name
72 | # and use this name for the summary table and outputs folder]
73 | name: default
74 | 
75 | # debug implementation by running every loop just once:
76 | dryrun: False
77 | 


--------------------------------------------------------------------------------
/cramming/config/cfg_pretrain.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration defaults
 2 | # Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams
 3 | defaults:
 4 |   - arch: crammed-depthrecurrent
 5 |   - data: arithmetic
 6 |   - impl: torch-default
 7 |   - wandb: default
 8 |   - train: cramming
 9 |   - _self_
10 |   - override hydra/job_logging: custom
11 | 
12 | base_dir: outputs
13 | model_dir: ${base_dir}
14 | data_dir:
15 | 
16 | hydra:
17 |   sweep:
18 |     dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S}
19 |   run:
20 |     dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S}
21 |   job:
22 |     chdir: True
23 | 
24 | seed: # Optional: Set initial seed
25 | name: default # A name for this run [will be used for the summary table and outputs folder]
26 | 
27 | # Total and central computation budget in hours:
28 | budget: 4
29 | overall_budget: ${budget}
30 | 
31 | # debug implementation by running every loop just once:
32 | dryrun: False
33 | 


--------------------------------------------------------------------------------
/cramming/config/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcleish7/arithmetic/86022a57d38c0fde46444d62e8dcbebcc0af614c/cramming/config/data/__init__.py


--------------------------------------------------------------------------------
/cramming/config/data/arithmetic.yaml:
--------------------------------------------------------------------------------
 1 | name: arithmetic
 2 | defaults:
 3 |   - sources:
 4 |       - arithmetic
 5 | 
 6 | 
 7 | 
 8 | # all the below stuff may not be required
 9 | # Preprocessing
10 | normalizer:
11 |   force_lowercase: False
12 |   strip_accents: False
13 |   force_english_keyboard: False
14 | tokenizer: bigcode/starcoder
15 | vocab_size: 49152 #32768 # 2^17
16 | 
17 | # Dataset Formation
18 | seq_length: 512
19 | include_eot_token_in_corpus: True
20 | 
21 | max_entries_in_raw_dataset: 20e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
22 | max_seq_in_tokenized_dataset: 80e6 # Select only this many tokenized sequences.
23 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
24 | 
25 | # Data Cleaning:
26 | remove_trash: False
27 | trash_cutoff: 0.25
28 | deduplicate_entries: False
29 | deduplication_threshold: 75
30 | 
31 | # Data Order:
32 | ordering: randomized # for now
33 | 
34 | # Validation Split
35 | validation_seqs: 4096 # how many sequences to reserve for validation
36 | 


--------------------------------------------------------------------------------
/cramming/config/data/c4-subset-processed.yaml:
--------------------------------------------------------------------------------
 1 | # This would be a slice of C4
 2 | name: c4-subset
 3 | defaults:
 4 |   - sources:
 5 |       - c4
 6 | 
 7 | # Preprocessing
 8 | normalizer:
 9 |   force_lowercase: False
10 |   strip_accents: False
11 |   force_english_keyboard: False
12 | tokenizer: SentencePieceBPE
13 | vocab_size: 131072 # 2^17
14 | 
15 | # Dataset Formation
16 | seq_length: 512
17 | include_eot_token_in_corpus: True
18 | 
19 | max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
20 | max_seq_in_tokenized_dataset: 85e6 # Select only this many tokenized sequences.
21 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
22 | 
23 | # Data Cleaning:
24 | remove_trash: False
25 | trash_cutoff: 0.25
26 | deduplicate_entries: False
27 | deduplication_threshold: 75
28 | 
29 | # Data Order:
30 | ordering: randomized # for now
31 | 
32 | # Validation Split
33 | validation_seqs: 4096 # how many sequences to reserve for validation
34 | 


--------------------------------------------------------------------------------
/cramming/config/data/openweb.yaml:
--------------------------------------------------------------------------------
 1 | # Selection of English sources from the ROOTS project
 2 | name: openweb
 3 | defaults:
 4 |   - sources:
 5 |       - openwebtext
 6 | 
 7 | # Preprocessing
 8 | normalizer:
 9 |   force_lowercase: False
10 |   strip_accents: False
11 |   force_english_keyboard: False
12 | tokenizer: BPE
13 | vocab_size: 32768 # 2^17
14 | 
15 | # Dataset Formation
16 | seq_length: 512
17 | include_eot_token_in_corpus: True
18 | 
19 | max_entries_in_raw_dataset: 20e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
20 | max_seq_in_tokenized_dataset: 80e6 # Select only this many tokenized sequences.
21 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
22 | 
23 | # Data Cleaning:
24 | remove_trash: False
25 | trash_cutoff: 0.25
26 | deduplicate_entries: False
27 | deduplication_threshold: 75
28 | 
29 | # Data Order:
30 | ordering: randomized # for now
31 | 
32 | # Validation Split
33 | validation_seqs: 4096 # how many sequences to reserve for validation
34 | 


--------------------------------------------------------------------------------
/cramming/config/data/proofpile.yaml:
--------------------------------------------------------------------------------
 1 | name: proofpile
 2 | defaults:
 3 |   - sources:
 4 |       - proofpiledata
 5 | 
 6 | # Preprocessing
 7 | normalizer:
 8 |   force_lowercase: False
 9 |   strip_accents: False
10 |   force_english_keyboard: False
11 | tokenizer: EleutherAI/llemma_34b
12 | vocab_size: 49152 #32768 # 2^17
13 | 
14 | # Dataset Formation
15 | seq_length: 512
16 | include_eot_token_in_corpus: True
17 | 
18 | max_entries_in_raw_dataset: 10e5 #10e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
19 | max_seq_in_tokenized_dataset: 5e4 #5e5 # Select only this many tokenized sequences.
20 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
21 | 
22 | # Data Cleaning:
23 | remove_trash: False
24 | trash_cutoff: 0.25
25 | deduplicate_entries: False
26 | deduplication_threshold: 75
27 | 
28 | # Data Order:
29 | ordering: randomized # for now
30 | 
31 | # Validation Split
32 | validation_seqs: 4096 # how many sequences to reserve for validation
33 | 


--------------------------------------------------------------------------------
/cramming/config/data/sanity-check-1.yaml:
--------------------------------------------------------------------------------
 1 | # Just a bunch of fake data ...
 2 | name: sanity-check-1
 3 | defaults:
 4 |   - sources:
 5 |       - fake
 6 | 
 7 | #
 8 | # Preprocessing
 9 | normalizer: # This is ignored and the default bert normalizer is used instead
10 |   force_lowercase:
11 |   strip_accents:
12 |   force_english_keyboard:
13 | tokenizer: gpt2
14 | vocab_size: 50257
15 | 
16 | # Dataset Formation
17 | seq_length: 64
18 | include_eot_token_in_corpus:
19 | 
20 | max_entries_in_raw_dataset: 1e12 # Select only this many examples from the dataset
21 | max_seq_in_tokenized_dataset: 1e12 # Select only this many tokenized sequences.
22 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
23 | 
24 | # Data Cleaning:
25 | remove_trash: False
26 | trash_cutoff: 0.3
27 | deduplicate_entries: False
28 | deduplication_threshold: 100
29 | 
30 | # Data Order:
31 | ordering: randomized # could be a curriculum
32 | 
33 | # Validation Split
34 | validation_seqs: 128 # how many sequences to reserve for validation
35 | 


--------------------------------------------------------------------------------
/cramming/config/data/sanity-check-2.yaml:
--------------------------------------------------------------------------------
 1 | # Just a tiny test dataset ...
 2 | name: sanity-check-2
 3 | # https://hydra.cc/docs/patterns/select_multiple_configs_from_config_group/
 4 | defaults:
 5 |   - sources:
 6 |       - ag_news
 7 | 
 8 | # Preprocessing
 9 | normalizer:
10 |   force_lowercase: False
11 |   strip_accents: False
12 |   force_english_keyboard: False
13 | tokenizer: BPE # faster for sanity checks
14 | vocab_size: 32768 # to make sure there are not memory surprises compared to the actual data
15 | 
16 | # Dataset Formation
17 | seq_length: 128
18 | include_eot_token_in_corpus: True
19 | 
20 | max_entries_in_raw_dataset: 1e10 # Select only this many examples from the dataset
21 | max_seq_in_tokenized_dataset: 1e10 # Select only this many tokenized sequences.
22 | # max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training
23 | 
24 | # Data Cleaning:
25 | remove_trash: False
26 | trash_cutoff: 0.3
27 | deduplicate_entries: False
28 | deduplication_threshold: 100
29 | 
30 | # Data Order:
31 | ordering: randomized # could be a curriculum
32 | 
33 | # Validation Split
34 | validation_seqs: 128 # how many sequences to reserve for validation
35 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/ag_news.yaml:
--------------------------------------------------------------------------------
 1 | # For sanity testing
 2 | ag_news:
 3 |   provider: huggingface
 4 |   partition: default
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   remove_columns: label
10 |   concatenate_successive_entries: 0
11 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/arithmetic.yaml:
--------------------------------------------------------------------------------
 1 | # Just a bunch of fake data ...
 2 | arithmetic:
 3 |   provider: arithmetic
 4 |   split:
 5 | 
 6 |   randgen_seed: 0
 7 |   size: 2048
 8 | 
 9 |   tokenized_dataset_path: "arithmetic_data/+_n_3_m_3_examples_100_seed_42/hf_tokenized_dataset"
10 |   tokenizer_type: # for specifiying which arthmetic tokenizer we want to use
11 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/bookcorpus.yaml:
--------------------------------------------------------------------------------
 1 | # The bookcorpus dataset, drawn from it huggingface mirror
 2 | bookcorpus:
 3 |   provider: huggingface
 4 |   partition: plain_text
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 16
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/c4.yaml:
--------------------------------------------------------------------------------
 1 | # The wikipedia en dataset, drawn from it huggingface mirror
 2 | c4:
 3 |   provider: huggingface
 4 |   partition: en
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/dash_books.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_book_dash_books:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/fake.yaml:
--------------------------------------------------------------------------------
1 | # Just a bunch of fake data ...
2 | fake:
3 |   provider: fake
4 |   split:
5 | 
6 |   randgen_seed: 0
7 |   size: 2048
8 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/iwslt.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_ted_talks_iwslt:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/local.yaml:
--------------------------------------------------------------------------------
1 | # Just a bunch of fake data ...
2 | local:
3 |   provider: local
4 |   split:
5 | 
6 |   randgen_seed: 0
7 |   size: 2048
8 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/no_code_stackexchange.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_no_code_stackexchange:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/openwebtext.yaml:
--------------------------------------------------------------------------------
 1 | # The open webtext replication, as mirrored on HF
 2 | openwebtext:
 3 |   provider: huggingface
 4 |   partition: plain_text
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/oscar.yaml:
--------------------------------------------------------------------------------
 1 | # The oscar dataset, drawn from it huggingface mirror
 2 | # should be 1.2T in this deduplicated version
 3 | oscar:
 4 |   provider: huggingface
 5 |   partition: unshuffled_deduplicated_en
 6 |   split: train
 7 | 
 8 |   streaming: True
 9 | 
10 |   # source-specific cleaning rules?
11 |   remove_columns:
12 |   concatenate_successive_entries: 0 # cannot concat when streaming
13 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/proofpiledata.yaml:
--------------------------------------------------------------------------------
 1 | # The open webtext replication, as mirrored on HF
 2 | EleutherAI/proof-pile-2:
 3 |   provider: huggingface
 4 |   partition: open-web-math #['default', 'arxiv', 'open-web-math', 'algebraic-stack']
 5 |   split: train
 6 | 
 7 |   streaming: False #True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/the_pile.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | the_pile:
 3 |   provider: local
 4 |   file_type: json
 5 |   files:
 6 |     - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
 7 |     - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
 8 |     - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
 9 |     - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
10 |     - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
11 |     - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
12 |     - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
13 |     - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
14 |     - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
15 |     - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
16 |     - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
17 |     - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
18 |     - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
19 |     - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
20 |     - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
21 |     - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
22 |     - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
23 |     - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
24 |     - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
25 |     - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
26 |     - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
27 |     - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
28 |     - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
29 |     - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
30 |     - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
31 |     - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
32 |     - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
33 |     - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
34 |     - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
35 |     - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
36 |   filter:
37 |     #  pile_set_name:
38 |     # possible pile_set_name values are
39 |     # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
40 |     # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
41 |     # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
42 |     # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
43 |     # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
44 |     # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
45 |     # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
46 |     # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
47 |     # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
48 |     # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
49 |     # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
50 |     # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
51 |     # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
52 |     # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
53 |     # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
54 |     # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
55 |     # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
56 |     # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
57 |     # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
58 |     # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
59 |     # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
60 |     # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
61 |   split: train
62 |   streaming: True
63 | 
64 |   # source-specific cleaning rules?
65 |   remove_columns:
66 |   concatenate_successive_entries: 0
67 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/the_pileCC.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | the_pileCC:
 3 |   provider: local
 4 |   file_type: json
 5 |   files:
 6 |     - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
 7 |     - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
 8 |     - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
 9 |     - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
10 |     - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
11 |     - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
12 |     - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
13 |     - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
14 |     - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
15 |     - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
16 |     - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
17 |     - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
18 |     - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
19 |     - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
20 |     - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
21 |     - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
22 |     - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
23 |     - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
24 |     - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
25 |     - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
26 |     - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
27 |     - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
28 |     - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
29 |     - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
30 |     - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
31 |     - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
32 |     - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
33 |     - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
34 |     - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
35 |     - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
36 |   filter:
37 |     pile_set_name:
38 |       - Pile-CC
39 |   # possible pile_set_name values are
40 |   # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
41 |   # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
42 |   # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
43 |   # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
44 |   # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
45 |   # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
46 |   # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
47 |   # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
48 |   # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
49 |   # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
50 |   # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
51 |   # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
52 |   # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
53 |   # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
54 |   # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
55 |   # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
56 |   # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
57 |   # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
58 |   # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
59 |   # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
60 |   # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
61 |   # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
62 |   split: train
63 |   streaming: True
64 | 
65 |   # source-specific cleaning rules?
66 |   remove_columns:
67 |   concatenate_successive_entries: 0
68 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/the_pile_dedup.yaml:
--------------------------------------------------------------------------------
 1 | # The EleutherAI/the_pile_deduplicated
 2 | EleutherAI/the_pile_deduplicated:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/the_pile_natural.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | the_pile_natural:
 3 |   provider: local
 4 |   file_type: json
 5 |   files:
 6 |     - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
 7 |     - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
 8 |     - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
 9 |     - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
10 |     - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
11 |     - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
12 |     - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
13 |     - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
14 |     - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
15 |     - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
16 |     - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
17 |     - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
18 |     - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
19 |     - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
20 |     - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
21 |     - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
22 |     - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
23 |     - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
24 |     - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
25 |     - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
26 |     - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
27 |     - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
28 |     - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
29 |     - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
30 |     - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
31 |     - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
32 |     - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
33 |     - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
34 |     - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
35 |     - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
36 |   filter:
37 |     pile_set_name:
38 |       - Gutenberg
39 |       - Books3
40 |       - Wikipedia (en)
41 |   # possible pile_set_name values are
42 |   # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
43 |   # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
44 |   # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
45 |   # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
46 |   # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
47 |   # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
48 |   # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
49 |   # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
50 |   # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
51 |   # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
52 |   # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
53 |   # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
54 |   # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
55 |   # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
56 |   # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
57 |   # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
58 |   # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
59 |   # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
60 |   # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
61 |   # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
62 |   # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
63 |   # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
64 |   split: train
65 |   streaming: True
66 | 
67 |   # source-specific cleaning rules?
68 |   remove_columns:
69 |   concatenate_successive_entries: 0
70 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/the_pile_stream.yaml:
--------------------------------------------------------------------------------
 1 | # Pile streaming from huggingface with new streaming tech :>
 2 | # should be 1.2T in this deduplicated version
 3 | EleutherAI/the_pile:
 4 |   provider: huggingface
 5 |   partition: unshuffled_deduplicated_en
 6 |   split: train
 7 | 
 8 |   streaming: True
 9 | 
10 |   # source-specific cleaning rules?
11 |   remove_columns:
12 |   concatenate_successive_entries: 0 # cannot concat when streaming
13 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/uncorpus.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_uncorpus:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/uspto.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_the_pile_uspto:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/wikibooks.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_wikibooks:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/wikinews.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_wikinews:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/wikipedia.yaml:
--------------------------------------------------------------------------------
 1 | # The wikipedia en dataset, drawn from it huggingface mirror
 2 | wikipedia:
 3 |   provider: huggingface
 4 |   partition: 20220301.en
 5 |   split: train
 6 | 
 7 |   streaming: False
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns: title
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/wikiquote.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_wikiquote:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/wikiversity.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_wikiversity:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/data/sources/wikivoyage.yaml:
--------------------------------------------------------------------------------
 1 | # A part of ROOTS
 2 | bigscience-data/roots_en_wikivoyage:
 3 |   provider: huggingface
 4 |   partition:
 5 |   split: train
 6 | 
 7 |   streaming: True
 8 | 
 9 |   # source-specific cleaning rules?
10 |   remove_columns:
11 |   concatenate_successive_entries: 0
12 | 


--------------------------------------------------------------------------------
/cramming/config/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcleish7/arithmetic/86022a57d38c0fde46444d62e8dcbebcc0af614c/cramming/config/eval/__init__.py


--------------------------------------------------------------------------------
/cramming/config/eval/pythia.yaml:
--------------------------------------------------------------------------------
 1 | # defaults:
 2 | #   - optim: adam
 3 | #   - tasks:
 4 |       # - winogrande
 5 |       # - lambada_openai
 6 |       # - piqa
 7 |       # - winograd_wsc
 8 |       # - arc
 9 |       # - sciq
10 |       # - logiqa
11 | 
12 | name: pythia-tests
13 | 
14 | arch_modifications: null
15 | # checkpoint name:
16 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder
17 | checkpoint: latest
18 | path: ${impl.path} # Path for caches of datasets and tokenizers
19 | 


--------------------------------------------------------------------------------
/cramming/config/eval/tasks/lambada_openai.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | lambada_openai:
3 | 


--------------------------------------------------------------------------------
/cramming/config/eval/tasks/winogrande.yaml:
--------------------------------------------------------------------------------
1 | # dataset-specific settings
2 | winogrande:
3 | 


--------------------------------------------------------------------------------
/cramming/config/hydra/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcleish7/arithmetic/86022a57d38c0fde46444d62e8dcbebcc0af614c/cramming/config/hydra/__init__.py


--------------------------------------------------------------------------------
/cramming/config/hydra/job_logging/custom.yaml:
--------------------------------------------------------------------------------
 1 | # python logging configuration for tasks
 2 | version: 1
 3 | formatters:
 4 |   simple:
 5 |     format: "[%(asctime)s] %(message)s"
 6 | handlers:
 7 |   console:
 8 |     class: logging.StreamHandler
 9 |     formatter: simple
10 |     stream: ext://sys.stdout
11 |   file:
12 |     class: logging.FileHandler
13 |     formatter: simple
14 |     # relative to the job log directory
15 |     filename: ${name}_${hydra.job.name}.log
16 | root:
17 |   level: INFO
18 |   handlers: [console, file]
19 | 
20 | disable_existing_loggers: false
21 | 


--------------------------------------------------------------------------------
/cramming/config/impl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcleish7/arithmetic/86022a57d38c0fde46444d62e8dcbebcc0af614c/cramming/config/impl/__init__.py


--------------------------------------------------------------------------------
/cramming/config/impl/_default.yaml:
--------------------------------------------------------------------------------
  1 | # Settings for implementation details
  2 | # These settings "should" not influence the outcome of the computation in major ways, only its speed.
  3 | # These settings are generic implementation details
  4 | # -----------------------------------------------------------------------------------------------------
  5 | 
  6 | # This is the main folder where data will be stored (such as caches of datasets and tokenizers):
  7 | # This can be an absolute path (which will be honored) or a relative path
  8 | # The relative path will be executed relative to the cfg.base_dir
  9 | # This behavior is controlled in the main_launcher
 10 | path: data
 11 | 
 12 | # data implementation:
 13 | local_staging_dir: # Optionally copy a preprocessed dataset into this folder before loading it for training
 14 | forbid_dataset_preprocessing: True
 15 | temporary_corpus: False # Save data directly into local staging dir, forget after use
 16 | max_raw_chunk_size: 1e14
 17 | 
 18 | # checkpointing and logging:
 19 | print_loss_every_nth_step: 1000
 20 | save_intermediate_checkpoints: False
 21 | save_every_nth_step: -1
 22 | save_every_n_minutes: -1
 23 | save_intermediate_model_name:
 24 | 
 25 | # early termination, cancel runs that do not meet this loss threshold early.
 26 | early_termination:
 27 |   enabled: False
 28 |   budget: 3 # budget in hours
 29 |   loss_threshold: 6.0 # modify this for non-xent losses
 30 |   overall_budget: -1
 31 | 
 32 | # Batch size settings:
 33 | # batch_size: This is handled in train after commit 982a4d33cd7f79a48b691114ae78f6ad1cdbee69
 34 | microbatch_size: 128 # dont make it larger than batch_size...
 35 | 
 36 | # Basic compute settings
 37 | threads: 32 # maximal number of cpu dataloader workers used per GPU, this value will never exceed num_gpus * num_physical threads
 38 | # Dataloader multiprocessing
 39 | pad_to_multiple_of: 8 # padding in dataloader during downstream
 40 | shuffle_in_dataloader: False # There is still shuffling in the preprocessing pipeline.
 41 | pin_memory: True
 42 | prefetch_factor: 2
 43 | persistent_workers: True # this clashes with pin_memory in pytorch<1.7.1
 44 | 
 45 | # Default floating point precision:
 46 | default_precision: float # needs to be a pytorch datatype
 47 | 
 48 | # Distributed training
 49 | dist_backend: nccl
 50 | sharing_strategy: # file_descriptor # if no argument is given, then the OS default is picked by pytorch
 51 | 
 52 | # Misc:
 53 | enable_huggingface_offline_mode: False
 54 | local_rank: # This is set automatically by the system_startup
 55 | 
 56 | save_final_model: False
 57 | push_to_huggingface_hub: False
 58 | hf_directoy_name: "test-crammedBERT-c5" # set a clever name here!
 59 | 
 60 | add_env_variables:
 61 | # should be NAME: stringval
 62 | 
 63 | # TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE
 64 | # TORCHINDUCTOR_MAX_AUTOTUNE_GEMM
 65 | 
 66 | # Other constants:
 67 | # OMP_NUM_THREADS:[number_of_physical_cores]
 68 | # OMP_SCHEDULE:  # STATIC
 69 | # OMP_PROC_BIND: # CLOSE
 70 | # GOMP_CPU_AFFINITY:  # "N-M"
 71 | # KMP_AFFINITY: # "granularity=fine,compact,1,0"
 72 | # KMP_BLOCKTIME: # 1
 73 | # optional_ld_preloads:
 74 | #  - libiomp5.so
 75 | # - jemalloc.so
 76 | 
 77 | #
 78 | # ### jemalloc
 79 | # export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
 80 | # export LD_PRELOAD=/home/mingfeim/packages/jemalloc-5.2.1/lib/libjemalloc.so
 81 | #
 82 | # ### tcmalloc
 83 | # export LD_PRELOAD=/home/mingfeim/packages/gperftools-2.8/install/lib/libtcmalloc.so
 84 | 
 85 | example_token_limit: 30 # never generate more example tokens than this
 86 | # example_prompts:
 87 | #   - "Oh, distinctly I remember, it was in the bleak"
 88 | #   - "The capital of Germany is"
 89 | #   - "The Westphalian peace ended the"
 90 | #   - "Hi! My name is"
 91 | #   - "In the place where we were born,"
 92 | #   - "Time is a"
 93 | 
 94 | # example_prompts:
 95 | #   - "System.out.println("
 96 | #   - "public class "
 97 | #   - "public static void main"
 98 | #   - "/* print hello world */"
 99 | #   - "System.out.println(2);"
100 | #   - "for (let i = 0; i < myarray.length; i++) {"
101 | example_prompts:
102 |     - "3 + 3 = "
103 |     - "44 + 56 = "
104 |     - "003 + 003 = "
105 |     - "070 + 094 = "
106 |     - "345 + 324 = "
107 |     - "598 + 527 = "
108 |     - "1234 + 4321 = "
109 |     - "94633 + 91826 = "


--------------------------------------------------------------------------------
/cramming/config/impl/torch-default.yaml:
--------------------------------------------------------------------------------
 1 | # Settings for implementation details
 2 | # These settings "should" not influence the outcome of the computation in major ways, only its speed.
 3 | # These settings are pytorch implementation details, tuned for singl(ish) GPU, sane pytorch stuff
 4 | # -----------------------------------------------------------------------------------------------------
 5 | 
 6 | name: torch-default
 7 | defaults:
 8 |   - _default
 9 |   - _self_
10 | 
11 | 
12 | # Basic pytorch settings
13 | benchmark: True # CUDNN benchmarking
14 | deterministic: False # This option will disable non-deterministic ops
15 | non_blocking: True # unblocked .to(device) handles
16 | tf32_allowed: True
17 | matmul_precision: medium # highest/high/medium
18 | 
19 | mixed_precision: True # turns on AMP on GPUs/Intel devices. The default precision needs to be float
20 | grad_scaling: True # Only activates when mixed_precision=True
21 | mixed_precision_target_dtype: float16 # you might try your luck with bfloat16 too
22 | 
23 | # Distributed training:
24 | zero_redundancy_optimizer: False # requires limited_decay_keys=[] for pytorch<=1.10.2
25 | broadcast_buffers: False
26 | bucket_cap_mb: 25
27 | gradient_as_bucket_view: True
28 | static_graph: True
29 | 
30 | # scaled dot products:
31 | enable_mem_efficient_sdp: False
32 | enable_math_sdp: True
33 | enable_flash_sdp: True
34 | 
35 | # Misc:
36 | foreach_optimizer: False
37 | 
38 | # Compilation
39 | compile_torch: True
40 | mode: default # overwritten by manual selection of inductor variables below
41 | dynamic: False # this is a world of pain (when I last tested it, around torch2.0 release)
42 | fullgraph: True # why even compile when not compile everywhere :>
43 | backend: inductor
44 | _inductor_vars:
45 |   # max_autotune_gemm: True
46 |   # max_autotune_pointwise: False # was better in some tests not to enable this?
47 |   # triton:
48 |   #   cudagraphs: False # cannot fit with overhead
49 |   #   # cudagraph_trees: False # fixes memory problems but has scary warning messages
50 |   # # epilogue_fusion: True # true by default is latest nightly
51 |   # # aggressive_fusion: False # oom on latest nightly
52 |   # permute_fusion: True # nice
53 |   # shape_padding: True # flaky on the new nightly?
54 |   # optional to mess with the internal inductor config. Maybe not advisable
55 |   # - `epilogue_fusion` which fuses pointwise ops into templates. Requires `max_autotune` to also be set
56 |   # - `max_autotune` which will profile to pick the best matmul configuration
57 |   # - `fallback_random` which is useful when debugging accuracy issues
58 |   # - `shape_padding` which pads matrix shapes to better align loads on GPUs especially for tensor cores
59 |   # - `triton.cudagraphs` which will reduce the overhead of python with CUDA graphs
60 |   # - `trace.enabled` which is the most useful debugging flag to turn on
61 |   # - `trace.graph_diagram` which will show you a picture of your graph after fusion
62 |   # - For inductor you can see the full list of configs that it supports by calling `torch._inductor.list_options()`
63 |   # or directly at https://github.com/pytorch/pytorch/blob/master/torch/_inductor/config.py
64 | 


--------------------------------------------------------------------------------
/cramming/config/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcleish7/arithmetic/86022a57d38c0fde46444d62e8dcbebcc0af614c/cramming/config/train/__init__.py


--------------------------------------------------------------------------------
/cramming/config/train/common.yaml:
--------------------------------------------------------------------------------
 1 | # Basic hyperparameter for normal BERT pretraining
 2 | # working hard here to separate "impl" implementation details and "train" abstract hyperparameters
 3 | 
 4 | name: common
 5 | 
 6 | defaults:
 7 |   - optim: adam_classic
 8 |   - optim_mod: disabled
 9 | 
10 | optim:
11 |   lr: 1e-4
12 | 
13 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers
14 | 
15 | # steps:
16 | warmup_steps: 80_000 # These are microbatch steps
17 | cooldown_steps: 0
18 | steps: 8_000_000 # These are microbatch steps at bs=64. The original 1mio steps for BERT are recovered with 512/64=8
19 | scheduler: polynomial-decay
20 | 
21 | # Training settting:
22 | stream_depth: ${data.seq_length} # full sequence as input to model
23 | batch_size: 512
24 | batch_size_ramp: 0
25 | 
26 | gradient_clipping:
27 | pretrain_in_train_mode: True # default BERT trains with dropout layers
28 | reverse_dataset_order: False
29 | 
30 | budget: ${budget}
31 | overall_budget: ${overall_budget}
32 | 


--------------------------------------------------------------------------------
/cramming/config/train/cramming.yaml:
--------------------------------------------------------------------------------
 1 | # Version 4 of changes to bert training hyperparameters
 2 | # Optimizes MLM rate for torch.compile, includes improved weight decay limitation, finally updated to a relative bs ramp
 3 | 
 4 | name: cramming-o4
 5 | 
 6 | defaults:
 7 |   - optim: adam
 8 |   - optim_mod: disabled
 9 | 
10 | optim:
11 |   lr: 1e-3
12 |   weight_decay: 0.01
13 | 
14 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers
15 | 
16 | # steps:
17 | warmup_steps: 0.1
18 | cooldown_steps: 0.1
19 | steps: 12_000_000 # these are microbatch steps. This is an upper limit that is usually never reached
20 | scheduler: budget-constant
21 | 
22 | # Training settting:
23 | stream_depth: ${data.seq_length} # full sequence as input to model
24 | batch_size: 8192
25 | batch_size_ramp: 0.60
26 | 
27 | gradient_clipping: 0.5
28 | pretrain_in_train_mode: True # default BERT trains with dropout layers enabled in pretrain
29 | reverse_dataset_order: False
30 | 
31 | budget: ${budget}
32 | overall_budget: ${overall_budget}
33 | 
34 | # for loading previously saved
35 | arch_modifications: null
36 | # checkpoint name:
37 | # This can be either "latest", or a reference to a specific checkpoint in a subfolder
38 | checkpoint: latest
39 | path: ${impl.path} # Path for caches of datasets and tokenizers
40 | 


--------------------------------------------------------------------------------
/cramming/config/train/janus-regime.yaml:
--------------------------------------------------------------------------------
 1 | # Version 4 of changes to bert training hyperparameters
 2 | # Optimizes MLM rate for torch.compile, includes improved weight decay limitation, finally updated to a relative bs ramp
 3 | 
 4 | name: cramming-o4
 5 | 
 6 | defaults:
 7 |   - optim: adam
 8 |   - optim_mod: disabled
 9 | 
10 | optim:
11 |   lr: 1e-3
12 |   weight_decay: 0.01
13 | 
14 | limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers
15 | 
16 | # steps:
17 | warmup_steps: 0.1
18 | cooldown_steps: 0.1
19 | steps: 4_000_000 # these are microbatch steps. This is an upper limit that is usually never reached
20 | scheduler: budget-constant
21 | 
22 | # Training settting:
23 | stream_depth: 2 # Train one token at a time
24 | batch_size: 16384
25 | batch_size_ramp: 0.60
26 | 
27 | gradient_clipping: 0.5
28 | pretrain_in_train_mode: True # default BERT trains with dropout layers enabled in pretrain
29 | reverse_dataset_order: False
30 | 
31 | budget: ${budget}
32 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/adafactor.yaml:
--------------------------------------------------------------------------------
 1 | type: Adafactor
 2 | 
 3 | lr: 0.001
 4 | eps:
 5 |   - 1e-30
 6 |   - 0.001
 7 | clip_threshold: 1.0
 8 | decay_rate: -0.8
 9 | beta1:
10 | weight_decay: 0.0
11 | scale_parameter: False
12 | relative_step: False
13 | warmup_init: False
14 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/adahessian.yaml:
--------------------------------------------------------------------------------
 1 | type: AdaHessian
 2 | 
 3 | lr: 0.15
 4 | betas:
 5 |   - 0.9
 6 |   - 0.98
 7 | eps: 1e-12
 8 | weight_decay: 0.01
 9 | hessian_power: 1.0
10 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/adam.yaml:
--------------------------------------------------------------------------------
 1 | type: AdamW
 2 | 
 3 | lr: 0.0005
 4 | betas:
 5 |   - 0.9
 6 |   - 0.98
 7 | eps: 1e-12
 8 | weight_decay: 0.01
 9 | amsgrad: False
10 | fused:
11 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/adam8bit.yaml:
--------------------------------------------------------------------------------
 1 | type: Adam8bit
 2 | 
 3 | lr: 0.0005
 4 | betas:
 5 |   - 0.9
 6 |   - 0.98
 7 | eps: 1e-12
 8 | weight_decay: 0.01
 9 | amsgrad: False
10 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/adam_classic.yaml:
--------------------------------------------------------------------------------
 1 | type: Adam
 2 | 
 3 | lr: 0.0005
 4 | betas:
 5 |   - 0.9
 6 |   - 0.999
 7 | eps: 1e-8
 8 | weight_decay: 0.01
 9 | amsgrad: False
10 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/adamscale.yaml:
--------------------------------------------------------------------------------
 1 | type: AdamWScale
 2 | 
 3 | lr: 0.0005
 4 | betas:
 5 |   - 0.9
 6 |   - 0.98
 7 | eps: 1e-12
 8 | weight_decay: 0.01
 9 | correct_bias: True # adamw fix
10 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/agd.yaml:
--------------------------------------------------------------------------------
1 | type: AGD
2 | 
3 | gain: 1.0
4 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/lion.yaml:
--------------------------------------------------------------------------------
1 | type: Lion
2 | 
3 | lr: 1e-4
4 | betas:
5 |   - 0.9
6 |   - 0.99
7 | # use 0.95, 0.98 if unstable
8 | weight_decay: 0.1
9 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/radam.yaml:
--------------------------------------------------------------------------------
1 | type: RAdam
2 | 
3 | lr: 0.0005
4 | betas:
5 |   - 0.9
6 |   - 0.98
7 | eps: 1e-12
8 | weight_decay: 0.01
9 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/sgd.yaml:
--------------------------------------------------------------------------------
1 | type: SGD
2 | 
3 | lr: 0.0005
4 | momentum: 0.9
5 | dampening: 0.0
6 | weight_decay: 0.01
7 | nesterov: True
8 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim/shampoo.yaml:
--------------------------------------------------------------------------------
 1 | type: Shampoo
 2 | 
 3 | lr: 0.0005
 4 | betas:
 5 |   - 0.9
 6 |   - 0.98
 7 | epsilon: 1e-12
 8 | use_bias_correction: True
 9 | adam_w_mode: True
10 | weight_decay: 0.01
11 | grafting_type: 4
12 | grafting_epsilon: 1e-08
13 | grafting_beta2: 0.999
14 | 
15 | root_inv_dist: False
16 | # update_freq (int): frequency for updating inverse preconditioner (Default: 100)
17 | # init_delay (int): initial delay before starting to compute root inverse (Default: 1000)
18 | # threshold (int): threshold for switching to diagonal preconditioner (Default: 1024)
19 | # preconditioner_dtype (torch.dtype): data type for preconditioner (Default: torch.float)
20 | # large_dim_method (LargeDimMethod): method for handling large scale tensors. (Default: LargeDimMethod.BLOCKING)
21 | # root_inv_dist (bool): distributes root inverse computation across multiple GPU workers (Default: True)
22 | # use_merge_dims (bool): merge dimensions if possible while respecting threshold. (Default: True)
23 | # grafting_type (GraftingType): Selects grafting method. (Default: GraftingType.ADAGRAD)
24 | # grafting_epsilon (float): Epsilon for grafting method. (Default: 1e-3)
25 | # grafting_beta2 (float): Exponential moving average factor for grafting method. (Default: 1.0)
26 | 
27 | # class PreconditionerType(enum.Enum):
28 | #     FULL = 0
29 | #     DIAGONAL = 1
30 | #
31 | #
32 | # class GraftingType(enum.Enum):
33 | #     NONE = 0
34 | #     SGD = 1
35 | #     ADAGRAD = 2
36 | #     RMSPROP = 3
37 | #     ADAM = 4
38 | #
39 | #
40 | # class LargeDimMethod(enum.Enum):
41 | #     DIAGONAL = 0
42 | #     ADAGRAD = 1
43 | #     BLOCKING = 2
44 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim_mod/disabled.yaml:
--------------------------------------------------------------------------------
1 | name: none
2 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim_mod/larc.yaml:
--------------------------------------------------------------------------------
1 | name: LARC
2 | 
3 | trust_coefficient: 0.02
4 | clip: True
5 | eps: 1e-8
6 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim_mod/lars.yaml:
--------------------------------------------------------------------------------
1 | name: LARS
2 | 
3 | trust_coefficient: 0.02
4 | clip: False
5 | eps: 1e-8
6 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim_mod/progressive.yaml:
--------------------------------------------------------------------------------
 1 | name: progressive-batching
 2 | 
 3 | progress_rule: norm-based
 4 | 
 5 | monotone: False
 6 | theta: 0.9
 7 | 
 8 | min_sample_guard: 2
 9 | max_sample_guard: 128
10 | 


--------------------------------------------------------------------------------
/cramming/config/train/optim_mod/sam.yaml:
--------------------------------------------------------------------------------
1 | name: SAM
2 | rho: 0.05
3 | 


--------------------------------------------------------------------------------
/cramming/config/wandb/default.yaml:
--------------------------------------------------------------------------------
1 | enabled: True
2 | entity: placeholder # change this obviously ;>
3 | project: arithmetic
4 | tags: []
5 | 


--------------------------------------------------------------------------------
/cramming/config/wandb/none.yaml:
--------------------------------------------------------------------------------
1 | enabled: False
2 | entity:
3 | project:
4 | tags: []
5 | 


--------------------------------------------------------------------------------
/cramming/data/__init__.py:
--------------------------------------------------------------------------------
1 | """This module handles and hides the data away ;)"""
2 | 
3 | from .pretraining_preparation import load_pretraining_corpus, prepare_dataloaders
4 | 


--------------------------------------------------------------------------------
/cramming/data/curriculum_sorting.py:
--------------------------------------------------------------------------------
  1 | """Baseline curricula."""
  2 | import torch
  3 | import numpy as np
  4 | 
  5 | import logging
  6 | 
  7 | log = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | def _sort_tokenized_dataset_by_unigram(tokenized_dataset, tokenizer, num_threads=1, ngram=1, reverse=False):
 11 |     # Force unigram counts per token:
 12 |     map_setup = dict(
 13 |         batched=True,
 14 |         batch_size=1024,
 15 |         # num_proc=None,  # have to reimplement counting as in-out instead of side effects for this to work. Lets see how slow num_proc=0 is
 16 |         load_from_cache_file=False,
 17 |         # keep_in_memory=True,
 18 |     )
 19 | 
 20 |     unigrams_counts_per_token = np.zeros(tokenizer.vocab_size, dtype=np.int64)
 21 | 
 22 |     def count_unigrams(examples):
 23 |         nonlocal unigrams_counts_per_token
 24 |         unigrams_counts_per_token += np.bincount(np.asarray(examples["input_ids"]).reshape(-1), minlength=tokenizer.vocab_size)
 25 | 
 26 |     tokenized_dataset.map(count_unigrams, desc="Counting token unigrams", **map_setup, num_proc=None)
 27 | 
 28 |     token_count = sum(unigrams_counts_per_token)
 29 |     k = 1
 30 |     k_smoothed_probs = (unigrams_counts_per_token + k) / (token_count + k * tokenizer.vocab_size)
 31 |     log2_probs = np.log2(k_smoothed_probs)
 32 | 
 33 |     def return_seq_prob(examples):
 34 |         logprob_scores = log2_probs[np.asarray(examples["input_ids"])].sum(axis=1) / tokenizer.model_max_length
 35 |         return dict(scores=logprob_scores)
 36 | 
 37 |     dataset_probs = tokenized_dataset.map(
 38 |         return_seq_prob,
 39 |         desc="Computing log probs per sequence",
 40 |         remove_columns=tokenized_dataset.column_names,
 41 |         **map_setup,
 42 |         num_proc=num_threads if num_threads > 0 else None,
 43 |     )
 44 | 
 45 |     new_order = np.argsort(np.asarray(dataset_probs["scores"]))
 46 | 
 47 |     if reverse:
 48 |         new_order = new_order[::-1]
 49 | 
 50 |     return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)
 51 | 
 52 | 
 53 | def _sort_tokenized_dataset_by_token(tokenized_dataset, tokenizer, target_token_id, num_threads=1):
 54 |     map_setup = dict(
 55 |         batched=True,
 56 |         batch_size=1024,
 57 |         num_proc=num_threads if num_threads > 0 else None,
 58 |         load_from_cache_file=False,
 59 |         # keep_in_memory=True,
 60 |     )
 61 | 
 62 |     def count_token(examples):
 63 |         return dict(counts=(np.asarray(examples["input_ids"]) == target_token_id).sum(axis=1))
 64 | 
 65 |     dataset_counts = tokenized_dataset.map(
 66 |         count_token,
 67 |         desc=f"Counting occurrences of token {tokenizer.decode(target_token_id)}",
 68 |         remove_columns=tokenized_dataset.column_names,
 69 |         **map_setup,
 70 |     )
 71 | 
 72 |     new_order = np.argsort(np.asarray(dataset_counts["counts"]))[::-1]
 73 | 
 74 |     # Print sentence with most occurrences:
 75 |     sentence_idx = int(new_order[0])
 76 |     input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
 77 |     dataset_size = len(tokenized_dataset)
 78 | 
 79 |     log.info("Sentence with most occurrences of token ...")
 80 |     log.info(tokenizer.batch_decode(input_data[None])[0])
 81 | 
 82 |     sentence_idx = int(new_order[-1])
 83 |     input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
 84 |     dataset_size = len(tokenized_dataset)
 85 | 
 86 |     log.info("Sentence with least occurrences of token ...")
 87 |     log.info(tokenizer.batch_decode(input_data[None])[0])
 88 | 
 89 |     return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)
 90 | 
 91 | 
 92 | def _sort_tokenized_dataset_by_word_length(tokenized_dataset, tokenizer, num_threads=1):
 93 |     map_setup = dict(
 94 |         batched=True,
 95 |         batch_size=1024,
 96 |         num_proc=num_threads if num_threads > 0 else None,
 97 |         load_from_cache_file=False,
 98 |     )
 99 | 
100 |     def count_word_lengths(examples):
101 |         return dict(lengths=[len(s) for s in tokenizer.batch_decode(torch.as_tensor(examples["input_ids"]))])
102 | 
103 |     dataset_counts = tokenized_dataset.map(
104 |         count_word_lengths,
105 |         desc="Counting word lengths per sequence",
106 |         remove_columns=tokenized_dataset.column_names,
107 |         **map_setup,
108 |     )
109 | 
110 |     new_order = np.argsort(np.asarray(dataset_counts["lengths"]))  # shortest sentences first
111 | 
112 |     # Print sentence with shortest length
113 |     sentence_idx = int(new_order[0])
114 |     input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
115 |     dataset_size = len(tokenized_dataset)
116 | 
117 |     log.info("Sentence with shortest length ...")
118 |     log.info(tokenizer.batch_decode(input_data[None])[0])
119 | 
120 |     sentence_idx = int(new_order[-1])
121 |     input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
122 |     dataset_size = len(tokenized_dataset)
123 | 
124 |     log.info("and longest ...")
125 |     log.info(tokenizer.batch_decode(input_data[None])[0])
126 | 
127 |     return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)
128 | 


--------------------------------------------------------------------------------
/cramming/data/deduplicate.py:
--------------------------------------------------------------------------------
  1 | """This is glue code to connect to the rust-based deduplication of https://github.com/google-research/deduplicate-text-datasets
  2 | there is probably a smart way to implement deduplication for huggingface datasets directly,
  3 | but this is just a dumb dump-everything-into-tmp-files solution.
  4 | 
  5 | Code based on branch https://github.com/google-research/deduplicate-text-datasets/tree/dev-v1
  6 | See original license below.
  7 | """
  8 | 
  9 | """Installation how-to:
 10 | cargo install --target-dir ../cramming/dedup
 11 | Make sure to make sure that path_to_rust_code is set to the correct value if installing differently
 12 | """
 13 | 
 14 | # ORIGINAL LICENSE:
 15 | 
 16 | # Copyright 2021 Google LLC
 17 | # Licensed under the Apache License, Version 2.0 (the "License");
 18 | # you may not use this file except in compliance with the License.
 19 | # You may obtain a copy of the License at
 20 | #
 21 | #     https://www.apache.org/licenses/LICENSE-2.0
 22 | #
 23 | # Unless required by applicable law or agreed to in writing, software
 24 | # distributed under the License is distributed on an "AS IS" BASIS,
 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 26 | # See the License for the specific language governing permissions and
 27 | # limitations under the License.
 28 | import datasets
 29 | 
 30 | import os
 31 | import numpy as np
 32 | from tqdm import tqdm
 33 | 
 34 | import time
 35 | import tempfile
 36 | 
 37 | import torch
 38 | 
 39 | 
 40 | def deduplicate_huggingface_dataset(dataset, threshold=100, original_cwd="."):
 41 |     """ "Seamlessly" run exact deduplication as in Lee et al."""
 42 |     path_to_rust_code = os.path.join(original_cwd, "dedup", "release")
 43 |     with tempfile.TemporaryDirectory() as tmpdir:
 44 |         text_file = _write_tmp_file(dataset, dirname=tmpdir)
 45 |         _make_suffix_array(text_file, tmpdir, path_to_rust_code)
 46 | 
 47 |         # Run other rust code directly
 48 |         options = f"--length-threshold {threshold} --cache-dir {tmpdir}/cache/"
 49 | 
 50 |         print("Finding self-similar parts...")
 51 |         os.popen(
 52 |             f"{path_to_rust_code}/dedup_dataset self-similar --data-file {text_file} " f"{options} --num-threads {torch.get_num_threads()}"
 53 |         ).read()
 54 |         print("Collect self-similar from all parts...")
 55 |         os.popen(f"{path_to_rust_code}/dedup_dataset collect --data-file {text_file} " f"{options}> {tmpdir}/drop_tokens_file").read()
 56 |         dataset = _finish_and_return_to_hf_dataset(text_file, f"{tmpdir}/drop_tokens_file")
 57 |     return dataset
 58 | 
 59 | 
 60 | def _write_tmp_file(dataset, dirname):
 61 |     text_file = os.path.join(dirname, "tmp_full_dataset_as_text")
 62 | 
 63 |     with open(text_file, "wb") as fout:
 64 |         for example in tqdm(dataset, desc="Writing dataset to tmp files."):  # not batched...
 65 |             fout.write((example["text"] + "<EOT>").encode("utf-8"))
 66 |     return text_file
 67 | 
 68 | 
 69 | def _make_suffix_array(text_file, tmpdir, path_to_rust_code):
 70 |     data_size = os.path.getsize(text_file)
 71 |     HACK = 100000
 72 | 
 73 |     started = []
 74 | 
 75 |     if data_size > 10e9:
 76 |         total_jobs = 100
 77 |         jobs_at_once = 20
 78 |     elif data_size > 1e9:
 79 |         total_jobs = 96
 80 |         jobs_at_once = 96
 81 |     elif data_size > 10e6:
 82 |         total_jobs = 4
 83 |         jobs_at_once = 4
 84 |     else:
 85 |         total_jobs = 4
 86 |         jobs_at_once = 1
 87 | 
 88 |     S = data_size // total_jobs
 89 |     print("Partition into parts and create suffix arrays...")
 90 |     for jobstart in range(0, total_jobs, jobs_at_once):
 91 |         wait = []
 92 |         for i in range(jobstart, jobstart + jobs_at_once):
 93 |             s, e = i * S, min((i + 1) * S + HACK, data_size)
 94 |             cmd = f"{path_to_rust_code}/dedup_dataset make-part --data-file {text_file} --start-byte {s} --end-byte {e}"
 95 |             started.append((s, e))
 96 |             # print(cmd)
 97 |             wait.append(os.popen(cmd))
 98 | 
 99 |             if e == data_size:
100 |                 break
101 | 
102 |         print("Waiting for jobs to finish")
103 |         [x.read() for x in wait]
104 | 
105 |     print("Checking all wrote correctly")
106 | 
107 |     while True:
108 |         files = [f"{text_file}.part.{s}-{e}" for s, e in started]
109 | 
110 |         wait = []
111 |         for x, (s, e) in zip(files, started):
112 |             size_data = os.path.getsize(x)
113 |             FACT = np.ceil(np.log(size_data) / np.log(2) / 8)
114 |             # print("FACT", FACT)
115 |             size_table = os.path.getsize(x + ".table.bin")
116 |             if not os.path.exists(x) or not os.path.exists(x + ".table.bin") or size_table == 0 or size_data * FACT != size_table:
117 |                 cmd = f"{path_to_rust_code}/dedup_dataset make-part --data-file {text_file} --start-byte {s} --end-byte {e}"
118 |                 # print(cmd)
119 |                 wait.append(os.popen(cmd))
120 |         print("Rerunning", len(wait), "jobs because they failed.")
121 |         [x.read() for x in wait]
122 |         time.sleep(1)
123 |         if len(wait) == 0:
124 |             break
125 | 
126 |     print("Merging suffix trees")
127 | 
128 |     torun = " --suffix-path ".join(files)
129 |     options = f"--output-file {tmpdir}/out.table.bin --suffix-path {torun} --num-threads {torch.get_num_threads()}"
130 |     print(f"{path_to_rust_code}/dedup_dataset merge {options}")
131 |     os.popen(f"{path_to_rust_code}/dedup_dataset merge {options}").read()
132 |     # exit(0)
133 |     print("Now merging individual tables")
134 |     os.popen(f"cat {tmpdir}/out.table.bin.* > {tmpdir}/out.table.bin").read()
135 |     print("Cleaning up")
136 |     os.popen(f"mv {tmpdir}/out.table.bin {text_file}.table.bin").read()
137 | 
138 | 
139 | def _finish_and_return_to_hf_dataset(original_text_file, remove_file_cache):
140 |     """For simplicity the entire new dataset has to fit into memory..."""
141 |     remove = []
142 |     with open(remove_file_cache) as fin:
143 |         for line in fin:
144 |             if "out" in line:
145 |                 break
146 |         for line in fin:
147 |             remove.append(list(map(int, line.split())))
148 |         remove = remove[::-1]
149 | 
150 |     print(f"Number of removal tuples is {len(remove)}")
151 | 
152 |     with open(original_text_file, "rb") as original_dataset:
153 |         deduped_dataset = dict(text=[])
154 |         start = 0
155 |         buffer = ""
156 |         for _ in tqdm(range(len(remove)), desc="Writing deduplicated data back to hf dataset"):
157 |             a, b = remove.pop()
158 |             buffer += original_dataset.read(a - start).decode("utf-8", errors="ignore")  # Is the error ignore here a terrible idea??
159 |             original_dataset.seek(b)
160 |             start = b
161 | 
162 |             buf_split = buffer.split("<EOT>")
163 |             if len(buf_split) > 1:
164 |                 deduped_dataset["text"] += buf_split[:-1]
165 |                 buffer = buf_split[-1]
166 |         deduped_dataset["text"] += (buffer + original_dataset.read().decode("utf-8")).split("<EOT>")[:-1]
167 | 
168 |     dataset = datasets.Dataset.from_dict(deduped_dataset)
169 |     return dataset
170 | 


--------------------------------------------------------------------------------
/cramming/data/utils.py:
--------------------------------------------------------------------------------
 1 | """Various utilities."""
 2 | import os
 3 | from omegaconf import OmegaConf
 4 | import hashlib
 5 | import json
 6 | import shutil
 7 | import subprocess
 8 | 
 9 | import logging
10 | import time
11 | 
12 | import datasets
13 | 
14 | log = logging.getLogger(__name__)
15 | 
16 | 
17 | def checksum_config(cfg):
18 |     """This is more annoying that I thought it would be. But a json-dump of the config file is hashed and used as checksum."""
19 |     bindump = json.dumps(OmegaConf.to_container(cfg, resolve=True), sort_keys=True).encode("utf-8")
20 |     checksum_of_config = hashlib.md5(bindump).hexdigest()
21 |     if "tokenizer" in cfg and "vocab_size" in cfg:
22 |         checksum_of_config = f"{cfg.tokenizer}x{cfg.vocab_size}_{checksum_of_config}"
23 |     return checksum_of_config
24 | 
25 | 
26 | def stage_dataset(data_directory_path, local_staging_dir):
27 |     """This is a mess because our network drives are a mess. You might not need this."""
28 |     data_directory_name = os.path.basename(data_directory_path)
29 |     new_path = os.path.join(local_staging_dir, data_directory_name)
30 |     if os.path.isdir(data_directory_path):
31 |         try:
32 |             if not os.path.isdir(new_path):
33 |                 try:
34 |                     shutil.copytree(data_directory_path, new_path)
35 |                     log.info(f"Staging dataset to {new_path}...")
36 |                 except FileExistsError:
37 |                     log.info(f"Concurrent writing to {new_path} detected. Stopping staging in this run and waiting for 300 seconds.")
38 |                     time.sleep(300)
39 |             else:
40 |                 log.info(f"Using staged dataset found at {new_path}...")
41 | 
42 |             for retries in range(15):
43 |                 _, _, free = shutil.disk_usage(new_path)
44 |                 used = _get_size(new_path)
45 |                 try:
46 |                     tokenized_dataset = datasets.load_from_disk(new_path)
47 |                     log.info(f"Staged dataset size is {used / 1024**3:,.3f}GB. {free/ 1024**3:,.3f}GB free in staging dir.")
48 |                     return new_path
49 |                 except FileNotFoundError:
50 |                     log.info(
51 |                         f"Staged dataset is incomplete. Size is {used / 1024**3:,.3f}GB. "
52 |                         f" Waiting for 60 more secs for staging race condition."
53 |                     )
54 |                     time.sleep(60)
55 |             log.info(f"Staging dataset corrupted. Falling back to network drive location {data_directory_path}")
56 |             return data_directory_path
57 | 
58 |         except Exception as e:  # noqa
59 |             log.info(f"Staging failed with error {e}. Falling back to network drive location {data_directory_path}")
60 |             return data_directory_path
61 |     else:
62 |         raise FileNotFoundError(f"Dataset not yet generated or not found at {data_directory_path}.")
63 | 
64 | 
65 | def _get_size(start_path="."):
66 |     """Compute the size of a directory path. Why is this not in the standard library?"""
67 |     """Stolen from https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python"""
68 |     total_size = 0
69 |     for dirpath, dirnames, filenames in os.walk(start_path):
70 |         for f in filenames:
71 |             fp = os.path.join(dirpath, f)
72 |             # skip if it is symbolic link
73 |             if not os.path.islink(fp):
74 |                 total_size += os.path.getsize(fp)
75 |     return total_size
76 | 
77 | 
78 | def detailed_OSError(e):
79 |     if e.errno == 28:  # "no space left on device"
80 |         if e.filename:
81 |             df_output = subprocess.check_output(["df", "-h", e.filename]).decode("utf-8")
82 |             df_lines = df_output.strip().split("\n")[1:]
83 |             if df_lines:
84 |                 # The file system containing the file is full
85 |                 device_name, size, used, available, percent, mount_point = df_lines[0].split()
86 |                 error_path = os.path.abspath(e.filename)
87 |                 error_message = f"Error writing to {error_path}: {e.strerror}"
88 |                 space_message = f"{available} space left on {mount_point}"
89 |                 full_error_message = f"{error_message}\nDevice {device_name} is full. {space_message}"
90 |         else:
91 |             # The file name is unknown
92 |             error_message = f"Error: {e.strerror}"
93 |             full_error_message = f"{error_message}\nUnknown file name. Device may be full."
94 |         raise OSError(full_error_message)
95 |     else:
96 |         raise e
97 | 


--------------------------------------------------------------------------------
/create_pos_or_variants.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import random
 4 | import os
 5 | 
 6 | def one_hot_vector(length, index=None):
 7 |     """return a one hot vector"""
 8 |     if index is None:
 9 |         index = np.random.randint(length)
10 |     one_hot = np.zeros(length)
11 |     one_hot[index] = 1
12 |     return one_hot
13 | 
14 | def zero_vector(length):
15 |     """return a zero vector"""
16 |     zeros = np.zeros(length)
17 |     return zeros
18 | 
19 | def main():
20 |     parser = argparse.ArgumentParser(description="Train a model")
21 |     parser.add_argument("--dir_name", type=str, required=True, help="dir to save to")
22 |     parser.add_argument("--op", type=str, default='+', help="operation")
23 |     parser.add_argument("--n", default=2, type=int, help="num digits in first number")
24 |     parser.add_argument("--m", default=2, type=int, help="num digits in second number")
25 |     parser.add_argument('--p', default=0.0, type=float, help="prob for adding padding")
26 |     parser.add_argument("--max", default=-1, type=int, help="num digits in second number")
27 |     parser.add_argument('--exact', action='store_true', help='only this size')
28 |     parser.add_argument('--eval', action='store_true', help='save as part of eval dataset')
29 |     FLAGS = parser.parse_args()
30 | 
31 |     p = FLAGS.p
32 |     dir_name = FLAGS.dir_name
33 |     lengths_n = lengths_n_range = list(range(1,FLAGS.n+1))
34 |     lengths_m = lengths_m_range = list(range(1,FLAGS.m+1))
35 |     if FLAGS.exact:
36 |         lengths_n = [FLAGS.n]
37 |         lengths_m = [FLAGS.m]
38 |         
39 |     ds = []
40 |     # 2d loop to sample exaustively
41 |     for i in lengths_n:
42 |         for j in lengths_m:
43 |             i_len=i
44 |             j_len=j
45 |             combined_len=max(i,j)
46 |             for index in list(range(0,min(i,j))):
47 |                 if i_len > j_len: # put one hot in longer vector
48 |                     vec1 = zero_vector(i_len)
49 |                     vec2 = one_hot_vector(j_len, index)
50 |                 elif i_len < j_len:
51 |                     vec1 = one_hot_vector(i_len, index)
52 |                     vec2 = zero_vector(j_len)
53 |                 else: # i.e. same length so either can be the zeros
54 |                     if random.random() > 0.5:
55 |                         vec1 = one_hot_vector(i_len, index)
56 |                         vec2 = zero_vector(j_len)
57 |                     else:
58 |                         vec1 = zero_vector(i_len)
59 |                         vec2 = one_hot_vector(j_len, index)
60 |                 ans = one_hot_vector(combined_len, index)
61 | 
62 |                 vec1_str = "".join(map(lambda x: str(int(x)), vec1))
63 |                 vec2_str = "".join(map(lambda x: str(int(x)), vec2))
64 |                 ans_str = "".join(map(lambda x: str(int(x)), ans))
65 | 
66 |                 dataset_entry = f"{vec1_str}{FLAGS.op}{vec2_str}={ans_str}"
67 |                 
68 |                 if p>0: # add random padding, exponentially decaying
69 |                     spaced_string = ""
70 |                     for char in dataset_entry:
71 |                         space_p = p
72 |                         while random.random() < space_p:
73 |                             space_p *= 0.1
74 |                             spaced_string += " "
75 |                         spaced_string += char
76 |                     dataset_entry = spaced_string
77 |             
78 |                 ds.append(dataset_entry)
79 | 
80 |     if FLAGS.max != -1:
81 |         ds = random.sample(ds, min(len(ds),FLAGS.max)) # cut to maximum size
82 |     if FLAGS.eval:
83 |         data_dir = f"./cramming-data/data/arithmetic_data/pos_or_one_vec_zeros/{dir_name}"
84 |         file_name = f"positional_arithmetic_n_{FLAGS.n}_m_{FLAGS.m}.txt"
85 |     else:
86 |         data_dir = f"./cramming-data/data/arithmetic_data/{dir_name}"
87 |         file_name = f"positional_or_one_vec_zeros_n_{FLAGS.n}_m_{FLAGS.m}_examples_{len(ds)}.txt"
88 |     os.makedirs(data_dir, exist_ok=True)
89 |     file_path = os.path.join(data_dir, file_name)
90 | 
91 |     with open(file_path, 'w') as file:
92 |         for entry in ds:
93 |             file.write(entry + '\n')
94 |     print(f"created: {file_path}")
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/dataset_analysis.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import seaborn as sns
  6 | import pandas as pd
  7 | import argparse
  8 | 
  9 | def read_dataset(dir_name, condense_white_space=False):
 10 |     # open all data files and append to big list
 11 |     dataset = []
 12 |     for filename in os.listdir(dir_name):
 13 |         if filename.endswith(".txt"):
 14 |             file_path = os.path.join(dir_name, filename)
 15 |             with open(file_path, "r") as file:
 16 |                 lines = file.readlines()
 17 |                 stripped_lines = [line.replace("\n", "") for line in lines]
 18 |                 if condense_white_space:
 19 |                     stripped_lines = [re.sub('\s+',' ', line) for line in lines]
 20 |                 dataset.extend(stripped_lines)
 21 | 
 22 |     for i in range(0,min(len(dataset),5)):
 23 |         print(dataset[i])
 24 |     return dataset
 25 | 
 26 | def remove_leading_zeros(match):
 27 |     """Removes all leading zeros"""
 28 |     return str(int(match.group(0)))
 29 | 
 30 | def count_digits(dataset, remove_formatting=False):
 31 |     """Count the digits in each operand"""
 32 |     pairs = {}
 33 |     input_1 = {}
 34 |     input_2 = {}
 35 |     ans = {}
 36 |     for input_string in dataset:
 37 |         cleaned_string = input_string.replace(' ', '')
 38 |         if remove_formatting:
 39 |             cleaned_string = re.sub(r'\b0+\d+', remove_leading_zeros, cleaned_string)
 40 | 
 41 |         numbers = re.findall(r'\d+', cleaned_string)
 42 |         digit_counts = [len(number) for number in numbers]
 43 | 
 44 |         input_1[digit_counts[0]] = input_1.get(digit_counts[0], 0) + 1
 45 |         input_2[digit_counts[1]] = input_2.get(digit_counts[1], 0) + 1
 46 |         ans[digit_counts[2]] = ans.get(digit_counts[2], 0) + 1
 47 | 
 48 |         input_tuple = (digit_counts[0], digit_counts[1])
 49 |         pairs[input_tuple] = pairs.get(input_tuple, 0) + 1
 50 | 
 51 |     return pairs, input_1, input_2, ans
 52 | 
 53 | def plot_pairs_heatmap(pairs, dir_name=".", remove_formatting=False):
 54 |     """plot a heatmap of the lengths of the operands"""
 55 |     max_length = int(max(max(pair) for pair in pairs.keys()))
 56 |     heatmap_matrix = np.zeros((max_length + 1, max_length + 1))
 57 | 
 58 |     # Populate the matrix with counts
 59 |     for pair, count in pairs.items():
 60 |         heatmap_matrix[pair[0],pair[1]] = count
 61 | 
 62 |     df = pd.DataFrame.from_dict(heatmap_matrix)
 63 | 
 64 |     # Create a heatmap using seaborn
 65 |     plt.figure(figsize=(10, 8))
 66 |     sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".4g", cbar_kws={'label': 'Count'}, annot_kws={'size': 8,'rotation':45})
 67 |     plt.xlabel('Length of First Number')
 68 |     plt.ylabel('Length of Second Number')
 69 |     plt.title('Input Pairs Length Heatmap')
 70 |     plt.savefig(f"{dir_name}/pairs_heatmap{'_removed_prepended_zeros' if remove_formatting else ''}.png", bbox_inches='tight')
 71 |     plt.clf()
 72 | 
 73 | def line_plotter(data, name, dir_name=".", remove_formatting=False):
 74 |     """plot a line graph for the length of the operand """
 75 |     data = dict(sorted(data.items()))
 76 |     x_values = list(data.keys())
 77 |     y_values = list(data.values())
 78 | 
 79 |     # Plotting the line plot
 80 |     plt.plot(x_values, y_values, marker='o')
 81 | 
 82 |     # Adding labels and title
 83 |     plt.xlabel('Length of number')
 84 |     plt.ylabel('Count')
 85 |     plt.title(f"Line Plot for {name}")
 86 |     plt.savefig(f"{dir_name}/{name}_line_plot{'_removed_prepended_zeros' if remove_formatting else ''}.png", bbox_inches='tight')
 87 |     plt.clf()
 88 | 
 89 | def consecutive_digit_counts(input_strings):
 90 |     """Count the number of times a digit is repeated"""
 91 |     counts_by_digit = {}
 92 | 
 93 |     for input_str in input_strings:
 94 |         current_digit = None
 95 |         consecutive_count = 0
 96 | 
 97 |         for char in input_str:
 98 |             if char.isdigit():
 99 |                 if char == current_digit:
100 |                     consecutive_count += 1
101 |                 else:
102 |                     if current_digit is not None:
103 |                         # Update the dictionary with consecutive count
104 |                         if consecutive_count != 1:
105 |                             counts_by_digit.setdefault(current_digit, {}).setdefault(consecutive_count, 0)
106 |                             counts_by_digit[current_digit][consecutive_count] += 1
107 | 
108 |                     current_digit = char
109 |                     consecutive_count = 1
110 | 
111 |         # Update the dictionary for the last digit in the string
112 |         if current_digit is not None:
113 |             if consecutive_count != 1:
114 |                 counts_by_digit.setdefault(current_digit, {}).setdefault(consecutive_count, 0)
115 |                 counts_by_digit[current_digit][consecutive_count] += 1
116 | 
117 |     return counts_by_digit
118 | 
119 | def create_repetition_heatmap(data, dir_name=".", remove_formatting=False):
120 |     """plot heat map for, consecutive_digit_counts"""
121 |     data = dict(sorted(data.items()))
122 |     # Convert the dictionary to a DataFrame
123 |     df = pd.DataFrame.from_dict(data, orient='index').fillna(0)
124 | 
125 |     # Create a heatmap using seaborn
126 |     plt.figure(figsize=(10, 8))
127 |     sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".4g", cbar_kws={'label': 'Count'}, annot_kws={'size': 8,'rotation':45})
128 |     plt.title('Consecutive Digit Counts Heatmap')
129 |     plt.xlabel('Consecutive Count')
130 |     plt.ylabel('Digit')
131 |     plt.savefig(f"{dir_name}/repetition_count_heatmap{'_removed_prepended_zeros' if remove_formatting else ''}.png", bbox_inches='tight')
132 |     plt.clf()
133 | 
134 | def main(dir_name):
135 |     base_directory = "./cramming-data/data/arithmetic_data"
136 |     dir_name = os.path.join(base_directory, dir_name)
137 |     dataset = read_dataset(dir_name)
138 | 
139 |     options = [True, False]
140 |     for remove_formatting in options:
141 |         pairs, input_1, input_2, ans = count_digits(dataset, remove_formatting=remove_formatting)
142 |         print(f"{'removed prepended zeros' if remove_formatting else 'keeping prepended zeros'}")
143 |         print("pairs: ",pairs)
144 |         print("input 1: ",input_1)
145 |         print("input 2: ",input_2)
146 |         print("answers: ",ans)
147 | 
148 |         plot_pairs_heatmap(pairs, dir_name=dir_name, remove_formatting=remove_formatting)
149 |         line_plotter(input_1, "input_1", dir_name=dir_name, remove_formatting=remove_formatting)
150 |         line_plotter(input_2, "input_2", dir_name=dir_name, remove_formatting=remove_formatting)
151 |         line_plotter(ans, "answer", dir_name=dir_name, remove_formatting=remove_formatting)
152 | 
153 |         result_list = consecutive_digit_counts(dataset)
154 |         print("repetitions: ",result_list)
155 |         create_repetition_heatmap(result_list, dir_name=dir_name, remove_formatting=remove_formatting)
156 | 
157 | if __name__ == "__main__":
158 |     parser = argparse.ArgumentParser(description="Data analysis")
159 |     parser.add_argument("--dir_name", type=str, required=True)
160 |     FLAGS = parser.parse_args()
161 | 
162 |     main(FLAGS.dir_name)


--------------------------------------------------------------------------------
/gen_eval_script.py:
--------------------------------------------------------------------------------
 1 | # input your model name and base_dir
 2 | name = "sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_reycle_with_fire_8x1_1_24_run_1"
 3 | base_dir = "cramming-data"
 4 | 
 5 | # pick which eval you are doing
 6 | add_100 = False
 7 | add_110+ = False
 8 | add_small = False
 9 | mul = False
10 | sort = True
11 | bitwise_or = False
12 | 
13 | # set the model parameters for eval
14 | print("remember to edit max_rec and tokenizer!!")
15 | max_rec = 1
16 | tokenizer = ' data.sources.arithmetic.tokenizer_type="pad"'
17 | if sort:
18 |     tokenizer = ' data.sources.arithmetic.tokenizer_type="sort"'
19 | 
20 | ## print statements for all tasks below
21 | if add_100:
22 |     for checkerboard_str in [" checkerboard=odd"," checkerboard=even"]:
23 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=55 big_eval_step_1=True reverse_inputs=True{tokenizer}{checkerboard_str}")
24 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=60 big_eval_step_2=True reverse_inputs=True{tokenizer}{checkerboard_str}")
25 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=70 big_eval_step_3=True reverse_inputs=True{tokenizer}{checkerboard_str}")
26 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=85 big_eval_step_4=True reverse_inputs=True{tokenizer}{checkerboard_str}")
27 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=90 big_eval_step_5=True reverse_inputs=True{tokenizer}{checkerboard_str}")
28 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=100 big_eval_step_6=True reverse_inputs=True{tokenizer}{checkerboard_str}")
29 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=100 big_eval_step_7=True reverse_inputs=True{tokenizer}{checkerboard_str}")
30 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=110 big_eval_step_8=True reverse_inputs=True{tokenizer}{checkerboard_str}")
31 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=110 big_eval_step_9=True reverse_inputs=True{tokenizer}{checkerboard_str}")
32 |         print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=110 big_eval_step_10=True reverse_inputs=True{tokenizer}{checkerboard_str}")
33 | 
34 | if add_100:
35 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=105 big_eval_step_1=True reverse_inputs=True checkerboard=even extended_eval=True{tokenizer}")
36 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=105 big_eval_step_2=True reverse_inputs=True checkerboard=even extended_eval=True{tokenizer}")
37 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=105 big_eval_step_3=True reverse_inputs=True checkerboard=even extended_eval=True{tokenizer}")
38 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=105 big_eval_step_4=True reverse_inputs=True checkerboard=even extended_eval=True{tokenizer}")
39 | 
40 | if add_small:
41 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=30 reverse_inputs=True{tokenizer}")
42 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=35 ood_only=True reverse_inputs=True{tokenizer}")
43 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=45 up_to_40=True reverse_inputs=True{tokenizer}")
44 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=55 up_to_50=True reverse_inputs=True{tokenizer}")
45 | 
46 | if mul:
47 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=30 pos_arth=True{tokenizer}")
48 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=50 pos_arth_ood=True{tokenizer}")
49 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=30 mul=True{tokenizer}")
50 | 
51 | if sort:
52 |     for i in range(0,30):
53 |         print(f"python sort_eval.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} sort_reverse=True data.sources.arithmetic.tokenizer_type='sort' max_size_given={i+2} start_ind_1_given={i+1} start_ind_2_given={i+1}")
54 | 
55 | if bitwise_or: # we give data to evaluate up to 100x100 as we show in the paper, but the evaluation loop in only arithmetic_eval_quicker.py evaluates up to 40x40. This can be easily edited if required
56 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=30 pos_arth=True{tokenizer}")
57 |     print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=50 pos_arth_ood=True{tokenizer}")
58 |                     
59 | 
60 | 


--------------------------------------------------------------------------------
/load_local_model.py:
--------------------------------------------------------------------------------
 1 | """Example for a script to load a local saved model.
 2 | 
 3 | Use as e.g.
 4 | 
 5 | python load_local_model.py name=A6000amp_b4096_c5_o3_final base_dir=
 6 | > wandb=none impl.push_to_huggingface_hub=True arch=bert-c5 train=bert-o3 train.batch_size=4096
 7 | > data=c4-subset-processed dryrun=True +eval=GLUE_sane
 8 | 
 9 | """
10 | import os
11 | 
12 | import hydra
13 | import time
14 | 
15 | import logging
16 | 
17 | 
18 | import cramming
19 | 
20 | log = logging.getLogger(__name__)
21 | 
22 | 
23 | def main_load_process(cfg, setup):
24 |     """This function controls the central routine."""
25 |     local_time = time.time()
26 | 
27 |     local_checkpoint_folder = os.path.join(cfg.base_dir, cfg.name, "checkpoints")
28 |     tokenizer, cfg_arch, model_file = cramming.utils.find_pretrained_checkpoint(cfg.eval.checkpoint,
29 |                                                                                 local_checkpoint_folder,
30 |                                                                                 cfg.eval.arch_modifications)
31 | 
32 |     model = cramming.construct_model(cfg_arch, tokenizer.vocab_size, downstream_classes=None)
33 |     model_engine, _, _, _ = cramming.load_backend(model, tokenizer, cfg.train, cfg.impl, setup=setup)
34 |     model_engine.load_checkpoint(cfg_arch, model_file)
35 | 
36 |     if cramming.utils.is_main_process():
37 |         if cfg.impl.push_to_huggingface_hub:
38 |             model_engine.push_to_hub(tokenizer, cfg, dryrun=cfg.dryrun)
39 | 
40 | 
41 | @hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.3")
42 | def launch(cfg):
43 |     cramming.utils.main_launcher(cfg, main_load_process, job_name="load and push model")
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     launch()
48 | 


--------------------------------------------------------------------------------
/pretty_plotter.py:
--------------------------------------------------------------------------------
  1 | ## combine multiple testing plots and make a pretty one 
  2 | 
  3 | import os
  4 | import numpy as np
  5 | import json
  6 | import matplotlib.patches as patches
  7 | import matplotlib.pyplot as plt
  8 | import pandas as pd
  9 | import seaborn as sns
 10 | from omegaconf import OmegaConf
 11 | 
 12 | def find_file(starting_directory, target_file):
 13 |     """Find target_file in the tree from starting_directory"""
 14 |     for root, dirs, files in os.walk(starting_directory):
 15 |         if target_file in files:
 16 |             return os.path.join(root, target_file)
 17 | 
 18 | def grid_plotter(data, type="accs", path="", title=None, rect_size=20, up_to_50=False):
 19 |     """plot the 2d grid (up to 50x50)"""
 20 |     if title is None:
 21 |         title = "All numbers are percetanges rounded to 1dp"
 22 |     data = np.array(data)*100
 23 |     df = pd.DataFrame(data)
 24 | 
 25 |     plt.figure(figsize=(10, 8))
 26 |     sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".0f", annot_kws={'size': 8,'rotation':0})
 27 |     if up_to_50:
 28 |         rect = patches.Rectangle((0, 0), rect_size, rect_size, linewidth=1.5, edgecolor='red', facecolor='none')
 29 |     else:
 30 |         rect = patches.Rectangle((0, 0), rect_size, rect_size, linewidth=1, edgecolor='red', facecolor='none')
 31 |     plt.gca().add_patch(rect)
 32 |     rect_size = data.shape[0]
 33 |     plt.xticks(np.arange(1, rect_size+1) - 0.5, labels=np.arange(1, rect_size+1), rotation=90, fontsize=10)
 34 |     plt.yticks(np.arange(1, rect_size+1) - 0.5, labels=np.arange(1, rect_size+1), rotation=0, fontsize=10)
 35 |     
 36 |     # Customize the plot
 37 |     plt.title(title)
 38 |     plt.ylabel("1st Number Length")
 39 |     plt.xlabel("2nd Number Length")
 40 |     
 41 |     plt.savefig(f"{path}combined_{type}_grid_plot{'_50' if up_to_50 else ''}", bbox_inches='tight', dpi=300)
 42 |     plt.clf()
 43 | 
 44 | def main():
 45 |     # replace with model name
 46 |     model_name = "cramming-data/add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_run_1"
 47 | 
 48 |     file_path = f"{model_name}/downstream"
 49 |     # get latest checkpoint for the model data
 50 |     config_path = f"{model_name}/checkpoints"
 51 |     all_checkpoints = [f for f in os.listdir(config_path)]
 52 |     checkpoint_paths = [os.path.join(config_path, c) for c in all_checkpoints]
 53 |     checkpoint_name = max(checkpoint_paths, key=os.path.getmtime)
 54 |     with open(os.path.join(checkpoint_name, "model_config.json"), "r") as file:
 55 |         cfg_arch = OmegaConf.create(json.load(file))
 56 |     max_rec = cfg_arch['maximal_recurrence']
 57 |     layers_in_block = cfg_arch['layers_in_recurrent_block']
 58 |     mask_bf_eq = cfg_arch['mask_before_equals']
 59 |     attn_type = cfg_arch['attention']['type']
 60 |     loss_reduc = cfg_arch['loss_reduction']
 61 |     throttle = cfg_arch['throttle']
 62 |     title = f"Model name:\n{model_name[14:]}\nNum layers in block: {layers_in_block}, Num blocks in training: {max_rec}\n Mask all before equals: {mask_bf_eq}, Train time: 24 hr\n attn: {attn_type}, temp: Greedy{', loss: 'if loss_reduc == 'none' else ''}{', throttle' if throttle else ''}"
 63 | 
 64 |     # works up in tiers starting from the smallest grid (large) up to the largest for this size (up_to_50)
 65 |     large_path = find_file(file_path, f"accs_grid_quick_large.json")
 66 |     with open(large_path, 'r') as file:
 67 |         data = json.load(file)
 68 |     large_data = np.array(data)
 69 | 
 70 |     ood_path = find_file(file_path, f"accs_grid_quick_ood_only.json")
 71 |     with open(ood_path, 'r') as file:
 72 |         data = json.load(file)
 73 |     ood_data = np.array(data)
 74 | 
 75 |     num_rows_to_add = ood_data.shape[0] - large_data.shape[0]
 76 |     num_cols_to_add = ood_data.shape[1] - large_data.shape[1]
 77 | 
 78 |     padded_array = np.pad(large_data, ((0, num_rows_to_add), (0, num_cols_to_add)), mode='constant', constant_values=0)
 79 |     combined = padded_array+ood_data
 80 | 
 81 |     rect_size=20
 82 |     path_40 = find_file(file_path, f"accs_grid_quick_up_to_40.json")
 83 |     if path_40 is not None:
 84 |         with open(path_40, 'r') as file:
 85 |             data = json.load(file)
 86 |         data_40 = np.array(data)
 87 |         num_rows_to_add = data_40.shape[0] - combined.shape[0]
 88 |         num_cols_to_add = data_40.shape[1] - combined.shape[1]
 89 |         padded_array = np.pad(combined, ((0, num_rows_to_add), (0, num_cols_to_add)), mode='constant', constant_values=0)
 90 |         combined = padded_array+data_40
 91 | 
 92 |     path_50 = find_file(file_path, f"accs_grid_quick_up_to_50.json")
 93 |     up_to_50 = False
 94 |     if path_50 is not None:
 95 |         with open(path_50, 'r') as file:
 96 |             data = json.load(file)
 97 |         data_50 = np.array(data)
 98 |         num_rows_to_add = data_50.shape[0] - combined.shape[0]
 99 |         num_cols_to_add = data_50.shape[1] - combined.shape[1]
100 |         padded_array = np.pad(combined, ((0, num_rows_to_add), (0, num_cols_to_add)), mode='constant', constant_values=0)
101 |         combined = padded_array+data_50
102 |         up_to_50 = True
103 |         
104 |     grid_plotter(combined, type="accs", path=f"{file_path}/", title=title, rect_size=rect_size, up_to_50=up_to_50)
105 | 
106 | if __name__ == "__main__":
107 |     main()


--------------------------------------------------------------------------------
/pretty_plotter_big.py:
--------------------------------------------------------------------------------
  1 | ## combine multiple testing plots and make a pretty one 
  2 | 
  3 | import os
  4 | import numpy as np
  5 | import json
  6 | import matplotlib.patches as patches
  7 | import matplotlib.pyplot as plt
  8 | import pandas as pd
  9 | import seaborn as sns
 10 | from omegaconf import OmegaConf
 11 | import glob
 12 | import re
 13 | 
 14 | def grid_plotter(data, type="accs", path="", title=None, rect_size=20):
 15 |     """Plot the large 100x100 grid"""
 16 |     if title is None:
 17 |         title = "All numbers are percetanges rounded to 1dp"
 18 |     data = np.array(data)*100
 19 |     df = pd.DataFrame(data)
 20 | 
 21 |     plt.figure(figsize=(10, 8))
 22 |     annotate = False
 23 |     # use interpolant
 24 |     sns.heatmap(df, annot=annotate, cmap="YlGnBu", fmt=".0f", annot_kws={'size': 8,'rotation':0})
 25 | 
 26 |     rect = patches.Rectangle((0, 0), rect_size, rect_size, linewidth=1.8, edgecolor='red', facecolor='none')
 27 |     plt.gca().add_patch(rect)
 28 |     rect_size = data.shape[0]
 29 |     plt.xticks(np.arange(1, rect_size+1, 2) - 0.5, labels=np.arange(1, rect_size+1, 2), rotation=90, fontsize=10)
 30 |     plt.yticks(np.arange(1, rect_size+1, 2) - 0.5, labels=np.arange(1, rect_size+1, 2), rotation=0, fontsize=10)
 31 |     
 32 |     # Customize the plot
 33 |     plt.title(title)
 34 |     plt.ylabel("1st Number Length")
 35 |     plt.xlabel("2nd Number Length")
 36 |     
 37 |     plt.savefig(f"{path}combined_accs_grid_plot_big_run", bbox_inches='tight', dpi=300)
 38 |     plt.clf()
 39 | 
 40 | def main():
 41 |     # replace with your model name
 42 |     model_name = "cramming-data/add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_with_skip_connections_run_1"
 43 |     rect_size = 20
 44 | 
 45 |     directory_path = f"{model_name}/downstream"
 46 |     # get latest checkpoint for the model data
 47 |     config_path = f"{model_name}/checkpoints"
 48 |     all_checkpoints = [f for f in os.listdir(config_path)]
 49 |     checkpoint_paths = [os.path.join(config_path, c) for c in all_checkpoints]
 50 |     checkpoint_name = max(checkpoint_paths, key=os.path.getmtime)
 51 |     with open(os.path.join(checkpoint_name, "model_config.json"), "r") as file:
 52 |         cfg_arch = OmegaConf.create(json.load(file))
 53 |     max_rec = cfg_arch['maximal_recurrence']
 54 |     layers_in_block = cfg_arch['layers_in_recurrent_block']
 55 |     mask_bf_eq = cfg_arch['mask_before_equals']
 56 |     attn_type = cfg_arch['attention']['type']
 57 |     loss_reduc = cfg_arch['loss_reduction']
 58 |     throttle = cfg_arch['throttle']
 59 |     title = f"Model name:\n{model_name[14:]}\nNum layers in block: {layers_in_block}, Num blocks in training: {max_rec}\n Mask all before equals: {mask_bf_eq}, Train time: 24 hr\n attn: {attn_type}, temp: Greedy{', loss: 'if loss_reduc == 'none' else ''}{', throttle' if throttle else ''}"
 60 | 
 61 | 
 62 |     # Define the pattern to search for
 63 |     file_pattern = directory_path + "/accs_grid_quick_big_eval_?_even.json"
 64 |     matching_files_even = glob.glob(file_pattern, recursive=True)
 65 |     file_pattern = directory_path + "/accs_grid_quick_big_eval_??_even.json"
 66 |     matching_files_even += glob.glob(file_pattern, recursive=True)
 67 | 
 68 |     file_pattern = directory_path + "/accs_grid_quick_big_eval_?_odd.json"
 69 |     matching_files_odd = glob.glob(file_pattern, recursive=True)
 70 |     file_pattern = directory_path + "/accs_grid_quick_big_eval_??_odd.json"
 71 |     matching_files_odd += glob.glob(file_pattern, recursive=True)
 72 | 
 73 |     # Print the matching files
 74 |     number_pattern_even = re.compile(r'accs_grid_quick_big_eval_(\d+)_even.json')
 75 |     number_pattern_odd = re.compile(r'accs_grid_quick_big_eval_(\d+)_odd.json')
 76 | 
 77 |     # Print the matching files and the numbers extracted from them
 78 |     file_paths = []
 79 |     even_nums = []
 80 |     odd_nums = []
 81 | 
 82 |     for file_path in matching_files_even:
 83 |         match = number_pattern_even.search(file_path)
 84 |         if match:
 85 |             number = match.group(1)
 86 |             if number not in even_nums:
 87 |                 even_nums.append(number)
 88 |                 print("Number:", number)
 89 |             else:
 90 |                 continue
 91 |         print("File:", file_path)
 92 |         file_paths.append(file_path)
 93 | 
 94 |     for file_path in matching_files_odd:
 95 |         match = number_pattern_odd.search(file_path)
 96 |         if match:
 97 |             number = match.group(1)
 98 |             if number not in odd_nums:
 99 |                 odd_nums.append(number)
100 |                 print("Number:", number)
101 |             else:
102 |                 continue
103 |         print("File:", file_path)
104 |         file_paths.append(file_path)
105 | 
106 |     arr = np.zeros((100, 100))
107 |     for file_path in file_paths:
108 |         with open(file_path, 'r') as file:
109 |             data = json.load(file)
110 |             if len(data) == 3:
111 |                 data = data[0]
112 |         arr = arr + np.array(data)
113 |         
114 |     title = title + "\n Even: "+', '.join(sorted(even_nums, key=lambda x: int(x))) + "\n Odd: "+', '.join(sorted(odd_nums, key=lambda x: int(x)))
115 |     grid_plotter(arr, type=type, path=f"{directory_path}/", title=title, rect_size=rect_size)
116 |     print(f"{model_name}")
117 | 
118 | if __name__ == "__main__":
119 |     main()


--------------------------------------------------------------------------------
/pretty_plotter_sort.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | import cv2
  7 | 
  8 | def grid_plotter(data, title="", path=None):
  9 |     data = np.array(data)
 10 |     df = pd.DataFrame(data)
 11 | 
 12 |     # find the average accuracy
 13 |     avg = np.mean(data)
 14 | 
 15 |     # Create the heatmap
 16 |     plt.figure(figsize=(10, 8))
 17 |     sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".1f", annot_kws={'size': 8, 'rotation': 0}, vmin=0, vmax=100)
 18 | 
 19 |     # Customize the plot
 20 |     plt.title(f"Accuracy - percetange, rounded to 1dp : {title}, Avg acc: {avg}")
 21 |     plt.ylabel("Maximum n-digit number (1-n)")
 22 |     plt.xlabel("Length of array to sort")
 23 |     size = data.shape[0]
 24 |     plt.xticks(np.arange(0.5, size + 0.5, 1), labels=np.arange(1, size + 1, 1))
 25 |     plt.yticks(np.arange(0.5, size + 0.5, 1), labels=np.arange(1, size + 1, 1))
 26 | 
 27 |     plt.savefig(f"{path}", bbox_inches='tight')
 28 |     plt.clf()
 29 | 
 30 | 
 31 | def run(names, short_hand, base_dir, sort_plots_path):
 32 |     os.makedirs(sort_plots_path, exist_ok=True)
 33 |     all_data_acc_dict = {}
 34 |     all_data_top_1_acc_dict = {}
 35 | 
 36 |     for i in range(len(names)):
 37 |         name = names[i]
 38 |         extra_name = short_hand[i]
 39 |         dict_key = extra_name[0]
 40 |         extra_name = extra_name[0] + "_" + extra_name[1]
 41 |         all_data_path = base_dir + name + "/downstream/"
 42 | 
 43 |         # get all the directories in the path that start with all_outputs
 44 |         all_dirs = os.listdir(all_data_path)
 45 |         # remove the ones that are not directories
 46 |         all_dirs = [dir for dir in all_dirs if os.path.isdir(all_data_path + dir)]
 47 |         all_images = []
 48 |         for dir in all_dirs:
 49 |             if "all_outputs" in dir:
 50 |                 # get the recurrence
 51 |                 recurrence = dir.split("_")[-1]
 52 |                 if "recurrence" not in recurrence:
 53 |                     continue
 54 | 
 55 |                 # get all the files in the directory
 56 |                 files = os.listdir(all_data_path + dir + "/")
 57 |                 all_images_local = []
 58 | 
 59 |                 all_data_acc = {}
 60 |                 all_data_top_1_acc = {}
 61 |                 max_size = 0
 62 | 
 63 |                 print(extra_name)
 64 |                 print("dir", dir)
 65 | 
 66 |                 for file in files:
 67 |                     if ".txt" in file:
 68 |                         all_info = file.split(".")[0]
 69 |                         all_info = all_info.split("_")
 70 |                         data_size_1 = int(all_info[-2])
 71 |                         data_size_2 = int(all_info[-1])
 72 | 
 73 |                         if data_size_1 > max_size:
 74 |                             max_size = data_size_1
 75 |                         if data_size_2 > max_size:
 76 |                             max_size = data_size_2
 77 | 
 78 |                         # get the accuracy
 79 |                         with open(all_data_path + dir + "/" + file, "r") as f:
 80 |                             acc = float(f.read())
 81 |                             if "top_1_acc" in file:
 82 |                                 all_data_top_1_acc[(data_size_1, data_size_2)] = acc
 83 |                             else:
 84 |                                 all_data_acc[(data_size_1, data_size_2)] = acc
 85 | 
 86 |                 # create the grid plot
 87 |                 data = np.zeros((max_size, max_size))
 88 |                 for key in all_data_acc.keys():
 89 |                     data[key[0] - 1][key[1] - 1] = all_data_acc[key]
 90 |                 grid_plotter(data,
 91 |                             title=f"{extra_name} {recurrence} acc",
 92 |                             path=f"./{sort_plots_path}/{extra_name}_{recurrence}_acc.png")
 93 | 
 94 |                 if dict_key not in all_data_acc_dict.keys():
 95 |                     all_data_acc_dict[dict_key] = []
 96 |                     all_data_top_1_acc_dict[dict_key] = []
 97 | 
 98 |                 all_data_acc_dict[dict_key].append(data)
 99 | 
100 |                 data = np.zeros((max_size, max_size))
101 |                 for key in all_data_top_1_acc.keys():
102 |                     data[key[0] - 1][key[1] - 1] = all_data_top_1_acc[key]
103 |                 grid_plotter(data,
104 |                             title=f"{extra_name} {recurrence} top_1_acc",
105 |                             path=f"./{sort_plots_path}/{extra_name}_{recurrence}_top_1_acc.png")
106 | 
107 |                 all_data_top_1_acc_dict[dict_key].append(data)
108 | 
109 | 
110 |                 all_images_local.append(cv2.imread(f"./{sort_plots_path}/{extra_name}_{recurrence}_acc.png"))
111 |                 all_images_local.append(cv2.imread(f"./{sort_plots_path}/{extra_name}_{recurrence}_top_1_acc.png"))
112 |                 all_images_local = cv2.hconcat(all_images_local)
113 |                 # write this image
114 |                 all_images.append((all_images_local, f"{extra_name}_{recurrence}.png"))
115 | 
116 |         os.makedirs(f"./{sort_plots_path}/final/", exist_ok=True)
117 |         if len(all_images) == 1:
118 |             all_images_local, name = all_images[0]
119 |             cv2.imwrite(f"./{sort_plots_path}/final/{name}", all_images_local)
120 |         else:
121 |             os.makedirs(f"./{sort_plots_path}/final/{extra_name}/", exist_ok=True)
122 |             for all_images_local, name in all_images:
123 |                 cv2.imwrite(f"./{sort_plots_path}/final/{extra_name}/{name}", all_images_local)
124 | 
125 | if __name__ == "__main__":
126 |     names = ["sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_with_fire_8x1_1_24_run_1"]
127 |     short_hand = [("rev_abacus_fire_8x1", "v1")] # the shrothand names for the runs you want to plot in the same order
128 | 
129 |     base_dir = "cramming-data/"
130 |     sort_plots_path = "./sort_plots/"
131 |     run(names, short_hand, base_dir, sort_plots_path)


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | [tool.black]
6 | line-length = 140
7 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | [metadata]
 4 | name = cramming
 5 | version = 0.1.0
 6 | author = Sean McLeish
 7 | author_email = smcleish@umd.edu
 8 | url = https://github.com/mcleish7/arithmetic
 9 | description = Fork of cramming for next token predicition
10 | long_description = file: README.md, LICENSE.md
11 | long_description_content_type = text/markdown
12 | license = MIT
13 | license_file = LICENSE.md
14 | platform = any
15 | keywords = Machine Learning, Language Modeling
16 | classifiers =
17 |     License :: OSI Approved :: MIT License
18 |     Operating System :: OS Independent
19 |     Programming Language :: Python
20 | homepage = "https://github.com/mcleish7/arithmetic"
21 | repository = "https://github.com/mcleish7/arithmetic"
22 | documentation = """
23 | 
24 | [options]
25 | zip_safe = False
26 | include_package_data = True
27 | python_requires = >= 3.10
28 | packages = find:
29 | 
30 | setup_requires =
31 |     setuptools
32 | 
33 | install_requires =
34 |     torch >= 2.0.0
35 |     hydra-core >= 1.1
36 |     datasets
37 |     tokenizers
38 |     transformers
39 |     evaluate
40 |     scipy
41 |     scikit-learn # for metrics
42 |     pynvml
43 |     psutil
44 |     einops
45 |     safetensors
46 |     apache-beam  # only used for wikipedia ...
47 |     zstandard    # only used for the Pile
48 |     wandb # if you want to use it
49 |     matplotlib==3.8.3 # the versions of plt and sns are fixed for annotating the heatmaps
50 |     seaborn==0.13.2
51 |     opencv-python
52 | 
53 | scripts =
54 |   pretrain.py
55 |   arithmetic_eval_quicker.py
56 | 
57 | [options.package_data]
58 | * =  "*.yaml", "*.txt"
59 | 
60 | 
61 | [check-manifest]
62 | ignore =
63 |     .ipynb
64 |     .sh
65 | 
66 | 
67 | #basically the pytorch flake8 setting from https://github.com/pytorch/pytorch/blob/master/.flake8
68 | [flake8]
69 | select = B,C,E,F,P,T4,W,B9
70 | max-line-length = 140
71 | # C408 ignored because we like the dict keyword argument syntax
72 | # E501 is not flexible enough, we're using B950 instead
73 | ignore =
74 |     E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
75 | per-file-ignores = __init__.py: F401 torch/utils/cpp_extension.py: B950
76 | optional-ascii-coding = True
77 | exclude =
78 |     .git,
79 |     __pycache__,
80 |     scripts,
81 |     tables,
82 |     outputs,
83 |     *.pyi
84 | 


--------------------------------------------------------------------------------
/shells/addition_ff.sh:
--------------------------------------------------------------------------------
 1 | ## FF
 2 | # nope
 3 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None
 4 | 
 5 | # fire
 6 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_fire_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" 
 7 | 
 8 | # abacus
 9 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus
10 | 
11 | ## FF w/ II
12 | # nope
13 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.forward_only_model_with_skip=True
14 | # fire
15 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_fire_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"  arch.forward_only_model_with_skip=True
16 | # abacus
17 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus arch.forward_only_model_with_skip=True
18 | 
19 | 
20 | ## FF w/ II
21 | # Abacus + FIRE
22 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_abacus_attn_emb_fire_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.forward_only_model_with_skip=True arch.embedding.pos_embedding=abacus 
23 | # Abacus + RoPE
24 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_abacus_attn_emb_rope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus arch.forward_only_model_with_skip=True arch.attention.type="self-attention" arch.attention.rotary_embedding=true


--------------------------------------------------------------------------------
/shells/addition_lt.sh:
--------------------------------------------------------------------------------
 1 | ### Looped Transformer experiments
 2 | # vary number of layers in recurrent_block: arch.layers_in_recurrent_block
 3 | # vary number of recurrences: arch.maximal_recurrence
 4 | 
 5 | # NOPE
 6 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None
 7 | # FIRE
 8 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_fire_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" 
 9 | # ABACUS
10 | python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus
11 | 


--------------------------------------------------------------------------------
/shells/bitwise_or.sh:
--------------------------------------------------------------------------------
 1 | # bitwise or is sometimes refered to as pos_arth in the code
 2 | 
 3 | ## LT
 4 | # NOPE
 5 | python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None
 6 | #  FIRE
 7 | python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_fire_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"
 8 | # abacus
 9 | python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus
10 | 
11 | ## FF
12 | #nope
13 | python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_nope_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.forward_only_model_with_skip=True
14 | # fire
15 | python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_nope_attn_emb_fire_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.forward_only_model_with_skip=True
16 | # abacus
17 | python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_abacus_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus arch.forward_only_model_with_skip=True
18 | 
19 | ## FF w/ II
20 | # nope
21 | python pretrain.py name=pos_or_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_nope_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/pos_arith_add_20_20_p_00/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.forward_only_model_with_skip=True
22 | # fire
23 | python pretrain.py name=pos_or_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_nope_attn_emb_fire_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/pos_arith_add_20_20_p_00/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"  arch.forward_only_model_with_skip=True
24 | # abacus
25 | python pretrain.py name=pos_or_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_abacus_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/pos_arith_add_20_20_p_00/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus arch.forward_only_model_with_skip=True
26 | 


--------------------------------------------------------------------------------
/shells/evaluation.sh:
--------------------------------------------------------------------------------
 1 | # there is an automated helper in gen_eval_script.py for generating these evaluation scripts
 2 | 
 3 | # Addition
 4 | python arithmetic_eval_quicker.py name=<name> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_rec> token_limit=105 big_eval_step_<STEP_NUM>=True reverse_inputs=True checkerboard=<EVEN/ODD> remove_padding=True data.sources.arithmetic.tokenizer_type="pad"
 5 | 
 6 | # Extended Addition Eval, i.e. 100
 7 | python arithmetic_eval_quicker.py name=<name> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_Rec> token_limit=105 big_eval_step_5=True reverse_inputs=True checkerboard=even remove_padding=True extended_eval=True data.sources.arithmetic.tokenizer_type="pad"
 8 | 
 9 | # Multiplication
10 | python arithmetic_eval_quicker.py name=<NAME> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_rec> token_limit=30 mul=True data.sources.arithmetic.tokenizer_type="pad"
11 | 
12 | # Sorting
13 | # max_size_given = end of grid, start_ind_... = start of grid, i.e. this evaluates from 1,1 to final_size, final_size
14 | python sort_eval.py name=<name> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_rec> sort_reverse=True data.sources.arithmetic.tokenizer_type='sort' max_size_given={final_size + 1} start_ind_1_given={1} start_ind_2_given={1}
15 | 
16 | # Bitwise OR
17 | python arithmetic_eval_quicker.py name=<name> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_rec> token_limit=105 big_eval_step_<STEP_NUM>=True checkerboard=<EVEN/ODD> pos_arth_ood=True data.sources.arithmetic.tokenizer_type="pad" remove_padding=False


--------------------------------------------------------------------------------
/shells/generate_and_tokenize_data.sh:
--------------------------------------------------------------------------------
 1 | ## Training Data -- these commands approximately correspond to the zipped data we provide
 2 | 
 3 | # bitwise or
 4 | python create_pos_or_variants.py --n 20 --m 20 --dir_name <NAME> --max 100
 5 | python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01
 6 | 
 7 | # addition
 8 | python create_data_split.py --bucket --op + --n 20 --m 20 --limit 20000000 --p 0.0 --dir_name <NAME> --reverse_all
 9 | python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01
10 | 
11 | # addition with index hints
12 | python create_data_split.py --bucket --op + --n 20 --m 20 --limit 20000000 --p 0.0 --dir_name <NAME> --reverse_all --index_hints
13 | python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type index
14 | 
15 | # multiplication
16 | python create_data_split.py --bucket --op x --n 15 --m 15 --limit 20000000 --dir_name <NAME>  --reverse_all --p 0.0
17 | python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01
18 | 
19 | # sorting
20 | python create_data_split.py --uniform_distribution_sort_data --continue_to_tokenize --tokenize --tokenizer_type sort --test_split_ratio 0.01 --n 10 --m 10 --limit 20000000 --dir <NAME> --sort_generation_method bucket_uniform_distribution --reverse_all
21 | 
22 | ## Evaluation Data -- run line and tokenize once for each operand length
23 | # bitwise or
24 | python create_pos_or_variants.py --n <i> --m <j> --dir_name <NAME> --exact --eval --max 100
25 | python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0
26 | 
27 | # addition
28 | python create_data_split.py --op + --n <i> --m <j> --num_samples 100 --dir_name <NAME> --exact
29 | python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0
30 | 
31 | # multiplication
32 | python create_data_split.py --op x --n <i> --m <j> --num_samples 100 --dir_name <NAME> --exact
33 | python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0
34 | 
35 | # sorting
36 | python create_data_split.py --uniform_distribution_sort_data --continue_to_tokenize --tokenize --tokenizer_type sort --test_split_ratio 0.01 --n <i> --m <j> --limit 100 --dir <NAME> --sort_generation_method bucket_uniform_distribution --reverse_all --exact


--------------------------------------------------------------------------------
/shells/multiplication.sh:
--------------------------------------------------------------------------------
1 | ## only Looped Transformer experiments for multiplication
2 | torchrun --nproc_per_node=8 --standalone pretrain.py name=mul_bucket_15_15_reverse_all_pad_00_depthrec_4_4_TBPTT_1024_nope_mask_before_equals_batch_512_fire_abacus_8_gpu wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=4 arch.maximal_recurrence=4 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/x_bucket_method_n_15_m_15_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.00006 data.sources.arithmetic.tokenizer_type="pad" arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.mask_before_equals=True impl.fullgraph=false arch.loss_reduction=none arch.throttle=True arch.embedding.pos_embedding="abacus"
3 | 
4 | torchrun --nproc_per_node=8 --standalone pretrain.py name=mul_bucket_15_15_reverse_all_pad_00_depthrec_4_4_TBPTT_1024_nope_mask_before_equals_batch_512_fire_nope_8_gpu wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=4 arch.maximal_recurrence=4 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/x_bucket_method_n_15_m_15_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.00006 data.sources.arithmetic.tokenizer_type="pad" arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.mask_before_equals=True impl.fullgraph=false arch.loss_reduction=none arch.throttle=True arch.embedding.pos_embedding=None
5 | 
6 | torchrun --nproc_per_node=8 --standalone pretrain.py name=mul_bucket_15_15_reverse_all_pad_00_depthrec_4_4_TBPTT_1024_nope_mask_before_equals_batch_512_abacus_8_gpu wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=4 arch.maximal_recurrence=4 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/x_bucket_method_n_15_m_15_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True impl.fullgraph=false arch.loss_reduction=none arch.throttle=True arch.embedding.pos_embedding="abacus"


--------------------------------------------------------------------------------
/shells/sorting.sh:
--------------------------------------------------------------------------------
 1 | # REMINDER SET BASE DIR
 2 | 
 3 | 
 4 | ## fire reverse
 5 | ## fire reverse recall
 6 | ## fire reverse recurrence
 7 | 
 8 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_fire_8x1_1_24_run_1 \
 9 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
10 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
11 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
12 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
13 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.attention.type='self-attention' \
14 | 	arch.attention.rotary_embedding='fire' impl.fullgraph=false impl.save_every_n_minutes=60 impl.save_intermediate_model_name='last'
15 | 
16 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_fire_recall_8x1_1_24_run_1 \
17 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
18 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
19 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
20 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
21 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.attention.type='self-attention' \
22 | 	arch.attention.rotary_embedding='fire' impl.fullgraph=false impl.save_every_n_minutes=60 impl.save_intermediate_model_name='last' arch.forward_only_model_with_skip=True
23 | 
24 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_fire_1x8_1_24_run_1 \
25 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
26 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=8 \
27 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
28 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
29 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.attention.type='self-attention' \
30 | 	arch.attention.rotary_embedding='fire' impl.fullgraph=false impl.save_every_n_minutes=60 impl.save_intermediate_model_name='last'
31 | 
32 | ## abacus reverse
33 | ## abacus reverse recall
34 | ## abacus reverse recurrence
35 | 
36 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_8x1_1_24_run_1 \
37 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
38 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
39 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
40 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
41 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus"
42 | 
43 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_8x1_skip_1_24_run_1 \
44 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
45 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
46 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
47 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
48 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus" arch.forward_only_model_with_skip=True
49 | 
50 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_1x8_1_24_run_1 \
51 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
52 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=8 \
53 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
54 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
55 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus"
56 | 
57 | 
58 | ## abacus fire reverse
59 | ## abacus fire reverse recall
60 | ## abacus fire reverse recurrence
61 | 
62 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_with_fire_8x1_1_24_run_1 \
63 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
64 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
65 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
66 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
67 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus" \
68 | 	arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"
69 | 
70 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_with_fire_8x1_skip_1_24_run_1 \
71 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
72 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
73 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
74 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
75 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus" \
76 | 	arch.forward_only_model_with_skip=True arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"
77 | 
78 | torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_with_fire_1x8_1_24_run_1 \
79 | 	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
80 | 	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=8 \
81 | 	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
82 | 	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
83 | 	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus" \
84 | 	arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"


--------------------------------------------------------------------------------
/upload_processed_dataset.py:
--------------------------------------------------------------------------------
 1 | """Script to upload a processed dataset to the huggingface hub. You probably don't need this :)"""
 2 | 
 3 | 
 4 | import hydra
 5 | import logging
 6 | from omegaconf import OmegaConf
 7 | import tempfile
 8 | import os
 9 | 
10 | from datasets import load_dataset
11 | 
12 | import cramming
13 | 
14 | 
15 | log = logging.getLogger(__name__)
16 | 
17 | 
18 | def upload(cfg, setup):
19 |     dataset, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl)
20 |     checksum = cramming.data.utils.checksum_config(cfg.data)
21 |     processed_dataset_name = f"{cfg.data.name}_{checksum}"
22 | 
23 |     use_own_chunking = True
24 |     chunk_size = 8192 * 32
25 |     num_files = len(dataset) // chunk_size + 1
26 |     target_types = ["input_ids"]
27 | 
28 |     files = []
29 |     # Split dataset in parquet files
30 |     with tempfile.TemporaryDirectory() as tmpdirname:
31 |         if use_own_chunking:
32 |             # Loop through the dataset and write each chunk to a Parquet file
33 |             # This is not really necessary, but nice to save only target_types and to match chunk sizes to target batch sizes
34 |             for idx in range(num_files):
35 |                 chunk = dataset.select(range(idx * chunk_size, min(len(dataset), (idx + 1) * chunk_size)))
36 |                 filename = f"{tmpdirname}/train_{idx}.parquet"
37 |                 chunk.to_pandas()[target_types].to_parquet(filename, index=False)
38 |                 files.append(filename)
39 |                 log.info(f"Chunk {idx} written to file {filename}.")
40 | 
41 |             # Re-assemble parqueted dataset
42 |             dataset = load_dataset("parquet", data_files=files)
43 | 
44 |         # Define the dataset info
45 |         description = f"""This is a preprocessed dataset for the cramming-project.
46 | 
47 |                                 Use only with the tokenizer prescribed here.
48 |                                 This version is {processed_dataset_name}, which corresponds to the following setup:
49 |                                 {OmegaConf.to_yaml(cfg, resolve=True)}
50 | 
51 |                                 Limitations and bias:
52 |                                 This training data was further filtered and sorted beyond the normal preprocessing.
53 |                                 These modifications were not tested for unintended consequences.
54 | 
55 |                               """
56 |         dataset["train"].info.description = description
57 |         # dataset_tags = ["cramming", "English", "preprocessed"]
58 | 
59 |         # Launch upload
60 |         log.info("Preparing for dataset upload ...")
61 |         dataset.push_to_hub(processed_dataset_name, private=True)
62 | 
63 |         # Upload tokenizer to same adress - this is annoying because by default tokenizers are pushed to model directories
64 |         # tokenizer.push_to_hub(processed_dataset_name) -> this will push to a new directory in HF models
65 |         from huggingface_hub import HfApi
66 | 
67 |         api = HfApi()
68 |         log.info("Preparing for tokenizer upload ...")
69 |         tokenizer_loc = os.path.join(os.path.join(cfg.impl.path, processed_dataset_name), "tokenizer")
70 |         for file in os.listdir(tokenizer_loc):
71 |             api.upload_file(
72 |                 path_or_fileobj=os.path.join(tokenizer_loc, file),
73 |                 path_in_repo=os.path.join("tokenizer", file),
74 |                 repo_id=f"{api.whoami()['name']}/{processed_dataset_name}",
75 |                 repo_type="dataset",
76 |             )
77 |         log.info("Upload completed succesfully.")
78 | 
79 | 
80 | @hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.3")
81 | def launch(cfg):
82 |     cramming.utils.main_launcher(cfg, upload, job_name="upload")
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     launch()
87 | 


--------------------------------------------------------------------------------