├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── README.md
    └── how-to-tokenize.md
├── pyproject.toml
├── scripts
    ├── compare_data_configs.py
    ├── compare_wandb_configs.py
    ├── launch_ruler.sh
    └── summarize_data_mix.py
├── src
    └── cookbook
    │   ├── __init__.py
    │   ├── aliases.py
    │   ├── cli
    │       ├── cli.py
    │       ├── core.py
    │       ├── eval.py
    │       ├── pmr.py
    │       └── utils.py
    │   ├── constants.py
    │   ├── data
    │       ├── __init__.py
    │       ├── dataset.py
    │       └── mixes
    │       │   ├── dolmino100.txt
    │       │   ├── dolmino300.txt
    │       │   ├── dolmino50.txt
    │       │   ├── jallyrun100.txt
    │       │   ├── jallyrun50.txt
    │       │   └── stackexchange.txt
    │   ├── eval
    │       ├── cache.py
    │       ├── conversion.py
    │       ├── conversion_from_hf.py
    │       ├── datalake.py
    │       ├── evaluation.py
    │       ├── miniframe.py
    │       ├── named_tasks.py
    │       └── results.py
    │   ├── model
    │       ├── __init__.py
    │       ├── builder.py
    │       ├── config.py
    │       └── evaluators.py
    │   ├── recipes
    │       ├── love2code
    │       │   ├── train-190M-1xC-love2code-weka-python-hlr-bpb-only.yaml
    │       │   ├── train-1b-5xC-love2code-starcoder1-weka-hlr.yaml
    │       │   ├── train-1b-5xC-love2code-starcoder1-weka.yaml
    │       │   ├── train-1b-5xC-love2code-weka-hlr.yaml
    │       │   ├── train-1b-5xC-love2code-weka-python-hlr.yaml
    │       │   ├── train-1b-5xC-love2code-weka-python-no-prose-hlr.yaml
    │       │   ├── train-1b-5xC-love2code-weka-python.yaml
    │       │   ├── train-1b-5xC-love2code-weka-starcoder1-noprose.yaml
    │       │   └── train-1b-5xC-love2code-weka.yaml
    │       ├── olmo2
    │       │   ├── anneal
    │       │   │   ├── train-1b-dclm-dolma2-anneal-10b.yaml
    │       │   │   ├── train-7b-code-dolma2-anneal-10b-augusta.yaml
    │       │   │   ├── train-7b-dclm-only-anneal-10b-control.yaml
    │       │   │   ├── train-7b-finemath3p-anneal-10b-50split-dclm.yaml
    │       │   │   ├── train-7b-finemath3p-anneal-10b-50split.yaml
    │       │   │   ├── train-7b-wiki-concat-anneal-10b-50split-dclm.yaml
    │       │   │   ├── train-7b-wiki-concat-anneal-10b-50split.yaml
    │       │   │   └── train-7b-wiki-concat-anneal-10b.yaml
    │       │   ├── train-1b-1xC-dclm.yaml
    │       │   ├── train-1b-5xC-dclm-dolma2-180k-wsd.yaml
    │       │   ├── train-1b-5xC-dclm-dolma2-180k.yaml
    │       │   ├── train-1b-5xC-dclm-dolma2-augusta.yaml
    │       │   ├── train-1b-5xC-dclm-dolma2-wsd.yaml
    │       │   ├── train-1b-5xC-dclm-dolma2.yaml
    │       │   ├── train-1b-5xC-dclm-superbpe-wsd.yaml
    │       │   ├── train-1b-5xC-dclm-superbpe.yaml
    │       │   ├── train-1b-5xC-olmo2-baseline.yaml
    │       │   ├── train-7b-1xC-dclm-dolma2-180k.yaml
    │       │   ├── train-7b-1xC-dclm-dolma2.yaml
    │       │   └── train-7b-1xC-dclm-superbpe.yaml
    │       ├── olmo3-evals
    │       │   └── README.md
    │       ├── olmo3-midtraining
    │       │   └── example-olmo2_7b-web-code-reasoning-microanneal.yaml
    │       ├── olmo3
    │       │   └── pstar
    │       │   │   ├── mixes
    │       │   │       ├── dclm_natural.json
    │       │   │       ├── dclm_pstar_001.json
    │       │   │       ├── dclm_pstar_002.json
    │       │   │       └── dist-plot.py
    │       │   │   ├── train-1b-5xC-pstar-001-dclm-dolma2.yaml
    │       │   │   ├── train-1b-5xC-pstar-002-dclm-dolma2.yaml
    │       │   │   └── train-1b-5xC-pstar-natural-dclm-dolma2.yaml
    │       └── spring2code
    │       │   └── scaling
    │       │       ├── spring2code-190m-5xC-weka-python-only-bpb-hlr.yaml
    │       │       ├── spring2code-190m-5xC-weka-python-only-bpb-vhlr.yaml
    │       │       ├── spring2code-190m-5xC-weka-top15-bpb-hlr-superbpe.yaml
    │       │       ├── spring2code-190m-5xC-weka-top15-bpb-hlr.yaml
    │       │       ├── spring2code-1b-5xC-weka-python-only-bpb-hlr.yaml
    │       │       ├── spring2code-1b-5xC-weka-top15-bpb-hlr-superbpe.yaml
    │       │       └── spring2code-1b-5xC-weka-top15-bpb-hlr.yaml
    │   ├── remote
    │       ├── __init__.py
    │       ├── __main__.py
    │       ├── aws.py
    │       ├── base.py
    │       ├── gantry_launcher.py
    │       └── gcp.py
    │   ├── train.py
    │   └── utils
    │       ├── __init__.py
    │       ├── clusters.py
    │       ├── config.py
    │       └── data.py
└── tests
    ├── __init__.py
    └── cookbook
        ├── __init__.py
        ├── eval
            ├── __init__.py
            └── test_miniframe.py
        └── remote
            ├── __init__.py
            └── test_remote.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | 
173 | 
174 | # MacOS files
175 | .DS_Store
176 | 
177 | # vscode
178 | .vscode/
179 | 
180 | # temporary directory
181 | tmp/
182 | temp/
183 | uv.lock
184 | 
185 | 
186 | # ignore vscode workspace settings
187 | *.code-workspace
188 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 | 
3 | This directory contains guides on how to use the OLMo Cookbook.
4 | 


--------------------------------------------------------------------------------
/docs/how-to-tokenize.md:
--------------------------------------------------------------------------------
  1 | # How to Tokenize
  2 | 
  3 | This is a brief guide on how to tokenize data on EC2.
  4 | We will use Poor Man Ray to create a new instance, install Dolma, and then SSH into machine to tokenize the data.
  5 | 
  6 | ## Step 0: install Poor Man Ray
  7 | 
  8 | Clone the OLMo Cookbook repository; install.
  9 | 
 10 | ```bash
 11 | git clone https://github.com/allenai/olmo-cookbook.git
 12 | cd olmo-cookbook
 13 | pip install -e .
 14 | ```
 15 | 
 16 | Ensure your AWS environment variables are set:
 17 | ```bash
 18 | export AWS_ACCESS_KEY_ID="[your key]"
 19 | export AWS_SECRET_ACCESS_KEY="[your secret]"
 20 | export AWS_DEFAULT_REGION="us-east-1"
 21 | ```
 22 | 
 23 | ## Step 1: create a cluster
 24 | 
 25 | Create a cluster on EC2 where we will run tokenization; we will use one `i4i.x32large` instance.
 26 | 
 27 | ```bash
 28 | cluster_name="YOUR_CLUSTER_NAME"
 29 | poormanray create -n $cluster_name -t i4i.32xlarge --number 1
 30 | ```
 31 | 
 32 | Then run two setup commands to setup storage and toolkit:
 33 | 
 34 | ```bash
 35 | poormanray setup-d2tk -n $cluster_name  -d
 36 | poormanray setup-dolma-python -n $cluster_name -d
 37 | ```
 38 | 
 39 | The `-d` here means do this in the background. You should wait a few minutes to finish. You can check status of first command by running
 40 | 
 41 | ```bash
 42 | poormanray run -n $cluster_name -c 'ls'
 43 | ```
 44 | 
 45 | and check if a `datamap-rs` exists; for the second, run
 46 | 
 47 | ```bash
 48 | poormanray run -n $cluster_name -c 'uv run dolma'
 49 | ```
 50 | 
 51 | and check if a dolma command is found.
 52 | 
 53 | ## Step 2: Download data to node
 54 | 
 55 | Use `list` to get IP of the machine:
 56 | 
 57 | ```bash
 58 | >>> poormanray list -n $cluster_name
 59 | 
 60 | 
 61 | Id:     i-xxxxxxxxxxxxxxxxx
 62 | Name:   <cluster-name>-0000
 63 | Type:   i4i.32xlarge
 64 | State:  running
 65 | IP:     xxx.yyy.zzz.ttt
 66 | Status: 2/2
 67 | Tags:   {"Contact": "<your username>", "Name": "<cluster-name>-0000", "Project": "<cluster-name>"}
 68 | ```
 69 | 
 70 | Now SSH into the machine and download the data using `s5cmd`. I recommend doing it inside a tmux session
 71 | 
 72 | ```bash
 73 | ssh ec2-user@xxx.yyy.zzz.ttt
 74 | 
 75 | s5cmd cp -sp \
 76 |     "s3://ai2-llm/pretraining-data/sources/dataset-name/documents/*" \
 77 |     "/mnt/raid0/ai2-llm/pretraining-data/sources/dataset-name/documents"
 78 | ```
 79 | 
 80 | make sure to use "*" at the end of source path, and a trailing / at the end of destination.
 81 | 
 82 | ## Step 3: Tokenize the  data
 83 | 
 84 | Now you can tokenize as follows:
 85 | 
 86 | ```bash
 87 | tokenizer="allenai/dolma2-tokenizer"
 88 | 
 89 | uv run huggingface-cli download $tokenizer --local-dir /mnt/raid0/tokenizer
 90 | 
 91 | uv run dolma tokens \
 92 |     --documents "/mnt/raid0/ai2-llm/pretraining-data/sources/dataset-name/documents/*" \
 93 |     --destination "/mnt/raid0/ai2-llm/preprocessed/dataset-name/${tokenizer}" \
 94 |     --tokenizer.name_or_path /mnt/raid0/tokenizer/tokenizer.json \
 95 |     --tokenizer.eos_token_id 100257 \
 96 |     --tokenizer.pad_token_id 100277 \
 97 |     --no-tokenizer.segment_before_tokenization \
 98 |     --tokenizer.encode_special_tokens \
 99 |     --processes $(python3 -c "import multiprocessing; print(multiprocessing.cpu_count())") \
100 |     --max_size 4_000_000_000 \
101 |     --sample_ring_prop \
102 |     --dtype uint32
103 | ```
104 | 
105 | ## Step 4: Upload data to S3
106 | 
107 | Finish by uploading the data to S3.
108 | 
109 | ```bash
110 | s5cmd cp -sp \
111 |     "/mnt/raid0/ai2-llm/preprocessed/dataset-name/${tokenizer}/*" \
112 |     "s3://ai2-llm/preprocessed/dataset-name/${tokenizer}/"
113 | ```
114 | 
115 | And then terminate the cluster.
116 | 
117 | ```bash
118 | poormanray terminate -n $cluster_name
119 | ```
120 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools", "wheel"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "olmo-cookbook"
  7 | dynamic = ["version"]
  8 | readme = "README.md"
  9 | description = ""
 10 | authors = [
 11 |     { name = "Allen Institute for Artificial Intelligence", email = "oe-data-engineering@allenai.org" }
 12 | ]
 13 | requires-python = ">=3.10,<3.14"
 14 | license = { file = "LICENSE" }
 15 | dependencies = [
 16 |     "boto3",
 17 |     "click",
 18 |     "requests",
 19 |     "platformdirs",
 20 |     "pydantic",
 21 |     "s3fs",
 22 |     "gcsfs",
 23 |     "rich",
 24 |     "smart_open",
 25 |     "yaspin",
 26 |     "PyYAML>=6.0,<7.0",
 27 |     "paramiko>=3.5,<3.6",
 28 |     "tabulate",
 29 |     "packaging>=24.2",
 30 |     "tqdm>=4.67.1",
 31 |     "huggingface-hub[hf-transfer]>=0.34,<0.35",
 32 | ]
 33 | 
 34 | [project.optional-dependencies]
 35 | dev = [
 36 |     "ruff>=0.12.8",
 37 |     "boto3-stubs[essential,ec2,s3,ssm]",
 38 |     "google-api-python-client-stubs",
 39 | ]
 40 | beaker = [
 41 |     "beaker-py>=1.17.1,<2",
 42 |     "GitPython>=3.0,<4.0",
 43 | ]
 44 | wandb = [
 45 |     "wandb",
 46 | ]
 47 | checkpoints = [
 48 |     "google-cloud-storage",
 49 |     "boto3"
 50 | ]
 51 | all = [
 52 |     "ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@7afdc3ed67f00b090aae11b5101ef147160274cc", #c779ca546cc3194e73e7491aaefcdffbed042c65",
 53 |     "beaker-py>=1.17.1,<2",
 54 |     "GitPython>=3.0,<4.0",
 55 |     "wandb",
 56 | ]
 57 | 
 58 | [project.scripts]
 59 | olmo-cookbook = "cookbook.cli:cli.cli"
 60 | olmo-cookbook-eval = "cookbook.cli:eval.cli"
 61 | olmo-cookbook-core = "cookbook.cli:core.cli"
 62 | poormanray = "cookbook.cli:pmr.cli"
 63 | 
 64 | 
 65 | [tool.black]
 66 | line-length = 115
 67 | target-version = ['py39']
 68 | include = '\.pyi?$'
 69 | exclude = '''
 70 | (
 71 |       __pycache__
 72 |     | \.git
 73 |     | \.mypy_cache
 74 |     | \.pytest_cache
 75 |     | \.vscode
 76 |     | \.venv
 77 |     | \bdist\b
 78 |     | \bdoc\b
 79 |     | scratch/
 80 |     | build/
 81 | )
 82 | '''
 83 | 
 84 | [tool.isort]
 85 | profile = "black"
 86 | multi_line_output = 3
 87 | 
 88 | [tool.ruff]
 89 | line-length = 115
 90 | 
 91 | [tool.ruff.lint]
 92 | ignore = ["F403", "F405", "E501"]
 93 | exclude = [
 94 |     ".bzr",
 95 |     ".direnv",
 96 |     ".eggs",
 97 |     ".git",
 98 |     ".venv",
 99 |     "venv",
100 |     ".mypy_cache",
101 |     "__pycache__",
102 |     ".nox",
103 |     ".pants.d",
104 |     ".pytype",
105 |     ".ruff_cache",
106 |     ".svn",
107 |     ".tox",
108 |     "__pypackages__",
109 |     "_build",
110 |     "buck-out",
111 |     "build",
112 |     "dist",
113 |     "node_modules",
114 |     "doc",
115 |     "pretrain_data",
116 |     "inference",
117 | ]
118 | 
119 | [tool.ruff.lint.per-file-ignores]
120 | "**/__init__.py" = ["F401"]
121 | 
122 | [tool.pyright]
123 | reportPrivateImportUsage = false
124 | 
125 | [tool.mypy]
126 | ignore_missing_imports = true
127 | no_site_packages = true
128 | check_untyped_defs = true
129 | disable_error_code = "has-type"
130 | 
131 | [[tool.mypy.overrides]]
132 | module = "tests.*"
133 | strict_optional = false
134 | 
135 | [tool.pytest.ini_options]
136 | testpaths = "tests/"
137 | python_classes = [
138 |   "Test*",
139 |   "*Test",
140 | ]
141 | log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
142 | log_level = "DEBUG"
143 | log_cli = false
144 | log_cli_level = "DEBUG"
145 | filterwarnings = [
146 |     'ignore::FutureWarning:huggingface_hub\.file_download',
147 |     'ignore::DeprecationWarning:pkg_resources',
148 |     'ignore::DeprecationWarning:google\.rpc',
149 |     'ignore::FutureWarning:torch\.distributed\.checkpoint\.default_planner',
150 | ]
151 | 


--------------------------------------------------------------------------------
/scripts/compare_wandb_configs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Examples:
  4 |     Comparing Peteish7 to OLMoE
  5 |     - python scripts/compare_wandb_configs.py https://wandb.ai/ai2-llm/olmo-medium/runs/cej4ya39 https://wandb.ai/ai2-llm/olmoe/runs/rzsn9tlc
  6 | 
  7 |     Comparing Peteish7 to Amberish7
  8 |     - python scripts/compare_wandb_configs.py https://wandb.ai/ai2-llm/olmo-medium/runs/cej4ya39 https://wandb.ai/ai2-llm/olmo-medium/runs/ij4ls6v2
  9 | 
 10 | 
 11 | """
 12 | 
 13 | import logging
 14 | import os
 15 | import re
 16 | from collections import Counter
 17 | 
 18 | import click
 19 | import wandb
 20 | from olmo_core.utils import flatten_dict, prepare_cli_environment
 21 | from rich.console import Console
 22 | from rich.panel import Panel
 23 | from rich.table import Table
 24 | from rich.text import Text
 25 | 
 26 | log = logging.getLogger(__name__)
 27 | run_path_re = re.compile(r"^[^/]+/[^/]+/[^/]+$")
 28 | run_path_url = re.compile(r"^https?://wandb.ai/([^/]+)/([^/]+)/runs/([^/]+)")
 29 | console = Console()
 30 | 
 31 | 
 32 | def parse_run_path(run_path: str) -> str:
 33 |     """For convenience, we allow run paths as well as URLs."""
 34 |     run_path = run_path.strip("/")
 35 |     if run_path_re.match(run_path):
 36 |         return run_path
 37 | 
 38 |     m = run_path_url.match(run_path)
 39 |     if m is not None:
 40 |         entity, project, run_id = m.groups()
 41 |         return f"{entity}/{project}/{run_id}"
 42 | 
 43 |     raise ValueError(f"Could not parse '{run_path}'")
 44 | 
 45 | 
 46 | def display_differences_table(left_config, right_config, title):
 47 |     # Create exclusive keys tables
 48 |     left_only_keys = left_config.keys() - right_config.keys()
 49 |     if left_only_keys:
 50 |         left_table = Table(title="Settings only in left", title_style="bold cyan")
 51 |         left_table.add_column("Key", style="dim")
 52 |         left_table.add_column("Value", no_wrap=False)
 53 | 
 54 |         for k in sorted(left_only_keys):
 55 |             left_table.add_row(str(k), str(left_config[k]))
 56 |         console.print(left_table)
 57 | 
 58 |     right_only_keys = right_config.keys() - left_config.keys()
 59 |     if right_only_keys:
 60 |         right_table = Table(title="Settings only in right", title_style="bold magenta")
 61 |         right_table.add_column("Key", style="dim")
 62 |         right_table.add_column("Value", no_wrap=False)
 63 | 
 64 |         for k in sorted(right_only_keys):
 65 |             right_table.add_row(str(k), str(right_config[k]))
 66 |         console.print(right_table)
 67 | 
 68 |     # Create differences table
 69 |     keys_with_differences = {
 70 |         k for k in left_config.keys() & right_config.keys() if left_config[k] != right_config[k]
 71 |     }
 72 | 
 73 |     if keys_with_differences:
 74 |         diff_table = Table(title=f"Differences in {title}", title_style="bold yellow")
 75 |         diff_table.add_column("Parameter", style="dim")
 76 |         diff_table.add_column("Left Value", style="cyan")
 77 |         diff_table.add_column("Right Value", style="magenta")
 78 | 
 79 |         for k in sorted(keys_with_differences):
 80 |             diff_table.add_row(str(k), str(left_config[k]), str(right_config[k]))
 81 |         console.print(diff_table)
 82 |     elif not (left_only_keys or right_only_keys):
 83 |         console.print(Panel(f"No differences found in {title}", style="green"))
 84 | 
 85 | 
 86 | def display_data_differences(left_data_paths, right_data_paths):
 87 |     left_table = Table(title="Data Paths for Left Config", title_style="bold cyan", show_header=True)
 88 |     left_table.add_column("Path")
 89 |     left_table.add_column("Count", justify="right")
 90 | 
 91 |     for path, count in left_data_paths.items():
 92 |         left_table.add_row(str(path), str(count))
 93 | 
 94 |     right_table = Table(title="Data Paths for Right Config", title_style="bold magenta", show_header=True)
 95 |     right_table.add_column("Path")
 96 |     right_table.add_column("Count", justify="right")
 97 | 
 98 |     for path, count in right_data_paths.items():
 99 |         right_table.add_row(str(path), str(count))
100 | 
101 |     console.print(left_table)
102 |     console.print(right_table)
103 | 
104 | 
105 | @click.command()
106 | @click.argument(
107 |     "left_run_path",
108 |     type=str,
109 | )
110 | @click.argument(
111 |     "right_run_path",
112 |     type=str,
113 | )
114 | @click.option(
115 |     "--diff-datasets",
116 |     is_flag=True,
117 |     default=False,
118 |     help="Whether to compare dataset differences between runs",
119 | )
120 | def main(
121 |     left_run_path: str,
122 |     right_run_path: str,
123 |     diff_datasets: bool,
124 | ):
125 |     api = wandb.Api()
126 |     left_run = api.run(parse_run_path(left_run_path))
127 |     right_run = api.run(parse_run_path(right_run_path))
128 | 
129 |     left_config_raw = left_run._attrs["rawconfig"]
130 |     right_config_raw = right_run._attrs["rawconfig"]
131 | 
132 |     # flattening the dict will make diffs easier
133 |     left_config = flatten_dict(left_config_raw)
134 |     right_config = flatten_dict(right_config_raw)
135 | 
136 |     # Handle dataset paths conditionally based on diff_datasets flag
137 |     left_data_paths = Counter()
138 |     right_data_paths = Counter()
139 |     if diff_datasets and "dataset.paths" in left_config:
140 |         left_data_paths = Counter([os.path.dirname(path) for path in left_config["dataset.paths"]])
141 |         del left_config["dataset.paths"]
142 |     elif "dataset.paths" in left_config:
143 |         del left_config["dataset.paths"]
144 | 
145 |     if diff_datasets and "dataset.paths" in right_config:
146 |         right_data_paths = Counter([os.path.dirname(path) for path in right_config["dataset.paths"]])
147 |         del right_config["dataset.paths"]
148 |     elif "dataset.paths" in right_config:
149 |         del right_config["dataset.paths"]
150 | 
151 |     # Handle source_mixture_config in the same way
152 |     if "dataset.source_mixture_config.source_configs" in left_config:
153 |         source_configs = left_config["dataset.source_mixture_config.source_configs"]
154 |         if diff_datasets:
155 |             for config in source_configs:
156 |                 if isinstance(config, dict) and "paths" in config:
157 |                     paths = config["paths"]
158 |                     for path in paths:
159 |                         left_data_paths[os.path.dirname(path)] += 1
160 | 
161 |         for config in source_configs:
162 |             if isinstance(config, dict) and "paths" in config:
163 |                 del config["paths"]
164 | 
165 |         left_config["dataset.source_mixture_config.source_configs"] = source_configs
166 | 
167 |     if "dataset.source_mixture_config.source_configs" in right_config:
168 |         source_configs = right_config["dataset.source_mixture_config.source_configs"]
169 |         if diff_datasets:
170 |             for config in source_configs:
171 |                 if isinstance(config, dict) and "paths" in config:
172 |                     paths = config["paths"]
173 |                     for path in paths:
174 |                         right_data_paths[os.path.dirname(path)] += 1
175 | 
176 |         for config in source_configs:
177 |             if isinstance(config, dict) and "paths" in config:
178 |                 del config["paths"]
179 | 
180 |         del right_config["dataset.source_mixture_config.source_configs"]
181 | 
182 |     # Display header with run information
183 |     console.print()
184 |     console.rule(f"[bold]Config differences between runs[/bold]")
185 |     console.print(f"Left:  [cyan]{left_run_path}[/cyan]")
186 |     console.print(f"Right: [magenta]{right_run_path}[/magenta]")
187 |     console.print()
188 | 
189 |     # Display parameter differences
190 |     console.rule("[bold]Parameter Differences[/bold]")
191 |     display_differences_table(left_config, right_config, "parameters")
192 |     console.print()
193 | 
194 |     # Display data differences only if diff_datasets is enabled
195 |     if diff_datasets:
196 |         console.rule("[bold]Data Differences[/bold]")
197 |         display_data_differences(left_data_paths, right_data_paths)
198 |         console.print()
199 | 
200 | 
201 | if __name__ == "__main__":
202 |     prepare_cli_environment()
203 |     main()
204 | 


--------------------------------------------------------------------------------
/scripts/launch_ruler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$1" ]; then
 4 |   echo "Usage: $0 <model_path>"
 5 |   exit 1
 6 | fi
 7 | 
 8 | 
 9 | # Check if olmo-cookbook-eval command is available
10 | if command -v olmo-cookbook-eval &> /dev/null; then
11 |   eval_command="olmo-cookbook-eval"
12 | elif command -v uv &> /dev/null && uv run olmo-cookbook-eval --help &> /dev/null; then
13 |   eval_command="uv run olmo-cookbook-eval"
14 | else
15 |   echo "Error: olmo-cookbook-eval command not found. Please install it or ensure uv is available."
16 |   exit 1
17 | fi
18 | 
19 | 
20 | model_path="$1"
21 | base_command="${eval_command} evaluate \"${model_path}\" --priority urgent --cluster ai2/jupiter-cirrascale-2 --num-gpus 1 --model-backend vllm --dashboard peteish-LC-ruler --budget ai2/oe-base --model-args \"trust_remote_code=true,  chat_model=null, max_length=65536\"  --task-args \"use_chat_format=false\"  --vllm-use-v1-spec  --workspace ai2/long-contexts  --beaker-image amandab/lc-only-adjust-rope-global-layers"
22 | 
23 | 
24 | echo "Launching task: ruler:4k"
25 | eval "${base_command} --tasks ruler:4k -j 2"
26 | 
27 | echo "Launching task: ruler:8k"
28 | eval "${base_command} --tasks ruler:8k -j 2"
29 | 
30 | echo "Launching task: ruler:16k"
31 | eval "${base_command} --tasks ruler:16k -j 2"
32 | 
33 | echo "Launching task: ruler:32k"
34 | eval "${base_command} --tasks ruler:32k -j 2"
35 | 
36 | echo "Launching task: ruler:64k"
37 | eval "${base_command} --tasks ruler:64k -j 2"
38 | 
39 | wait
40 | 


--------------------------------------------------------------------------------
/scripts/summarize_data_mix.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Examples:
  4 |     Peteish7    python scripts/summarize_data_mix.py https://wandb.ai/ai2-llm/olmo-medium/runs/cej4ya39
  5 |     OLMoE       python scripts/summarize_data_mix.py https://wandb.ai/ai2-llm/olmoe/runs/rzsn9tlc
  6 |     Amberish    python scripts/summarize_data_mix.py https://wandb.ai/ai2-llm/olmo-medium/runs/ij4ls6v2
  7 | 
  8 | """
  9 | 
 10 | import logging
 11 | import os
 12 | import re
 13 | from collections import Counter
 14 | from typing import Any, Callable, Dict, MutableMapping, Optional, Tuple, Union
 15 | 
 16 | import click
 17 | 
 18 | log = logging.getLogger(__name__)
 19 | log.setLevel(logging.INFO)
 20 | console_handler = logging.StreamHandler()
 21 | console_handler.setLevel(logging.INFO)
 22 | formatter = logging.Formatter("%(message)s")  # Simple formatter to just show the message
 23 | console_handler.setFormatter(formatter)
 24 | log.addHandler(console_handler)
 25 | 
 26 | 
 27 | run_path_re = re.compile(r"^[^/]+/[^/]+/[^/]+$")
 28 | run_path_url = re.compile(r"^https?://wandb.ai/([^/]+)/([^/]+)/runs/([^/]+)")
 29 | 
 30 | 
 31 | def flatten_dict(dictionary, parent_key="", separator=".", include_lists=False):
 32 |     """
 33 |     Flatten a nested dictionary into a single-level dictionary.
 34 | 
 35 |     Args:
 36 |         dictionary (dict): The nested dictionary to be flattened.
 37 |         parent_key (str, optional): The parent key to be prepended to the keys of the flattened dictionary. Defaults to "".
 38 |         separator (str, optional): The separator to be used between the parent key and the keys of the flattened dictionary. Defaults to ".".
 39 |         include_lists (bool, optional): Whether to convert lists to dictionaries with integer keys. Defaults to False.
 40 | 
 41 |     Returns:
 42 |         dict: The flattened dictionary.
 43 | 
 44 |     """
 45 |     d: Dict[str, Any] = {}
 46 |     for key, value in dictionary.items():
 47 |         new_key = parent_key + separator + key if parent_key else key
 48 |         # convert lists to dict with key <int>
 49 |         if isinstance(value, list) and include_lists:
 50 |             value = {f"{i}": v for i, v in enumerate(value)}
 51 |         if isinstance(value, MutableMapping):
 52 |             d.update(**flatten_dict(value, new_key, separator=separator, include_lists=include_lists))
 53 |         else:
 54 |             d[new_key] = value
 55 |     return d
 56 | 
 57 | 
 58 | def parse_run_path(run_path: str) -> str:
 59 |     """For convenience, we allow run paths as well as URLs."""
 60 |     run_path = run_path.strip("/")
 61 |     if run_path_re.match(run_path):
 62 |         return run_path
 63 | 
 64 |     m = run_path_url.match(run_path)
 65 |     if m is not None:
 66 |         entity, project, run_id = m.groups()
 67 |         return f"{entity}/{project}/{run_id}"
 68 | 
 69 |     raise ValueError(f"Could not parse '{run_path}'")
 70 | 
 71 | 
 72 | def format_counter_paths(data_paths, log):
 73 |     """
 74 |     Format a Counter containing file paths into a readable log output.
 75 |     Shows full paths with aligned counts and percentages.
 76 | 
 77 |     Args:
 78 |         counter (Counter): Counter object containing path counts
 79 |         log (logging.Logger): Logger instance to output the formatted results
 80 |     """
 81 |     if not data_paths:
 82 |         log.info("Counter is empty")
 83 |         return
 84 | 
 85 |     # Find the largest count for padding
 86 |     max_count_width = len(str(max(data_paths.values())))
 87 | 
 88 |     # Sort by count in descending order
 89 |     sorted_items = data_paths.most_common()
 90 |     total_count = sum(data_paths.values())
 91 | 
 92 |     log.info(f"Total entries: {total_count}")
 93 |     log.info("-" * 120)  # Made longer to accommodate full paths
 94 | 
 95 |     for path, count in sorted_items:
 96 |         # Format the percentage
 97 |         percentage = (count / total_count) * 100
 98 | 
 99 |         # Create the formatted string with aligned counts and percentages
100 |         formatted_line = f"{count:>{max_count_width},d} items ({percentage:5.1f}%) | {path}"
101 | 
102 |         log.info(formatted_line)
103 | 
104 |     log.info("-" * 120)  # Made longer to accommodate full paths
105 | 
106 | 
107 | @click.command()
108 | @click.argument(
109 |     "run_path",
110 |     type=str,
111 | )
112 | def main(run_path: str):
113 |     import wandb
114 | 
115 |     api = wandb.Api()
116 |     run = api.run(parse_run_path(run_path))
117 | 
118 |     config_raw = run._attrs["rawconfig"]
119 | 
120 |     # flattening the dict will make diffs easier
121 |     config = flatten_dict(config_raw)
122 | 
123 |     # first, data.paths can be grouped and counted.
124 |     data_paths = Counter([os.path.dirname(path) for path in config["data.paths"]])
125 | 
126 |     format_counter_paths(data_paths, log)
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/src/cookbook/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/__init__.py


--------------------------------------------------------------------------------
/src/cookbook/aliases.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from os import PathLike
  3 | from pathlib import Path
  4 | from typing import Any, List, Optional, Union
  5 | 
  6 | from olmo_core.data.types import NumpyDatasetDType
  7 | from olmo_core.launch.beaker import BeakerLaunchConfig
  8 | from olmo_core.train.common import Duration
  9 | from pydantic import BaseModel, field_validator
 10 | 
 11 | from cookbook.model.config import ModelConfigIdentifier
 12 | from cookbook.model.evaluators import DownstreamEvaluator
 13 | 
 14 | DownstreamEvaluatorType = Union[str, DownstreamEvaluator]
 15 | PathType = Union[Path, PathLike[Any], str]
 16 | 
 17 | try:
 18 |     from beaker import Priority  # pyright: ignore
 19 | except ImportError:
 20 |     Priority = str
 21 | 
 22 | 
 23 | class SourceConfig(BaseModel):
 24 |     name: str
 25 |     paths: list[str]
 26 |     target_ratio: Optional[float] = None
 27 |     repetition_factor: float = 1.0
 28 |     max_source_ratio: float = 1.0
 29 | 
 30 | 
 31 | class SourceInstance(BaseModel):
 32 |     name: str
 33 |     paths: list[str]
 34 |     ratio: float
 35 |     repetition_factor: float = 1.0
 36 | 
 37 | 
 38 | class DatasetConfig(BaseModel):
 39 |     sources: list[SourceConfig]
 40 |     dtype: NumpyDatasetDType = NumpyDatasetDType.uint32
 41 |     processes: int = 16
 42 |     seed: int = 42
 43 | 
 44 | 
 45 | class MetricBackend(Enum):
 46 |     wandb = "wandb"
 47 |     comet = "comet"
 48 | 
 49 | 
 50 | class MetricsConfig(BaseModel):
 51 |     project: str = "olmo-cookbook"
 52 |     workspace: str = "ai2"
 53 |     entity: str = "ai2-llm"
 54 |     backends: list[MetricBackend] = [MetricBackend.wandb]
 55 | 
 56 | 
 57 | class SchedulerType(Enum):
 58 |     COSINE = "cosine"
 59 |     COS_LINEAR = "cos_linear"
 60 |     LINEAR = "linear"
 61 |     WSD = "wsd"
 62 | 
 63 |     @classmethod
 64 |     def values(cls):
 65 |         return [e.value for e in cls]
 66 | 
 67 |     @classmethod
 68 |     def keys(cls):
 69 |         return [e.name for e in cls]
 70 | 
 71 | 
 72 | class AnnealConfig(BaseModel):
 73 |     enabled: bool = True
 74 |     initial_lr: Optional[float] = None
 75 | 
 76 | 
 77 | class ExperimentConfig(BaseModel, extra="forbid"):
 78 |     name: str
 79 |     description: str
 80 |     budget: str
 81 |     workspace: str
 82 |     nodes: int
 83 |     gpus: int
 84 |     max_tokens: int
 85 |     sequence_length: int
 86 |     seed: int
 87 |     cluster: str
 88 |     tokenizer: str
 89 |     priority: Priority  # pyright: ignore
 90 |     dataset: DatasetConfig
 91 |     model: ModelConfigIdentifier
 92 |     load_path: Optional[str] = None
 93 |     load_state: bool = True
 94 |     annealing: Optional[AnnealConfig] = None
 95 |     nccl_debug: bool = False
 96 |     activation_checkpointing: bool = False
 97 |     model_overrides: Optional[List[str]] = None
 98 |     scheduler_type: SchedulerType = SchedulerType.COS_LINEAR
 99 |     hard_stop: Optional[Duration] = None
100 |     rank_microbatch_size: Optional[int] = None
101 |     learning_rate: Optional[float] = None
102 |     global_batch_size: Optional[int] = None
103 |     lm_evaluator: bool = False
104 |     downstream_evaluators: list[DownstreamEvaluatorType] = []  # type: ignore
105 |     max_target_sequence_length: int = 8192
106 |     metrics_config: Optional[MetricsConfig] = MetricsConfig()
107 |     preemptible: bool = True
108 |     shared_filesystem: bool = False
109 |     weka: bool = False
110 |     eval_interval: int = 200
111 |     save_interval: int = 1000
112 |     warmup_steps: Optional[int] = None
113 |     path: Path
114 | 
115 |     @field_validator("model", mode="before")
116 |     @classmethod
117 |     def validate_model(cls, value):
118 |         """Convert string to ModelConfigIdentifier if needed."""
119 |         if isinstance(value, str):
120 |             return ModelConfigIdentifier(value)
121 |         return value
122 | 
123 |     @field_validator("annealing")
124 |     @classmethod
125 |     def validate_annealing(cls, value, info):
126 |         """Validate that if annealing is True, then load_path must not be None."""
127 |         if value is not None and info.data.get("load_path") is None:
128 |             raise ValueError("If annealing is enabled, load_path must be specified.")
129 |         return value
130 | 
131 | 
132 | class ExperimentInstance(BaseModel):
133 |     name: str
134 |     sources: list[SourceInstance]
135 | 
136 | 
137 | class ExperimentGroup(BaseModel):
138 |     config: ExperimentConfig
139 |     group_id: str
140 |     instances: list[ExperimentInstance]
141 | 
142 | 
143 | class LaunchGroup(BaseModel):
144 |     instances: list[BeakerLaunchConfig]
145 | 
146 | 
147 | def validate_sources(sources: list[SourceConfig]):
148 |     """Validate a list of source configurations."""
149 |     target_ratio_present = any(source.target_ratio is not None for source in sources)
150 | 
151 |     for source in sources:
152 |         if target_ratio_present and source.target_ratio is None:
153 |             raise ValueError("If any source has target_ratio set, all sources must have target_ratio set.")
154 | 


--------------------------------------------------------------------------------
/src/cookbook/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/data/__init__.py


--------------------------------------------------------------------------------
/src/cookbook/data/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass, field
 3 | from typing import List, Union
 4 | from urllib.parse import urlparse
 5 | 
 6 | import gcsfs
 7 | import s3fs
 8 | from olmo_core.data.source_mixture import (
 9 |     SourceMixtureConfig,
10 |     SourceMixtureDatasetConfig,
11 | )
12 | from olmo_core.data.types import NumpyDatasetDType
13 | 
14 | from cookbook.aliases import SourceInstance
15 | from cookbook.utils.data import expand_globs
16 | 
17 | 
18 | @dataclass
19 | class MixtureBuilder:
20 |     sources: List[SourceInstance]
21 |     max_tokens: int
22 |     sequence_length: int
23 |     seed: int
24 |     dtype: NumpyDatasetDType
25 |     processes: int = 1
26 |     cached_fs: dict[str, Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]] = field(
27 |         default_factory=lambda: dict(
28 |             s3=s3fs.S3FileSystem(),
29 |             weka=s3fs.S3FileSystem(
30 |                 client_kwargs={"endpoint_url": os.environ["WEKA_ENDPOINT_URL"]}, profile="WEKA"
31 |             ),
32 |             gs=gcsfs.GCSFileSystem(),
33 |         )
34 |     )
35 | 
36 |     def build(self) -> SourceMixtureDatasetConfig:
37 |         source_configs: List[SourceMixtureConfig] = []
38 |         for source in self.sources:
39 |             globs = [path for path in source.paths if "*" in path]
40 |             paths = [path for path in source.paths if path not in globs]
41 | 
42 |             # Check if all paths have the same URL scheme
43 |             schemes = {urlparse(path).scheme for path in paths + globs}
44 |             if len(schemes) > 1:
45 |                 raise ValueError(f"All paths for source {source.name} must have the same scheme. Found: {schemes}")
46 |             elif len(schemes) == 0:
47 |                 raise ValueError(f"No paths found for source {source.name}")
48 | 
49 |             scheme = schemes.pop()
50 | 
51 |             expanded = paths + expand_globs(self.cached_fs.get(scheme, self.cached_fs["s3"]), globs)
52 | 
53 |             if len(expanded) == 0:
54 |                 raise ValueError(f"No paths found for source {source.name}")
55 | 
56 |             source_configs.append(
57 |                 SourceMixtureConfig(
58 |                     source_name=source.name,
59 |                     paths=expanded,
60 |                     target_ratio=source.ratio,
61 |                     max_repetition_ratio=source.repetition_factor,
62 |                 )
63 |             )
64 | 
65 |         return SourceMixtureDatasetConfig(
66 |             source_configs=source_configs,
67 |             max_tokens=self.max_tokens,
68 |             sequence_length=self.sequence_length,
69 |             seed=self.seed,
70 |             dtype=self.dtype,
71 |             processes=self.processes,
72 |         )
73 | 


--------------------------------------------------------------------------------
/src/cookbook/data/mixes/stackexchange.txt:
--------------------------------------------------------------------------------
 1 | #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT)
 2 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy
 3 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy
 4 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy
 5 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy
 6 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy
 7 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy
 8 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy
 9 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy
10 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy
11 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy
12 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy
13 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy
14 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy
15 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy
16 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy
17 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy
18 | 


--------------------------------------------------------------------------------
/src/cookbook/eval/cache.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import os
  4 | import shutil
  5 | from dataclasses import dataclass
  6 | from typing import Generic, TypeVar
  7 | 
  8 | import smart_open
  9 | from platformdirs import user_cache_dir
 10 | 
 11 | T = TypeVar("T")
 12 | V = TypeVar("V")
 13 | 
 14 | 
 15 | @dataclass(frozen=True)
 16 | class DatalakeCacheResult(Generic[T]):
 17 |     success: bool
 18 |     value: T | None
 19 | 
 20 | 
 21 | # Singleton instance storage
 22 | _DATALAKE_CACHE_INSTANCE = None
 23 | 
 24 | 
 25 | @dataclass
 26 | class DatalakeCache(Generic[T]):
 27 |     cache_dir: str
 28 |     invalidate: bool
 29 |     do_not_cache: bool
 30 | 
 31 |     def __init__(self, invalidate: bool = False, do_not_cache: bool = False):
 32 |         self.invalidate = (
 33 |             invalidate
 34 |             if invalidate is not False
 35 |             else (os.environ.get("DATALAKE_CACHE_INVALIDATE", "false").lower() == "true")
 36 |         )
 37 | 
 38 |         self.do_not_cache = (
 39 |             do_not_cache
 40 |             if do_not_cache is not False
 41 |             else (os.environ.get("DATALAKE_DO_NOT_CACHE", "false").lower() == "true")
 42 |         )
 43 | 
 44 |         # Set cache_dir
 45 |         self.cache_dir = user_cache_dir("datalake", "olmo-cookbook")
 46 | 
 47 |         if self.invalidate and os.path.exists(self.cache_dir):
 48 |             shutil.rmtree(self.cache_dir, ignore_errors=True)
 49 | 
 50 |             # Check if path exists but is a file instead of a directory
 51 |             if os.path.exists(self.cache_dir) and not os.path.isdir(self.cache_dir):
 52 |                 try:
 53 |                     os.remove(self.cache_dir)
 54 |                 except FileNotFoundError:
 55 |                     pass
 56 | 
 57 |         if not os.path.exists(self.cache_dir):
 58 |             os.makedirs(self.cache_dir, exist_ok=True)
 59 | 
 60 |     def _make_cache_path(self, **kwargs) -> str:
 61 |         cache_key = hashlib.sha256(json.dumps(kwargs).encode()).hexdigest()
 62 |         return os.path.join(self.cache_dir, f"{cache_key}.json.gz")
 63 | 
 64 |     def get(self, **kwargs) -> DatalakeCacheResult[T]:
 65 |         if self.do_not_cache:
 66 |             return DatalakeCacheResult(success=False, value=None)
 67 | 
 68 |         if os.path.exists(cache_file := self._make_cache_path(**kwargs)) and not self.invalidate:
 69 |             with smart_open.open(cache_file, "rt", encoding="utf-8") as f:
 70 |                 return DatalakeCacheResult(success=True, value=json.load(f))
 71 | 
 72 |         return DatalakeCacheResult(success=False, value=None)
 73 | 
 74 |     def set(self, value: T, **kwargs) -> DatalakeCacheResult[T]:
 75 |         if self.do_not_cache:
 76 |             return DatalakeCacheResult(success=False, value=None)
 77 | 
 78 |         if not os.path.exists(cache_file := self._make_cache_path(**kwargs)) or self.invalidate:
 79 |             with smart_open.open(cache_file, "wt", encoding="utf-8") as f:
 80 |                 json.dump(value, f)
 81 | 
 82 |         return DatalakeCacheResult(success=True, value=value)
 83 | 
 84 |     def delete(self, **kwargs) -> None:
 85 |         if os.path.exists(cache_file := self._make_cache_path(**kwargs)):
 86 |             os.remove(cache_file)
 87 | 
 88 | 
 89 | def get_datalake_cache(invalidate: bool = False, do_not_cache: bool = False) -> DatalakeCache:
 90 |     """Get or create a singleton instance of DatalakeCache."""
 91 |     global _DATALAKE_CACHE_INSTANCE
 92 | 
 93 |     if _DATALAKE_CACHE_INSTANCE is None:
 94 |         kwargs = {}
 95 |         if invalidate is not None:
 96 |             kwargs["invalidate"] = invalidate
 97 |         if do_not_cache is not None:
 98 |             kwargs["do_not_cache"] = do_not_cache
 99 |         _DATALAKE_CACHE_INSTANCE = DatalakeCache(**kwargs)
100 | 
101 |     return _DATALAKE_CACHE_INSTANCE
102 | 


--------------------------------------------------------------------------------
/src/cookbook/eval/conversion_from_hf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shlex
  3 | import shutil
  4 | import subprocess
  5 | from typing import Optional
  6 | 
  7 | from cookbook.cli.utils import (
  8 |     PythonEnv,
  9 |     add_secret_to_beaker_workspace,
 10 |     discover_weka_mount,
 11 |     install_beaker_py,
 12 |     install_olmo_core,
 13 |     install_transformers,
 14 |     make_destination_dir,
 15 |     remove_conflicting_packages,
 16 | )
 17 | from cookbook.constants import (
 18 |     OLMO_CORE_CONVERT_FROM_HF_SCRIPT,
 19 |     OLMO_CORE_V2_COMMIT_HASH,
 20 |     TRANSFORMERS_COMMIT_HASH,
 21 | )
 22 | from cookbook.utils.clusters import get_matching_clusters
 23 | 
 24 | 
 25 | def convert_hf_to_olmo_core_v2(
 26 |     input_dir: str,
 27 |     output_dir: Optional[str] = None,
 28 |     output_suffix: str = "olmo_core",
 29 |     olmo_core_v2_commit_hash: str = OLMO_CORE_V2_COMMIT_HASH,
 30 |     olmo_core_v2_experiment_json_path: Optional[str] = None,
 31 |     olmo_core_v2_model_arch: Optional[str] = None,
 32 |     olmo_core_v2_tokenizer: Optional[str] = None,
 33 |     transformers_git_url: Optional[str] = None,
 34 |     transformers_commit_hash: str = TRANSFORMERS_COMMIT_HASH,
 35 |     transformers_model_id: Optional[str] = None,
 36 |     transformers_revision: str = "main",
 37 |     skip_validation: bool = False,
 38 |     debug_validation: bool = False,
 39 |     device: Optional[str] = None,
 40 |     env: Optional[PythonEnv] = None,
 41 | ):
 42 |     env = env or PythonEnv.null()
 43 | 
 44 |     directories_to_clean_up = []
 45 | 
 46 |     output_dir = make_destination_dir(input_dir, output_suffix, output_dir)
 47 | 
 48 |     try:
 49 |         print("Starting conversion of HF model...")
 50 | 
 51 |         olmo_code_dir = install_olmo_core(env=env, commit_hash=olmo_core_v2_commit_hash)
 52 |         directories_to_clean_up.append(olmo_code_dir)
 53 | 
 54 |         huggingface_code_dir = install_transformers(transformers_commit_hash, env, git_url=transformers_git_url)
 55 |         directories_to_clean_up.append(huggingface_code_dir)
 56 | 
 57 |         print("Converting Huggingface weights to OLMo core V2 format...")
 58 |         os.makedirs(output_dir, exist_ok=True)
 59 |         cmd = [
 60 |             env.python,
 61 |             OLMO_CORE_CONVERT_FROM_HF_SCRIPT,
 62 |             f"--checkpoint-input-path {input_dir}",
 63 |             f"--output-dir {output_dir}",
 64 |             f"--revision {transformers_revision}",
 65 |             (f"--config-path {olmo_core_v2_experiment_json_path}" if olmo_core_v2_experiment_json_path else ""),
 66 |             (f"--model-arch {olmo_core_v2_model_arch}" if olmo_core_v2_model_arch else ""),
 67 |             (f"--tokenizer {olmo_core_v2_tokenizer}" if olmo_core_v2_tokenizer else ""),
 68 |             (f"--model-id {transformers_model_id}" if transformers_model_id else ""),
 69 |             (f"--device {device}" if device else ""),
 70 |             ("--skip-validation" if skip_validation else ""),
 71 |             ("--debug" if debug_validation else ""),
 72 |         ]
 73 |         print(f"Running command: {' '.join(cmd)} from commit hash: {olmo_core_v2_commit_hash}")
 74 | 
 75 |         try:
 76 |             subprocess.run(
 77 |                 shlex.split(" ".join(cmd)),
 78 |                 check=True,
 79 |                 cwd=olmo_code_dir,
 80 |                 env=env.path(),
 81 |                 capture_output=True,
 82 |                 text=True,
 83 |             )
 84 |         except subprocess.CalledProcessError as e:
 85 |             raise RuntimeError(f"Conversion failed with output: \n{e.output}\nStderr: \n{e.stderr}") from e
 86 | 
 87 |         print(f"Completed conversion of HF model. OLMo core v2 model at {output_dir}.")
 88 | 
 89 |     finally:
 90 |         for directory in directories_to_clean_up:
 91 |             print(f"Cleaning up {directory}...")
 92 |             shutil.rmtree(directory, ignore_errors=True)
 93 | 
 94 | 
 95 | def run_checkpoint_conversion_from_hf(
 96 |     beaker_allow_dirty: bool,
 97 |     beaker_budget: str,
 98 |     beaker_cluster: str,
 99 |     beaker_dry_run: bool,
100 |     beaker_gpus: int,
101 |     beaker_priority: str,
102 |     beaker_workspace: str,
103 |     beaker_preemptible: bool,
104 |     huggingface_token: Optional[str],
105 |     input_dir: str,
106 |     output_dir: Optional[str],
107 |     output_suffix: str,
108 |     olmo_core_v2_commit_hash: str,
109 |     olmo_core_v2_experiment_json_path: Optional[str],
110 |     olmo_core_v2_model_arch: Optional[str],
111 |     olmo_core_v2_tokenizer: Optional[str],
112 |     huggingface_transformers_git_url: Optional[str],
113 |     huggingface_transformers_commit_hash: str,
114 |     huggingface_transformers_model_id: Optional[str],
115 |     huggingface_transformers_revision: str,
116 |     use_beaker: bool,
117 |     use_system_python: bool,
118 |     python_venv_name: str,
119 |     python_venv_force: bool,
120 |     skip_validation: bool,
121 |     debug_validation: bool,
122 |     torch_device: Optional[str],
123 | ):
124 |     env = (
125 |         PythonEnv.create(name=python_venv_name, force=python_venv_force)
126 |         if not use_system_python
127 |         else PythonEnv.null()
128 |     )
129 | 
130 |     if use_beaker:
131 |         print("Installing beaker and gantry clients...")
132 |         install_beaker_py(env=env)
133 | 
134 |         assert input_dir.startswith("/"), "Input directory must be fully specified"
135 |         if output_dir:
136 |             assert output_dir.startswith("/"), "Output directory must be fully specified"
137 |         if olmo_core_v2_experiment_json_path:
138 |             assert olmo_core_v2_experiment_json_path.startswith("/"), "Output directory must be fully specified"
139 | 
140 |         weka_mounts = [
141 |             mount
142 |             for mount in (
143 |                 discover_weka_mount(input_dir),
144 |                 discover_weka_mount(output_dir),
145 |             )
146 |             if mount is not None
147 |         ]
148 | 
149 |         gantry_flags = []
150 | 
151 |         for weka_path in set(weka_mounts):
152 |             gantry_flags.append(f"--weka {weka_path}:/{weka_path}")
153 | 
154 |         if huggingface_token is not None:
155 |             secret_name = add_secret_to_beaker_workspace(
156 |                 secret_name="HF_TOKEN",
157 |                 secret_value=huggingface_token,
158 |                 workspace=beaker_workspace,
159 |                 env=env,  # type: ignore
160 |             )
161 |             if secret_name:
162 |                 gantry_flags.append(f"--env-secret HF_TOKEN={secret_name}")
163 | 
164 |         for cluster in set(get_matching_clusters(beaker_cluster)):
165 |             gantry_flags.append(f"--cluster {cluster}")
166 | 
167 |         remote_command = [
168 |             "pip install uv && uv pip install . --system &&",
169 |             "olmo-cookbook-eval convert-from-hf",
170 |             f"{input_dir}",
171 |             (f"--output-dir {output_dir}" if output_dir else ""),
172 |             f"--output-suffix {output_suffix}",
173 |             f"--olmo-core-v2-commit-hash {olmo_core_v2_commit_hash}",
174 |             (
175 |                 f"--olmo-core-v2-experiment-json-path {olmo_core_v2_experiment_json_path}"
176 |                 if olmo_core_v2_experiment_json_path
177 |                 else ""
178 |             ),
179 |             (f"--olmo-core-v2-model-arch {olmo_core_v2_model_arch}" if olmo_core_v2_model_arch else ""),
180 |             (f"--olmo-core-v2-tokenizer {olmo_core_v2_tokenizer}" if olmo_core_v2_tokenizer else ""),
181 |             f"--huggingface-transformers-git-url {huggingface_transformers_git_url}",
182 |             f"--huggingface-transformers-commit-hash {huggingface_transformers_commit_hash}",
183 |             (
184 |                 f"--huggingface-transformers-model-id {huggingface_transformers_model_id}"
185 |                 if huggingface_transformers_model_id
186 |                 else ""
187 |             ),
188 |             f"--huggingface-transformers-revision {huggingface_transformers_revision}",
189 |             "--use-system-python",
190 |             ("--skip-validation" if skip_validation else ""),
191 |             ("--debug-validation" if debug_validation else ""),
192 |             ("--torch-device" if torch_device else ""),
193 |         ]
194 |         remote_command_str = " ".join(remote_command)
195 | 
196 |         gantry_command = [
197 |             "gantry run",
198 |             f"--description 'Converting HF checkpoint at {input_dir}'",
199 |             ("--allow-dirty" if beaker_allow_dirty else ""),
200 |             "--no-python",
201 |             f"--workspace {beaker_workspace}",
202 |             f"--priority {beaker_priority}",
203 |             f"--gpus {beaker_gpus}",
204 |             ("--preemptible" if beaker_preemptible else ""),
205 |             f"--budget {beaker_budget}",
206 |             "--yes",
207 |             ("--dry-run" if beaker_dry_run else ""),
208 |             " ".join(gantry_flags),
209 |             f"-- /bin/bash -c '{remote_command_str}'",
210 |         ]
211 |         gantry_command_str = " ".join(gantry_command)
212 | 
213 |         print(f"Submitting to beaker with command: {gantry_command_str}")
214 |         return subprocess.run(shlex.split(gantry_command_str), check=True, env=env.path())
215 | 
216 |     remove_conflicting_packages(env=env)
217 | 
218 |     return convert_hf_to_olmo_core_v2(
219 |         input_dir=input_dir,
220 |         output_dir=output_dir,
221 |         output_suffix=output_suffix,
222 |         olmo_core_v2_commit_hash=olmo_core_v2_commit_hash,
223 |         olmo_core_v2_experiment_json_path=olmo_core_v2_experiment_json_path,
224 |         olmo_core_v2_model_arch=olmo_core_v2_model_arch,
225 |         olmo_core_v2_tokenizer=olmo_core_v2_tokenizer,
226 |         transformers_git_url=huggingface_transformers_git_url,
227 |         transformers_commit_hash=huggingface_transformers_commit_hash,
228 |         transformers_model_id=huggingface_transformers_model_id,
229 |         transformers_revision=huggingface_transformers_revision,
230 |         skip_validation=skip_validation,
231 |         debug_validation=debug_validation,
232 |         device=torch_device,
233 |         env=env,
234 |     )
235 | 


--------------------------------------------------------------------------------
/src/cookbook/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/model/__init__.py


--------------------------------------------------------------------------------
/src/cookbook/model/config.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from enum import Enum
  3 | from typing import Any, Optional
  4 | 
  5 | import olmo_core.train.train_module as train_module
  6 | from olmo_core.config import Config
  7 | from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig, TokenizerConfig
  8 | from olmo_core.nn.transformer import (
  9 |     TransformerBlockType,
 10 |     TransformerConfig,
 11 | )
 12 | from olmo_core.optim import OptimConfig
 13 | from olmo_core.train import TrainerConfig
 14 | 
 15 | 
 16 | class Tokenizers(Enum):
 17 |     dolma2 = TokenizerConfig.dolma2()
 18 |     gpt_neox = TokenizerConfig.gpt_neox_olmo_dolma_v1_5()
 19 |     superbpe_experimental = TokenizerConfig(
 20 |         vocab_size=180021,
 21 |         identifier="allenai/superbpe-experimental_v0.1.0",
 22 |         eos_token_id=180000,
 23 |         pad_token_id=180001,
 24 |     )
 25 |     dolma2_180k = TokenizerConfig(
 26 |         vocab_size=180021,
 27 |         identifier="allenai/dolma2-180k-experimental-0.0.1",
 28 |         eos_token_id=180000,
 29 |         pad_token_id=180001,
 30 |     )
 31 | 
 32 | 
 33 | @dataclass
 34 | class ModelTrainConfig(Config):
 35 |     model: TransformerConfig
 36 |     optim: OptimConfig
 37 |     dataset: NumpyDatasetConfig
 38 |     data_loader: NumpyDataLoaderConfig
 39 |     trainer: TrainerConfig
 40 |     train_module: train_module.TransformerTrainModuleConfig
 41 |     init_seed: int = 12536
 42 | 
 43 | 
 44 | @dataclass
 45 | class DefaultOptimizerProperties:
 46 |     betas: tuple = (0.9, 0.95)
 47 |     eps: float = 1e-8
 48 |     weight_decay: float = 0.1
 49 | 
 50 | 
 51 | @dataclass
 52 | class DefaultTransformerProperties:
 53 |     block_type: TransformerBlockType = TransformerBlockType.reordered_norm
 54 |     decay_embeddings: bool = False
 55 |     layer_norm_eps: float = 1e-6
 56 |     qk_norm: bool = True
 57 |     rope_theta: int = 500_000
 58 | 
 59 | 
 60 | class ModelConfigIdentifier:
 61 |     """
 62 |     A dynamic registry for model identifiers that auto-initializes when used.
 63 |     """
 64 | 
 65 |     _registry: dict[str, str] = {}
 66 |     _initialized = False
 67 | 
 68 |     def __init__(self, identifier):
 69 |         # Auto-initialize the first time this class is used
 70 |         if not ModelConfigIdentifier._initialized:
 71 |             ModelConfigIdentifier._initialize_identifiers()
 72 | 
 73 |         if identifier not in ModelConfigIdentifier._registry:
 74 |             raise ValueError(
 75 |                 f"'{identifier}' is not a valid model identifier. "
 76 |                 f"Available models: {', '.join(ModelConfigIdentifier._registry.keys())}"
 77 |             )
 78 | 
 79 |         self.value = identifier
 80 |         self.name = identifier
 81 | 
 82 |     def __str__(self) -> str:
 83 |         return self.value
 84 | 
 85 |     def __repr__(self) -> str:
 86 |         return f"ModelConfigIdentifier({self.value!r})"
 87 | 
 88 |     def __eq__(self, other) -> bool:
 89 |         if isinstance(other, str):
 90 |             return self.value == other
 91 |         elif isinstance(other, ModelConfigIdentifier):
 92 |             return self.value == other.value
 93 |         return False
 94 | 
 95 |     @classmethod
 96 |     def _get_model_methods(cls, target_class) -> list[str]:
 97 |         """Get all classmethods of a class that might represent model configurations."""
 98 |         return [
 99 |             attr
100 |             for attr in dir(target_class)
101 |             if callable(getattr(target_class, attr))
102 |             and not attr.startswith("_")
103 |             and attr not in ["from_dict", "from_json", "from_model_identifier", "values", "keys"]
104 |         ]
105 | 
106 |     @classmethod
107 |     def _initialize_identifiers(cls) -> None:
108 |         """Initialize the model identifier registry with methods from TransformerConfig and WrappedTransformerConfig."""
109 |         # Add default models
110 |         cls._registry["default"] = "default"
111 | 
112 |         # Add methods from WrappedTransformerConfig
113 |         for method_name in cls._get_model_methods(WrappedTransformerConfig):
114 |             cls._registry[method_name] = method_name
115 | 
116 |         # Add methods from TransformerConfig
117 |         for method_name in cls._get_model_methods(TransformerConfig):
118 |             if method_name not in cls._registry:
119 |                 cls._registry[method_name] = method_name
120 | 
121 |         cls._initialized = True
122 | 
123 |     @classmethod
124 |     def keys(cls) -> list[str]:
125 |         """Return all valid model identifier keys."""
126 |         if not cls._initialized:
127 |             cls._initialize_identifiers()
128 |         return list(cls._registry.keys())
129 | 
130 |     @classmethod
131 |     def values(cls) -> list[str]:
132 |         """Return all valid model identifier values."""
133 |         if not cls._initialized:
134 |             cls._initialize_identifiers()
135 |         return list(cls._registry.values())
136 | 
137 |     @classmethod
138 |     def __get_pydantic_core_schema__(cls, _source_type, _handler):
139 |         from pydantic_core import core_schema
140 | 
141 |         def validate_identifier(value, info):
142 |             # Ensure registry is initialized
143 |             if not cls._initialized:
144 |                 cls._initialize_identifiers()
145 | 
146 |             # Handle existing instances
147 |             if isinstance(value, cls):
148 |                 return value
149 | 
150 |             # Handle string values
151 |             if not isinstance(value, str):
152 |                 raise ValueError(f"Expected string or {cls.__name__}, got {type(value)}")
153 | 
154 |             # Validate against registry
155 |             if value not in cls._registry:
156 |                 valid_values = ", ".join(cls._registry.keys())
157 |                 raise ValueError(
158 |                     f"'{value}' is not a valid model identifier. " f"Available models: {valid_values}"
159 |                 )
160 | 
161 |             return cls(value)
162 | 
163 |         return core_schema.with_info_plain_validator_function(
164 |             validate_identifier,
165 |             serialization=core_schema.plain_serializer_function_ser_schema(lambda instance: instance.value),
166 |             metadata={
167 |                 "type": "enum-like",
168 |                 "values": list(cls.keys()),
169 |             },
170 |         )
171 | 
172 | 
173 | class WrappedTransformerConfig:
174 |     @classmethod
175 |     def olmo_30m(cls, tokenizer: TokenizerConfig) -> TransformerConfig:
176 |         """
177 |         OLMo 30m
178 |         """
179 |         return getattr(TransformerConfig, "llama_like")(
180 |             d_model=256,
181 |             n_heads=8,
182 |             n_layers=4,
183 |             vocab_size=tokenizer.padded_vocab_size(),
184 |             rope_theta=DefaultTransformerProperties.rope_theta,
185 |             layer_norm_eps=DefaultTransformerProperties.layer_norm_eps,
186 |             qk_norm=DefaultTransformerProperties.qk_norm,
187 |             block_name=DefaultTransformerProperties.block_type,
188 |         )
189 | 
190 |     @classmethod
191 |     def from_model_identifier(
192 |         cls,
193 |         model_identifier: ModelConfigIdentifier,
194 |         tokenizer: TokenizerConfig = Tokenizers.dolma2.value,
195 |     ) -> TransformerConfig:
196 |         """
197 |         Create a TransformerConfig from a ModelConfigIdentifier.
198 | 
199 |         This method supports all models defined in the ModelConfigIdentifier enum by
200 |         mapping them to appropriate TransformerConfig class methods.
201 | 
202 |         Args:
203 |             model_identifier: The model identifier to create a config for
204 |             tokenizer: The tokenizer config to use
205 |             model_overrides: Optional overrides for the model config
206 | 
207 |         Returns:
208 |             A TransformerConfig instance for the specified model
209 | 
210 |         Raises:
211 |             ValueError: If the model identifier isn't supported in either cookbook or olmo-core
212 |         """
213 |         model_name = model_identifier.value
214 | 
215 |         # First, check if we have a custom config override for this model
216 |         if hasattr(cls, model_name):
217 |             return getattr(cls, model_name)(tokenizer)
218 | 
219 |         # Then, check if the TransformerConfig class has a method for this model
220 |         if hasattr(TransformerConfig, model_name):
221 |             return getattr(TransformerConfig, model_name)(
222 |                 vocab_size=tokenizer.padded_vocab_size(),
223 |             )
224 | 
225 |         raise ValueError(
226 |             f"Model identifier '{model_identifier}' is not supported in either cookbook or olmo-core."
227 |             f" Available models: {', '.join(ModelConfigIdentifier.keys())}"
228 |         )
229 | 
230 | 
231 | DEFAULT_LR_MAP = {
232 |     "olmo2_1B": 1.8e-3,
233 |     "olmo2_1B_v2": 1.8e-3,
234 | }
235 | 


--------------------------------------------------------------------------------
/src/cookbook/model/evaluators.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Dict, List
 3 | 
 4 | from olmo_eval import list_tasks
 5 | 
 6 | OLMO2_DEV_1B_TASKS = [
 7 |     # OLMES Core 9(-ish) RC
 8 |     "arc_challenge_test_rc_5shot",
 9 |     "arc_easy_test_rc_5shot",
10 |     "hellaswag_rc_5shot",  # 1K subset of HellaSwag
11 |     "winogrande_val_rc_5shot",  # Helpful after 750M-5xC scale
12 |     "csqa_val_rc_5shot",
13 |     "piqa_val_rc_5shot",
14 |     "socialiqa_val_rc_5shot",
15 |     # MMLU RC
16 |     "mmlu_stem_val_rc_5shot",
17 |     "mmlu_humanities_val_rc_5shot",
18 |     "mmlu_social_sciences_val_rc_5shot",
19 |     "mmlu_other_val_rc_5shot",
20 |     "mmlu_stem_test_rc_5shot",
21 |     "mmlu_humanities_test_rc_5shot",
22 |     "mmlu_social_sciences_test_rc_5shot",
23 |     "mmlu_other_test_rc_5shot",
24 |     # Gen tasks BPB
25 |     "gsm8k_gold_bpb_5shot",
26 |     "minerva_math_algebra_gold_bpb_0shot",
27 |     "minerva_math_counting_and_probability_gold_bpb_0shot",
28 |     "minerva_math_geometry_gold_bpb_0shot",
29 |     "minerva_math_intermediate_algebra_gold_bpb_0shot",
30 |     "minerva_math_number_theory_gold_bpb_0shot",
31 |     "minerva_math_prealgebra_gold_bpb_0shot",
32 |     "minerva_math_precalculus_gold_bpb_0shot",
33 |     "codex_humaneval_gold_bpb_0shot",
34 |     "codex_mbpp_gold_bpb_0shot",
35 |     # Sanity check for MCQA ability
36 |     "copycolors_10way",
37 |     # Basic Skills rc 5shot
38 |     "basic_skills_arithmetic_rc_5shot",
39 |     "basic_skills_coding_rc_5shot",
40 |     "basic_skills_common_knowledge_rc_5shot",
41 |     "basic_skills_logical_reasoning_rc_5shot",
42 |     "basic_skills_pattern_rc_5shot",
43 |     "basic_skills_string_operations_rc_5shot",
44 | ]
45 | 
46 | TASK_GROUPS: Dict[str, List[str]] = {
47 |     "all": list(list_tasks()),
48 |     "olmo2_dev_1b": OLMO2_DEV_1B_TASKS
49 | }
50 | 
51 | 
52 | ALL_TASKS_MAP = {task.upper(): task for task in list_tasks()}
53 | 
54 | DownstreamEvaluator = Enum(
55 |     "DownstreamEvaluator",
56 |     {
57 |         item[0].upper(): item[1] if isinstance(item[1], list) else [item[1]]
58 |         for item in {**TASK_GROUPS, **ALL_TASKS_MAP}.items()
59 |     },
60 | )
61 | 
62 | 
63 | def get_tasks_for_groups(groups: List[str]) -> List[str]:
64 |     """Return all tasks in a group"""
65 |     tasks = []
66 |     for group in groups:
67 |         if group in TASK_GROUPS:
68 |             tasks.extend(TASK_GROUPS[group])
69 |         elif group.upper() in ALL_TASKS_MAP:
70 |             tasks.append(ALL_TASKS_MAP[group.upper()])
71 |         else:
72 |             raise ValueError(f"Group or task '{group}' not found")
73 | 
74 |     tasks = list(set(tasks))
75 |     tasks.sort()
76 | 
77 |     return tasks
78 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka-hlr.yaml:
--------------------------------------------------------------------------------
  1 | name: "olmo-cookbook-1b-5xC-love2code-hlr"
  2 | description: "Love2Code model, first stab at a config"
  3 | budget: "ai2/oe-training"
  4 | workspace: "ai2/oe-data"
  5 | nodes: 4
  6 | gpus: 8
  7 | preemptible: false
  8 | max_tokens: 113_184_153_600 # 5xC multiplier
  9 | sequence_length: 2048
 10 | seed: 1337
 11 | learning_rate: 1.8e-3
 12 | model: "olmo2_1B"
 13 | tokenizer: "dolma2"
 14 | priority: urgent
 15 | cluster: ai2/jupiter-cirrascale-2
 16 | weka: true
 17 | dataset:
 18 |   sources:
 19 |   - name: the-stack-v2-ai2v0
 20 |     target_ratio: 0.85
 21 |     paths:
 22 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-001-00000.npy
 23 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-004-00000.npy
 24 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-006-00000.npy
 25 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-007-00000.npy
 26 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00000.npy
 27 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00001.npy
 28 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00002.npy
 29 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-009-00000.npy
 30 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-010-00000.npy
 31 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-011-00000.npy
 32 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-012-00000.npy
 33 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-014-00000.npy
 34 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-015-00000.npy
 35 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-016-00000.npy
 36 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-017-00000.npy
 37 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-018-00000.npy
 38 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-020-00000.npy
 39 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-020-00001.npy
 40 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-022-00003.npy
 41 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-024-00001.npy
 42 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-024-00002.npy
 43 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-025-00001.npy
 44 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-027-00000.npy
 45 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00000.npy
 46 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00001.npy
 47 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00002.npy
 48 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00003.npy
 49 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-029-00001.npy
 50 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-030-00000.npy
 51 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-031-00000.npy
 52 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-032-00001.npy
 53 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-034-00000.npy
 54 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-035-00000.npy
 55 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-037-00001.npy
 56 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-038-00000.npy
 57 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-040-00000.npy
 58 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00000.npy
 59 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00001.npy
 60 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00002.npy
 61 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-043-00000.npy
 62 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-045-00000.npy
 63 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-046-00000.npy
 64 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-047-00000.npy
 65 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-048-00000.npy
 66 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-048-00002.npy
 67 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00000.npy
 68 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00002.npy
 69 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00003.npy
 70 |   - name: dclm-codeprose-v0
 71 |     target_ratio: 0.15
 72 |     paths:
 73 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-00-00000.npy
 74 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-01-00000.npy
 75 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-02-00000.npy
 76 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-03-00000.npy
 77 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-04-00000.npy
 78 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-05-00000.npy
 79 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-06-00000.npy
 80 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-07-00000.npy
 81 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-08-00000.npy
 82 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-09-00000.npy
 83 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-10-00000.npy
 84 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-11-00000.npy
 85 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-12-00000.npy
 86 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-13-00000.npy
 87 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-14-00000.npy
 88 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-15-00000.npy
 89 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-16-00000.npy
 90 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-17-00000.npy
 91 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-18-00000.npy
 92 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-19-00000.npy
 93 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-20-00000.npy
 94 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-21-00000.npy
 95 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-22-00000.npy
 96 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-23-00000.npy
 97 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-24-00000.npy
 98 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-25-00000.npy
 99 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-26-00000.npy
100 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-27-00000.npy
101 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-28-00000.npy
102 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-29-00000.npy
103 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-30-00000.npy
104 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka-starcoder1-noprose.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo-cookbook-1b-5xC-love2code-starcoder-no-prose"
 2 | description: "Love2Code model, but with starcoder1 data and no prose"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: false
 8 | max_tokens: 113_184_153_600 # 5xC multiplier
 9 | sequence_length: 2048
10 | seed: 1337
11 | model: "olmo2_1B"
12 | tokenizer: "dolma2"
13 | priority: high
14 | cluster: ai2/jupiter-cirrascale-2
15 | weka: true
16 | dataset:
17 |   sources:
18 |   - name: starcoder1
19 |     target_ratio: 1.0
20 |     paths:
21 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-00-00000.npy
22 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-01-00000.npy
23 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-01-00001.npy
24 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-02-00000.npy
25 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-02-00001.npy
26 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-03-00000.npy
27 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-03-00001.npy
28 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-04-00000.npy
29 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-05-00000.npy
30 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-05-00001.npy
31 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-06-00000.npy
32 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-06-00001.npy
33 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-07-00000.npy
34 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-08-00000.npy
35 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-08-00001.npy
36 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-09-00000.npy
37 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-09-00001.npy
38 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-10-00000.npy
39 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-11-00000.npy
40 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-11-00001.npy
41 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-12-00000.npy
42 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-12-00001.npy
43 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-13-00000.npy
44 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-14-00000.npy
45 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-15-00000.npy
46 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-15-00001.npy
47 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-16-00000.npy
48 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-17-00000.npy
49 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-18-00000.npy
50 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-19-00000.npy
51 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-19-00001.npy
52 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-20-00000.npy
53 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-20-00001.npy
54 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-21-00000.npy
55 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-22-00000.npy
56 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-22-00001.npy
57 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-23-00000.npy
58 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-23-00001.npy
59 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-24-00000.npy
60 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-25-00000.npy
61 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-25-00001.npy
62 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-26-00000.npy
63 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-27-00000.npy
64 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-28-00000.npy
65 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-28-00001.npy
66 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-29-00000.npy
67 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-29-00001.npy
68 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-30-00000.npy
69 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-31-00000.npy
70 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-31-00001.npy
71 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-32-00000.npy
72 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-32-00001.npy
73 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-33-00000.npy
74 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-33-00001.npy
75 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-34-00000.npy
76 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-34-00001.npy
77 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-35-00000.npy
78 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-35-00001.npy
79 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-36-00000.npy
80 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-37-00000.npy
81 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-37-00001.npy
82 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-38-00000.npy
83 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-39-00000.npy
84 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-40-00000.npy
85 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-40-00001.npy
86 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-41-00000.npy
87 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-41-00001.npy
88 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-42-00000.npy
89 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-43-00000.npy
90 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-44-00000.npy
91 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-45-00000.npy
92 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-46-00000.npy
93 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-47-00000.npy
94 |     - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-48-00000.npy
95 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka.yaml:
--------------------------------------------------------------------------------
  1 | name: "olmo-cookbook-1b-5xC-love2code"
  2 | description: "Love2Code model, first stab at a config"
  3 | budget: "ai2/oe-training"
  4 | workspace: "ai2/oe-data"
  5 | nodes: 4
  6 | gpus: 8
  7 | preemptible: false
  8 | max_tokens: 113_184_153_600 # 5xC multiplier
  9 | sequence_length: 2048
 10 | seed: 1337
 11 | model: "olmo2_1B"
 12 | tokenizer: "dolma2"
 13 | priority: urgent
 14 | cluster: ai2/jupiter-cirrascale-2
 15 | weka: true
 16 | dataset:
 17 |   sources:
 18 |   - name: the-stack-v2-ai2v0
 19 |     target_ratio: 0.85
 20 |     paths:
 21 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-001-00000.npy
 22 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-004-00000.npy
 23 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-006-00000.npy
 24 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-007-00000.npy
 25 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00000.npy
 26 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00001.npy
 27 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00002.npy
 28 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-009-00000.npy
 29 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-010-00000.npy
 30 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-011-00000.npy
 31 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-012-00000.npy
 32 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-014-00000.npy
 33 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-015-00000.npy
 34 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-016-00000.npy
 35 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-017-00000.npy
 36 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-018-00000.npy
 37 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-020-00000.npy
 38 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-020-00001.npy
 39 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-022-00003.npy
 40 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-024-00001.npy
 41 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-024-00002.npy
 42 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-025-00001.npy
 43 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-027-00000.npy
 44 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00000.npy
 45 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00001.npy
 46 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00002.npy
 47 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00003.npy
 48 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-029-00001.npy
 49 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-030-00000.npy
 50 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-031-00000.npy
 51 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-032-00001.npy
 52 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-034-00000.npy
 53 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-035-00000.npy
 54 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-037-00001.npy
 55 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-038-00000.npy
 56 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-040-00000.npy
 57 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00000.npy
 58 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00001.npy
 59 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00002.npy
 60 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-043-00000.npy
 61 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-045-00000.npy
 62 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-046-00000.npy
 63 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-047-00000.npy
 64 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-048-00000.npy
 65 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-048-00002.npy
 66 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00000.npy
 67 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00002.npy
 68 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00003.npy
 69 |   - name: dclm-codeprose-v0
 70 |     target_ratio: 0.15
 71 |     paths:
 72 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-00-00000.npy
 73 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-01-00000.npy
 74 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-02-00000.npy
 75 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-03-00000.npy
 76 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-04-00000.npy
 77 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-05-00000.npy
 78 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-06-00000.npy
 79 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-07-00000.npy
 80 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-08-00000.npy
 81 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-09-00000.npy
 82 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-10-00000.npy
 83 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-11-00000.npy
 84 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-12-00000.npy
 85 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-13-00000.npy
 86 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-14-00000.npy
 87 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-15-00000.npy
 88 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-16-00000.npy
 89 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-17-00000.npy
 90 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-18-00000.npy
 91 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-19-00000.npy
 92 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-20-00000.npy
 93 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-21-00000.npy
 94 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-22-00000.npy
 95 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-23-00000.npy
 96 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-24-00000.npy
 97 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-25-00000.npy
 98 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-26-00000.npy
 99 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-27-00000.npy
100 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-28-00000.npy
101 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-29-00000.npy
102 |     - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-30-00000.npy
103 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/anneal/train-1b-dclm-dolma2-anneal-10b.yaml:
--------------------------------------------------------------------------------
 1 | name: "learn2code-linear-nowup-anneal-10B"
 2 | description: "OLMo2 1b anneal to 10B Tokens"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 2048
11 | seed: 1337
12 | model: "olmo2_1B"
13 | tokenizer: "dolma2"
14 | priority: high
15 | eval_interval: 250
16 | learning_rate: 1.8e-3
17 | cluster: ai2/jupiter-cirrascale-2
18 | rank_microbatch_size: 32768
19 | scheduler_type: linear
20 | warmup_steps: 0
21 | annealing: true
22 | model_overrides:
23 |   - block.feed_forward.hidden_size=5632
24 | load_path: weka://oe-training-default/ai2-llm/checkpoints/ai2-tylerm/olmo-cookbook-1b-5xC-love2code-no-prose-hlr-00203459/step53971/
25 | load_state: false
26 | weka: true
27 | dataset:
28 |   sources:
29 |   - name: dclm-baseline-20pct-dolma2
30 |     target_ratio: 1.0
31 |     paths:
32 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy
33 | 
34 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/anneal/train-7b-code-dolma2-anneal-10b-augusta.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b-learn2code-linear-nowup-anneal-10B"
 2 | description: "OLMo2 7b anneal to 10B Tokens on code data"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: false
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 2048
11 | seed: 1337
12 | model: "olmo2_7B"
13 | tokenizer: "dolma2"
14 | priority: high
15 | eval_interval: 250
16 | cluster: ai2/augusta-google-1
17 | rank_microbatch_size: 8192
18 | scheduler_type: linear
19 | warmup_steps: 0
20 | annealing:
21 |   enabled: true
22 |   initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
24 | load_state: false
25 | dataset:
26 |   sources:
27 |   - name: python
28 |     target_ratio: 0.5
29 |     paths:
30 |     - s3://ai2-llm/preprocessed/the-stack-v2/love2code/v0/heuristic_filtered_minhash_plpartition/python/dolma2-tokenizer/*.npy
31 |   - name: rust
32 |     target_ratio: 0.5
33 |     paths:
34 |     - s3://ai2-llm/preprocessed/the-stack-v2/love2code/v0/heuristic_filtered_minhash_plpartition/rust/dolma2-tokenizer/*.npy
35 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/anneal/train-7b-dclm-only-anneal-10b-control.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b_10b-anneal_dclm-only_control"
 2 | description: "OLMo2 7b anneal to 10B Tokens for dclm"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 2048
11 | seed: 1337
12 | model: "olmo2_7B"
13 | tokenizer: "dolma2"
14 | priority: urgent
15 | eval_interval: 250
16 | cluster: ai2/augusta-google-1
17 | rank_microbatch_size: 8192
18 | scheduler_type: linear
19 | warmup_steps: 0
20 | annealing:
21 |   enabled: true
22 |   initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
24 | load_state: false
25 | dataset:
26 |   sources:
27 |   - name: dclm-baseline-olmo2
28 |     target_ratio: 1.0
29 |     paths:
30 |     - s3://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/anneal/train-7b-finemath3p-anneal-10b-50split-dclm.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b_10b-anneal_finemath-3plus-dclm"
 2 | description: "OLMo2 7b anneal to 10B Tokens for finemath-3plus + dclm"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 2048
11 | seed: 1337
12 | model: "olmo2_7B"
13 | tokenizer: "dolma2"
14 | priority: urgent
15 | eval_interval: 250
16 | cluster: ai2/augusta-google-1
17 | rank_microbatch_size: 8192
18 | scheduler_type: linear
19 | warmup_steps: 0
20 | annealing:
21 |   enabled: true
22 |   initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
24 | load_state: false
25 | dataset:
26 |   sources:
27 |   - name: finemath-3plus
28 |     target_ratio: 0.5
29 |     paths:
30 |     - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
31 |   - name: dclm-baseline-olmo2
32 |     target_ratio: 0.5
33 |     paths:
34 |     - s3://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/anneal/train-7b-finemath3p-anneal-10b-50split.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b_10b-anneal_finemath-3plus"
 2 | description: "OLMo2 7b anneal to 10B Tokens for finemath-3plus"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 2048
11 | seed: 1337
12 | model: "olmo2_7B"
13 | tokenizer: "dolma2"
14 | priority: high
15 | eval_interval: 250
16 | cluster: ai2/augusta-google-1
17 | rank_microbatch_size: 8192
18 | scheduler_type: linear
19 | warmup_steps: 0
20 | annealing:
21 |   enabled: true
22 |   initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
24 | load_state: false
25 | dataset:
26 |   sources:
27 |   - name: finemath-3plus
28 |     target_ratio: 0.5
29 |     paths:
30 |     - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
31 |   - name: base # Survivors | 5.59B tokens
32 |     target_ratio: 0.5
33 |     paths:
34 |     - s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v2_subsamples/madlad_ablations_v1/ingredients/survivors/tokens/*.npy
35 | 
36 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/anneal/train-7b-wiki-concat-anneal-10b-50split-dclm.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b_10b-anneal_wiki-concat-dclm"
 2 | description: "OLMo2 7b anneal to 10B Tokens for structured-wikipedia-concat with links + dclm"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 2048
11 | seed: 1337
12 | model: "olmo2_7B"
13 | tokenizer: "dolma2"
14 | priority: urgent
15 | eval_interval: 250
16 | cluster: ai2/augusta-google-1
17 | rank_microbatch_size: 8192
18 | scheduler_type: linear
19 | warmup_steps: 0
20 | annealing:
21 |   enabled: true
22 |   initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
24 | load_state: false
25 | dataset:
26 |   sources:
27 |   - name: structure-wiki-concat-with-links
28 |     target_ratio: 0.5
29 |     paths:
30 |     - s3://ai2-llm/preprocessed/structured-wikipedia/concat_with_links/allenai/dolma2-tokenizer/*.npy
31 |   - name: dclm-baseline-olmo2
32 |     target_ratio: 0.5
33 |     paths:
34 |     - s3://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy
35 | 
36 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/anneal/train-7b-wiki-concat-anneal-10b-50split.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b_10b-anneal_wiki-concat"
 2 | description: "OLMo2 7b anneal to 10B Tokens for structured-wikipedia-concat with links"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 2048
11 | seed: 1337
12 | model: "olmo2_7B"
13 | tokenizer: "dolma2"
14 | priority: urgent
15 | eval_interval: 250
16 | cluster: ai2/augusta-google-1
17 | rank_microbatch_size: 8192
18 | scheduler_type: linear
19 | warmup_steps: 0
20 | annealing:
21 |   enabled: true
22 |   initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
24 | load_state: false
25 | dataset:
26 |   sources:
27 |   - name: structure-wiki-concat-with-links
28 |     target_ratio: 0.5
29 |     paths:
30 |     - s3://ai2-llm/preprocessed/structured-wikipedia/concat_with_links/allenai/dolma2-tokenizer/*.npy
31 |   - name: base # Survivors | 5.59B tokens
32 |     target_ratio: 0.5
33 |     paths:
34 |     - s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v2_subsamples/madlad_ablations_v1/ingredients/survivors/tokens/*.npy
35 | 
36 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/anneal/train-7b-wiki-concat-anneal-10b.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b_10b-anneal_wiki-concat"
 2 | description: "OLMo2 7b anneal to 10B Tokens for structured-wikipedia-concat with links"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 2048
11 | seed: 1337
12 | model: "olmo2_7B"
13 | tokenizer: "dolma2"
14 | priority: high
15 | eval_interval: 250
16 | cluster: ai2/augusta-google-1
17 | rank_microbatch_size: 8192
18 | scheduler_type: linear
19 | warmup_steps: 0
20 | annealing:
21 |   enabled: true
22 |   initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
24 | load_state: false
25 | dataset:
26 |   sources:
27 |   - name: structure-wiki-concat-with-links
28 |     target_ratio: 1.0
29 |     paths:
30 |     - s3://ai2-llm/preprocessed/structured-wikipedia/concat_with_links/allenai/dolma2-tokenizer/*.npy
31 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-1b-1xC-dclm.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo-cookbook-1b-1xC-dclm-001"
 2 | description: "Example olmo-cookbook recipe"
 3 | budget: "ai2/oe-data"
 4 | workspace: "ai2/dolma2"
 5 | nodes: 1
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 100_000_000
 9 | sequence_length: 2048
10 | seed: 1337
11 | model: "olmo2_1B"
12 | tokenizer: "dolma2"
13 | priority: high
14 | cluster: ai2/saturn-cirrascale
15 | weka: true
16 | dataset:
17 |   sources:
18 |   - name: dclm-baseline-ft7pct-fw2
19 |     target_ratio: 1.0
20 |     paths:
21 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy
22 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy
23 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy
24 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy
25 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy
26 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-2-00000.npy
27 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy
28 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy
29 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy
30 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy
31 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy
32 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy
33 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy
34 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy
35 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy
36 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy
37 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy
38 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy
39 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy
40 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy
41 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy
42 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy
43 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy
44 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy
45 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy
46 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy
47 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy
48 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy
49 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy
50 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy
51 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy
52 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy
53 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy
54 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy
55 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy
56 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy
57 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy
58 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy
59 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy
60 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy
61 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy
62 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy
63 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy
64 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy
65 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy
66 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy
67 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy
68 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy
69 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy
70 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy
71 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy
72 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy
73 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy
74 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy
75 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy
76 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy
77 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy
78 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy
79 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy
80 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy
81 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy
82 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy
83 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy
84 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy
85 |     - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy
86 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2-180k-wsd.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-1b-5xC-dclm-hlr-dolma2-180k-wsd"
 2 | description: "OLMo2 1b@5xC dclm-baseline"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 127_939_584_000
 9 | global_batch_size: 2097152
10 | rank_microbatch_size: 16384
11 | sequence_length: 4096
12 | seed: 1337
13 | model: "olmo2_1B_v2"
14 | tokenizer: "dolma2_180k"
15 | priority: high
16 | eval_interval: 250
17 | learning_rate: 1.8e-3
18 | scheduler_type: wsd
19 | cluster: ai2/jupiter-cirrascale-2
20 | weka: true
21 | # downstream_evaluators:
22 | #   - olmo2_dev_1b
23 | dataset:
24 |   sources:
25 |   - name: dclm-baseline-20pct-dolma2-180k
26 |     target_ratio: 1.0
27 |     paths:
28 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer-180k/**/**/part-0*-00000.npy
29 | 
30 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2-180k.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-1b-5xC-dclm-hlr-dolma2-180k"
 2 | description: "OLMo2 1b@5xC dclm-baseline"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 127_939_584_000
 9 | global_batch_size: 2097152
10 | rank_microbatch_size: 16384
11 | sequence_length: 4096
12 | seed: 1337
13 | model: "olmo2_1B_v2"
14 | tokenizer: "dolma2_180k"
15 | priority: high
16 | eval_interval: 250
17 | learning_rate: 1.8e-3
18 | cluster: ai2/jupiter-cirrascale-2
19 | weka: true
20 | # downstream_evaluators:
21 | #   - olmo2_dev_1b
22 | dataset:
23 |   sources:
24 |   - name: dclm-baseline-20pct-dolma2-180k
25 |     target_ratio: 1.0
26 |     paths:
27 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer-180k/**/**/part-0*-00000.npy
28 | 
29 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2-augusta.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-1b-augusta-test"
 2 | description: ""
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 1
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 1_000_000_000
 9 | global_batch_size: 2097152
10 | rank_microbatch_size: 32768
11 | sequence_length: 4096
12 | seed: 1337
13 | model: "olmo2_1B_v2"
14 | tokenizer: "dolma2"
15 | priority: high
16 | cluster: ai2/augusta-google-1
17 | activation_checkpointing: true
18 | eval_interval: 10
19 | dataset:
20 |   sources:
21 |   - name: gs-test
22 |     target_ratio: 1.0
23 |     paths:
24 |     - gs://ai2-llm/preprocessed/dclm/love2code_codeprose/codeprose/*.npy
25 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2-wsd.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-1b-5xC-dclm-hlr-dolma2-wsd"
 2 | description: "OLMo2 1b@5xC dclm-baseline"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 127_939_584_000
 9 | global_batch_size: 2097152
10 | sequence_length: 4096
11 | seed: 1337
12 | model: "olmo2_1B_v2"
13 | tokenizer: "dolma2"
14 | priority: high
15 | eval_interval: 250
16 | scheduler_type: wsd
17 | learning_rate: 1.8e-3
18 | cluster: ai2/jupiter-cirrascale-2
19 | rank_microbatch_size: 32768
20 | weka: true
21 | # downstream_evaluators:
22 | #   - olmo2_dev_1b
23 | dataset:
24 |   sources:
25 |   - name: dclm-baseline-20pct-dolma2
26 |     target_ratio: 1.0
27 |     paths:
28 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy
29 | 
30 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-1b-dclm-dolma2"
 2 | description: "OLMo2 1b@5xC dclm-baseline"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 127_939_584_000
 9 | global_batch_size: 2097152
10 | sequence_length: 4096
11 | seed: 1337
12 | model: "olmo2_1B_v2"
13 | tokenizer: "dolma2"
14 | priority: high
15 | eval_interval: 250
16 | cluster: ai2/jupiter-cirrascale-2
17 | rank_microbatch_size: 32768
18 | weka: true
19 | dataset:
20 |   sources:
21 |   - name: dclm-baseline-20pct-dolma2
22 |     target_ratio: 1.0
23 |     paths:
24 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy
25 | 
26 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-1b-5xC-dclm-superbpe-wsd.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-1b-5xC-dclm-hlr-superbpe-wsd"
 2 | description: "OLMo2 1b@5xC dclm-baseline with superbpe tokens"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 127_939_584_000
 9 | global_batch_size: 3072000
10 | rank_microbatch_size: 24000
11 | sequence_length: 3000
12 | seed: 1337
13 | model: "olmo2_1B_v2"
14 | tokenizer: "superbpe_experimental"
15 | priority: high
16 | eval_interval: 250
17 | warmup_steps: 6100
18 | scheduler_type: wsd
19 | learning_rate: 1.8e-3
20 | cluster: ai2/jupiter-cirrascale-2
21 | weka: true
22 | # downstream_evaluators:
23 | #   - olmo2_dev_1b
24 | dataset:
25 |   sources:
26 |   - name: dclm-baseline-20pct-superbpe
27 |     target_ratio: 1.0
28 |     paths:
29 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/superbpe-experimental-0.1.0/**/**/part-0*-00000.npy
30 | 
31 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-1b-5xC-dclm-superbpe.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-1b-5xC-dclm-hlr-superbpe"
 2 | description: "OLMo2 1b@5xC dclm-baseline with superbpe tokens"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 127_939_584_000
 9 | global_batch_size: 2097152
10 | rank_microbatch_size: 16384
11 | sequence_length: 4096
12 | seed: 1337
13 | model: "olmo2_1B_v2"
14 | tokenizer: "superbpe_experimental"
15 | priority: high
16 | eval_interval: 250
17 | warmup_steps: 6100
18 | learning_rate: 1.8e-3
19 | cluster: ai2/jupiter-cirrascale-2
20 | weka: true
21 | # downstream_evaluators:
22 | #   - olmo2_dev_1b
23 | dataset:
24 |   sources:
25 |   - name: dclm-baseline-20pct-superbpe
26 |     target_ratio: 1.0
27 |     paths:
28 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/superbpe-experimental-0.1.0/**/**/part-0*-00000.npy
29 | 
30 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-7b-1xC-dclm-dolma2-180k.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b-1xC-dclm-hlr-dolma2-180k"
 2 | description: "OLMo2 7b@1xC dclm-baseline"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 16
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 144_284_139_520  # 7_214_206_976 * 20
 9 | global_batch_size: 2097152
10 | rank_microbatch_size: 8192
11 | sequence_length: 4096
12 | seed: 1337
13 | model: "olmo2_7B"
14 | tokenizer: "dolma2_180k"
15 | priority: high
16 | eval_interval: 250
17 | warmup_steps: 6880
18 | learning_rate: 6.8e-4 # sqrt scaling from 1.8e-3
19 | cluster: ai2/jupiter-cirrascale-2
20 | weka: true
21 | dataset:
22 |   sources:
23 |   - name: dclm-baseline-20pct-dolma2-180k
24 |     target_ratio: 1.0
25 |     paths:
26 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer-180k/**/**/part-0*-00000.npy
27 | 
28 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-7b-1xC-dclm-dolma2.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b-1xC-dclm-dolma2"
 2 | description: "OLMo2 7b@1xC dclm-baseline"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 16
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 137_751_511_040 # 6,887,575,552 * 20
 9 | global_batch_size: 2097152
10 | rank_microbatch_size: 8192
11 | sequence_length: 4096
12 | seed: 1337
13 | model: "olmo2_7B"
14 | tokenizer: "dolma2"
15 | priority: high
16 | eval_interval: 250
17 | warmup_steps: 6568
18 | learning_rate: 6.8e-4 # sqrt scaling from 1.8e-3
19 | cluster: ai2/jupiter-cirrascale-2
20 | weka: true
21 | dataset:
22 |   sources:
23 |   - name: dclm-baseline-20pct-dolma2
24 |     target_ratio: 1.0
25 |     paths:
26 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy
27 | 
28 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo2/train-7b-1xC-dclm-superbpe.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b-1xC-dclm-hlr-superbpe"
 2 | description: "OLMo2 7b@1xC dclm-baseline with superbpe tokens"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 16
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 144_284_139_520  # 7_214_206_976 * 20
 9 | global_batch_size: 2097152
10 | rank_microbatch_size: 8192
11 | sequence_length: 4096
12 | seed: 1337
13 | model: "olmo2_7B"
14 | tokenizer: "superbpe_experimental"
15 | priority: high
16 | eval_interval: 250
17 | warmup_steps: 6880
18 | learning_rate: 6.8e-4 # sqrt scaling from 1.8e-3
19 | cluster: ai2/jupiter-cirrascale-2
20 | weka: true
21 | dataset:
22 |   sources:
23 |   - name: dclm-baseline-20pct-superbpe
24 |     target_ratio: 1.0
25 |     paths:
26 |     - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/superbpe-experimental-0.1.0/**/**/part-0*-00000.npy
27 | 
28 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3-evals/README.md:
--------------------------------------------------------------------------------
  1 | # Evaluation practices for OLMo 3 development
  2 | 
  3 | 
  4 | ## In-loop evaluation
  5 | 
  6 | For **OLMo 3 7B** integration tests use this set of evals, baked into the OLMo 3 7B config in OLMo-Core: https://github.com/allenai/OLMo-core/blob/a91a82e6b8b37103f738e190cdddd6278f2c7f1f/src/scripts/train/OLMo3-7B.py#L113. In summary:
  7 | * Don't slow down training run:
  8 |     * Prune to minimal set of tasks + use a fast version of MC. This is captured in OLMo Core as the `fast` set. See PR: https://github.com/allenai/OLMo-core/pull/282
  9 | * Focus on BPB + MC, ignoring RC:
 10 |     * BPB should spot any major issues early on in training before MC takeoff
 11 |     * Still track MC because we want to make sure there is sensible metric takeoff. For an example of OLMo 3 7B MC metric takeoff slightly after 150B tokens, see https://wandb.ai/ai2-llm/olmo3/reports/OLMo-3-vs-OLMo-2--VmlldzoxMjc2MTA4Mw
 12 | 
 13 | 
 14 | For **OLMo 2 1B 5xC** or **OLMo 2 7B annealing** runs, which we are still using for data ablations, the broad recommendation is to rely more on offline eval, which always has the latest state of evals.
 15 | 
 16 | Some more notes about in-loop eval:
 17 | * If you want to add an in-loop eval, the repo is here: https://github.com/allenai/OLMo-in-loop-evals
 18 | * When selecting which metrics in Wandb, be very careful around whether you are selecting `{dev|test}`, `{rc|mc}`, `{length-normalized-accuracy|length-normalized-accuracy v2}`, `{BPB|BPB v2}`, `{5shot|5shot_fast}`.
 19 |     * The `v2` tasks fix the length-normalization, the original versions are reported for backwards-compatibility.
 20 |     * `5shot_fast` and `5shot` will give the same numbers, the `_fast` implementation uses one forward pass for the `A/B/C/D` tokens in MCQA tasks.
 21 | 
 22 | 
 23 | ## Offline evaluation (1B or smaller)
 24 | 
 25 | For all OLMo models (+ external baselines), this is a running list of evals we care about.
 26 | 
 27 | For **OLMo 2 1B 5xC** runs, it's still good practice to look at both BPB & RC numbers, which usually track together with each other; MC numbers typically haven't broken through noise at this point.
 28 | 
 29 | The command to run an eval looks like:
 30 | 
 31 | ```bash
 32 | CHECKPOINT="/oe-training-default/ai2-llm/checkpoints/mayeec/olmo-cookbook-core-v2-1bv2-5xC-dclm-baseline-topic-classified-sample-natural-28f8e9a9/step61000-hf"
 33 | CLUSTER="l40"
 34 | NUM_GPUS=1
 35 | PARTITION=8
 36 | PRIORITY="high"
 37 | MODEL_ARGS="dtype=bfloat16"
 38 | DASHBOARD="olmo-3-evals"
 39 | WORKSPACE="ai2/olmo-3-evals"
 40 | 
 41 | olmo-cookbook-eval evaluate "$CHECKPOINT" \
 42 |     --tasks "olmo3:dev:1b:main" \
 43 |     --priority "$PRIORITY" \
 44 |     --cluster "$CLUSTER" \
 45 |     --num-gpus "$NUM_GPUS" \
 46 |     --model-backend vllm \
 47 |     --model-args "$MODEL_ARGS" \
 48 |     --partition-size "$PARTITION" \
 49 |     --dashboard "$DASHBOARD"  \
 50 |     --workspace "$WORKSPACE"
 51 | 
 52 | olmo-cookbook-eval evaluate "$CHECKPOINT" \
 53 |     --tasks "olmo3:dev:1b:main:hf" \
 54 |     --priority "$PRIORITY" \
 55 |     --cluster "$CLUSTER" \
 56 |     --num-gpus "$NUM_GPUS" \
 57 |     --model-backend hf \
 58 |     --model-args "$MODEL_ARGS" \
 59 |     --partition-size "$PARTITION" \
 60 |     --dashboard "$DASHBOARD"  \
 61 |     --workspace "$WORKSPACE"
 62 | ```
 63 | 
 64 | Notes: 
 65 | * Task names are collected here: https://github.com/allenai/olmo-cookbook/blob/main/src/cookbook/eval/named_tasks.py
 66 | 
 67 | *How long does it take?*
 68 | * `olmo3:dev:1b:main` are a full suite of 20 tasks, each task w multiple metrics + some tasks as families w multiple subtasks. In total, this is around 150 metrics. Takes 2 hours to do all of them on `--partition-size 8` and `num-gpus 1` with single L40 (launches 5 jobs).
 69 | * `olmo3:dev:1b:main:hf` are two masked PPL evals. Takes 1 hour to do both on a single L40.
 70 | 
 71 | To pull dashboard results (use `--format json` to see full results):
 72 | 
 73 | ```python
 74 | olmo-cookbook-eval results \
 75 |     --dashboard olmo-3-evals \
 76 |     --tasks olmo3:dev:1b:main \
 77 |     --tasks olmo3:dev:1b:main:hf \
 78 |     --format json | jq '.' | less
 79 | ```
 80 | 
 81 | 
 82 | *Notes*
 83 | * If you want to see if the datalake uploading job ran, use your beaker experimental ID to this URL: `https://oe-eval-datalake.allen.ai/greenlake/metadata/01JWMGNY3G3R5N91NW9TCKF6FB`.
 84 | 
 85 | ## Offline evaluation (7B or larger)
 86 | 
 87 | To launch the base evals for large training runs, we have a separate set. We assume the models a better capability to follow task formats, so this set removes BPB and includes MC:
 88 | 
 89 | ```sh
 90 | olmo-cookbook-eval evaluate "$CHECKPOINT" \
 91 |     --tasks "olmo3:dev:7b:main" \
 92 |     --model-backend vllm \
 93 |     ...
 94 | ```
 95 | 
 96 | And pull your results with:
 97 | 
 98 | ```python
 99 | olmo-cookbook-eval results \
100 |     --dashboard olmo-3-evals \
101 |     --tasks olmo3:dev:7b:main \
102 |     --format json | jq '.' | less
103 | ```
104 | 
105 | *How long does it take?*
106 | * `olmo3:dev:7b:main` takes roughly 30 minutes to do all of them on `--partition-size 1` and `num-gpus 2` with L40s.
107 | 
108 | ## Offline evaluation (midtraining)
109 | 
110 | We have an additional set of adapt evals formatted for mid-trained models, this is meant to be run **in addition** to the base models. Please make sure to add the additional arguements to use a basic chat template for base models:
111 | 
112 | ```sh
113 | olmo-cookbook-eval evaluate "$CHECKPOINT" \
114 |   --tasks olmo3:dev:midtrain:v0 \
115 |   --model-backend vllm \
116 |   --no-compute-gold-bpb \
117 |   --model-args chat_template=basic_answer \
118 |   --use-gantry \
119 |   --gantry-args env-secret="OPENAI_API_KEY=openai_api_key" \
120 |   --task-args chat_overrides="{\"generation_kwargs\": {\"stop_sequences\": [\"Problem:\", \"Answer:\", \"Question:\", \"</s>\", \"<|eot_id|>\"]}}"
121 |   ...
122 | ```
123 | 
124 | And pull your results with:
125 | 
126 | ```python
127 | olmo-cookbook-eval results \
128 |     --dashboard olmo-3-evals \
129 |     --tasks olmo3:dev:midtrain:v0 \
130 |     --format json | jq '.' | less
131 | ```
132 | 
133 | Please refer to #oe-midtraining for more documentation.
134 | 
135 | *How long does it take?*
136 | * `olmo3:dev:midtrain:v0` takes roughly 20 minutes using `--partition-size 1` and `num-gpus 2` with L40s.
137 | 
138 | ## FAQs
139 | 
140 | 1. **Why leave out GSM8k for base?** The task is odd and appears mainly moved by hill-climbing with mid-training data. Minerva seems like it covers a greater range.
141 | 
142 | 2. **BPB vs RC vs MC?** This is still debated among team, but the eventual goal should be to move toward BPB that we trust for our experiments, and monitoring MC (or our final target end task format) on 7B+ runs for metric breakthrough moments & final scores.
143 | 
144 | ### RC vs. MC
145 | 
146 | For 7B+ runs, our development evals replace multiple-choice RC evals with MC evals. 
147 | 
148 | Summarizing our conversation with the team, we considered three options for calculating both, and aggregating the results:
149 | 
150 | 1. Calculate `max(rc, mc)` -- This isn't desirable. Imagine two ablations -- one consistently has a very high RC and the other a very high MC. This is not a behavior we want from our metric
151 | 2. Calculate `avg(rc, mc)` -- This isn't desirable. At the small scale (when models get random-chance MC) we are artificially penalizing performance. At the large scale, (when models get lower RC than MC, because RC does not allow the model to see distractor options and therefore is a slightly more difficult task config) we are artificially penalizing performance.
152 | 3. Keep `mc` only -- We choose this. There is agreement that MC is a better task format, and that the issues caused by aggregation are not worth the benefit of accounting for two task formats.
153 | 
154 | Additionally, we observed empircally that the MC tasks better ranked models w.r.t. training compute. For more discussion, see [Figure 1 of the OLMES paper](https://arxiv.org/pdf/2406.08446?page=7).
155 | 
156 | ## TODOs
157 | 
158 | 1. Want to add MMLU subcategories also pulled as part of dashboard pull.
159 | 2. Want to add more evals. There are 3 themes:
160 |     * Existing evals that others use, but we don't. Candidates include LAMBADA, NQ, Squad, TriviaQA, MedMCQA, MedQA, etc.
161 |     * Existing evals that need to be fixed in some way. Candidates include converting Gen2MC format, data augmentation for PiQA, SIQA, CSQA, OBQA, Winogrande, Simplified version of hard tasks like SimpleQA, etc.
162 |     * New evals that capture something we aren't evaluating today but we think important to capture. Candidates include legal document tasks, science IE/Summ tasks, structured input (tables) reasoning tasks, perplexity over gold reasoning chains, etc
163 | 3. Want to add some stats testing, or some notion of noise.
164 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml:
--------------------------------------------------------------------------------
 1 | name: "olmo2-7b_10b-microanneal_web-code-reasoning"
 2 | description: "OLMo2 7b microanneal to 10B Tokens on web + code + reasoning data"
 3 | budget: "ai2/oe-base"
 4 | workspace: "ai2/olmo-3-microanneals"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 10_000_000_000
 9 | global_batch_size: 2097152
10 | sequence_length: 4096
11 | seed: 1337
12 | model: "olmo2_7B"
13 | tokenizer: "dolma2"
14 | priority: urgent
15 | eval_interval: 250
16 | cluster: ai2/augusta-google-1
17 | rank_microbatch_size: 8192
18 | scheduler_type: linear
19 | warmup_steps: 0
20 | annealing:
21 |   enabled: true
22 |   initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
24 | load_state: false
25 | dataset:
26 |   sources:
27 |   - name: web 
28 |     target_ratio: 0.5
29 |     paths:
30 |     - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
31 |   - name: code 
32 |     target_ratio: 0.4
33 |     paths:
34 |     - gs://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/**/*.npy
35 |   - name: reasoning 
36 |     target_ratio: 0.1
37 |     paths:
38 |     - gs://ai2-llm/preprocessed/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3/pstar/mixes/dclm_natural.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "domain": "adult_content",
 4 |     "weight": 0.014765319631599511
 5 |   },
 6 |   {
 7 |     "domain": "art_and_design",
 8 |     "weight": 0.011240832719837508
 9 |   },
10 |   {
11 |     "domain": "crime_and_law",
12 |     "weight": 0.030561549604003226
13 |   },
14 |   {
15 |     "domain": "education_and_jobs",
16 |     "weight": 0.02716702787570161
17 |   },
18 |   {
19 |     "domain": "electronics_and_hardware",
20 |     "weight": 0.01097654207606029
21 |   },
22 |   {
23 |     "domain": "entertainment",
24 |     "weight": 0.06613278917361155
25 |   },
26 |   {
27 |     "domain": "fashion_and_beauty",
28 |     "weight": 0.008628080822748758
29 |   },
30 |   {
31 |     "domain": "finance_and_business",
32 |     "weight": 0.07745446670203346
33 |   },
34 |   {
35 |     "domain": "food_and_dining",
36 |     "weight": 0.019951347751195767
37 |   },
38 |   {
39 |     "domain": "games",
40 |     "weight": 0.051721512912070444
41 |   },
42 |   {
43 |     "domain": "health",
44 |     "weight": 0.07827944177883765
45 |   },
46 |   {
47 |     "domain": "history_and_geography",
48 |     "weight": 0.033290089550114574
49 |   },
50 |   {
51 |     "domain": "home_and_hobbies",
52 |     "weight": 0.02602595762618607
53 |   },
54 |   {
55 |     "domain": "industrial",
56 |     "weight": 0.004413683853722294
57 |   },
58 |   {
59 |     "domain": "literature",
60 |     "weight": 0.07559574213897882
61 |   },
62 |   {
63 |     "domain": "politics",
64 |     "weight": 0.16428824015945423
65 |   },
66 |   {
67 |     "domain": "religion",
68 |     "weight": 0.042119899145531485
69 |   },
70 |   {
71 |     "domain": "science_math_and_technology",
72 |     "weight": 0.11054985278398685
73 |   },
74 |   {
75 |     "domain": "social_life",
76 |     "weight": 0.03756325792313331
77 |   },
78 |   {
79 |     "domain": "software",
80 |     "weight": 0.02068057269775392
81 |   },
82 |   {
83 |     "domain": "software_development",
84 |     "weight": 0.03122990602438789
85 |   },
86 |   {
87 |     "domain": "sports_and_fitness",
88 |     "weight": 0.03400017727030217
89 |   },
90 |   {
91 |     "domain": "transportation",
92 |     "weight": 0.013721828887038707
93 |   },
94 |   {
95 |     "domain": "travel_and_tourism",
96 |     "weight": 0.009641880891709906
97 |   }
98 | ]
99 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3/pstar/mixes/dclm_pstar_001.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "domain": "adult_content",
 4 |     "weight": 0.00026734253853351287
 5 |   },
 6 |   {
 7 |     "domain": "art_and_design",
 8 |     "weight": 0.0008791647328983347
 9 |   },
10 |   {
11 |     "domain": "crime_and_law",
12 |     "weight": 0.0003445385964403161
13 |   },
14 |   {
15 |     "domain": "education_and_jobs",
16 |     "weight": 0.002800955150175096
17 |   },
18 |   {
19 |     "domain": "electronics_and_hardware",
20 |     "weight": 0.00035335013072883227
21 |   },
22 |   {
23 |     "domain": "entertainment",
24 |     "weight": 0.001974562965447597
25 |   },
26 |   {
27 |     "domain": "fashion_and_beauty",
28 |     "weight": 0.00004713420964224449
29 |   },
30 |   {
31 |     "domain": "finance_and_business",
32 |     "weight": 0.0016387611257735
33 |   },
34 |   {
35 |     "domain": "food_and_dining",
36 |     "weight": 0.07688426723105127
37 |   },
38 |   {
39 |     "domain": "games",
40 |     "weight": 0.0005008265605007255
41 |   },
42 |   {
43 |     "domain": "health",
44 |     "weight": 0.15269241254530494
45 |   },
46 |   {
47 |     "domain": "history_and_geography",
48 |     "weight": 0.0009123174419980062
49 |   },
50 |   {
51 |     "domain": "home_and_hobbies",
52 |     "weight": 0.050373880217286074
53 |   },
54 |   {
55 |     "domain": "industrial",
56 |     "weight": 0.020135591340543595
57 |   },
58 |   {
59 |     "domain": "literature",
60 |     "weight": 0.0017867234094703585
61 |   },
62 |   {
63 |     "domain": "politics",
64 |     "weight": 0.0010592059864753477
65 |   },
66 |   {
67 |     "domain": "religion",
68 |     "weight": 0.001744181597964507
69 |   },
70 |   {
71 |     "domain": "science_math_and_technology",
72 |     "weight": 0.41123062705335955
73 |   },
74 |   {
75 |     "domain": "social_life",
76 |     "weight": 0.01865278057032931
77 |   },
78 |   {
79 |     "domain": "software",
80 |     "weight": 0.00024003225357094727
81 |   },
82 |   {
83 |     "domain": "software_development",
84 |     "weight": 0.2546200267035965
85 |   },
86 |   {
87 |     "domain": "sports_and_fitness",
88 |     "weight": 0.00023863135040023465
89 |   },
90 |   {
91 |     "domain": "transportation",
92 |     "weight": 0.00020485519629886065
93 |   },
94 |   {
95 |     "domain": "travel_and_tourism",
96 |     "weight": 0.00041783109221028316
97 |   }
98 | ]
99 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3/pstar/mixes/dclm_pstar_002.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "domain": "adult_content",
 4 |     "weight": 0.0000497999482706765
 5 |   },
 6 |   {
 7 |     "domain": "art_and_design",
 8 |     "weight": 0.0008890156319389095
 9 |   },
10 |   {
11 |     "domain": "crime_and_law",
12 |     "weight": 0.016186555467287898
13 |   },
14 |   {
15 |     "domain": "education_and_jobs",
16 |     "weight": 0.000810837888896179
17 |   },
18 |   {
19 |     "domain": "electronics_and_hardware",
20 |     "weight": 0.0007234640858286977
21 |   },
22 |   {
23 |     "domain": "entertainment",
24 |     "weight": 0.007313197890536659
25 |   },
26 |   {
27 |     "domain": "fashion_and_beauty",
28 |     "weight": 0.0000411900615375631
29 |   },
30 |   {
31 |     "domain": "finance_and_business",
32 |     "weight": 0.0002518954652144411
33 |   },
34 |   {
35 |     "domain": "food_and_dining",
36 |     "weight": 0.003309783527164619
37 |   },
38 |   {
39 |     "domain": "games",
40 |     "weight": 0.0017901306583973666
41 |   },
42 |   {
43 |     "domain": "health",
44 |     "weight": 0.08071548886090384
45 |   },
46 |   {
47 |     "domain": "history_and_geography",
48 |     "weight": 0.0004153629844012423
49 |   },
50 |   {
51 |     "domain": "home_and_hobbies",
52 |     "weight": 0.008164976242805017
53 |   },
54 |   {
55 |     "domain": "industrial",
56 |     "weight": 0.04250079499662069
57 |   },
58 |   {
59 |     "domain": "literature",
60 |     "weight": 0.12352451996832144
61 |   },
62 |   {
63 |     "domain": "politics",
64 |     "weight": 0.001356369544468529
65 |   },
66 |   {
67 |     "domain": "religion",
68 |     "weight": 0.0005276023343452912
69 |   },
70 |   {
71 |     "domain": "science_math_and_technology",
72 |     "weight": 0.3859178478770352
73 |   },
74 |   {
75 |     "domain": "social_life",
76 |     "weight": 0.022827908810393156
77 |   },
78 |   {
79 |     "domain": "software",
80 |     "weight": 0.0008462566212385966
81 |   },
82 |   {
83 |     "domain": "software_development",
84 |     "weight": 0.2563807300394595
85 |   },
86 |   {
87 |     "domain": "sports_and_fitness",
88 |     "weight": 0.04400562004176248
89 |   },
90 |   {
91 |     "domain": "transportation",
92 |     "weight": 0.0011793381768489277
93 |   },
94 |   {
95 |     "domain": "travel_and_tourism",
96 |     "weight": 0.0002713128763231578
97 |   }
98 | ]
99 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3/pstar/mixes/dist-plot.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | import seaborn as sns
 4 | import matplotlib.pyplot as plt
 5 | from pathlib import Path
 6 | 
 7 | # Define the paths to the JSON files
 8 | json_files = ["dclm_natural.json", "dclm_pstar_001.json", "dclm_pstar_002.json"]
 9 | 
10 | # Load and combine data from all JSON files
11 | all_data = []
12 | for json_file in json_files:
13 |     with open(json_file, "r") as f:
14 |         data = json.load(f)
15 | 
16 |     # Extract filename without extension for mix label
17 |     mix_name = Path(json_file).stem
18 | 
19 |     # Add mix information to each domain entry
20 |     for item in data:
21 |         item["mix"] = mix_name
22 |         all_data.append(item)
23 | 
24 | # Convert to DataFrame
25 | df = pd.DataFrame(all_data)
26 | 
27 | # Set up the plot style
28 | plt.figure(figsize=(16, 10))
29 | sns.set_style("whitegrid")
30 | 
31 | # Create histogram with color coding by mix and better spacing
32 | ax = sns.barplot(data=df, x="domain", y="weight", hue="mix", palette="Set2")
33 | 
34 | # Add spacing between domain groups
35 | ax.tick_params(axis="x", which="major", pad=10)
36 | 
37 | # Customize the plot
38 | plt.title("Domain Weight Distribution Across DCLM Mixes", fontsize=16, fontweight="bold", pad=20)
39 | plt.xlabel("Domain", fontsize=12, labelpad=15)
40 | plt.ylabel("Weight", fontsize=12, labelpad=15)
41 | plt.xticks(rotation=45, ha="right", fontsize=10)
42 | plt.yticks(fontsize=10)
43 | 
44 | # Add vertical lines to separate domain groups every 3 domains for better readability
45 | for i in range(0, len(df["domain"].unique()), 1):
46 |     plt.axvline(x=i - 0.5, color="gray", linestyle="--", alpha=0.3, linewidth=0.8)
47 | 
48 | # Improve legend positioning and styling
49 | plt.legend(title="Mix", bbox_to_anchor=(1.02, 1), loc="upper left", frameon=True, fancybox=True, shadow=True)
50 | 
51 | # Adjust layout to prevent label cutoff with more padding
52 | plt.tight_layout(pad=2.0)
53 | 
54 | # Show the plot
55 | plt.show()
56 | 
57 | # Print summary statistics
58 | print("\nSummary Statistics by Mix:")
59 | print(df.groupby("mix")["weight"].agg(["count", "mean", "std", "min", "max"]))
60 | 
61 | # Print top domains by weight for each mix
62 | print("\nTop 5 domains by weight for each mix:")
63 | for mix in df["mix"].unique():
64 |     mix_data = df[df["mix"] == mix].nlargest(5, "weight")
65 |     print(f"\n{mix}:")
66 |     for _, row in mix_data.iterrows():
67 |         print(f"  {row['domain']}: {row['weight']:.4f}")
68 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3/pstar/train-1b-5xC-pstar-001-dclm-dolma2.yaml:
--------------------------------------------------------------------------------
  1 | name: "olmo3-1b-5xC-dclm-pstar-001-hlr-dolma2"
  2 | description: "OLMo3 1b@5xC dclm-baseline p* mix"
  3 | budget: "ai2/oe-training"
  4 | workspace: "ai2/oe-data"
  5 | nodes: 4
  6 | gpus: 8
  7 | preemptible: true
  8 | max_tokens: 127_939_584_000
  9 | global_batch_size: 2097152
 10 | rank_microbatch_size: 32768
 11 | sequence_length: 4096
 12 | seed: 1337
 13 | model: "olmo2_1B_v2"
 14 | tokenizer: "dolma2"
 15 | priority: high
 16 | eval_interval: 250
 17 | learning_rate: 1.8e-3
 18 | cluster: ai2/augusta-google-1
 19 | downstream_evaluators:
 20 |   - olmo2_dev_1b
 21 | dataset:
 22 |   sources:
 23 |     - name: adult_content
 24 |       target_ratio: 0.00026734253853351287
 25 |       repetition_factor: 2.0
 26 |       paths:
 27 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/adult_content/**/*.npy"
 28 |     - name: art_and_design
 29 |       target_ratio: 0.0008791647328983347
 30 |       repetition_factor: 2.0
 31 |       paths:
 32 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/art_and_design/**/*.npy"
 33 |     - name: crime_and_law
 34 |       target_ratio: 0.0003445385964403161
 35 |       repetition_factor: 2.0
 36 |       paths:
 37 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/crime_and_law/**/*.npy"
 38 |     - name: education_and_jobs
 39 |       target_ratio: 0.002800955150175096
 40 |       repetition_factor: 2.0
 41 |       paths:
 42 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/education_and_jobs/**/*.npy"
 43 |     - name: electronics_and_hardware
 44 |       target_ratio: 0.00035335013072883227
 45 |       repetition_factor: 2.0
 46 |       paths:
 47 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/electronics_and_hardware/**/*.npy"
 48 |     - name: entertainment
 49 |       target_ratio: 0.001974562965447597
 50 |       repetition_factor: 2.0
 51 |       paths:
 52 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/entertainment/**/*.npy"
 53 |     - name: fashion_and_beauty
 54 |       target_ratio: 0.00004713420964224449
 55 |       repetition_factor: 2.0
 56 |       paths:
 57 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/fashion_and_beauty/**/*.npy"
 58 |     - name: finance_and_business
 59 |       target_ratio: 0.0016387611257735
 60 |       repetition_factor: 2.0
 61 |       paths:
 62 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/finance_and_business/**/*.npy"
 63 |     - name: food_and_dining
 64 |       target_ratio: 0.07688426723105127
 65 |       repetition_factor: 2.0
 66 |       paths:
 67 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/food_and_dining/**/*.npy"
 68 |     - name: games
 69 |       target_ratio: 0.0005008265605007255
 70 |       repetition_factor: 2.0
 71 |       paths:
 72 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/games/**/*.npy"
 73 |     - name: health
 74 |       target_ratio: 0.15269241254530494
 75 |       repetition_factor: 2.0
 76 |       paths:
 77 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/health/**/*.npy"
 78 |     - name: history_and_geography
 79 |       target_ratio: 0.0009123174419980062
 80 |       repetition_factor: 2.0
 81 |       paths:
 82 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/history_and_geography/**/*.npy"
 83 |     - name: home_and_hobbies
 84 |       target_ratio: 0.050373880217286074
 85 |       repetition_factor: 2.0
 86 |       paths:
 87 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/home_and_hobbies/**/*.npy"
 88 |     - name: industrial
 89 |       target_ratio: 0.020135591340543595
 90 |       repetition_factor: 2.0
 91 |       paths:
 92 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/industrial/**/*.npy"
 93 |     - name: literature
 94 |       target_ratio: 0.0017867234094703585
 95 |       repetition_factor: 2.0
 96 |       paths:
 97 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/literature/**/*.npy"
 98 |     - name: politics
 99 |       target_ratio: 0.0010592059864753477
100 |       repetition_factor: 2.0
101 |       paths:
102 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/politics/**/*.npy"
103 |     - name: religion
104 |       target_ratio: 0.001744181597964507
105 |       repetition_factor: 2.0
106 |       paths:
107 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/religion/**/*.npy"
108 |     - name: science_math_and_technology
109 |       target_ratio: 0.41123062705335955
110 |       repetition_factor: 2.0
111 |       paths:
112 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/science_math_and_technology/**/*.npy"
113 |     - name: social_life
114 |       target_ratio: 0.01865278057032931
115 |       repetition_factor: 2.0
116 |       paths:
117 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/social_life/**/*.npy"
118 |     - name: software
119 |       target_ratio: 0.00024003225357094727
120 |       repetition_factor: 2.0
121 |       paths:
122 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software/**/*.npy"
123 |     - name: software_development
124 |       target_ratio: 0.2546200267035965
125 |       repetition_factor: 2.0
126 |       paths:
127 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software_development/**/*.npy"
128 |     - name: sports_and_fitness
129 |       target_ratio: 0.00023863135040023465
130 |       repetition_factor: 2.0
131 |       paths:
132 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/sports_and_fitness/**/*.npy"
133 |     - name: transportation
134 |       target_ratio: 0.00020485519629886065
135 |       repetition_factor: 2.0
136 |       paths:
137 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/transportation/**/*.npy"
138 |     - name: travel_and_tourism
139 |       target_ratio: 0.00041783109221028316
140 |       repetition_factor: 2.0
141 |       paths:
142 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/travel_and_tourism/**/*.npy"
143 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3/pstar/train-1b-5xC-pstar-002-dclm-dolma2.yaml:
--------------------------------------------------------------------------------
  1 | name: "olmo3-1b-5xC-dclm-pstar-002-hlr-dolma2"
  2 | description: "OLMo3 1b@5xC dclm-baseline p* mix"
  3 | budget: "ai2/oe-training"
  4 | workspace: "ai2/oe-data"
  5 | nodes: 4
  6 | gpus: 8
  7 | preemptible: true
  8 | max_tokens: 127_939_584_000
  9 | global_batch_size: 2097152
 10 | rank_microbatch_size: 32768
 11 | sequence_length: 4096
 12 | seed: 1337
 13 | model: "olmo2_1B_v2"
 14 | tokenizer: "dolma2"
 15 | priority: high
 16 | eval_interval: 250
 17 | learning_rate: 1.8e-3
 18 | cluster: ai2/augusta-google-1
 19 | downstream_evaluators:
 20 |   - olmo2_dev_1b
 21 | dataset:
 22 |   sources:
 23 |     - name: adult_content
 24 |       target_ratio: 0.0000497999482706765
 25 |       repetition_factor: 2.0
 26 |       paths:
 27 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/adult_content/**/*.npy"
 28 |     - name: art_and_design
 29 |       target_ratio: 0.0008890156319389095
 30 |       repetition_factor: 2.0
 31 |       paths:
 32 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/art_and_design/**/*.npy"
 33 |     - name: crime_and_law
 34 |       target_ratio: 0.016186555467287898
 35 |       repetition_factor: 2.0
 36 |       paths:
 37 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/crime_and_law/**/*.npy"
 38 |     - name: education_and_jobs
 39 |       target_ratio: 0.000810837888896179
 40 |       repetition_factor: 2.0
 41 |       paths:
 42 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/education_and_jobs/**/*.npy"
 43 |     - name: electronics_and_hardware
 44 |       target_ratio: 0.0007234640858286977
 45 |       repetition_factor: 2.0
 46 |       paths:
 47 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/electronics_and_hardware/**/*.npy"
 48 |     - name: entertainment
 49 |       target_ratio: 0.007313197890536659
 50 |       repetition_factor: 2.0
 51 |       paths:
 52 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/entertainment/**/*.npy"
 53 |     - name: fashion_and_beauty
 54 |       target_ratio: 0.0000411900615375631
 55 |       repetition_factor: 2.0
 56 |       paths:
 57 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/fashion_and_beauty/**/*.npy"
 58 |     - name: finance_and_business
 59 |       target_ratio: 0.0002518954652144411
 60 |       repetition_factor: 2.0
 61 |       paths:
 62 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/finance_and_business/**/*.npy"
 63 |     - name: food_and_dining
 64 |       target_ratio: 0.003309783527164619
 65 |       repetition_factor: 2.0
 66 |       paths:
 67 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/food_and_dining/**/*.npy"
 68 |     - name: games
 69 |       target_ratio: 0.0017901306583973666
 70 |       repetition_factor: 2.0
 71 |       paths:
 72 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/games/**/*.npy"
 73 |     - name: health
 74 |       target_ratio: 0.08071548886090384
 75 |       repetition_factor: 2.0
 76 |       paths:
 77 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/health/**/*.npy"
 78 |     - name: history_and_geography
 79 |       target_ratio: 0.0004153629844012423
 80 |       repetition_factor: 2.0
 81 |       paths:
 82 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/history_and_geography/**/*.npy"
 83 |     - name: home_and_hobbies
 84 |       target_ratio: 0.008164976242805017
 85 |       repetition_factor: 2.0
 86 |       paths:
 87 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/home_and_hobbies/**/*.npy"
 88 |     - name: industrial
 89 |       target_ratio: 0.04250079499662069
 90 |       repetition_factor: 2.0
 91 |       paths:
 92 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/industrial/**/*.npy"
 93 |     - name: literature
 94 |       target_ratio: 0.12352451996832144
 95 |       repetition_factor: 2.0
 96 |       paths:
 97 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/literature/**/*.npy"
 98 |     - name: politics
 99 |       target_ratio: 0.001356369544468529
100 |       repetition_factor: 2.0
101 |       paths:
102 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/politics/**/*.npy"
103 |     - name: religion
104 |       target_ratio: 0.0005276023343452912
105 |       repetition_factor: 2.0
106 |       paths:
107 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/religion/**/*.npy"
108 |     - name: science_math_and_technology
109 |       target_ratio: 0.3859178478770352
110 |       repetition_factor: 2.0
111 |       paths:
112 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/science_math_and_technology/**/*.npy"
113 |     - name: social_life
114 |       target_ratio: 0.022827908810393156
115 |       repetition_factor: 2.0
116 |       paths:
117 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/social_life/**/*.npy"
118 |     - name: software
119 |       target_ratio: 0.0008462566212385966
120 |       repetition_factor: 2.0
121 |       paths:
122 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software/**/*.npy"
123 |     - name: software_development
124 |       target_ratio: 0.2563807300394595
125 |       repetition_factor: 2.0
126 |       paths:
127 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software_development/**/*.npy"
128 |     - name: sports_and_fitness
129 |       target_ratio: 0.04400562004176248
130 |       repetition_factor: 2.0
131 |       paths:
132 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/sports_and_fitness/**/*.npy"
133 |     - name: transportation
134 |       target_ratio: 0.0011793381768489277
135 |       repetition_factor: 2.0
136 |       paths:
137 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/transportation/**/*.npy"
138 |     - name: travel_and_tourism
139 |       target_ratio: 0.0002713128763231578
140 |       repetition_factor: 2.0
141 |       paths:
142 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/travel_and_tourism/**/*.npy"
143 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/olmo3/pstar/train-1b-5xC-pstar-natural-dclm-dolma2.yaml:
--------------------------------------------------------------------------------
  1 | name: "olmo3-1b-5xC-dclm-pstar-natural-hlr-dolma2"
  2 | description: "OLMo3 1b@5xC dclm-baseline FT classifier natural distribution"
  3 | budget: "ai2/oe-training"
  4 | workspace: "ai2/oe-data"
  5 | nodes: 4
  6 | gpus: 8
  7 | preemptible: true
  8 | max_tokens: 127_939_584_000
  9 | global_batch_size: 2097152
 10 | rank_microbatch_size: 32768
 11 | sequence_length: 4096
 12 | seed: 1337
 13 | model: "olmo2_1B_v2"
 14 | tokenizer: "dolma2"
 15 | priority: high
 16 | eval_interval: 250
 17 | learning_rate: 1.8e-3
 18 | cluster: ai2/augusta-google-1
 19 | downstream_evaluators:
 20 |   - olmo2_dev_1b
 21 | dataset:
 22 |   sources:
 23 |     - name: adult_content
 24 |       target_ratio: 0.014765319631599511
 25 |       repetition_factor: 2.0
 26 |       paths:
 27 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/adult_content/**/*.npy"
 28 |     - name: art_and_design
 29 |       target_ratio: 0.011240832719837508
 30 |       repetition_factor: 2.0
 31 |       paths:
 32 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/art_and_design/**/*.npy"
 33 |     - name: crime_and_law
 34 |       target_ratio: 0.030561549604003226
 35 |       repetition_factor: 2.0
 36 |       paths:
 37 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/crime_and_law/**/*.npy"
 38 |     - name: education_and_jobs
 39 |       target_ratio: 0.02716702787570161
 40 |       repetition_factor: 2.0
 41 |       paths:
 42 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/education_and_jobs/**/*.npy"
 43 |     - name: electronics_and_hardware
 44 |       target_ratio: 0.01097654207606029
 45 |       repetition_factor: 2.0
 46 |       paths:
 47 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/electronics_and_hardware/**/*.npy"
 48 |     - name: entertainment
 49 |       target_ratio: 0.06613278917361155
 50 |       repetition_factor: 2.0
 51 |       paths:
 52 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/entertainment/**/*.npy"
 53 |     - name: fashion_and_beauty
 54 |       target_ratio: 0.008628080822748758
 55 |       repetition_factor: 2.0
 56 |       paths:
 57 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/fashion_and_beauty/**/*.npy"
 58 |     - name: finance_and_business
 59 |       target_ratio: 0.07745446670203346
 60 |       repetition_factor: 2.0
 61 |       paths:
 62 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/finance_and_business/**/*.npy"
 63 |     - name: food_and_dining
 64 |       target_ratio: 0.019951347751195767
 65 |       repetition_factor: 2.0
 66 |       paths:
 67 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/food_and_dining/**/*.npy"
 68 |     - name: games
 69 |       target_ratio: 0.051721512912070444
 70 |       repetition_factor: 2.0
 71 |       paths:
 72 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/games/**/*.npy"
 73 |     - name: health
 74 |       target_ratio: 0.07827944177883765
 75 |       repetition_factor: 2.0
 76 |       paths:
 77 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/health/**/*.npy"
 78 |     - name: history_and_geography
 79 |       target_ratio: 0.033290089550114574
 80 |       repetition_factor: 2.0
 81 |       paths:
 82 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/history_and_geography/**/*.npy"
 83 |     - name: home_and_hobbies
 84 |       target_ratio: 0.02602595762618607
 85 |       repetition_factor: 2.0
 86 |       paths:
 87 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/home_and_hobbies/**/*.npy"
 88 |     - name: industrial
 89 |       target_ratio: 0.004413683853722294
 90 |       repetition_factor: 2.0
 91 |       paths:
 92 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/industrial/**/*.npy"
 93 |     - name: literature
 94 |       target_ratio: 0.07559574213897882
 95 |       repetition_factor: 2.0
 96 |       paths:
 97 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/literature/**/*.npy"
 98 |     - name: politics
 99 |       target_ratio: 0.16428824015945423
100 |       repetition_factor: 2.0
101 |       paths:
102 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/politics/**/*.npy"
103 |     - name: religion
104 |       target_ratio: 0.042119899145531485
105 |       repetition_factor: 2.0
106 |       paths:
107 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/religion/**/*.npy"
108 |     - name: science_math_and_technology
109 |       target_ratio: 0.11054985278398685
110 |       repetition_factor: 2.0
111 |       paths:
112 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/science_math_and_technology/**/*.npy"
113 |     - name: social_life
114 |       target_ratio: 0.03756325792313331
115 |       repetition_factor: 2.0
116 |       paths:
117 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/social_life/**/*.npy"
118 |     - name: software
119 |       target_ratio: 0.02068057269775392
120 |       repetition_factor: 2.0
121 |       paths:
122 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software/**/*.npy"
123 |     - name: software_development
124 |       target_ratio: 0.03122990602438789
125 |       repetition_factor: 2.0
126 |       paths:
127 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software_development/**/*.npy"
128 |     - name: sports_and_fitness
129 |       target_ratio: 0.03400017727030217
130 |       repetition_factor: 2.0
131 |       paths:
132 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/sports_and_fitness/**/*.npy"
133 |     - name: transportation
134 |       target_ratio: 0.013721828887038707
135 |       repetition_factor: 2.0
136 |       paths:
137 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/transportation/**/*.npy"
138 |     - name: travel_and_tourism
139 |       target_ratio: 0.009641880891709906
140 |       repetition_factor: 2.0
141 |       paths:
142 |         - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/travel_and_tourism/**/*.npy"
143 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/spring2code/scaling/spring2code-190m-5xC-weka-top15-bpb-hlr-superbpe.yaml:
--------------------------------------------------------------------------------
 1 | name: "spring2code-190m-5xC-top15-hlr-superbpe"
 2 | description: "learn2code 190M@5xC top15 langauges from the-stack-v2 + dclm prose v2"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 1
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 19_000_000_000
 9 | sequence_length: 2048
10 | seed: 1337
11 | model: "olmo2_190M"
12 | tokenizer: "superbpe_experimental"
13 | priority: high
14 | eval_interval: 100
15 | global_batch_size: 1_048_576 # Half of the 1B
16 | learning_rate: 1.8e-3
17 | cluster: ai2/jupiter-cirrascale-2
18 | weka: true
19 | downstream_evaluators:
20 |   - codex_humaneval_gold_bpb_0shot
21 |   - codex_mbpp_gold_bpb_0shot
22 | dataset:
23 |   sources:
24 |   - name: the-stack-v2-top15-ai2v0-minhash-10pct-superbpe
25 |     target_ratio: 0.85
26 |     paths:
27 |     - weka://oe-training-default/ai2-llm/preprocessed/learn2code/love2code-top15-minhash-plpartition-10pct/allenai/superbpe-olmo3-experimental/**/*.npy
28 |   - name: dclm-codeprose-v2-superbpe
29 |     target_ratio: 0.15
30 |     paths:
31 |     - weka://oe-training-default/ai2-llm/preprocessed/learn2code/dclm-prose-v2/all/allenai/superbpe-olmo3-experimental/**/*.npy
32 | 


--------------------------------------------------------------------------------
/src/cookbook/recipes/spring2code/scaling/spring2code-1b-5xC-weka-top15-bpb-hlr-superbpe.yaml:
--------------------------------------------------------------------------------
 1 | name: "spring2code-1b-5xC-top15-hlr-superbpe"
 2 | description: "learn2code 1b@5xC top15 langauges from the-stack-v2 + dclm prose v2"
 3 | budget: "ai2/oe-training"
 4 | workspace: "ai2/oe-data"
 5 | nodes: 4
 6 | gpus: 8
 7 | preemptible: true
 8 | max_tokens: 113_184_153_600
 9 | rank_microbatch_size: 16_384 # Larger tokens require more memory, so we use a smaller micro batch size.
10 | load_path: /weka/oe-training-default/ai2-llm/checkpoints/ai2-tylerm/spring2code-1b-5xC-top15-hlr-superbpe-2e15a7da/step13900/
11 | sequence_length: 2048
12 | seed: 1337
13 | model: "olmo2_1B_v2"
14 | tokenizer: "superbpe_experimental"
15 | priority: high
16 | eval_interval: 250
17 | learning_rate: 1.8e-3
18 | cluster: ai2/jupiter-cirrascale-2
19 | weka: true
20 | downstream_evaluators:
21 |   - codex_humaneval_gold_bpb_0shot
22 |   - codex_mbpp_gold_bpb_0shot
23 | dataset:
24 |   sources:
25 |   - name: the-stack-v2-top15-ai2v0-minhash-10pct-superbpe
26 |     target_ratio: 0.85
27 |     paths:
28 |     - weka://oe-training-default/ai2-llm/preprocessed/learn2code/love2code-top15-minhash-plpartition-10pct/allenai/superbpe-olmo3-experimental/**/*.npy
29 |   - name: dclm-codeprose-v2-superbpe
30 |     target_ratio: 0.15
31 |     paths:
32 |     - weka://oe-training-default/ai2-llm/preprocessed/learn2code/dclm-prose-v2/all/allenai/superbpe-olmo3-experimental/**/*.npy
33 | 


--------------------------------------------------------------------------------
/src/cookbook/remote/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/remote/__init__.py


--------------------------------------------------------------------------------
/src/cookbook/remote/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import shutil
  4 | from tempfile import TemporaryDirectory
  5 | from typing import Any, Generator
  6 | 
  7 | from cookbook.cli.utils import PythonEnv
  8 | 
  9 | from .base import BaseAuthentication, LocatedPath
 10 | from .gantry_launcher import GantryLauncher
 11 | 
 12 | 
 13 | def copy_prefix(
 14 |     src_path: str,
 15 |     dst_path: str,
 16 |     src_credentials: BaseAuthentication | None = None,
 17 |     dst_credentials: BaseAuthentication | None = None,
 18 |     *args: Any,
 19 |     **kwargs: Any,
 20 | ):
 21 |     src_loc = LocatedPath.from_str(src_path)
 22 |     dst_loc = LocatedPath.from_str(dst_path)
 23 | 
 24 |     if src_loc.prot == "gs":
 25 |         if dst_loc.prot in ("weka", "file"):
 26 |             from .gcp import GoogleCloudToken, download_gcs_prefix
 27 | 
 28 |             assert src_credentials is None or isinstance(src_credentials, GoogleCloudToken)
 29 |             download_gcs_prefix(src_loc.remote, dst_loc.local, credentials=src_credentials, *args, **kwargs)
 30 |         elif dst_loc.prot == "gs":
 31 |             from .gcp import GoogleCloudToken, download_gcs_prefix, upload_gcs_prefix
 32 | 
 33 |             assert src_credentials is None or isinstance(src_credentials, GoogleCloudToken)
 34 |             assert dst_credentials is None or isinstance(dst_credentials, GoogleCloudToken)
 35 |             with TemporaryDirectory() as tmp_dir:
 36 |                 download_gcs_prefix(src_loc.remote, tmp_dir, credentials=src_credentials, *args, **kwargs)
 37 |                 upload_gcs_prefix(tmp_dir, dst_loc.remote, credentials=dst_credentials, *args, **kwargs)
 38 | 
 39 |         elif dst_loc.prot == "s3":
 40 |             from .aws import AwsCredentials, upload_s3_prefix
 41 |             from .gcp import GoogleCloudToken, download_gcs_prefix
 42 | 
 43 |             assert src_credentials is None or isinstance(src_credentials, GoogleCloudToken)
 44 |             assert dst_credentials is None or isinstance(dst_credentials, AwsCredentials)
 45 |             with TemporaryDirectory() as tmp_dir:
 46 |                 download_gcs_prefix(src_loc.remote, tmp_dir, credentials=src_credentials, *args, **kwargs)
 47 |                 upload_s3_prefix(tmp_dir, dst_loc.remote, credentials=dst_credentials, *args, **kwargs)
 48 | 
 49 |     elif src_loc.prot == "s3":
 50 |         if dst_loc.prot in ("weka", "file"):
 51 |             from .aws import AwsCredentials, download_s3_prefix
 52 | 
 53 |             assert src_credentials is None or isinstance(src_credentials, AwsCredentials)
 54 |             download_s3_prefix(src_loc.remote, dst_loc.local, credentials=src_credentials, *args, **kwargs)
 55 |         elif dst_loc.prot == "s3":
 56 |             from .aws import AwsCredentials, download_s3_prefix, upload_s3_prefix
 57 | 
 58 |             assert src_credentials is None or isinstance(src_credentials, AwsCredentials)
 59 |             assert dst_credentials is None or isinstance(dst_credentials, AwsCredentials)
 60 |             with TemporaryDirectory() as tmp_dir:
 61 |                 download_s3_prefix(src_loc.remote, tmp_dir, credentials=src_credentials, *args, **kwargs)
 62 |                 upload_s3_prefix(tmp_dir, dst_loc.remote, credentials=dst_credentials, *args, **kwargs)
 63 |         elif dst_loc.prot == "gs":
 64 |             from .aws import AwsCredentials, download_s3_prefix
 65 |             from .gcp import GoogleCloudToken, upload_gcs_prefix
 66 | 
 67 |             assert src_credentials is None or isinstance(src_credentials, AwsCredentials)
 68 |             assert dst_credentials is None or isinstance(dst_credentials, GoogleCloudToken)
 69 |             with TemporaryDirectory() as tmp_dir:
 70 |                 download_s3_prefix(src_loc.remote, tmp_dir, credentials=src_credentials, *args, **kwargs)
 71 |                 upload_gcs_prefix(tmp_dir, dst_loc.remote, credentials=dst_credentials, *args, **kwargs)
 72 | 
 73 |     elif src_loc.prot in ("weka", "file"):
 74 |         if dst_loc.prot in ("weka", "file"):
 75 |             # local copy
 76 |             shutil.copytree(src_loc.local, dst_loc.local)
 77 |         elif dst_loc.prot == "gs":
 78 |             from .gcp import GoogleCloudToken, upload_gcs_prefix
 79 | 
 80 |             assert dst_credentials is None or isinstance(dst_credentials, GoogleCloudToken)
 81 |             upload_gcs_prefix(src_loc.local, dst_loc.remote, credentials=dst_credentials, *args, **kwargs)
 82 |         elif dst_loc.prot == "s3":
 83 |             from .aws import AwsCredentials, upload_s3_prefix
 84 | 
 85 |             assert dst_credentials is None or isinstance(dst_credentials, AwsCredentials)
 86 |             upload_s3_prefix(src_loc.local, dst_loc.remote, credentials=dst_credentials, *args, **kwargs)
 87 | 
 88 |     else:
 89 |         raise ValueError(f"{src_loc.prot.upper()} -> {dst_loc.prot.upper()}: not recognized")
 90 | 
 91 | 
 92 | def push_credentials(gantry_launcher: GantryLauncher, *paths: str):
 93 |     for path in paths:
 94 |         loc = LocatedPath.from_str(path)
 95 |         if loc.prot == "gs":
 96 |             from .gcp import GoogleCloudToken
 97 | 
 98 |             gct = GoogleCloudToken.make()
 99 |             gantry_launcher.add_env_secret(f"COOKBOOK_AUTH_{loc.hash[:6]}", gct.to_json(), overwrite=True)
100 |         elif loc.prot == "s3":
101 |             from .aws import AwsCredentials
102 | 
103 |             aws_creds = AwsCredentials.make()
104 |             gantry_launcher.add_env_secret(f"COOKBOOK_AUTH_{loc.hash[:6]}", aws_creds.to_json(), overwrite=True)
105 | 
106 | 
107 | def pull_credentials(*paths: str) -> Generator[BaseAuthentication | None, Any, Any]:
108 |     for path in paths:
109 |         loc = LocatedPath.from_str(path)
110 |         if loc.prot == "gs":
111 |             from .gcp import GoogleCloudToken
112 | 
113 |             yield GoogleCloudToken.from_json(os.environ[f"COOKBOOK_AUTH_{loc.hash[:6]}"])
114 |         elif loc.prot == "s3":
115 |             from .aws import AwsCredentials
116 | 
117 |             yield AwsCredentials.from_json(os.environ[f"COOKBOOK_AUTH_{loc.hash[:6]}"])
118 |         else:
119 |             yield None
120 | 
121 | 
122 | def main():
123 |     parser = argparse.ArgumentParser("Move prefixes between storage systems")
124 |     parser.add_argument("src_path", type=str, help="Source path")
125 |     parser.add_argument("dst_path", type=str, help="Destination path")
126 |     parser.add_argument("--num-workers", type=int, default=10, help="Number of workers")
127 |     parser.add_argument("--google-cloud-token", type=str, default=None, help="Google Cloud token")
128 |     parser.add_argument("--allow-dirty", action="store_true", help="Allow dirty operations")
129 |     parser.add_argument("--budget", type=str, default="ai2/oe-base", help="Budget")
130 |     parser.add_argument("--cluster", type=str, default="aus", help="Clusters to run on")
131 |     parser.add_argument("--dry-run", action="store_true", help="Dry run")
132 |     parser.add_argument("--gpus", type=int, default=0, help="Number of GPUs")
133 |     parser.add_argument("--priority", type=str, default="high", help="Priority")
134 |     parser.add_argument("--preemptible", action="store_true", help="Preemptible")
135 |     parser.add_argument("--workspace", type=str, default="ai2/oe-data", help="Workspace")
136 |     parser.add_argument("--local-only", action="store_true", help="Local only")
137 |     parser.add_argument(
138 |         "--credentials_env_name", type=str, default="COOKBOOK_REMOTE_CREDENTIALS", help="Credentials env name"
139 |     )
140 |     args = parser.parse_args()
141 | 
142 |     if os.environ.get("BEAKER_EXPERIMENT_ID") or args.local_only:
143 |         # only pull credentials if running on beaker
144 |         source_credentials, destination_credentials = (
145 |             pull_credentials(args.src_path, args.dst_path) if not args.local_only else (None, None)
146 |         )
147 | 
148 |         copy_prefix(
149 |             src_path=args.src_path,
150 |             dst_path=args.dst_path,
151 |             src_credentials=source_credentials,
152 |             dst_credentials=destination_credentials,
153 |             num_workers=args.num_workers,
154 |         )
155 | 
156 |     else:
157 |         # running locally, submit to beaker
158 |         env = PythonEnv.create("copy-prefix")
159 |         bw = GantryLauncher(
160 |             allow_dirty=args.allow_dirty,
161 |             budget=args.budget,
162 |             cluster=args.cluster,
163 |             dry_run=args.dry_run,
164 |             gpus=args.gpus,
165 |             priority=args.priority,
166 |             preemptible=args.preemptible,
167 |             workspace=args.workspace,
168 |             env=env,
169 |         )
170 | 
171 |         # adds mount if necessary
172 |         bw.add_mount(args.src_path)
173 |         bw.add_mount(args.dst_path)
174 | 
175 |         push_credentials(bw, args.src_path, args.dst_path)
176 | 
177 |         bw.run(
178 |             command=f"python -m cookbook.remote '{args.src_path}' '{args.dst_path}'",
179 |             description=f"Copying {args.src_path} to {args.dst_path}",
180 |         )
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     main()
185 | 


--------------------------------------------------------------------------------
/src/cookbook/remote/aws.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from concurrent.futures import ThreadPoolExecutor, as_completed
  3 | from dataclasses import dataclass
  4 | from pathlib import Path
  5 | from typing import TYPE_CHECKING
  6 | from urllib.parse import urlparse
  7 | 
  8 | import boto3
  9 | from tqdm import tqdm
 10 | 
 11 | from ..cli.utils import get_aws_access_key_id, get_aws_secret_access_key
 12 | from .base import AuthenticationError, BaseAuthentication
 13 | 
 14 | if TYPE_CHECKING:
 15 |     from mypy_boto3_s3.client import S3Client
 16 | 
 17 | 
 18 | @dataclass(frozen=True)
 19 | class AwsCredentials(BaseAuthentication):
 20 |     access_key_id: str
 21 |     secret_access_key: str
 22 | 
 23 |     @classmethod
 24 |     def make(cls) -> "AwsCredentials":
 25 |         access_key_id = get_aws_access_key_id()
 26 |         secret_access_key = get_aws_secret_access_key()
 27 |         if access_key_id is None or secret_access_key is None:
 28 |             raise AuthenticationError("No AWS credentials found")
 29 |         return cls(access_key_id=access_key_id, secret_access_key=secret_access_key)
 30 | 
 31 |     def apply(self) -> boto3.Session:
 32 |         """Apply the credentials so that it can be used for remote operations."""
 33 |         return boto3.Session(aws_access_key_id=self.access_key_id, aws_secret_access_key=self.secret_access_key)
 34 | 
 35 | 
 36 | def list_objects_with_paginator(bucket_name: str, prefix: str, client: "S3Client"):
 37 |     """
 38 |     List all objects in an S3 bucket using boto3's paginator.
 39 |     This automatically handles pagination for you.
 40 |     """
 41 |     # Create a paginator for list_objects_v2
 42 |     paginator = client.get_paginator("list_objects_v2")
 43 | 
 44 |     # Configure the pagination parameters
 45 |     page_iterator = paginator.paginate(
 46 |         Bucket=bucket_name,
 47 |         Prefix=prefix,
 48 |         PaginationConfig={
 49 |             "MaxItems": None,  # Return all items
 50 |             "PageSize": 1000,  # Number of items per page (max 1000)
 51 |         },
 52 |     )
 53 | 
 54 |     # Iterate through all pages
 55 |     for page in page_iterator:
 56 |         if "Contents" in page:
 57 |             for obj in page["Contents"]:
 58 |                 yield bucket_name, obj["Key"]
 59 | 
 60 | 
 61 | def download_s3_prefix(
 62 |     remote_path: str,
 63 |     local_path: str | Path,
 64 |     session: boto3.Session | None = None,
 65 |     num_workers: int | None = None,
 66 |     credentials: AwsCredentials | None = None,
 67 | ):
 68 |     protocol, bucket_name, prefix = (p := urlparse(remote_path)).scheme, p.netloc, p.path.lstrip("/")
 69 |     assert protocol.startswith("s3"), "Only S3 and S3A protocols are supported"
 70 | 
 71 |     client = (credentials.apply() if credentials else (session or boto3.Session())).client("s3")
 72 | 
 73 |     # Create a local directory if it doesn't exist
 74 |     local_path = Path(local_path)
 75 |     futures = []
 76 |     with ThreadPoolExecutor(max_workers=num_workers) as executor:
 77 |         for bucket, key in list_objects_with_paginator(bucket_name, prefix, client):
 78 |             local_file_path = local_path / Path(key).relative_to(Path(prefix))
 79 | 
 80 |             def _download_file(
 81 |                 _bucket: str,
 82 |                 _key: str,
 83 |                 _local_file_path: Path,
 84 |                 _client: "S3Client",
 85 |             ):
 86 |                 _local_file_path.parent.mkdir(parents=True, exist_ok=True)
 87 |                 _client.download_file(_bucket, _key, str(_local_file_path))
 88 | 
 89 |             futures.append(
 90 |                 executor.submit(
 91 |                     _download_file,
 92 |                     _bucket=bucket,
 93 |                     _key=key,
 94 |                     _local_file_path=local_file_path,
 95 |                     _client=client,
 96 |                 )
 97 |             )
 98 | 
 99 |         for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading prefix"):
100 |             try:
101 |                 future.result()
102 |             except Exception as e:
103 |                 for future_to_cancel in futures:
104 |                     future_to_cancel.cancel()
105 |                 raise e
106 | 
107 | 
108 | def upload_s3_prefix(
109 |     local_path: str | Path,
110 |     remote_path: str,
111 |     session: boto3.Session | None = None,
112 |     num_workers: int | None = None,
113 |     credentials: AwsCredentials | None = None,
114 | ):
115 |     protocol, bucket_name, prefix = (p := urlparse(remote_path)).scheme, p.netloc, p.path.lstrip("/")
116 |     assert protocol.startswith("s3"), "Only S3 and S3A protocols are supported"
117 | 
118 |     client = (credentials.apply() if credentials else (session or boto3.Session())).client("s3")
119 |     local_path = Path(local_path).absolute()
120 | 
121 |     with ThreadPoolExecutor(max_workers=num_workers) as executor:
122 |         futures = []
123 |         for dp, _, files in os.walk(str(local_path)):
124 |             for fp_str in files:
125 |                 fp = Path(dp) / fp_str
126 |                 if not fp.is_file():
127 |                     continue
128 | 
129 |                 def _upload_file(
130 |                     _fp: Path,
131 |                     _bucket: str,
132 |                     _key: str,
133 |                     _client: "S3Client",
134 |                 ):
135 |                     _client.upload_file(str(_fp), _bucket, _key)
136 | 
137 |                 futures.append(
138 |                     executor.submit(
139 |                         _upload_file,
140 |                         _fp=fp,
141 |                         _bucket=bucket_name,
142 |                         _key=f"{prefix}/{fp.relative_to(local_path)}",
143 |                         _client=client,
144 |                     )
145 |                 )
146 | 
147 |         for future in tqdm(as_completed(futures), total=len(futures), desc="Uploading prefix"):
148 |             try:
149 |                 future.result()
150 |             except Exception as e:
151 |                 for future_to_cancel in futures:
152 |                     future_to_cancel.cancel()
153 |                 raise e
154 | 


--------------------------------------------------------------------------------
/src/cookbook/remote/base.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from dataclasses import asdict, dataclass
  3 | from hashlib import md5
  4 | from pathlib import Path
  5 | from typing import Any, Generic, Literal, Optional, TypeAlias, TypeVar, Union
  6 | from urllib.parse import urlparse
  7 | 
  8 | from typing_extensions import Self
  9 | 
 10 | from cookbook.constants import WEKA_MOUNTS
 11 | 
 12 | JSON_VALID_TYPES: TypeAlias = Union[str, int, float, bool, list, dict]
 13 | 
 14 | 
 15 | C = TypeVar("C")
 16 | 
 17 | 
 18 | @dataclass(frozen=True)
 19 | class BaseAuthentication(Generic[C]):
 20 |     """Base class for all remote authentication classes."""
 21 | 
 22 |     @classmethod
 23 |     def from_dict(cls, obj: dict[str, JSON_VALID_TYPES]) -> "Self":
 24 |         """Convert a dictionary to a BaseAuthentication instance."""
 25 |         return cls(**obj)
 26 | 
 27 |     @classmethod
 28 |     def _check_dict_types(cls, obj: dict[str, JSON_VALID_TYPES]) -> None:
 29 |         """Check if the dictionary contains only valid types."""
 30 |         for key, value in obj.items():
 31 |             if not isinstance(key, str):
 32 |                 raise ValueError(f"Invalid key type: {key!r} (expected str)")
 33 |             if not isinstance(value, JSON_VALID_TYPES):
 34 |                 raise ValueError(f"Invalid value type: {value!r} (expected {JSON_VALID_TYPES})")
 35 |             if isinstance(value, dict):
 36 |                 cls._check_dict_types(value)
 37 | 
 38 |     def to_dict(self) -> dict[str, JSON_VALID_TYPES]:
 39 |         """Convert a BaseAuthentication instance to a dictionary."""
 40 |         self._check_dict_types(obj := asdict(self))
 41 |         return obj
 42 | 
 43 |     @classmethod
 44 |     def from_json(cls, obj: str) -> "Self":
 45 |         """Convert a JSON string to a BaseAuthentication instance."""
 46 |         obj = json.loads(obj)
 47 |         if not isinstance(obj, dict):
 48 |             raise ValueError(f"Invalid JSON object: {obj}")
 49 |         return cls.from_dict(obj)
 50 | 
 51 |     def to_json(self) -> str:
 52 |         """Convert a BaseAuthentication instance to a JSON string."""
 53 |         return json.dumps(self.to_dict())
 54 | 
 55 |     @classmethod
 56 |     def make(cls) -> "Self":
 57 |         """Create a new credentials instance to be used for remote operations."""
 58 |         raise NotImplementedError("Subclasses must implement this method")
 59 | 
 60 |     def apply(self, *args: Any, **kwargs: Any) -> C:
 61 |         """Apply the credentials so that it can be used for remote operations."""
 62 |         raise NotImplementedError("Subclasses must implement this method")
 63 | 
 64 | 
 65 | class AuthenticationError(RuntimeError):
 66 |     """Error raised when authentication fails."""
 67 | 
 68 |     ...
 69 | 
 70 | 
 71 | @dataclass(frozen=True)
 72 | class LocatedPath:
 73 |     prot: Literal["gs", "s3", "weka", "file"]
 74 |     path: str
 75 | 
 76 |     @property
 77 |     def hash(self) -> str:
 78 |         h = md5()
 79 |         h.update(self.prot.encode())
 80 |         h.update(self.path.encode())
 81 |         return h.hexdigest()
 82 | 
 83 |     @classmethod
 84 |     def weka_path(cls, path: str | Path) -> Optional["Self"]:
 85 |         parsed = urlparse(str(path))
 86 |         termination = "/" if str(path).endswith("/") else ""
 87 | 
 88 |         if parsed.scheme == "weka":
 89 |             if parsed.netloc not in WEKA_MOUNTS:
 90 |                 raise ValueError(f"Invalid Weka bucket: {parsed.netloc}")
 91 |             return cls(prot="weka", path=f"/{parsed.netloc.strip('/')}/{parsed.path.lstrip('/')}{termination}")
 92 | 
 93 |         # the first part is usually '/'
 94 |         _, *parts = Path(path).parts
 95 | 
 96 |         if parts[0] in WEKA_MOUNTS:
 97 |             return cls(prot="weka", path="/" + "/".join(parts).strip("/") + termination)
 98 |         elif parts[0] == "weka" and parts[1] in WEKA_MOUNTS:
 99 |             return cls(prot="weka", path="/" + "/".join(parts[1:]).strip("/") + termination)
100 | 
101 |         return None
102 | 
103 |     @classmethod
104 |     def local_path(cls, path: str | Path) -> Optional["Self"]:
105 |         parsed = urlparse(str(path))
106 |         termination = "/" if str(path).endswith("/") else ""
107 |         if parsed.scheme == "file":
108 |             return cls(prot="file", path=f"/{parsed.netloc.strip('/')}/{parsed.path.lstrip('/')}{termination}")
109 | 
110 |         if parsed.scheme == "":
111 |             return cls(prot="file", path=str(path))
112 | 
113 |         return None
114 | 
115 |     @classmethod
116 |     def s3_path(cls, path: str | Path) -> Optional["Self"]:
117 |         parsed = urlparse(str(path))
118 |         if parsed.scheme.startswith("s3"):
119 |             return cls(prot="s3", path=parsed.netloc.strip("/") + "/" + parsed.path.lstrip("/"))
120 |         return None
121 | 
122 |     @classmethod
123 |     def gcs_path(cls, path: str | Path) -> Optional["Self"]:
124 |         parsed = urlparse(str(path))
125 |         if parsed.scheme in ("gs", "gcs"):
126 |             return cls(prot="gs", path=parsed.netloc.strip("/") + "/" + parsed.path.lstrip("/"))
127 |         return None
128 | 
129 |     @classmethod
130 |     def from_str(cls, path: str | Path) -> "Self":
131 |         if p := cls.weka_path(path):
132 |             return p
133 |         elif p := cls.local_path(path):
134 |             return p
135 |         elif p := cls.s3_path(path):
136 |             return p
137 |         elif p := cls.gcs_path(path):
138 |             return p
139 |         raise ValueError(f"Invalid path: {path}")
140 | 
141 |     @property
142 |     def local(self) -> Path:
143 |         if self.prot in ("weka", "file"):
144 |             return Path(self.path)
145 | 
146 |         raise ValueError(f"Path is not local: {self.path}")
147 | 
148 |     @property
149 |     def remote(self) -> str:
150 |         if self.prot == "file":
151 |             raise ValueError(f"Path is not remote: {self.path}")
152 |         return f"{self.prot}://{self.path.lstrip('/')}"
153 | 
154 |     @property
155 |     def bucket(self) -> str:
156 |         remote = self.remote
157 |         url = urlparse(remote)
158 |         return url.netloc
159 | 
160 |     @property
161 |     def prefix(self) -> str:
162 |         remote = self.remote
163 |         url = urlparse(remote)
164 |         return url.path.lstrip("/")
165 | 


--------------------------------------------------------------------------------
/src/cookbook/remote/gantry_launcher.py:
--------------------------------------------------------------------------------
 1 | import shlex
 2 | import subprocess
 3 | from dataclasses import InitVar, dataclass
 4 | 
 5 | from cookbook.cli.utils import (
 6 |     PythonEnv,
 7 |     add_secret_to_beaker_workspace,
 8 |     install_beaker_py,
 9 | )
10 | from cookbook.utils.clusters import get_matching_clusters
11 | 
12 | from .base import LocatedPath
13 | 
14 | 
15 | @dataclass
16 | class GantryLauncher:
17 |     allow_dirty: bool
18 |     budget: str
19 |     cluster: str
20 |     dry_run: bool
21 |     gpus: int
22 |     priority: str
23 |     preemptible: bool
24 |     workspace: str
25 |     env: InitVar[PythonEnv | None] = None
26 | 
27 |     def __post_init__(self, env: PythonEnv | None):
28 |         self._env = env or PythonEnv.null()
29 |         self._flags = []
30 | 
31 |         # setup beaker-py
32 |         install_beaker_py(env=self._env)
33 | 
34 |         for cluster in set(get_matching_clusters(self.cluster)):
35 |             self._flags.append(f"--cluster {cluster}")
36 | 
37 |     def add_mount(self, path: str):
38 |         located_path = LocatedPath.from_str(path)
39 |         if located_path.prot == "weka":
40 |             self._flags.append(f"--weka {located_path.bucket}:/{located_path.bucket}")
41 | 
42 |     def add_env_secret(self, key: str, value: str, overwrite: bool = False):
43 |         secret_name = add_secret_to_beaker_workspace(
44 |             secret_name=key,
45 |             secret_value=value,
46 |             workspace=self.workspace,
47 |             env=self._env,  # pyright: ignore
48 |             overwrite=overwrite,
49 |         )
50 |         self._flags.append(f"--env-secret {key}={secret_name}")
51 | 
52 |     def run(
53 |         self,
54 |         command: str,
55 |         description: str,
56 |         extra_flags: dict[str, str] | None = None,
57 |     ) -> subprocess.CompletedProcess:
58 | 
59 |         extra_flags = extra_flags or {}
60 | 
61 |         gantry_command = [
62 |             "gantry run",
63 |             f"--description '{description}'",
64 |             ("--allow-dirty" if self.allow_dirty else ""),
65 |             "--no-python",
66 |             f"--workspace {self.workspace}",
67 |             f"--priority {self.priority}",
68 |             f"--gpus {self.gpus}",
69 |             ("--preemptible" if self.preemptible else ""),
70 |             f"--budget {self.budget}",
71 |             "--yes",
72 |             ("--dry-run" if self.dry_run else ""),
73 |             " ".join(self._flags),
74 |             " ".join(f"--{k} {v}" for k, v in extra_flags.items()),
75 |             f"-- /bin/bash -c 'pip install uv && uv pip install . --system && {command}'",
76 |         ]
77 |         gantry_command_str = " ".join(gantry_command)
78 | 
79 |         print(f"Submitting to beaker with command: {gantry_command_str}")
80 |         return subprocess.run(shlex.split(gantry_command_str), check=True, env=self._env.path())
81 | 


--------------------------------------------------------------------------------
/src/cookbook/remote/gcp.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | from concurrent.futures import ThreadPoolExecutor, as_completed
  4 | from dataclasses import dataclass
  5 | from pathlib import Path
  6 | from urllib.parse import urlparse
  7 | 
  8 | from google.auth import default
  9 | from google.auth.transport.requests import Request
 10 | from google.cloud import storage
 11 | from google.oauth2.credentials import Credentials
 12 | from tqdm import tqdm
 13 | 
 14 | from .base import JSON_VALID_TYPES, AuthenticationError, BaseAuthentication
 15 | 
 16 | 
 17 | @dataclass(frozen=True)
 18 | class GoogleCloudToken(BaseAuthentication):
 19 |     token: str
 20 |     project_id: str
 21 |     expiry: datetime.datetime | None
 22 | 
 23 |     @classmethod
 24 |     def from_dict(cls, obj: dict[str, JSON_VALID_TYPES]) -> "GoogleCloudToken":
 25 |         parsed_obj = {
 26 |             "token": obj["token"],
 27 |             "expiry": datetime.datetime.fromisoformat(e) if isinstance(e := obj.get("expiry", None), str) else e,
 28 |             "project_id": obj["project_id"],
 29 |         }
 30 |         return super().from_dict(parsed_obj)
 31 | 
 32 |     def to_dict(self) -> dict[str, JSON_VALID_TYPES]:
 33 |         obj = {
 34 |             "token": self.token,
 35 |             "expiry": self.expiry.isoformat() if self.expiry else None,
 36 |             "project_id": self.project_id,
 37 |         }
 38 |         return obj
 39 | 
 40 |     @classmethod
 41 |     def make(cls) -> "GoogleCloudToken":
 42 |         """Generate short-lived token for GCS access."""
 43 |         credentials, project_id = default()
 44 |         if not credentials.valid:  # pyright: ignore
 45 |             credentials.refresh(Request())  # pyright: ignore
 46 | 
 47 |         return cls(token=credentials.token, project_id=project_id, expiry=credentials.expiry)  # pyright: ignore
 48 | 
 49 |     def apply(self) -> storage.Client:
 50 |         """Apply the credentials so that it can be used for remote operations."""
 51 | 
 52 |         if self.expiry is not None and datetime.datetime.now() > self.expiry:
 53 |             raise AuthenticationError("Token expired!")
 54 | 
 55 |         credentials = Credentials(self.token)
 56 |         return storage.Client(credentials=credentials, project=self.project_id)
 57 | 
 58 | 
 59 | def download_gcs_prefix(
 60 |     remote_path: str,
 61 |     local_path: str | Path,
 62 |     client: storage.Client | None = None,
 63 |     num_workers: int | None = None,
 64 |     credentials: GoogleCloudToken | None = None,
 65 | ):
 66 |     protocol, bucket_name, prefix = (p := urlparse(remote_path)).scheme, p.netloc, p.path.lstrip("/")
 67 |     assert protocol in ("gs", "gcs"), "Only GCS and GS protocols are supported"
 68 | 
 69 |     client = credentials.apply() if credentials else (client or storage.Client())
 70 | 
 71 |     local_path = Path(local_path)
 72 |     blobs = client.list_blobs(bucket_name, prefix=prefix)
 73 |     futures = []
 74 |     with ThreadPoolExecutor(max_workers=num_workers) as executor:
 75 |         for blob in blobs:
 76 |             local_file_path = local_path / Path(blob.name).relative_to(Path(prefix))
 77 | 
 78 |             def _download_file(
 79 |                 _blob: storage.Blob,
 80 |                 _local_file_path: Path,
 81 |                 _expiration_time: datetime.datetime | None,
 82 |             ):
 83 |                 if _expiration_time is not None and datetime.datetime.now() > _expiration_time:
 84 |                     raise RuntimeError("Token expired!")
 85 | 
 86 |                 _local_file_path.parent.mkdir(parents=True, exist_ok=True)
 87 |                 _blob.download_to_filename(str(_local_file_path))
 88 | 
 89 |             futures.append(
 90 |                 executor.submit(
 91 |                     _download_file,
 92 |                     _blob=blob,
 93 |                     _local_file_path=local_file_path,
 94 |                     _expiration_time=credentials.expiry if credentials else None,
 95 |                 )
 96 |             )
 97 | 
 98 |         for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading prefix"):
 99 |             try:
100 |                 future.result()
101 |             except Exception as e:
102 |                 for future_to_cancel in futures:
103 |                     future_to_cancel.cancel()
104 |                 raise e
105 | 
106 | 
107 | def upload_gcs_prefix(
108 |     local_path: str | Path,
109 |     remote_path: str,
110 |     client: storage.Client | None = None,
111 |     num_workers: int | None = None,
112 |     credentials: GoogleCloudToken | None = None,
113 | ):
114 |     protocol, bucket_name, prefix = (p := urlparse(remote_path)).scheme, p.netloc, p.path.lstrip("/")
115 |     assert protocol in ("gs", "gcs"), "Only GCS and GS protocols are supported"
116 | 
117 |     client = credentials.apply() if credentials else (client or storage.Client())
118 |     local_path = Path(local_path).absolute()
119 | 
120 |     with ThreadPoolExecutor(max_workers=num_workers) as executor:
121 |         futures = []
122 |         for dp, _, files in os.walk(str(local_path)):
123 |             for fp_str in files:
124 |                 fp = Path(dp) / fp_str
125 |                 if not fp.is_file():
126 |                     continue
127 | 
128 |                 bucket = client.bucket(bucket_name)
129 |                 blob = bucket.blob(f"{prefix}/{fp.relative_to(local_path)}")
130 | 
131 |                 def _upload_file(
132 |                     _fp: Path,
133 |                     _blob: storage.Blob,
134 |                     _expiration_time: datetime.datetime | None,
135 |                 ):
136 |                     if _expiration_time is not None and datetime.datetime.now() > _expiration_time:
137 |                         raise RuntimeError("Token expired!")
138 | 
139 |                     _blob.upload_from_filename(str(_fp))
140 | 
141 |                 futures.append(
142 |                     executor.submit(
143 |                         _upload_file,
144 |                         _fp=fp,
145 |                         _blob=blob,
146 |                         _expiration_time=credentials.expiry if credentials else None,
147 |                     )
148 |                 )
149 | 
150 |         for future in tqdm(as_completed(futures), total=len(futures), desc="Uploading prefix"):
151 |             try:
152 |                 future.result()
153 |             except Exception as e:
154 |                 for future_to_cancel in futures:
155 |                     future_to_cancel.cancel()
156 |                 raise e
157 | 


--------------------------------------------------------------------------------
/src/cookbook/train.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | 
 4 | import click
 5 | from olmo_core.train import prepare_training_environment, teardown_training_environment
 6 | from torch.distributed.elastic.multiprocessing.errors import record
 7 | 
 8 | from cookbook.utils.config import build_train_config
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | @click.group()
14 | def cli():
15 |     pass
16 | 
17 | 
18 | @cli.command()
19 | @click.option(
20 |     "--run-name",
21 |     "-n",
22 |     type=str,
23 |     help="Name of the run",
24 |     required=True,
25 | )
26 | @click.option(
27 |     "--group-id",
28 |     "-g",
29 |     type=str,
30 |     help="Group ID for the experiment",
31 | )
32 | @click.option(
33 |     "--beaker-user",
34 |     "-u",
35 |     type=str,
36 |     help="Beaker user",
37 | )
38 | @click.option(
39 |     "--config-path",
40 |     "-C",
41 |     type=click.Path(exists=True),
42 |     required=True,
43 |     help="Relative path to the experiment configuration file.",
44 | )
45 | @record
46 | def train(
47 |     run_name: str,
48 |     group_id: str,
49 |     beaker_user: str,
50 |     config_path: Path,
51 | ):
52 |     trainer = build_train_config(config_path, run_name, group_id, beaker_user)
53 | 
54 |     if trainer is None:
55 |         logger.error("Failed to build training config! Exiting...")
56 |         raise click.Abort()
57 | 
58 |     trainer.fit()
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     try:
63 |         prepare_training_environment()
64 |         cli()
65 |     finally:
66 |         teardown_training_environment()
67 | 


--------------------------------------------------------------------------------
/src/cookbook/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/utils/__init__.py


--------------------------------------------------------------------------------
/src/cookbook/utils/clusters.py:
--------------------------------------------------------------------------------
 1 | from cookbook.constants import BEAKER_KNOWN_CLUSTERS, NEW_CLUSTER_ALIASES
 2 | 
 3 | 
 4 | def get_matching_clusters(cluster: str) -> list[str]:
 5 |     """
 6 |     This function converts cluster aliases to the actual cluster names; it also
 7 |     handles the cases where a cluster is referred to by an alias.
 8 |     """
 9 |     if cluster in NEW_CLUSTER_ALIASES:
10 |         cluster = NEW_CLUSTER_ALIASES[cluster]
11 | 
12 |     if cluster in BEAKER_KNOWN_CLUSTERS:
13 |         return BEAKER_KNOWN_CLUSTERS[cluster]
14 | 
15 |     return [cluster]
16 | 
17 | 
18 | def is_gcs_cluster(cluster: str) -> bool:
19 |     """
20 |     This function checks if a cluster has GCS support; support here means we don't have to
21 |     push google credentials to access the cluster.
22 |     """
23 | 
24 |     canonical_names = get_matching_clusters(cluster)
25 | 
26 |     if all(cluster_name in BEAKER_KNOWN_CLUSTERS["goog"] for cluster_name in canonical_names):
27 |         return True
28 | 
29 |     return False
30 | 
31 | 
32 | def get_known_clusters() -> list[str]:
33 |     """
34 |     This function returns all known clusters in OLMo Cookbook.
35 |     """
36 |     all_clusters = [c for cs in BEAKER_KNOWN_CLUSTERS.values() for c in cs]
37 |     return sorted(set(all_clusters))
38 | 


--------------------------------------------------------------------------------
/src/cookbook/utils/config.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import os
  4 | from pathlib import Path
  5 | from typing import List, Tuple, Union, cast
  6 | from urllib.parse import urlparse
  7 | 
  8 | import gcsfs
  9 | import s3fs
 10 | import yaml
 11 | from olmo_core.io import normalize_path
 12 | from olmo_core.launch.beaker import (
 13 |     BeakerEnvSecret,
 14 |     BeakerEnvVar,
 15 |     BeakerLaunchConfig,
 16 |     BeakerWekaBucket,
 17 | )
 18 | from olmo_core.train.callbacks import ConfigSaverCallback, WandBCallback
 19 | from olmo_core.utils import seed_all
 20 | 
 21 | from cookbook.aliases import (
 22 |     ExperimentConfig,
 23 |     ExperimentGroup,
 24 |     ExperimentInstance,
 25 |     SourceConfig,
 26 |     SourceInstance,
 27 | )
 28 | from cookbook.model.builder import TransformerConfigBuilder
 29 | from cookbook.utils.data import normalize_source_paths
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | 
 34 | def config_from_path(config: Path) -> ExperimentConfig:
 35 |     with open(config, "r") as f:
 36 |         data = yaml.safe_load(f)
 37 | 
 38 |     return ExperimentConfig(**data, path=config)
 39 | 
 40 | 
 41 | def mk_source_instances(
 42 |     sources: list[SourceConfig], priors: Tuple[dict[str, float], int] | None = None
 43 | ) -> list[SourceInstance]:
 44 |     if priors:
 45 |         ratios_by_source, total_tokens = priors
 46 |     else:
 47 |         ratios_by_source = {}
 48 | 
 49 |     instances = []
 50 |     for source in sources:
 51 |         ratio = source.target_ratio or ratios_by_source[source.name]
 52 |         instances.append(
 53 |             SourceInstance(
 54 |                 name=source.name,
 55 |                 paths=source.paths,
 56 |                 ratio=ratio,
 57 |                 repetition_factor=source.repetition_factor,
 58 |             )
 59 |         )
 60 | 
 61 |     return instances
 62 | 
 63 | 
 64 | def mk_experiments(
 65 |     config: ExperimentConfig, group_id: str, priors: Tuple[dict[str, float], int]
 66 | ) -> list[ExperimentInstance]:
 67 |     """Generate source instances from a config."""
 68 |     return [
 69 |         ExperimentInstance(
 70 |             name=f"{config.name}-{group_id}",
 71 |             sources=mk_source_instances(config.dataset.sources, priors),
 72 |         )
 73 |     ]
 74 | 
 75 | 
 76 | def mk_experiment_group(
 77 |     config: ExperimentConfig, priors: Tuple[dict[str, float], int], group_id: str
 78 | ) -> ExperimentGroup:
 79 |     """Build an experiment group from an experiment config."""
 80 | 
 81 |     return ExperimentGroup(
 82 |         config=config,
 83 |         group_id=group_id,
 84 |         instances=mk_experiments(config, group_id, priors),
 85 |     )
 86 | 
 87 | 
 88 | def mk_instance_cmd(
 89 |     instance: ExperimentInstance, config: ExperimentConfig, group_id: str, beaker_user: str
 90 | ) -> List[str]:
 91 |     """Build a command for launching an experiment instance."""
 92 | 
 93 |     return [
 94 |         "src/cookbook/train.py",
 95 |         "train",
 96 |         "-n",
 97 |         instance.name,
 98 |         "-g",
 99 |         group_id,
100 |         "-u",
101 |         beaker_user,
102 |         "-C",
103 |         str(config.path),
104 |     ]
105 | 
106 | 
107 | _REMOTE_FS_CACHE: dict[str, Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]] | None = None
108 | 
109 | 
110 | def remote_fs_cache() -> dict[str, Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]]:
111 |     global _REMOTE_FS_CACHE
112 |     if _REMOTE_FS_CACHE is not None:
113 |         return _REMOTE_FS_CACHE
114 | 
115 |     _REMOTE_FS_CACHE = dict(
116 |         s3=s3fs.S3FileSystem(),
117 |         weka=s3fs.S3FileSystem(client_kwargs={"endpoint_url": os.environ["WEKA_ENDPOINT_URL"]}, profile="WEKA"),
118 |         gs=gcsfs.GCSFileSystem(),
119 |     )
120 | 
121 |     return _REMOTE_FS_CACHE
122 | 
123 | 
124 | def build_train_config(config_path: Path, run_name: str, group_id: str, beaker_user: str, dry_run: bool = False):
125 |     """
126 |     Launch a training run with the given parameters.
127 |     """
128 | 
129 |     base_config = config_from_path(config_path)
130 |     load_path_fs = None
131 | 
132 |     if dry_run:
133 |         source_paths = base_config.dataset.sources
134 |         if base_config.load_path:
135 |             try:
136 |                 load_path_fs = remote_fs_cache()[urlparse(base_config.load_path).scheme]
137 |             except KeyError:
138 |                 raise ValueError(f"Unsupported load path scheme: {base_config.load_path}")
139 | 
140 |             # When we have a weka path locally we need to treat it like a remote s3
141 |             # path and strip the special weka prefix and bucket name
142 |             base_config.load_path = normalize_path(base_config.load_path.replace("weka://", "s3://"))
143 | 
144 |     else:
145 |         source_paths = normalize_source_paths(base_config.dataset.sources, expand=True)
146 | 
147 |         if base_config.load_path:
148 |             # When we have a weka path remotely on beaker we need to treat it like a local path since the bucket is mounted
149 |             base_config.load_path = normalize_path(base_config.load_path.replace("weka://", "/weka/"))
150 | 
151 |     source_instances = mk_source_instances(source_paths, None)
152 |     dp_world_size = base_config.nodes * base_config.gpus
153 | 
154 |     config = TransformerConfigBuilder(
155 |         beaker_user=beaker_user,
156 |         cluster=base_config.cluster,
157 |         downstream_evaluators=base_config.downstream_evaluators,
158 |         dtype=base_config.dataset.dtype,
159 |         eval_interval=base_config.eval_interval,
160 |         group_id=group_id.strip(),
161 |         lm_evaluator=base_config.lm_evaluator,
162 |         max_dp_world_size=dp_world_size,
163 |         max_target_sequence_length=base_config.max_target_sequence_length,
164 |         max_tokens=base_config.max_tokens,
165 |         model_identifier=base_config.model,
166 |         run_name=run_name.strip(),
167 |         save_interval=base_config.save_interval,
168 |         seed=base_config.seed,
169 |         sequence_length=base_config.sequence_length,
170 |         sources=source_instances,
171 |         tokenizer=base_config.tokenizer,
172 |         metrics_config=base_config.metrics_config,
173 |         weka=base_config.weka,
174 |         rank_microbatch_size=base_config.rank_microbatch_size,
175 |         global_batch_size=base_config.global_batch_size,
176 |         load_path=base_config.load_path,
177 |         warmup_steps=base_config.warmup_steps,
178 |         learning_rate=base_config.learning_rate,
179 |         scheduler_type=base_config.scheduler_type,
180 |         annealing=base_config.annealing,
181 |         hard_stop=base_config.hard_stop,
182 |         model_overrides=base_config.model_overrides,
183 |         activation_checkpointing=base_config.activation_checkpointing,
184 |         load_path_fs=load_path_fs,
185 |     ).build()
186 | 
187 |     seed_all(config.init_seed)
188 |     config_dict = config.as_config_dict()
189 |     trainer = None
190 | 
191 |     if not dry_run:
192 |         dataset = config.dataset.build()
193 |         model = config.model.build(init_device="meta")
194 |         train_module = config.train_module.build(model)
195 |         data_loader = config.data_loader.build(dataset, dp_process_group=train_module.dp_process_group)
196 |         trainer = config.trainer.build(train_module, data_loader)
197 | 
198 |         # If we have a load path and there is no checkpoint in the save folder, load the checkpoint from the load path.
199 |         if not trainer.maybe_load_checkpoint(trainer.save_folder) and base_config.load_path:
200 |             logger.info(
201 |                 f"Loading checkpoint from {base_config.load_path} and load_trainer_state: {base_config.load_state}"
202 |             )
203 |             trainer.load_checkpoint(base_config.load_path, load_trainer_state=base_config.load_state)
204 | 
205 |         cast(WandBCallback, trainer.callbacks["wandb"]).config = config_dict
206 |         cast(ConfigSaverCallback, trainer.callbacks["config_saver"]).config = config_dict
207 | 
208 |     logger.info("Configuration:")
209 |     # We log estimated step count here when dry_run is enabled because we're not able to build the trainer on non-CUDA devices
210 |     if dry_run:
211 |         logger.info(
212 |             f"Estimated training steps: {math.ceil(base_config.max_tokens / config.data_loader.global_batch_size):,}"
213 |         )
214 |     logger.info(config)
215 | 
216 |     return trainer
217 | 
218 | 
219 | def mk_launch_configs(group: ExperimentGroup, beaker_user: str) -> list[BeakerLaunchConfig]:
220 |     """Build a beaker launch config from an experiment group."""
221 | 
222 |     weka_buckets: List[BeakerWekaBucket] = []
223 |     if group.config.weka:
224 |         weka_buckets.append(BeakerWekaBucket("oe-training-default", "/weka/oe-training-default"))
225 | 
226 |     return [
227 |         BeakerLaunchConfig(
228 |             name=f"{experiment.name}",
229 |             description=group.config.description,
230 |             task_name=experiment.name,
231 |             cmd=mk_instance_cmd(experiment, group.config, group.group_id, beaker_user),
232 |             clusters=[group.config.cluster],
233 |             num_nodes=group.config.nodes,
234 |             num_gpus=group.config.gpus,
235 |             shared_filesystem=group.config.weka,
236 |             allow_dirty=True,
237 |             weka_buckets=weka_buckets,
238 |             budget=group.config.budget or "ai2/oe-base",
239 |             workspace=group.config.workspace,
240 |             preemptible=group.config.preemptible,
241 |             beaker_image="petew/olmo-core-tch270cu128",
242 |             priority=group.config.priority,
243 |             env_vars=[BeakerEnvVar(name="NCCL_DEBUG", value="INFO" if group.config.nccl_debug else "WARN")],
244 |             env_secrets=[
245 |                 BeakerEnvSecret(name="BEAKER_TOKEN", secret=f"{beaker_user}_BEAKER_TOKEN"),
246 |                 BeakerEnvSecret(name="WANDB_API_KEY", secret=f"{beaker_user}_WANDB_API_KEY"),
247 |                 BeakerEnvSecret(name="AWS_CONFIG", secret=f"{beaker_user}_AWS_CONFIG"),
248 |                 BeakerEnvSecret(name="AWS_CREDENTIALS", secret=f"{beaker_user}_AWS_CREDENTIALS"),
249 |                 BeakerEnvSecret(name="R2_ENDPOINT_URL", secret="R2_ENDPOINT_URL"),
250 |                 BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"),
251 |                 BeakerEnvSecret(name="GOOGLE_CLOUD_PROJECT", secret="GOOGLE_CLOUD_PROJECT"),
252 |             ],
253 |             setup_steps=[
254 |                 'git clone "$REPO_URL"',
255 |                 "conda shell.bash activate base",
256 |                 "cd olmo-cookbook",
257 |                 'git checkout "$GIT_REF"',
258 |                 "git submodule update --init --recursive",
259 |                 "pip install -e '.[all]'",
260 |                 "pip freeze",
261 |                 # Move AWS credentials from env to relevant files
262 |                 "mkdir -p ~/.aws",
263 |                 "printenv AWS_CONFIG > ~/.aws/config",
264 |                 "printenv AWS_CREDENTIALS > ~/.aws/credentials",
265 |             ],
266 |         )
267 |         for experiment in group.instances
268 |     ]
269 | 


--------------------------------------------------------------------------------
/src/cookbook/utils/data.py:
--------------------------------------------------------------------------------
  1 | import concurrent.futures
  2 | import hashlib
  3 | import json
  4 | import logging
  5 | import os
  6 | import pathlib
  7 | from collections import defaultdict
  8 | from typing import Any, List, Optional, Tuple, Union
  9 | from urllib.parse import urlparse
 10 | 
 11 | import gcsfs
 12 | import s3fs
 13 | from olmo_core.aliases import PathOrStr
 14 | from olmo_core.data.types import NumpyDatasetDType
 15 | from olmo_core.io import get_file_size, is_url, normalize_path
 16 | from olmo_core.utils import OLMoEnvironmentError
 17 | from tqdm import tqdm
 18 | 
 19 | from cookbook.aliases import SourceConfig
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | logging.getLogger("botocore").setLevel(logging.WARNING)
 23 | 
 24 | 
 25 | def _bytes_to_tokens(num_bytes: int, dtype: NumpyDatasetDType) -> int:
 26 |     """
 27 |     Convert bytes to tokens based on the dtype.
 28 |     """
 29 |     npdtype = dtype.as_np_dtype()
 30 |     return num_bytes // npdtype(int(0)).itemsize
 31 | 
 32 | 
 33 | def _count_tokens_for_file(path: PathOrStr, dtype: NumpyDatasetDType) -> int:
 34 |     return _bytes_to_tokens(get_file_size(path), dtype)
 35 | 
 36 | 
 37 | def get_token_counts_and_ratios(
 38 |     source_configs: list[SourceConfig], dtype: NumpyDatasetDType, use_cache: bool
 39 | ) -> Tuple[dict[str, float], int]:
 40 |     config_hash = hashlib.md5(
 41 |         json.dumps(
 42 |             [(sc.name, sc.paths) for sc in source_configs],
 43 |             sort_keys=True,
 44 |         ).encode("utf-8")
 45 |     ).hexdigest()
 46 | 
 47 |     cache_path = pathlib.Path(f"/tmp/olmo-cookbook/priors_cache_{config_hash}.json")
 48 |     if use_cache:
 49 |         try:
 50 |             with open(cache_path, "r") as f:
 51 |                 logger.info(
 52 |                     "Source distribution cache found, using cached values! This can be disabled by setting use_cache=False."
 53 |                 )
 54 |                 obj = json.load(f)
 55 |                 return (obj["relative_sizes"], obj["total_tokens"])
 56 |         except FileNotFoundError:
 57 |             logger.info("No cache file found, calculating from source files...")
 58 | 
 59 |     token_counts = defaultdict(int)
 60 | 
 61 |     filesystems = {}
 62 | 
 63 |     # Pre-check each source for mixed schemes and create appropriate filesystem clients
 64 |     for source in source_configs:
 65 |         schemes = {urlparse(path).scheme for path in source.paths}
 66 | 
 67 |         # Check for mixed schemes within a source
 68 |         if len(schemes) > 1 and any(scheme for scheme in schemes):
 69 |             raise OLMoEnvironmentError(
 70 |                 f"Mixed URL schemes in source '{source.name}': {schemes}. Each source must use a consistent scheme."
 71 |             )
 72 | 
 73 |         # Get the scheme (or None for local paths)
 74 |         scheme = next(iter(schemes)) if schemes and next(iter(schemes)) else "local"
 75 | 
 76 |         if scheme not in filesystems:
 77 |             filesystems[scheme] = get_filesystem_for_scheme(scheme)
 78 | 
 79 |     with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor:
 80 |         for source in source_configs:
 81 |             # Get the appropriate filesystem for this source
 82 |             scheme = next(iter({urlparse(path).scheme for path in source.paths}), "local")
 83 |             fs = filesystems.get(scheme)
 84 | 
 85 |             globs = [path for path in source.paths if "*" in path]
 86 |             paths = [path for path in source.paths if path not in globs]
 87 |             source.paths = paths + expand_globs(fs, globs) if globs else paths
 88 | 
 89 |         futures = {
 90 |             executor.submit(_count_tokens_for_file, path, dtype): source
 91 |             for source in source_configs
 92 |             for path in source.paths
 93 |         }
 94 | 
 95 |         for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
 96 |             source_future = futures[future]
 97 |             try:
 98 |                 result = future.result()
 99 |                 token_counts[source_future.name] += result
100 |             except Exception as e:
101 |                 logger.info(f"Error processing {source_future.name}: {str(e)}")
102 |                 token_counts[source_future.name] = 0
103 | 
104 |     # Calculate relative sizes
105 |     total_tokens = sum(token_counts.values())
106 | 
107 |     if total_tokens == 0:
108 |         raise Exception(f"Error processing config, no tokens found!")
109 | 
110 |     relative_sizes = {path: count / total_tokens for path, count in token_counts.items()}
111 | 
112 |     if use_cache:
113 |         os.makedirs(os.path.dirname(cache_path), exist_ok=True)
114 |         with open(cache_path, "w") as f:
115 |             json.dump({"relative_sizes": relative_sizes, "total_tokens": total_tokens}, f)
116 | 
117 |     return (relative_sizes, total_tokens)
118 | 
119 | 
120 | def expand_globs(
121 |     fs: Optional[Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]] = s3fs.S3FileSystem(), sources: List[str] = []
122 | ) -> Any:
123 |     results = []
124 | 
125 |     for source in sources:
126 |         if is_url(source):
127 |             results.extend(_expand_remote(source, fs))
128 |         else:
129 |             results.extend(_expand_local(source))
130 | 
131 |     # Filter the globs from the expanded list
132 |     return [r for r in results if "*" not in r]
133 | 
134 | 
135 | def _expand_local(pattern: str) -> List[str]:
136 |     """
137 |     Expand a local glob pattern.
138 |     """
139 |     from glob import glob
140 | 
141 |     logger.info(f"Expanding '{pattern}'...")
142 |     matches = sorted(glob(pattern, recursive=True))
143 | 
144 |     if not matches:
145 |         raise FileNotFoundError(pattern)
146 | 
147 |     return [normalize_path(match) for match in matches]
148 | 
149 | 
150 | def _expand_remote(pattern: str, fs: Optional[Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]]) -> List[str]:
151 |     """
152 |     Expand a remote glob pattern.
153 |     """
154 |     if not fs:
155 |         fs = s3fs.S3FileSystem()
156 | 
157 |     parsed = urlparse(pattern)
158 |     logger.info(f"Expanding remote glob '{pattern}'...")
159 | 
160 |     if parsed.scheme == "s3":
161 |         return [f"s3://{obj}" for obj in fs.glob(pattern)]
162 |     elif parsed.scheme == "weka":
163 |         return [f"weka://{obj}" for obj in fs.glob(pattern.replace("weka://", "s3://"))]
164 |     elif parsed.scheme == "gs":
165 |         return [f"gs://{obj}" for obj in fs.glob(pattern)]
166 |     elif parsed.scheme == "r2":
167 |         raise NotImplementedError("'r2' types are not currently supported")
168 |     elif parsed.scheme in ("http", "https"):
169 |         raise NotImplementedError("'http' types are not currently supported")
170 |     elif parsed.scheme == "file":
171 |         raise NotImplementedError("Remote 'file' types are not currently supported")
172 |     else:
173 |         raise NotImplementedError(f"Glob expansion is not currently supported for '{parsed.scheme}' files")
174 | 
175 | 
176 | def normalize_source_paths(sources: List[SourceConfig], expand: bool = False) -> List[SourceConfig]:
177 |     """
178 |     Normalize the paths in a SourceConfig object.
179 |     """
180 |     normalized = []
181 | 
182 |     for source in sources:
183 |         source_paths = []
184 |         schemes = set()
185 | 
186 |         for path in source.paths:
187 |             if is_url(path):
188 |                 parsed = urlparse(path)
189 |                 schemes.add(parsed.scheme)
190 |                 if parsed.scheme == "s3":
191 |                     source_paths.append(path)
192 |                 elif parsed.scheme == "weka":
193 |                     source_paths.append(normalize_path(path.replace("weka://", "/weka/")))
194 |                 elif parsed.scheme == "gs":
195 |                     source_paths.append(path)
196 |                 elif parsed.scheme == "r2":
197 |                     raise NotImplementedError("'r2' types are not currently supported")
198 |                 elif parsed.scheme in ("http", "https"):
199 |                     raise NotImplementedError("'http' types are not currently supported")
200 |                 else:
201 |                     raise OLMoEnvironmentError(f"Unsupported URL scheme: {parsed.scheme}")
202 |             else:
203 |                 source_paths.append(normalize_path(path))
204 |                 schemes.add("local")
205 | 
206 |         # Get filesystem if we're expanding globs and paths exist
207 |         fs = None
208 |         if expand and source_paths:
209 |             scheme = next(iter(schemes)) if schemes else "local"
210 |             fs = get_filesystem_for_scheme(scheme)
211 | 
212 |         normalized.append(
213 |             SourceConfig(
214 |                 name=source.name,
215 |                 paths=expand_globs(fs=fs, sources=source_paths) if expand else source_paths,
216 |                 target_ratio=source.target_ratio,
217 |                 repetition_factor=source.repetition_factor,
218 |                 max_source_ratio=source.max_source_ratio,
219 |             )
220 |         )
221 | 
222 |     return normalized
223 | 
224 | 
225 | def get_filesystem_for_scheme(scheme: str):
226 |     """
227 |     Get the appropriate filesystem for a given URL scheme.
228 | 
229 |     Args:
230 |         scheme: The URL scheme (e.g., 's3', 'gs', 'local', 'weka')
231 | 
232 |     Returns:
233 |         The appropriate filesystem object for the scheme or None for local paths
234 | 
235 |     Raises:
236 |         OLMoEnvironmentError: If the scheme is not supported or not configured correctly
237 |         NotImplementedError: If the scheme is recognized but not currently supported
238 |     """
239 |     if scheme in ("s3", "weka"):
240 |         client_kwargs = {}
241 |         profile_name = os.environ.get("AWS_PROFILE", None)
242 | 
243 |         if scheme == "weka":
244 |             profile_name = "WEKA"
245 |             client_kwargs["endpoint_url"] = os.environ.get("WEKA_ENDPOINT_URL")
246 | 
247 |         return s3fs.S3FileSystem(client_kwargs={**client_kwargs}, profile=profile_name)
248 | 
249 |     elif scheme == "gs":
250 |         try:
251 |             gs_project = os.environ.get("GOOGLE_CLOUD_PROJECT", None)
252 | 
253 |             if not gs_project:
254 |                 raise OLMoEnvironmentError("GOOGLE_CLOUD_PROJECT environment variable is not set!")
255 | 
256 |             try:
257 |                 return gcsfs.GCSFileSystem(token="google_default")
258 |             except Exception as e:
259 |                 logger.warning(
260 |                     f"Failed to create GCS filesystem with default credentials: {str(e)}. Retrying with metadata server..."
261 |                 )
262 |                 return gcsfs.GCSFileSystem()
263 | 
264 |         except Exception as e:
265 |             raise OLMoEnvironmentError(
266 |                 f"Failed to create GCS filesystem: {str(e)}. Ensure GOOGLE_APPLICATION_CREDENTIALS_JSON and GOOGLE_CLOUD_PROJECT are set correctly."
267 |             )
268 | 
269 |     elif scheme in ("r2", "http", "https"):
270 |         raise NotImplementedError(f"'{scheme}' scheme is not currently supported")
271 | 
272 |     elif scheme == "local":
273 |         return None  # No remote filesystem needed for local paths
274 | 
275 |     else:
276 |         raise OLMoEnvironmentError(f"Unsupported URL scheme: {scheme}")
277 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/tests/__init__.py


--------------------------------------------------------------------------------
/tests/cookbook/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/tests/cookbook/__init__.py


--------------------------------------------------------------------------------
/tests/cookbook/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/tests/cookbook/eval/__init__.py


--------------------------------------------------------------------------------
/tests/cookbook/remote/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/tests/cookbook/remote/__init__.py


--------------------------------------------------------------------------------
/tests/cookbook/remote/test_remote.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from unittest import TestCase
 4 | 
 5 | from cookbook.remote.base import LocatedPath
 6 | 
 7 | 
 8 | class TestLocatedPath(TestCase):
 9 |     def test_located_path_gcs(self):
10 |         self.assertEqual(LocatedPath.from_str("gs://bucket/prefix"), LocatedPath(prot="gs", path="bucket/prefix"))
11 |         self.assertEqual(LocatedPath.from_str("gcs://bucket/prefix"), LocatedPath(prot="gs", path="bucket/prefix"))
12 |         self.assertEqual(
13 |             LocatedPath.from_str("gs://bucket/more/prefix/"), LocatedPath(prot="gs", path="bucket/more/prefix/")
14 |         )
15 | 
16 |     def test_located_path_s3(self):
17 |         self.assertEqual(LocatedPath.from_str("s3://bucket/prefix"), LocatedPath(prot="s3", path="bucket/prefix"))
18 |         self.assertEqual(
19 |             LocatedPath.from_str("s3n://bucket/prefix/"), LocatedPath(prot="s3", path="bucket/prefix/")
20 |         )
21 | 
22 |     def test_located_path_weka(self):
23 |         self.assertEqual(
24 |             LocatedPath.from_str("/weka/oe-training-default/prefix"),
25 |             LocatedPath(prot="weka", path="/oe-training-default/prefix"),
26 |         )
27 |         self.assertEqual(
28 |             LocatedPath.from_str("/oe-training-default/prefix/"),
29 |             LocatedPath(prot="weka", path="/oe-training-default/prefix/"),
30 |         )
31 | 
32 |         with self.assertRaises(ValueError):
33 |             LocatedPath.from_str("weka://non-existent-bucket/prefix")
34 | 
35 |     def test_located_path_local(self):
36 |         # Test absolute local paths
37 |         self.assertEqual(LocatedPath.from_str("/home/user/data"), LocatedPath(prot="file", path="/home/user/data"))
38 |         self.assertEqual(LocatedPath.from_str("/tmp/data/"), LocatedPath(prot="file", path="/tmp/data/"))
39 | 
40 |         # Test with Path objects
41 |         self.assertEqual(
42 |             LocatedPath.from_str(Path("/usr/local/bin")), LocatedPath(prot="file", path="/usr/local/bin")
43 |         )
44 | 
45 |         # Test single-level paths
46 |         self.assertEqual(LocatedPath.from_str("/home"), LocatedPath(prot="file", path="/home"))
47 | 
48 |     def test_located_path_to_str(self):
49 |         # Test conversion back to string
50 |         path = LocatedPath(prot="file", path="home/user/data")
51 |         self.assertEqual(path.local, Path("home/user/data"))
52 | 
53 |         path_with_trailing_slash = LocatedPath(prot="file", path="tmp/data/")
54 |         self.assertEqual(path_with_trailing_slash.local, Path("tmp/data/"))
55 | 
56 |     def test_located_path_invalid(self):
57 |         with self.assertRaises(ValueError):
58 |             LocatedPath.from_str("azure://bucket/prefix")
59 | 
60 |     def test_local_command(self):
61 |         with self.assertRaises(ValueError):
62 |             LocatedPath.from_str("s3://bucket/prefix").local
63 | 
64 |         with self.assertRaises(ValueError):
65 |             LocatedPath.from_str("gs://bucket/prefix").local
66 | 
67 |         self.assertEqual(LocatedPath.from_str("file://home/user/data").local, Path("/home/user/data"))
68 |         self.assertEqual(LocatedPath.from_str("/home/user/data").local, Path("/home/user/data"))
69 |         self.assertEqual(
70 |             LocatedPath.from_str("weka://oe-data-default/prefix").local, Path("/oe-data-default/prefix")
71 |         )
72 |         self.assertEqual(LocatedPath.from_str("/oe-data-default/prefix").local, Path("/oe-data-default/prefix"))
73 |         self.assertEqual(
74 |             LocatedPath.from_str("/weka/oe-training-default/prefix").local, Path("/oe-training-default/prefix")
75 |         )
76 | 
77 |     def test_remote_command(self):
78 |         with self.assertRaises(ValueError):
79 |             LocatedPath.from_str("file://home/user/data").remote
80 | 
81 |         with self.assertRaises(ValueError):
82 |             LocatedPath.from_str("/home/user/data").remote
83 | 
84 |         self.assertEqual(LocatedPath.from_str("s3://bucket/prefix").remote, "s3://bucket/prefix")
85 |         self.assertEqual(LocatedPath.from_str("gs://bucket/prefix").remote, "gs://bucket/prefix")
86 |         self.assertEqual(
87 |             LocatedPath.from_str("/oe-training-default/prefix").remote, "weka://oe-training-default/prefix"
88 |         )
89 |         self.assertEqual(
90 |             LocatedPath.from_str("/weka/oe-training-default/prefix").remote, "weka://oe-training-default/prefix"
91 |         )
92 | 


--------------------------------------------------------------------------------