├── .gitignore ├── LICENSE ├── README.md ├── docs ├── README.md └── how-to-tokenize.md ├── pyproject.toml ├── scripts ├── compare_data_configs.py ├── compare_wandb_configs.py ├── launch_ruler.sh └── summarize_data_mix.py ├── src └── cookbook │ ├── __init__.py │ ├── aliases.py │ ├── cli │ ├── cli.py │ ├── core.py │ ├── eval.py │ ├── pmr.py │ └── utils.py │ ├── constants.py │ ├── data │ ├── __init__.py │ ├── dataset.py │ └── mixes │ │ ├── dolmino100.txt │ │ ├── dolmino300.txt │ │ ├── dolmino50.txt │ │ ├── jallyrun100.txt │ │ ├── jallyrun50.txt │ │ └── stackexchange.txt │ ├── eval │ ├── cache.py │ ├── conversion.py │ ├── conversion_from_hf.py │ ├── datalake.py │ ├── evaluation.py │ ├── miniframe.py │ ├── named_tasks.py │ └── results.py │ ├── model │ ├── __init__.py │ ├── builder.py │ ├── config.py │ └── evaluators.py │ ├── recipes │ ├── love2code │ │ ├── train-190M-1xC-love2code-weka-python-hlr-bpb-only.yaml │ │ ├── train-1b-5xC-love2code-starcoder1-weka-hlr.yaml │ │ ├── train-1b-5xC-love2code-starcoder1-weka.yaml │ │ ├── train-1b-5xC-love2code-weka-hlr.yaml │ │ ├── train-1b-5xC-love2code-weka-python-hlr.yaml │ │ ├── train-1b-5xC-love2code-weka-python-no-prose-hlr.yaml │ │ ├── train-1b-5xC-love2code-weka-python.yaml │ │ ├── train-1b-5xC-love2code-weka-starcoder1-noprose.yaml │ │ └── train-1b-5xC-love2code-weka.yaml │ ├── olmo2 │ │ ├── anneal │ │ │ ├── train-1b-dclm-dolma2-anneal-10b.yaml │ │ │ ├── train-7b-code-dolma2-anneal-10b-augusta.yaml │ │ │ ├── train-7b-dclm-only-anneal-10b-control.yaml │ │ │ ├── train-7b-finemath3p-anneal-10b-50split-dclm.yaml │ │ │ ├── train-7b-finemath3p-anneal-10b-50split.yaml │ │ │ ├── train-7b-wiki-concat-anneal-10b-50split-dclm.yaml │ │ │ ├── train-7b-wiki-concat-anneal-10b-50split.yaml │ │ │ └── train-7b-wiki-concat-anneal-10b.yaml │ │ ├── train-1b-1xC-dclm.yaml │ │ ├── train-1b-5xC-dclm-dolma2-180k-wsd.yaml │ │ ├── train-1b-5xC-dclm-dolma2-180k.yaml │ │ ├── train-1b-5xC-dclm-dolma2-augusta.yaml │ │ ├── train-1b-5xC-dclm-dolma2-wsd.yaml │ │ ├── train-1b-5xC-dclm-dolma2.yaml │ │ ├── train-1b-5xC-dclm-superbpe-wsd.yaml │ │ ├── train-1b-5xC-dclm-superbpe.yaml │ │ ├── train-1b-5xC-olmo2-baseline.yaml │ │ ├── train-7b-1xC-dclm-dolma2-180k.yaml │ │ ├── train-7b-1xC-dclm-dolma2.yaml │ │ └── train-7b-1xC-dclm-superbpe.yaml │ ├── olmo3-evals │ │ └── README.md │ ├── olmo3-midtraining │ │ └── example-olmo2_7b-web-code-reasoning-microanneal.yaml │ ├── olmo3 │ │ └── pstar │ │ │ ├── mixes │ │ │ ├── dclm_natural.json │ │ │ ├── dclm_pstar_001.json │ │ │ ├── dclm_pstar_002.json │ │ │ └── dist-plot.py │ │ │ ├── train-1b-5xC-pstar-001-dclm-dolma2.yaml │ │ │ ├── train-1b-5xC-pstar-002-dclm-dolma2.yaml │ │ │ └── train-1b-5xC-pstar-natural-dclm-dolma2.yaml │ └── spring2code │ │ └── scaling │ │ ├── spring2code-190m-5xC-weka-python-only-bpb-hlr.yaml │ │ ├── spring2code-190m-5xC-weka-python-only-bpb-vhlr.yaml │ │ ├── spring2code-190m-5xC-weka-top15-bpb-hlr-superbpe.yaml │ │ ├── spring2code-190m-5xC-weka-top15-bpb-hlr.yaml │ │ ├── spring2code-1b-5xC-weka-python-only-bpb-hlr.yaml │ │ ├── spring2code-1b-5xC-weka-top15-bpb-hlr-superbpe.yaml │ │ └── spring2code-1b-5xC-weka-top15-bpb-hlr.yaml │ ├── remote │ ├── __init__.py │ ├── __main__.py │ ├── aws.py │ ├── base.py │ ├── gantry_launcher.py │ └── gcp.py │ ├── train.py │ └── utils │ ├── __init__.py │ ├── clusters.py │ ├── config.py │ └── data.py └── tests ├── __init__.py └── cookbook ├── __init__.py ├── eval ├── __init__.py └── test_miniframe.py └── remote ├── __init__.py └── test_remote.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | 173 | 174 | # MacOS files 175 | .DS_Store 176 | 177 | # vscode 178 | .vscode/ 179 | 180 | # temporary directory 181 | tmp/ 182 | temp/ 183 | uv.lock 184 | 185 | 186 | # ignore vscode workspace settings 187 | *.code-workspace 188 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | This directory contains guides on how to use the OLMo Cookbook. 4 | -------------------------------------------------------------------------------- /docs/how-to-tokenize.md: -------------------------------------------------------------------------------- 1 | # How to Tokenize 2 | 3 | This is a brief guide on how to tokenize data on EC2. 4 | We will use Poor Man Ray to create a new instance, install Dolma, and then SSH into machine to tokenize the data. 5 | 6 | ## Step 0: install Poor Man Ray 7 | 8 | Clone the OLMo Cookbook repository; install. 9 | 10 | ```bash 11 | git clone https://github.com/allenai/olmo-cookbook.git 12 | cd olmo-cookbook 13 | pip install -e . 14 | ``` 15 | 16 | Ensure your AWS environment variables are set: 17 | ```bash 18 | export AWS_ACCESS_KEY_ID="[your key]" 19 | export AWS_SECRET_ACCESS_KEY="[your secret]" 20 | export AWS_DEFAULT_REGION="us-east-1" 21 | ``` 22 | 23 | ## Step 1: create a cluster 24 | 25 | Create a cluster on EC2 where we will run tokenization; we will use one `i4i.x32large` instance. 26 | 27 | ```bash 28 | cluster_name="YOUR_CLUSTER_NAME" 29 | poormanray create -n $cluster_name -t i4i.32xlarge --number 1 30 | ``` 31 | 32 | Then run two setup commands to setup storage and toolkit: 33 | 34 | ```bash 35 | poormanray setup-d2tk -n $cluster_name -d 36 | poormanray setup-dolma-python -n $cluster_name -d 37 | ``` 38 | 39 | The `-d` here means do this in the background. You should wait a few minutes to finish. You can check status of first command by running 40 | 41 | ```bash 42 | poormanray run -n $cluster_name -c 'ls' 43 | ``` 44 | 45 | and check if a `datamap-rs` exists; for the second, run 46 | 47 | ```bash 48 | poormanray run -n $cluster_name -c 'uv run dolma' 49 | ``` 50 | 51 | and check if a dolma command is found. 52 | 53 | ## Step 2: Download data to node 54 | 55 | Use `list` to get IP of the machine: 56 | 57 | ```bash 58 | >>> poormanray list -n $cluster_name 59 | 60 | 61 | Id: i-xxxxxxxxxxxxxxxxx 62 | Name: -0000 63 | Type: i4i.32xlarge 64 | State: running 65 | IP: xxx.yyy.zzz.ttt 66 | Status: 2/2 67 | Tags: {"Contact": "", "Name": "-0000", "Project": ""} 68 | ``` 69 | 70 | Now SSH into the machine and download the data using `s5cmd`. I recommend doing it inside a tmux session 71 | 72 | ```bash 73 | ssh ec2-user@xxx.yyy.zzz.ttt 74 | 75 | s5cmd cp -sp \ 76 | "s3://ai2-llm/pretraining-data/sources/dataset-name/documents/*" \ 77 | "/mnt/raid0/ai2-llm/pretraining-data/sources/dataset-name/documents" 78 | ``` 79 | 80 | make sure to use "*" at the end of source path, and a trailing / at the end of destination. 81 | 82 | ## Step 3: Tokenize the data 83 | 84 | Now you can tokenize as follows: 85 | 86 | ```bash 87 | tokenizer="allenai/dolma2-tokenizer" 88 | 89 | uv run huggingface-cli download $tokenizer --local-dir /mnt/raid0/tokenizer 90 | 91 | uv run dolma tokens \ 92 | --documents "/mnt/raid0/ai2-llm/pretraining-data/sources/dataset-name/documents/*" \ 93 | --destination "/mnt/raid0/ai2-llm/preprocessed/dataset-name/${tokenizer}" \ 94 | --tokenizer.name_or_path /mnt/raid0/tokenizer/tokenizer.json \ 95 | --tokenizer.eos_token_id 100257 \ 96 | --tokenizer.pad_token_id 100277 \ 97 | --no-tokenizer.segment_before_tokenization \ 98 | --tokenizer.encode_special_tokens \ 99 | --processes $(python3 -c "import multiprocessing; print(multiprocessing.cpu_count())") \ 100 | --max_size 4_000_000_000 \ 101 | --sample_ring_prop \ 102 | --dtype uint32 103 | ``` 104 | 105 | ## Step 4: Upload data to S3 106 | 107 | Finish by uploading the data to S3. 108 | 109 | ```bash 110 | s5cmd cp -sp \ 111 | "/mnt/raid0/ai2-llm/preprocessed/dataset-name/${tokenizer}/*" \ 112 | "s3://ai2-llm/preprocessed/dataset-name/${tokenizer}/" 113 | ``` 114 | 115 | And then terminate the cluster. 116 | 117 | ```bash 118 | poormanray terminate -n $cluster_name 119 | ``` 120 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "olmo-cookbook" 7 | dynamic = ["version"] 8 | readme = "README.md" 9 | description = "" 10 | authors = [ 11 | { name = "Allen Institute for Artificial Intelligence", email = "oe-data-engineering@allenai.org" } 12 | ] 13 | requires-python = ">=3.10,<3.14" 14 | license = { file = "LICENSE" } 15 | dependencies = [ 16 | "boto3", 17 | "click", 18 | "requests", 19 | "platformdirs", 20 | "pydantic", 21 | "s3fs", 22 | "gcsfs", 23 | "rich", 24 | "smart_open", 25 | "yaspin", 26 | "PyYAML>=6.0,<7.0", 27 | "paramiko>=3.5,<3.6", 28 | "tabulate", 29 | "packaging>=24.2", 30 | "tqdm>=4.67.1", 31 | "huggingface-hub[hf-transfer]>=0.34,<0.35", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | dev = [ 36 | "ruff>=0.12.8", 37 | "boto3-stubs[essential,ec2,s3,ssm]", 38 | "google-api-python-client-stubs", 39 | ] 40 | beaker = [ 41 | "beaker-py>=1.17.1,<2", 42 | "GitPython>=3.0,<4.0", 43 | ] 44 | wandb = [ 45 | "wandb", 46 | ] 47 | checkpoints = [ 48 | "google-cloud-storage", 49 | "boto3" 50 | ] 51 | all = [ 52 | "ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@7afdc3ed67f00b090aae11b5101ef147160274cc", #c779ca546cc3194e73e7491aaefcdffbed042c65", 53 | "beaker-py>=1.17.1,<2", 54 | "GitPython>=3.0,<4.0", 55 | "wandb", 56 | ] 57 | 58 | [project.scripts] 59 | olmo-cookbook = "cookbook.cli:cli.cli" 60 | olmo-cookbook-eval = "cookbook.cli:eval.cli" 61 | olmo-cookbook-core = "cookbook.cli:core.cli" 62 | poormanray = "cookbook.cli:pmr.cli" 63 | 64 | 65 | [tool.black] 66 | line-length = 115 67 | target-version = ['py39'] 68 | include = '\.pyi?$' 69 | exclude = ''' 70 | ( 71 | __pycache__ 72 | | \.git 73 | | \.mypy_cache 74 | | \.pytest_cache 75 | | \.vscode 76 | | \.venv 77 | | \bdist\b 78 | | \bdoc\b 79 | | scratch/ 80 | | build/ 81 | ) 82 | ''' 83 | 84 | [tool.isort] 85 | profile = "black" 86 | multi_line_output = 3 87 | 88 | [tool.ruff] 89 | line-length = 115 90 | 91 | [tool.ruff.lint] 92 | ignore = ["F403", "F405", "E501"] 93 | exclude = [ 94 | ".bzr", 95 | ".direnv", 96 | ".eggs", 97 | ".git", 98 | ".venv", 99 | "venv", 100 | ".mypy_cache", 101 | "__pycache__", 102 | ".nox", 103 | ".pants.d", 104 | ".pytype", 105 | ".ruff_cache", 106 | ".svn", 107 | ".tox", 108 | "__pypackages__", 109 | "_build", 110 | "buck-out", 111 | "build", 112 | "dist", 113 | "node_modules", 114 | "doc", 115 | "pretrain_data", 116 | "inference", 117 | ] 118 | 119 | [tool.ruff.lint.per-file-ignores] 120 | "**/__init__.py" = ["F401"] 121 | 122 | [tool.pyright] 123 | reportPrivateImportUsage = false 124 | 125 | [tool.mypy] 126 | ignore_missing_imports = true 127 | no_site_packages = true 128 | check_untyped_defs = true 129 | disable_error_code = "has-type" 130 | 131 | [[tool.mypy.overrides]] 132 | module = "tests.*" 133 | strict_optional = false 134 | 135 | [tool.pytest.ini_options] 136 | testpaths = "tests/" 137 | python_classes = [ 138 | "Test*", 139 | "*Test", 140 | ] 141 | log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s" 142 | log_level = "DEBUG" 143 | log_cli = false 144 | log_cli_level = "DEBUG" 145 | filterwarnings = [ 146 | 'ignore::FutureWarning:huggingface_hub\.file_download', 147 | 'ignore::DeprecationWarning:pkg_resources', 148 | 'ignore::DeprecationWarning:google\.rpc', 149 | 'ignore::FutureWarning:torch\.distributed\.checkpoint\.default_planner', 150 | ] 151 | -------------------------------------------------------------------------------- /scripts/compare_wandb_configs.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Examples: 4 | Comparing Peteish7 to OLMoE 5 | - python scripts/compare_wandb_configs.py https://wandb.ai/ai2-llm/olmo-medium/runs/cej4ya39 https://wandb.ai/ai2-llm/olmoe/runs/rzsn9tlc 6 | 7 | Comparing Peteish7 to Amberish7 8 | - python scripts/compare_wandb_configs.py https://wandb.ai/ai2-llm/olmo-medium/runs/cej4ya39 https://wandb.ai/ai2-llm/olmo-medium/runs/ij4ls6v2 9 | 10 | 11 | """ 12 | 13 | import logging 14 | import os 15 | import re 16 | from collections import Counter 17 | 18 | import click 19 | import wandb 20 | from olmo_core.utils import flatten_dict, prepare_cli_environment 21 | from rich.console import Console 22 | from rich.panel import Panel 23 | from rich.table import Table 24 | from rich.text import Text 25 | 26 | log = logging.getLogger(__name__) 27 | run_path_re = re.compile(r"^[^/]+/[^/]+/[^/]+$") 28 | run_path_url = re.compile(r"^https?://wandb.ai/([^/]+)/([^/]+)/runs/([^/]+)") 29 | console = Console() 30 | 31 | 32 | def parse_run_path(run_path: str) -> str: 33 | """For convenience, we allow run paths as well as URLs.""" 34 | run_path = run_path.strip("/") 35 | if run_path_re.match(run_path): 36 | return run_path 37 | 38 | m = run_path_url.match(run_path) 39 | if m is not None: 40 | entity, project, run_id = m.groups() 41 | return f"{entity}/{project}/{run_id}" 42 | 43 | raise ValueError(f"Could not parse '{run_path}'") 44 | 45 | 46 | def display_differences_table(left_config, right_config, title): 47 | # Create exclusive keys tables 48 | left_only_keys = left_config.keys() - right_config.keys() 49 | if left_only_keys: 50 | left_table = Table(title="Settings only in left", title_style="bold cyan") 51 | left_table.add_column("Key", style="dim") 52 | left_table.add_column("Value", no_wrap=False) 53 | 54 | for k in sorted(left_only_keys): 55 | left_table.add_row(str(k), str(left_config[k])) 56 | console.print(left_table) 57 | 58 | right_only_keys = right_config.keys() - left_config.keys() 59 | if right_only_keys: 60 | right_table = Table(title="Settings only in right", title_style="bold magenta") 61 | right_table.add_column("Key", style="dim") 62 | right_table.add_column("Value", no_wrap=False) 63 | 64 | for k in sorted(right_only_keys): 65 | right_table.add_row(str(k), str(right_config[k])) 66 | console.print(right_table) 67 | 68 | # Create differences table 69 | keys_with_differences = { 70 | k for k in left_config.keys() & right_config.keys() if left_config[k] != right_config[k] 71 | } 72 | 73 | if keys_with_differences: 74 | diff_table = Table(title=f"Differences in {title}", title_style="bold yellow") 75 | diff_table.add_column("Parameter", style="dim") 76 | diff_table.add_column("Left Value", style="cyan") 77 | diff_table.add_column("Right Value", style="magenta") 78 | 79 | for k in sorted(keys_with_differences): 80 | diff_table.add_row(str(k), str(left_config[k]), str(right_config[k])) 81 | console.print(diff_table) 82 | elif not (left_only_keys or right_only_keys): 83 | console.print(Panel(f"No differences found in {title}", style="green")) 84 | 85 | 86 | def display_data_differences(left_data_paths, right_data_paths): 87 | left_table = Table(title="Data Paths for Left Config", title_style="bold cyan", show_header=True) 88 | left_table.add_column("Path") 89 | left_table.add_column("Count", justify="right") 90 | 91 | for path, count in left_data_paths.items(): 92 | left_table.add_row(str(path), str(count)) 93 | 94 | right_table = Table(title="Data Paths for Right Config", title_style="bold magenta", show_header=True) 95 | right_table.add_column("Path") 96 | right_table.add_column("Count", justify="right") 97 | 98 | for path, count in right_data_paths.items(): 99 | right_table.add_row(str(path), str(count)) 100 | 101 | console.print(left_table) 102 | console.print(right_table) 103 | 104 | 105 | @click.command() 106 | @click.argument( 107 | "left_run_path", 108 | type=str, 109 | ) 110 | @click.argument( 111 | "right_run_path", 112 | type=str, 113 | ) 114 | @click.option( 115 | "--diff-datasets", 116 | is_flag=True, 117 | default=False, 118 | help="Whether to compare dataset differences between runs", 119 | ) 120 | def main( 121 | left_run_path: str, 122 | right_run_path: str, 123 | diff_datasets: bool, 124 | ): 125 | api = wandb.Api() 126 | left_run = api.run(parse_run_path(left_run_path)) 127 | right_run = api.run(parse_run_path(right_run_path)) 128 | 129 | left_config_raw = left_run._attrs["rawconfig"] 130 | right_config_raw = right_run._attrs["rawconfig"] 131 | 132 | # flattening the dict will make diffs easier 133 | left_config = flatten_dict(left_config_raw) 134 | right_config = flatten_dict(right_config_raw) 135 | 136 | # Handle dataset paths conditionally based on diff_datasets flag 137 | left_data_paths = Counter() 138 | right_data_paths = Counter() 139 | if diff_datasets and "dataset.paths" in left_config: 140 | left_data_paths = Counter([os.path.dirname(path) for path in left_config["dataset.paths"]]) 141 | del left_config["dataset.paths"] 142 | elif "dataset.paths" in left_config: 143 | del left_config["dataset.paths"] 144 | 145 | if diff_datasets and "dataset.paths" in right_config: 146 | right_data_paths = Counter([os.path.dirname(path) for path in right_config["dataset.paths"]]) 147 | del right_config["dataset.paths"] 148 | elif "dataset.paths" in right_config: 149 | del right_config["dataset.paths"] 150 | 151 | # Handle source_mixture_config in the same way 152 | if "dataset.source_mixture_config.source_configs" in left_config: 153 | source_configs = left_config["dataset.source_mixture_config.source_configs"] 154 | if diff_datasets: 155 | for config in source_configs: 156 | if isinstance(config, dict) and "paths" in config: 157 | paths = config["paths"] 158 | for path in paths: 159 | left_data_paths[os.path.dirname(path)] += 1 160 | 161 | for config in source_configs: 162 | if isinstance(config, dict) and "paths" in config: 163 | del config["paths"] 164 | 165 | left_config["dataset.source_mixture_config.source_configs"] = source_configs 166 | 167 | if "dataset.source_mixture_config.source_configs" in right_config: 168 | source_configs = right_config["dataset.source_mixture_config.source_configs"] 169 | if diff_datasets: 170 | for config in source_configs: 171 | if isinstance(config, dict) and "paths" in config: 172 | paths = config["paths"] 173 | for path in paths: 174 | right_data_paths[os.path.dirname(path)] += 1 175 | 176 | for config in source_configs: 177 | if isinstance(config, dict) and "paths" in config: 178 | del config["paths"] 179 | 180 | del right_config["dataset.source_mixture_config.source_configs"] 181 | 182 | # Display header with run information 183 | console.print() 184 | console.rule(f"[bold]Config differences between runs[/bold]") 185 | console.print(f"Left: [cyan]{left_run_path}[/cyan]") 186 | console.print(f"Right: [magenta]{right_run_path}[/magenta]") 187 | console.print() 188 | 189 | # Display parameter differences 190 | console.rule("[bold]Parameter Differences[/bold]") 191 | display_differences_table(left_config, right_config, "parameters") 192 | console.print() 193 | 194 | # Display data differences only if diff_datasets is enabled 195 | if diff_datasets: 196 | console.rule("[bold]Data Differences[/bold]") 197 | display_data_differences(left_data_paths, right_data_paths) 198 | console.print() 199 | 200 | 201 | if __name__ == "__main__": 202 | prepare_cli_environment() 203 | main() 204 | -------------------------------------------------------------------------------- /scripts/launch_ruler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | 9 | # Check if olmo-cookbook-eval command is available 10 | if command -v olmo-cookbook-eval &> /dev/null; then 11 | eval_command="olmo-cookbook-eval" 12 | elif command -v uv &> /dev/null && uv run olmo-cookbook-eval --help &> /dev/null; then 13 | eval_command="uv run olmo-cookbook-eval" 14 | else 15 | echo "Error: olmo-cookbook-eval command not found. Please install it or ensure uv is available." 16 | exit 1 17 | fi 18 | 19 | 20 | model_path="$1" 21 | base_command="${eval_command} evaluate \"${model_path}\" --priority urgent --cluster ai2/jupiter-cirrascale-2 --num-gpus 1 --model-backend vllm --dashboard peteish-LC-ruler --budget ai2/oe-base --model-args \"trust_remote_code=true, chat_model=null, max_length=65536\" --task-args \"use_chat_format=false\" --vllm-use-v1-spec --workspace ai2/long-contexts --beaker-image amandab/lc-only-adjust-rope-global-layers" 22 | 23 | 24 | echo "Launching task: ruler:4k" 25 | eval "${base_command} --tasks ruler:4k -j 2" 26 | 27 | echo "Launching task: ruler:8k" 28 | eval "${base_command} --tasks ruler:8k -j 2" 29 | 30 | echo "Launching task: ruler:16k" 31 | eval "${base_command} --tasks ruler:16k -j 2" 32 | 33 | echo "Launching task: ruler:32k" 34 | eval "${base_command} --tasks ruler:32k -j 2" 35 | 36 | echo "Launching task: ruler:64k" 37 | eval "${base_command} --tasks ruler:64k -j 2" 38 | 39 | wait 40 | -------------------------------------------------------------------------------- /scripts/summarize_data_mix.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Examples: 4 | Peteish7 python scripts/summarize_data_mix.py https://wandb.ai/ai2-llm/olmo-medium/runs/cej4ya39 5 | OLMoE python scripts/summarize_data_mix.py https://wandb.ai/ai2-llm/olmoe/runs/rzsn9tlc 6 | Amberish python scripts/summarize_data_mix.py https://wandb.ai/ai2-llm/olmo-medium/runs/ij4ls6v2 7 | 8 | """ 9 | 10 | import logging 11 | import os 12 | import re 13 | from collections import Counter 14 | from typing import Any, Callable, Dict, MutableMapping, Optional, Tuple, Union 15 | 16 | import click 17 | 18 | log = logging.getLogger(__name__) 19 | log.setLevel(logging.INFO) 20 | console_handler = logging.StreamHandler() 21 | console_handler.setLevel(logging.INFO) 22 | formatter = logging.Formatter("%(message)s") # Simple formatter to just show the message 23 | console_handler.setFormatter(formatter) 24 | log.addHandler(console_handler) 25 | 26 | 27 | run_path_re = re.compile(r"^[^/]+/[^/]+/[^/]+$") 28 | run_path_url = re.compile(r"^https?://wandb.ai/([^/]+)/([^/]+)/runs/([^/]+)") 29 | 30 | 31 | def flatten_dict(dictionary, parent_key="", separator=".", include_lists=False): 32 | """ 33 | Flatten a nested dictionary into a single-level dictionary. 34 | 35 | Args: 36 | dictionary (dict): The nested dictionary to be flattened. 37 | parent_key (str, optional): The parent key to be prepended to the keys of the flattened dictionary. Defaults to "". 38 | separator (str, optional): The separator to be used between the parent key and the keys of the flattened dictionary. Defaults to ".". 39 | include_lists (bool, optional): Whether to convert lists to dictionaries with integer keys. Defaults to False. 40 | 41 | Returns: 42 | dict: The flattened dictionary. 43 | 44 | """ 45 | d: Dict[str, Any] = {} 46 | for key, value in dictionary.items(): 47 | new_key = parent_key + separator + key if parent_key else key 48 | # convert lists to dict with key 49 | if isinstance(value, list) and include_lists: 50 | value = {f"{i}": v for i, v in enumerate(value)} 51 | if isinstance(value, MutableMapping): 52 | d.update(**flatten_dict(value, new_key, separator=separator, include_lists=include_lists)) 53 | else: 54 | d[new_key] = value 55 | return d 56 | 57 | 58 | def parse_run_path(run_path: str) -> str: 59 | """For convenience, we allow run paths as well as URLs.""" 60 | run_path = run_path.strip("/") 61 | if run_path_re.match(run_path): 62 | return run_path 63 | 64 | m = run_path_url.match(run_path) 65 | if m is not None: 66 | entity, project, run_id = m.groups() 67 | return f"{entity}/{project}/{run_id}" 68 | 69 | raise ValueError(f"Could not parse '{run_path}'") 70 | 71 | 72 | def format_counter_paths(data_paths, log): 73 | """ 74 | Format a Counter containing file paths into a readable log output. 75 | Shows full paths with aligned counts and percentages. 76 | 77 | Args: 78 | counter (Counter): Counter object containing path counts 79 | log (logging.Logger): Logger instance to output the formatted results 80 | """ 81 | if not data_paths: 82 | log.info("Counter is empty") 83 | return 84 | 85 | # Find the largest count for padding 86 | max_count_width = len(str(max(data_paths.values()))) 87 | 88 | # Sort by count in descending order 89 | sorted_items = data_paths.most_common() 90 | total_count = sum(data_paths.values()) 91 | 92 | log.info(f"Total entries: {total_count}") 93 | log.info("-" * 120) # Made longer to accommodate full paths 94 | 95 | for path, count in sorted_items: 96 | # Format the percentage 97 | percentage = (count / total_count) * 100 98 | 99 | # Create the formatted string with aligned counts and percentages 100 | formatted_line = f"{count:>{max_count_width},d} items ({percentage:5.1f}%) | {path}" 101 | 102 | log.info(formatted_line) 103 | 104 | log.info("-" * 120) # Made longer to accommodate full paths 105 | 106 | 107 | @click.command() 108 | @click.argument( 109 | "run_path", 110 | type=str, 111 | ) 112 | def main(run_path: str): 113 | import wandb 114 | 115 | api = wandb.Api() 116 | run = api.run(parse_run_path(run_path)) 117 | 118 | config_raw = run._attrs["rawconfig"] 119 | 120 | # flattening the dict will make diffs easier 121 | config = flatten_dict(config_raw) 122 | 123 | # first, data.paths can be grouped and counted. 124 | data_paths = Counter([os.path.dirname(path) for path in config["data.paths"]]) 125 | 126 | format_counter_paths(data_paths, log) 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /src/cookbook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/__init__.py -------------------------------------------------------------------------------- /src/cookbook/aliases.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from os import PathLike 3 | from pathlib import Path 4 | from typing import Any, List, Optional, Union 5 | 6 | from olmo_core.data.types import NumpyDatasetDType 7 | from olmo_core.launch.beaker import BeakerLaunchConfig 8 | from olmo_core.train.common import Duration 9 | from pydantic import BaseModel, field_validator 10 | 11 | from cookbook.model.config import ModelConfigIdentifier 12 | from cookbook.model.evaluators import DownstreamEvaluator 13 | 14 | DownstreamEvaluatorType = Union[str, DownstreamEvaluator] 15 | PathType = Union[Path, PathLike[Any], str] 16 | 17 | try: 18 | from beaker import Priority # pyright: ignore 19 | except ImportError: 20 | Priority = str 21 | 22 | 23 | class SourceConfig(BaseModel): 24 | name: str 25 | paths: list[str] 26 | target_ratio: Optional[float] = None 27 | repetition_factor: float = 1.0 28 | max_source_ratio: float = 1.0 29 | 30 | 31 | class SourceInstance(BaseModel): 32 | name: str 33 | paths: list[str] 34 | ratio: float 35 | repetition_factor: float = 1.0 36 | 37 | 38 | class DatasetConfig(BaseModel): 39 | sources: list[SourceConfig] 40 | dtype: NumpyDatasetDType = NumpyDatasetDType.uint32 41 | processes: int = 16 42 | seed: int = 42 43 | 44 | 45 | class MetricBackend(Enum): 46 | wandb = "wandb" 47 | comet = "comet" 48 | 49 | 50 | class MetricsConfig(BaseModel): 51 | project: str = "olmo-cookbook" 52 | workspace: str = "ai2" 53 | entity: str = "ai2-llm" 54 | backends: list[MetricBackend] = [MetricBackend.wandb] 55 | 56 | 57 | class SchedulerType(Enum): 58 | COSINE = "cosine" 59 | COS_LINEAR = "cos_linear" 60 | LINEAR = "linear" 61 | WSD = "wsd" 62 | 63 | @classmethod 64 | def values(cls): 65 | return [e.value for e in cls] 66 | 67 | @classmethod 68 | def keys(cls): 69 | return [e.name for e in cls] 70 | 71 | 72 | class AnnealConfig(BaseModel): 73 | enabled: bool = True 74 | initial_lr: Optional[float] = None 75 | 76 | 77 | class ExperimentConfig(BaseModel, extra="forbid"): 78 | name: str 79 | description: str 80 | budget: str 81 | workspace: str 82 | nodes: int 83 | gpus: int 84 | max_tokens: int 85 | sequence_length: int 86 | seed: int 87 | cluster: str 88 | tokenizer: str 89 | priority: Priority # pyright: ignore 90 | dataset: DatasetConfig 91 | model: ModelConfigIdentifier 92 | load_path: Optional[str] = None 93 | load_state: bool = True 94 | annealing: Optional[AnnealConfig] = None 95 | nccl_debug: bool = False 96 | activation_checkpointing: bool = False 97 | model_overrides: Optional[List[str]] = None 98 | scheduler_type: SchedulerType = SchedulerType.COS_LINEAR 99 | hard_stop: Optional[Duration] = None 100 | rank_microbatch_size: Optional[int] = None 101 | learning_rate: Optional[float] = None 102 | global_batch_size: Optional[int] = None 103 | lm_evaluator: bool = False 104 | downstream_evaluators: list[DownstreamEvaluatorType] = [] # type: ignore 105 | max_target_sequence_length: int = 8192 106 | metrics_config: Optional[MetricsConfig] = MetricsConfig() 107 | preemptible: bool = True 108 | shared_filesystem: bool = False 109 | weka: bool = False 110 | eval_interval: int = 200 111 | save_interval: int = 1000 112 | warmup_steps: Optional[int] = None 113 | path: Path 114 | 115 | @field_validator("model", mode="before") 116 | @classmethod 117 | def validate_model(cls, value): 118 | """Convert string to ModelConfigIdentifier if needed.""" 119 | if isinstance(value, str): 120 | return ModelConfigIdentifier(value) 121 | return value 122 | 123 | @field_validator("annealing") 124 | @classmethod 125 | def validate_annealing(cls, value, info): 126 | """Validate that if annealing is True, then load_path must not be None.""" 127 | if value is not None and info.data.get("load_path") is None: 128 | raise ValueError("If annealing is enabled, load_path must be specified.") 129 | return value 130 | 131 | 132 | class ExperimentInstance(BaseModel): 133 | name: str 134 | sources: list[SourceInstance] 135 | 136 | 137 | class ExperimentGroup(BaseModel): 138 | config: ExperimentConfig 139 | group_id: str 140 | instances: list[ExperimentInstance] 141 | 142 | 143 | class LaunchGroup(BaseModel): 144 | instances: list[BeakerLaunchConfig] 145 | 146 | 147 | def validate_sources(sources: list[SourceConfig]): 148 | """Validate a list of source configurations.""" 149 | target_ratio_present = any(source.target_ratio is not None for source in sources) 150 | 151 | for source in sources: 152 | if target_ratio_present and source.target_ratio is None: 153 | raise ValueError("If any source has target_ratio set, all sources must have target_ratio set.") 154 | -------------------------------------------------------------------------------- /src/cookbook/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/data/__init__.py -------------------------------------------------------------------------------- /src/cookbook/data/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, field 3 | from typing import List, Union 4 | from urllib.parse import urlparse 5 | 6 | import gcsfs 7 | import s3fs 8 | from olmo_core.data.source_mixture import ( 9 | SourceMixtureConfig, 10 | SourceMixtureDatasetConfig, 11 | ) 12 | from olmo_core.data.types import NumpyDatasetDType 13 | 14 | from cookbook.aliases import SourceInstance 15 | from cookbook.utils.data import expand_globs 16 | 17 | 18 | @dataclass 19 | class MixtureBuilder: 20 | sources: List[SourceInstance] 21 | max_tokens: int 22 | sequence_length: int 23 | seed: int 24 | dtype: NumpyDatasetDType 25 | processes: int = 1 26 | cached_fs: dict[str, Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]] = field( 27 | default_factory=lambda: dict( 28 | s3=s3fs.S3FileSystem(), 29 | weka=s3fs.S3FileSystem( 30 | client_kwargs={"endpoint_url": os.environ["WEKA_ENDPOINT_URL"]}, profile="WEKA" 31 | ), 32 | gs=gcsfs.GCSFileSystem(), 33 | ) 34 | ) 35 | 36 | def build(self) -> SourceMixtureDatasetConfig: 37 | source_configs: List[SourceMixtureConfig] = [] 38 | for source in self.sources: 39 | globs = [path for path in source.paths if "*" in path] 40 | paths = [path for path in source.paths if path not in globs] 41 | 42 | # Check if all paths have the same URL scheme 43 | schemes = {urlparse(path).scheme for path in paths + globs} 44 | if len(schemes) > 1: 45 | raise ValueError(f"All paths for source {source.name} must have the same scheme. Found: {schemes}") 46 | elif len(schemes) == 0: 47 | raise ValueError(f"No paths found for source {source.name}") 48 | 49 | scheme = schemes.pop() 50 | 51 | expanded = paths + expand_globs(self.cached_fs.get(scheme, self.cached_fs["s3"]), globs) 52 | 53 | if len(expanded) == 0: 54 | raise ValueError(f"No paths found for source {source.name}") 55 | 56 | source_configs.append( 57 | SourceMixtureConfig( 58 | source_name=source.name, 59 | paths=expanded, 60 | target_ratio=source.ratio, 61 | max_repetition_ratio=source.repetition_factor, 62 | ) 63 | ) 64 | 65 | return SourceMixtureDatasetConfig( 66 | source_configs=source_configs, 67 | max_tokens=self.max_tokens, 68 | sequence_length=self.sequence_length, 69 | seed=self.seed, 70 | dtype=self.dtype, 71 | processes=self.processes, 72 | ) 73 | -------------------------------------------------------------------------------- /src/cookbook/data/mixes/stackexchange.txt: -------------------------------------------------------------------------------- 1 | #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) 2 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy 3 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy 4 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy 5 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy 6 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy 7 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy 8 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy 9 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy 10 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy 11 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy 12 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy 13 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy 14 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy 15 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy 16 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy 17 | s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy 18 | -------------------------------------------------------------------------------- /src/cookbook/eval/cache.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import os 4 | import shutil 5 | from dataclasses import dataclass 6 | from typing import Generic, TypeVar 7 | 8 | import smart_open 9 | from platformdirs import user_cache_dir 10 | 11 | T = TypeVar("T") 12 | V = TypeVar("V") 13 | 14 | 15 | @dataclass(frozen=True) 16 | class DatalakeCacheResult(Generic[T]): 17 | success: bool 18 | value: T | None 19 | 20 | 21 | # Singleton instance storage 22 | _DATALAKE_CACHE_INSTANCE = None 23 | 24 | 25 | @dataclass 26 | class DatalakeCache(Generic[T]): 27 | cache_dir: str 28 | invalidate: bool 29 | do_not_cache: bool 30 | 31 | def __init__(self, invalidate: bool = False, do_not_cache: bool = False): 32 | self.invalidate = ( 33 | invalidate 34 | if invalidate is not False 35 | else (os.environ.get("DATALAKE_CACHE_INVALIDATE", "false").lower() == "true") 36 | ) 37 | 38 | self.do_not_cache = ( 39 | do_not_cache 40 | if do_not_cache is not False 41 | else (os.environ.get("DATALAKE_DO_NOT_CACHE", "false").lower() == "true") 42 | ) 43 | 44 | # Set cache_dir 45 | self.cache_dir = user_cache_dir("datalake", "olmo-cookbook") 46 | 47 | if self.invalidate and os.path.exists(self.cache_dir): 48 | shutil.rmtree(self.cache_dir, ignore_errors=True) 49 | 50 | # Check if path exists but is a file instead of a directory 51 | if os.path.exists(self.cache_dir) and not os.path.isdir(self.cache_dir): 52 | try: 53 | os.remove(self.cache_dir) 54 | except FileNotFoundError: 55 | pass 56 | 57 | if not os.path.exists(self.cache_dir): 58 | os.makedirs(self.cache_dir, exist_ok=True) 59 | 60 | def _make_cache_path(self, **kwargs) -> str: 61 | cache_key = hashlib.sha256(json.dumps(kwargs).encode()).hexdigest() 62 | return os.path.join(self.cache_dir, f"{cache_key}.json.gz") 63 | 64 | def get(self, **kwargs) -> DatalakeCacheResult[T]: 65 | if self.do_not_cache: 66 | return DatalakeCacheResult(success=False, value=None) 67 | 68 | if os.path.exists(cache_file := self._make_cache_path(**kwargs)) and not self.invalidate: 69 | with smart_open.open(cache_file, "rt", encoding="utf-8") as f: 70 | return DatalakeCacheResult(success=True, value=json.load(f)) 71 | 72 | return DatalakeCacheResult(success=False, value=None) 73 | 74 | def set(self, value: T, **kwargs) -> DatalakeCacheResult[T]: 75 | if self.do_not_cache: 76 | return DatalakeCacheResult(success=False, value=None) 77 | 78 | if not os.path.exists(cache_file := self._make_cache_path(**kwargs)) or self.invalidate: 79 | with smart_open.open(cache_file, "wt", encoding="utf-8") as f: 80 | json.dump(value, f) 81 | 82 | return DatalakeCacheResult(success=True, value=value) 83 | 84 | def delete(self, **kwargs) -> None: 85 | if os.path.exists(cache_file := self._make_cache_path(**kwargs)): 86 | os.remove(cache_file) 87 | 88 | 89 | def get_datalake_cache(invalidate: bool = False, do_not_cache: bool = False) -> DatalakeCache: 90 | """Get or create a singleton instance of DatalakeCache.""" 91 | global _DATALAKE_CACHE_INSTANCE 92 | 93 | if _DATALAKE_CACHE_INSTANCE is None: 94 | kwargs = {} 95 | if invalidate is not None: 96 | kwargs["invalidate"] = invalidate 97 | if do_not_cache is not None: 98 | kwargs["do_not_cache"] = do_not_cache 99 | _DATALAKE_CACHE_INSTANCE = DatalakeCache(**kwargs) 100 | 101 | return _DATALAKE_CACHE_INSTANCE 102 | -------------------------------------------------------------------------------- /src/cookbook/eval/conversion_from_hf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | import shutil 4 | import subprocess 5 | from typing import Optional 6 | 7 | from cookbook.cli.utils import ( 8 | PythonEnv, 9 | add_secret_to_beaker_workspace, 10 | discover_weka_mount, 11 | install_beaker_py, 12 | install_olmo_core, 13 | install_transformers, 14 | make_destination_dir, 15 | remove_conflicting_packages, 16 | ) 17 | from cookbook.constants import ( 18 | OLMO_CORE_CONVERT_FROM_HF_SCRIPT, 19 | OLMO_CORE_V2_COMMIT_HASH, 20 | TRANSFORMERS_COMMIT_HASH, 21 | ) 22 | from cookbook.utils.clusters import get_matching_clusters 23 | 24 | 25 | def convert_hf_to_olmo_core_v2( 26 | input_dir: str, 27 | output_dir: Optional[str] = None, 28 | output_suffix: str = "olmo_core", 29 | olmo_core_v2_commit_hash: str = OLMO_CORE_V2_COMMIT_HASH, 30 | olmo_core_v2_experiment_json_path: Optional[str] = None, 31 | olmo_core_v2_model_arch: Optional[str] = None, 32 | olmo_core_v2_tokenizer: Optional[str] = None, 33 | transformers_git_url: Optional[str] = None, 34 | transformers_commit_hash: str = TRANSFORMERS_COMMIT_HASH, 35 | transformers_model_id: Optional[str] = None, 36 | transformers_revision: str = "main", 37 | skip_validation: bool = False, 38 | debug_validation: bool = False, 39 | device: Optional[str] = None, 40 | env: Optional[PythonEnv] = None, 41 | ): 42 | env = env or PythonEnv.null() 43 | 44 | directories_to_clean_up = [] 45 | 46 | output_dir = make_destination_dir(input_dir, output_suffix, output_dir) 47 | 48 | try: 49 | print("Starting conversion of HF model...") 50 | 51 | olmo_code_dir = install_olmo_core(env=env, commit_hash=olmo_core_v2_commit_hash) 52 | directories_to_clean_up.append(olmo_code_dir) 53 | 54 | huggingface_code_dir = install_transformers(transformers_commit_hash, env, git_url=transformers_git_url) 55 | directories_to_clean_up.append(huggingface_code_dir) 56 | 57 | print("Converting Huggingface weights to OLMo core V2 format...") 58 | os.makedirs(output_dir, exist_ok=True) 59 | cmd = [ 60 | env.python, 61 | OLMO_CORE_CONVERT_FROM_HF_SCRIPT, 62 | f"--checkpoint-input-path {input_dir}", 63 | f"--output-dir {output_dir}", 64 | f"--revision {transformers_revision}", 65 | (f"--config-path {olmo_core_v2_experiment_json_path}" if olmo_core_v2_experiment_json_path else ""), 66 | (f"--model-arch {olmo_core_v2_model_arch}" if olmo_core_v2_model_arch else ""), 67 | (f"--tokenizer {olmo_core_v2_tokenizer}" if olmo_core_v2_tokenizer else ""), 68 | (f"--model-id {transformers_model_id}" if transformers_model_id else ""), 69 | (f"--device {device}" if device else ""), 70 | ("--skip-validation" if skip_validation else ""), 71 | ("--debug" if debug_validation else ""), 72 | ] 73 | print(f"Running command: {' '.join(cmd)} from commit hash: {olmo_core_v2_commit_hash}") 74 | 75 | try: 76 | subprocess.run( 77 | shlex.split(" ".join(cmd)), 78 | check=True, 79 | cwd=olmo_code_dir, 80 | env=env.path(), 81 | capture_output=True, 82 | text=True, 83 | ) 84 | except subprocess.CalledProcessError as e: 85 | raise RuntimeError(f"Conversion failed with output: \n{e.output}\nStderr: \n{e.stderr}") from e 86 | 87 | print(f"Completed conversion of HF model. OLMo core v2 model at {output_dir}.") 88 | 89 | finally: 90 | for directory in directories_to_clean_up: 91 | print(f"Cleaning up {directory}...") 92 | shutil.rmtree(directory, ignore_errors=True) 93 | 94 | 95 | def run_checkpoint_conversion_from_hf( 96 | beaker_allow_dirty: bool, 97 | beaker_budget: str, 98 | beaker_cluster: str, 99 | beaker_dry_run: bool, 100 | beaker_gpus: int, 101 | beaker_priority: str, 102 | beaker_workspace: str, 103 | beaker_preemptible: bool, 104 | huggingface_token: Optional[str], 105 | input_dir: str, 106 | output_dir: Optional[str], 107 | output_suffix: str, 108 | olmo_core_v2_commit_hash: str, 109 | olmo_core_v2_experiment_json_path: Optional[str], 110 | olmo_core_v2_model_arch: Optional[str], 111 | olmo_core_v2_tokenizer: Optional[str], 112 | huggingface_transformers_git_url: Optional[str], 113 | huggingface_transformers_commit_hash: str, 114 | huggingface_transformers_model_id: Optional[str], 115 | huggingface_transformers_revision: str, 116 | use_beaker: bool, 117 | use_system_python: bool, 118 | python_venv_name: str, 119 | python_venv_force: bool, 120 | skip_validation: bool, 121 | debug_validation: bool, 122 | torch_device: Optional[str], 123 | ): 124 | env = ( 125 | PythonEnv.create(name=python_venv_name, force=python_venv_force) 126 | if not use_system_python 127 | else PythonEnv.null() 128 | ) 129 | 130 | if use_beaker: 131 | print("Installing beaker and gantry clients...") 132 | install_beaker_py(env=env) 133 | 134 | assert input_dir.startswith("/"), "Input directory must be fully specified" 135 | if output_dir: 136 | assert output_dir.startswith("/"), "Output directory must be fully specified" 137 | if olmo_core_v2_experiment_json_path: 138 | assert olmo_core_v2_experiment_json_path.startswith("/"), "Output directory must be fully specified" 139 | 140 | weka_mounts = [ 141 | mount 142 | for mount in ( 143 | discover_weka_mount(input_dir), 144 | discover_weka_mount(output_dir), 145 | ) 146 | if mount is not None 147 | ] 148 | 149 | gantry_flags = [] 150 | 151 | for weka_path in set(weka_mounts): 152 | gantry_flags.append(f"--weka {weka_path}:/{weka_path}") 153 | 154 | if huggingface_token is not None: 155 | secret_name = add_secret_to_beaker_workspace( 156 | secret_name="HF_TOKEN", 157 | secret_value=huggingface_token, 158 | workspace=beaker_workspace, 159 | env=env, # type: ignore 160 | ) 161 | if secret_name: 162 | gantry_flags.append(f"--env-secret HF_TOKEN={secret_name}") 163 | 164 | for cluster in set(get_matching_clusters(beaker_cluster)): 165 | gantry_flags.append(f"--cluster {cluster}") 166 | 167 | remote_command = [ 168 | "pip install uv && uv pip install . --system &&", 169 | "olmo-cookbook-eval convert-from-hf", 170 | f"{input_dir}", 171 | (f"--output-dir {output_dir}" if output_dir else ""), 172 | f"--output-suffix {output_suffix}", 173 | f"--olmo-core-v2-commit-hash {olmo_core_v2_commit_hash}", 174 | ( 175 | f"--olmo-core-v2-experiment-json-path {olmo_core_v2_experiment_json_path}" 176 | if olmo_core_v2_experiment_json_path 177 | else "" 178 | ), 179 | (f"--olmo-core-v2-model-arch {olmo_core_v2_model_arch}" if olmo_core_v2_model_arch else ""), 180 | (f"--olmo-core-v2-tokenizer {olmo_core_v2_tokenizer}" if olmo_core_v2_tokenizer else ""), 181 | f"--huggingface-transformers-git-url {huggingface_transformers_git_url}", 182 | f"--huggingface-transformers-commit-hash {huggingface_transformers_commit_hash}", 183 | ( 184 | f"--huggingface-transformers-model-id {huggingface_transformers_model_id}" 185 | if huggingface_transformers_model_id 186 | else "" 187 | ), 188 | f"--huggingface-transformers-revision {huggingface_transformers_revision}", 189 | "--use-system-python", 190 | ("--skip-validation" if skip_validation else ""), 191 | ("--debug-validation" if debug_validation else ""), 192 | ("--torch-device" if torch_device else ""), 193 | ] 194 | remote_command_str = " ".join(remote_command) 195 | 196 | gantry_command = [ 197 | "gantry run", 198 | f"--description 'Converting HF checkpoint at {input_dir}'", 199 | ("--allow-dirty" if beaker_allow_dirty else ""), 200 | "--no-python", 201 | f"--workspace {beaker_workspace}", 202 | f"--priority {beaker_priority}", 203 | f"--gpus {beaker_gpus}", 204 | ("--preemptible" if beaker_preemptible else ""), 205 | f"--budget {beaker_budget}", 206 | "--yes", 207 | ("--dry-run" if beaker_dry_run else ""), 208 | " ".join(gantry_flags), 209 | f"-- /bin/bash -c '{remote_command_str}'", 210 | ] 211 | gantry_command_str = " ".join(gantry_command) 212 | 213 | print(f"Submitting to beaker with command: {gantry_command_str}") 214 | return subprocess.run(shlex.split(gantry_command_str), check=True, env=env.path()) 215 | 216 | remove_conflicting_packages(env=env) 217 | 218 | return convert_hf_to_olmo_core_v2( 219 | input_dir=input_dir, 220 | output_dir=output_dir, 221 | output_suffix=output_suffix, 222 | olmo_core_v2_commit_hash=olmo_core_v2_commit_hash, 223 | olmo_core_v2_experiment_json_path=olmo_core_v2_experiment_json_path, 224 | olmo_core_v2_model_arch=olmo_core_v2_model_arch, 225 | olmo_core_v2_tokenizer=olmo_core_v2_tokenizer, 226 | transformers_git_url=huggingface_transformers_git_url, 227 | transformers_commit_hash=huggingface_transformers_commit_hash, 228 | transformers_model_id=huggingface_transformers_model_id, 229 | transformers_revision=huggingface_transformers_revision, 230 | skip_validation=skip_validation, 231 | debug_validation=debug_validation, 232 | device=torch_device, 233 | env=env, 234 | ) 235 | -------------------------------------------------------------------------------- /src/cookbook/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/model/__init__.py -------------------------------------------------------------------------------- /src/cookbook/model/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | from typing import Any, Optional 4 | 5 | import olmo_core.train.train_module as train_module 6 | from olmo_core.config import Config 7 | from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig, TokenizerConfig 8 | from olmo_core.nn.transformer import ( 9 | TransformerBlockType, 10 | TransformerConfig, 11 | ) 12 | from olmo_core.optim import OptimConfig 13 | from olmo_core.train import TrainerConfig 14 | 15 | 16 | class Tokenizers(Enum): 17 | dolma2 = TokenizerConfig.dolma2() 18 | gpt_neox = TokenizerConfig.gpt_neox_olmo_dolma_v1_5() 19 | superbpe_experimental = TokenizerConfig( 20 | vocab_size=180021, 21 | identifier="allenai/superbpe-experimental_v0.1.0", 22 | eos_token_id=180000, 23 | pad_token_id=180001, 24 | ) 25 | dolma2_180k = TokenizerConfig( 26 | vocab_size=180021, 27 | identifier="allenai/dolma2-180k-experimental-0.0.1", 28 | eos_token_id=180000, 29 | pad_token_id=180001, 30 | ) 31 | 32 | 33 | @dataclass 34 | class ModelTrainConfig(Config): 35 | model: TransformerConfig 36 | optim: OptimConfig 37 | dataset: NumpyDatasetConfig 38 | data_loader: NumpyDataLoaderConfig 39 | trainer: TrainerConfig 40 | train_module: train_module.TransformerTrainModuleConfig 41 | init_seed: int = 12536 42 | 43 | 44 | @dataclass 45 | class DefaultOptimizerProperties: 46 | betas: tuple = (0.9, 0.95) 47 | eps: float = 1e-8 48 | weight_decay: float = 0.1 49 | 50 | 51 | @dataclass 52 | class DefaultTransformerProperties: 53 | block_type: TransformerBlockType = TransformerBlockType.reordered_norm 54 | decay_embeddings: bool = False 55 | layer_norm_eps: float = 1e-6 56 | qk_norm: bool = True 57 | rope_theta: int = 500_000 58 | 59 | 60 | class ModelConfigIdentifier: 61 | """ 62 | A dynamic registry for model identifiers that auto-initializes when used. 63 | """ 64 | 65 | _registry: dict[str, str] = {} 66 | _initialized = False 67 | 68 | def __init__(self, identifier): 69 | # Auto-initialize the first time this class is used 70 | if not ModelConfigIdentifier._initialized: 71 | ModelConfigIdentifier._initialize_identifiers() 72 | 73 | if identifier not in ModelConfigIdentifier._registry: 74 | raise ValueError( 75 | f"'{identifier}' is not a valid model identifier. " 76 | f"Available models: {', '.join(ModelConfigIdentifier._registry.keys())}" 77 | ) 78 | 79 | self.value = identifier 80 | self.name = identifier 81 | 82 | def __str__(self) -> str: 83 | return self.value 84 | 85 | def __repr__(self) -> str: 86 | return f"ModelConfigIdentifier({self.value!r})" 87 | 88 | def __eq__(self, other) -> bool: 89 | if isinstance(other, str): 90 | return self.value == other 91 | elif isinstance(other, ModelConfigIdentifier): 92 | return self.value == other.value 93 | return False 94 | 95 | @classmethod 96 | def _get_model_methods(cls, target_class) -> list[str]: 97 | """Get all classmethods of a class that might represent model configurations.""" 98 | return [ 99 | attr 100 | for attr in dir(target_class) 101 | if callable(getattr(target_class, attr)) 102 | and not attr.startswith("_") 103 | and attr not in ["from_dict", "from_json", "from_model_identifier", "values", "keys"] 104 | ] 105 | 106 | @classmethod 107 | def _initialize_identifiers(cls) -> None: 108 | """Initialize the model identifier registry with methods from TransformerConfig and WrappedTransformerConfig.""" 109 | # Add default models 110 | cls._registry["default"] = "default" 111 | 112 | # Add methods from WrappedTransformerConfig 113 | for method_name in cls._get_model_methods(WrappedTransformerConfig): 114 | cls._registry[method_name] = method_name 115 | 116 | # Add methods from TransformerConfig 117 | for method_name in cls._get_model_methods(TransformerConfig): 118 | if method_name not in cls._registry: 119 | cls._registry[method_name] = method_name 120 | 121 | cls._initialized = True 122 | 123 | @classmethod 124 | def keys(cls) -> list[str]: 125 | """Return all valid model identifier keys.""" 126 | if not cls._initialized: 127 | cls._initialize_identifiers() 128 | return list(cls._registry.keys()) 129 | 130 | @classmethod 131 | def values(cls) -> list[str]: 132 | """Return all valid model identifier values.""" 133 | if not cls._initialized: 134 | cls._initialize_identifiers() 135 | return list(cls._registry.values()) 136 | 137 | @classmethod 138 | def __get_pydantic_core_schema__(cls, _source_type, _handler): 139 | from pydantic_core import core_schema 140 | 141 | def validate_identifier(value, info): 142 | # Ensure registry is initialized 143 | if not cls._initialized: 144 | cls._initialize_identifiers() 145 | 146 | # Handle existing instances 147 | if isinstance(value, cls): 148 | return value 149 | 150 | # Handle string values 151 | if not isinstance(value, str): 152 | raise ValueError(f"Expected string or {cls.__name__}, got {type(value)}") 153 | 154 | # Validate against registry 155 | if value not in cls._registry: 156 | valid_values = ", ".join(cls._registry.keys()) 157 | raise ValueError( 158 | f"'{value}' is not a valid model identifier. " f"Available models: {valid_values}" 159 | ) 160 | 161 | return cls(value) 162 | 163 | return core_schema.with_info_plain_validator_function( 164 | validate_identifier, 165 | serialization=core_schema.plain_serializer_function_ser_schema(lambda instance: instance.value), 166 | metadata={ 167 | "type": "enum-like", 168 | "values": list(cls.keys()), 169 | }, 170 | ) 171 | 172 | 173 | class WrappedTransformerConfig: 174 | @classmethod 175 | def olmo_30m(cls, tokenizer: TokenizerConfig) -> TransformerConfig: 176 | """ 177 | OLMo 30m 178 | """ 179 | return getattr(TransformerConfig, "llama_like")( 180 | d_model=256, 181 | n_heads=8, 182 | n_layers=4, 183 | vocab_size=tokenizer.padded_vocab_size(), 184 | rope_theta=DefaultTransformerProperties.rope_theta, 185 | layer_norm_eps=DefaultTransformerProperties.layer_norm_eps, 186 | qk_norm=DefaultTransformerProperties.qk_norm, 187 | block_name=DefaultTransformerProperties.block_type, 188 | ) 189 | 190 | @classmethod 191 | def from_model_identifier( 192 | cls, 193 | model_identifier: ModelConfigIdentifier, 194 | tokenizer: TokenizerConfig = Tokenizers.dolma2.value, 195 | ) -> TransformerConfig: 196 | """ 197 | Create a TransformerConfig from a ModelConfigIdentifier. 198 | 199 | This method supports all models defined in the ModelConfigIdentifier enum by 200 | mapping them to appropriate TransformerConfig class methods. 201 | 202 | Args: 203 | model_identifier: The model identifier to create a config for 204 | tokenizer: The tokenizer config to use 205 | model_overrides: Optional overrides for the model config 206 | 207 | Returns: 208 | A TransformerConfig instance for the specified model 209 | 210 | Raises: 211 | ValueError: If the model identifier isn't supported in either cookbook or olmo-core 212 | """ 213 | model_name = model_identifier.value 214 | 215 | # First, check if we have a custom config override for this model 216 | if hasattr(cls, model_name): 217 | return getattr(cls, model_name)(tokenizer) 218 | 219 | # Then, check if the TransformerConfig class has a method for this model 220 | if hasattr(TransformerConfig, model_name): 221 | return getattr(TransformerConfig, model_name)( 222 | vocab_size=tokenizer.padded_vocab_size(), 223 | ) 224 | 225 | raise ValueError( 226 | f"Model identifier '{model_identifier}' is not supported in either cookbook or olmo-core." 227 | f" Available models: {', '.join(ModelConfigIdentifier.keys())}" 228 | ) 229 | 230 | 231 | DEFAULT_LR_MAP = { 232 | "olmo2_1B": 1.8e-3, 233 | "olmo2_1B_v2": 1.8e-3, 234 | } 235 | -------------------------------------------------------------------------------- /src/cookbook/model/evaluators.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Dict, List 3 | 4 | from olmo_eval import list_tasks 5 | 6 | OLMO2_DEV_1B_TASKS = [ 7 | # OLMES Core 9(-ish) RC 8 | "arc_challenge_test_rc_5shot", 9 | "arc_easy_test_rc_5shot", 10 | "hellaswag_rc_5shot", # 1K subset of HellaSwag 11 | "winogrande_val_rc_5shot", # Helpful after 750M-5xC scale 12 | "csqa_val_rc_5shot", 13 | "piqa_val_rc_5shot", 14 | "socialiqa_val_rc_5shot", 15 | # MMLU RC 16 | "mmlu_stem_val_rc_5shot", 17 | "mmlu_humanities_val_rc_5shot", 18 | "mmlu_social_sciences_val_rc_5shot", 19 | "mmlu_other_val_rc_5shot", 20 | "mmlu_stem_test_rc_5shot", 21 | "mmlu_humanities_test_rc_5shot", 22 | "mmlu_social_sciences_test_rc_5shot", 23 | "mmlu_other_test_rc_5shot", 24 | # Gen tasks BPB 25 | "gsm8k_gold_bpb_5shot", 26 | "minerva_math_algebra_gold_bpb_0shot", 27 | "minerva_math_counting_and_probability_gold_bpb_0shot", 28 | "minerva_math_geometry_gold_bpb_0shot", 29 | "minerva_math_intermediate_algebra_gold_bpb_0shot", 30 | "minerva_math_number_theory_gold_bpb_0shot", 31 | "minerva_math_prealgebra_gold_bpb_0shot", 32 | "minerva_math_precalculus_gold_bpb_0shot", 33 | "codex_humaneval_gold_bpb_0shot", 34 | "codex_mbpp_gold_bpb_0shot", 35 | # Sanity check for MCQA ability 36 | "copycolors_10way", 37 | # Basic Skills rc 5shot 38 | "basic_skills_arithmetic_rc_5shot", 39 | "basic_skills_coding_rc_5shot", 40 | "basic_skills_common_knowledge_rc_5shot", 41 | "basic_skills_logical_reasoning_rc_5shot", 42 | "basic_skills_pattern_rc_5shot", 43 | "basic_skills_string_operations_rc_5shot", 44 | ] 45 | 46 | TASK_GROUPS: Dict[str, List[str]] = { 47 | "all": list(list_tasks()), 48 | "olmo2_dev_1b": OLMO2_DEV_1B_TASKS 49 | } 50 | 51 | 52 | ALL_TASKS_MAP = {task.upper(): task for task in list_tasks()} 53 | 54 | DownstreamEvaluator = Enum( 55 | "DownstreamEvaluator", 56 | { 57 | item[0].upper(): item[1] if isinstance(item[1], list) else [item[1]] 58 | for item in {**TASK_GROUPS, **ALL_TASKS_MAP}.items() 59 | }, 60 | ) 61 | 62 | 63 | def get_tasks_for_groups(groups: List[str]) -> List[str]: 64 | """Return all tasks in a group""" 65 | tasks = [] 66 | for group in groups: 67 | if group in TASK_GROUPS: 68 | tasks.extend(TASK_GROUPS[group]) 69 | elif group.upper() in ALL_TASKS_MAP: 70 | tasks.append(ALL_TASKS_MAP[group.upper()]) 71 | else: 72 | raise ValueError(f"Group or task '{group}' not found") 73 | 74 | tasks = list(set(tasks)) 75 | tasks.sort() 76 | 77 | return tasks 78 | -------------------------------------------------------------------------------- /src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka-hlr.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo-cookbook-1b-5xC-love2code-hlr" 2 | description: "Love2Code model, first stab at a config" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: false 8 | max_tokens: 113_184_153_600 # 5xC multiplier 9 | sequence_length: 2048 10 | seed: 1337 11 | learning_rate: 1.8e-3 12 | model: "olmo2_1B" 13 | tokenizer: "dolma2" 14 | priority: urgent 15 | cluster: ai2/jupiter-cirrascale-2 16 | weka: true 17 | dataset: 18 | sources: 19 | - name: the-stack-v2-ai2v0 20 | target_ratio: 0.85 21 | paths: 22 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-001-00000.npy 23 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-004-00000.npy 24 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-006-00000.npy 25 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-007-00000.npy 26 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00000.npy 27 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00001.npy 28 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00002.npy 29 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-009-00000.npy 30 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-010-00000.npy 31 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-011-00000.npy 32 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-012-00000.npy 33 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-014-00000.npy 34 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-015-00000.npy 35 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-016-00000.npy 36 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-017-00000.npy 37 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-018-00000.npy 38 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-020-00000.npy 39 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-020-00001.npy 40 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-022-00003.npy 41 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-024-00001.npy 42 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-024-00002.npy 43 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-025-00001.npy 44 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-027-00000.npy 45 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00000.npy 46 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00001.npy 47 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00002.npy 48 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00003.npy 49 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-029-00001.npy 50 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-030-00000.npy 51 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-031-00000.npy 52 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-032-00001.npy 53 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-034-00000.npy 54 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-035-00000.npy 55 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-037-00001.npy 56 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-038-00000.npy 57 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-040-00000.npy 58 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00000.npy 59 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00001.npy 60 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00002.npy 61 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-043-00000.npy 62 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-045-00000.npy 63 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-046-00000.npy 64 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-047-00000.npy 65 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-048-00000.npy 66 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-048-00002.npy 67 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00000.npy 68 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00002.npy 69 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00003.npy 70 | - name: dclm-codeprose-v0 71 | target_ratio: 0.15 72 | paths: 73 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-00-00000.npy 74 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-01-00000.npy 75 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-02-00000.npy 76 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-03-00000.npy 77 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-04-00000.npy 78 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-05-00000.npy 79 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-06-00000.npy 80 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-07-00000.npy 81 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-08-00000.npy 82 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-09-00000.npy 83 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-10-00000.npy 84 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-11-00000.npy 85 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-12-00000.npy 86 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-13-00000.npy 87 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-14-00000.npy 88 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-15-00000.npy 89 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-16-00000.npy 90 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-17-00000.npy 91 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-18-00000.npy 92 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-19-00000.npy 93 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-20-00000.npy 94 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-21-00000.npy 95 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-22-00000.npy 96 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-23-00000.npy 97 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-24-00000.npy 98 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-25-00000.npy 99 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-26-00000.npy 100 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-27-00000.npy 101 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-28-00000.npy 102 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-29-00000.npy 103 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-30-00000.npy 104 | -------------------------------------------------------------------------------- /src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka-starcoder1-noprose.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo-cookbook-1b-5xC-love2code-starcoder-no-prose" 2 | description: "Love2Code model, but with starcoder1 data and no prose" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: false 8 | max_tokens: 113_184_153_600 # 5xC multiplier 9 | sequence_length: 2048 10 | seed: 1337 11 | model: "olmo2_1B" 12 | tokenizer: "dolma2" 13 | priority: high 14 | cluster: ai2/jupiter-cirrascale-2 15 | weka: true 16 | dataset: 17 | sources: 18 | - name: starcoder1 19 | target_ratio: 1.0 20 | paths: 21 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-00-00000.npy 22 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-01-00000.npy 23 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-01-00001.npy 24 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-02-00000.npy 25 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-02-00001.npy 26 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-03-00000.npy 27 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-03-00001.npy 28 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-04-00000.npy 29 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-05-00000.npy 30 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-05-00001.npy 31 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-06-00000.npy 32 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-06-00001.npy 33 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-07-00000.npy 34 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-08-00000.npy 35 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-08-00001.npy 36 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-09-00000.npy 37 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-09-00001.npy 38 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-10-00000.npy 39 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-11-00000.npy 40 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-11-00001.npy 41 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-12-00000.npy 42 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-12-00001.npy 43 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-13-00000.npy 44 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-14-00000.npy 45 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-15-00000.npy 46 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-15-00001.npy 47 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-16-00000.npy 48 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-17-00000.npy 49 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-18-00000.npy 50 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-19-00000.npy 51 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-19-00001.npy 52 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-20-00000.npy 53 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-20-00001.npy 54 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-21-00000.npy 55 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-22-00000.npy 56 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-22-00001.npy 57 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-23-00000.npy 58 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-23-00001.npy 59 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-24-00000.npy 60 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-25-00000.npy 61 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-25-00001.npy 62 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-26-00000.npy 63 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-27-00000.npy 64 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-28-00000.npy 65 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-28-00001.npy 66 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-29-00000.npy 67 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-29-00001.npy 68 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-30-00000.npy 69 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-31-00000.npy 70 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-31-00001.npy 71 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-32-00000.npy 72 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-32-00001.npy 73 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-33-00000.npy 74 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-33-00001.npy 75 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-34-00000.npy 76 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-34-00001.npy 77 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-35-00000.npy 78 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-35-00001.npy 79 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-36-00000.npy 80 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-37-00000.npy 81 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-37-00001.npy 82 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-38-00000.npy 83 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-39-00000.npy 84 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-40-00000.npy 85 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-40-00001.npy 86 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-41-00000.npy 87 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-41-00001.npy 88 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-42-00000.npy 89 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-43-00000.npy 90 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-44-00000.npy 91 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-45-00000.npy 92 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-46-00000.npy 93 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-47-00000.npy 94 | - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/allenai/dolma2-tokenizer/part-48-00000.npy 95 | -------------------------------------------------------------------------------- /src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo-cookbook-1b-5xC-love2code" 2 | description: "Love2Code model, first stab at a config" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: false 8 | max_tokens: 113_184_153_600 # 5xC multiplier 9 | sequence_length: 2048 10 | seed: 1337 11 | model: "olmo2_1B" 12 | tokenizer: "dolma2" 13 | priority: urgent 14 | cluster: ai2/jupiter-cirrascale-2 15 | weka: true 16 | dataset: 17 | sources: 18 | - name: the-stack-v2-ai2v0 19 | target_ratio: 0.85 20 | paths: 21 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-001-00000.npy 22 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-004-00000.npy 23 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-006-00000.npy 24 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-007-00000.npy 25 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00000.npy 26 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00001.npy 27 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-008-00002.npy 28 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-009-00000.npy 29 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-010-00000.npy 30 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-011-00000.npy 31 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-012-00000.npy 32 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-014-00000.npy 33 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-015-00000.npy 34 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-016-00000.npy 35 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-017-00000.npy 36 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-018-00000.npy 37 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-020-00000.npy 38 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-020-00001.npy 39 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-022-00003.npy 40 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-024-00001.npy 41 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-024-00002.npy 42 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-025-00001.npy 43 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-027-00000.npy 44 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00000.npy 45 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00001.npy 46 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00002.npy 47 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-028-00003.npy 48 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-029-00001.npy 49 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-030-00000.npy 50 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-031-00000.npy 51 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-032-00001.npy 52 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-034-00000.npy 53 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-035-00000.npy 54 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-037-00001.npy 55 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-038-00000.npy 56 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-040-00000.npy 57 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00000.npy 58 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00001.npy 59 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-042-00002.npy 60 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-043-00000.npy 61 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-045-00000.npy 62 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-046-00000.npy 63 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-047-00000.npy 64 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-048-00000.npy 65 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-048-00002.npy 66 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00000.npy 67 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00002.npy 68 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/stack2_v0_trunc/part-050-00003.npy 69 | - name: dclm-codeprose-v0 70 | target_ratio: 0.15 71 | paths: 72 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-00-00000.npy 73 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-01-00000.npy 74 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-02-00000.npy 75 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-03-00000.npy 76 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-04-00000.npy 77 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-05-00000.npy 78 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-06-00000.npy 79 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-07-00000.npy 80 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-08-00000.npy 81 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-09-00000.npy 82 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-10-00000.npy 83 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-11-00000.npy 84 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-12-00000.npy 85 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-13-00000.npy 86 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-14-00000.npy 87 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-15-00000.npy 88 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-16-00000.npy 89 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-17-00000.npy 90 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-18-00000.npy 91 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-19-00000.npy 92 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-20-00000.npy 93 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-21-00000.npy 94 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-22-00000.npy 95 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-23-00000.npy 96 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-24-00000.npy 97 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-25-00000.npy 98 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-26-00000.npy 99 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-27-00000.npy 100 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-28-00000.npy 101 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-29-00000.npy 102 | - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-30-00000.npy 103 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/anneal/train-1b-dclm-dolma2-anneal-10b.yaml: -------------------------------------------------------------------------------- 1 | name: "learn2code-linear-nowup-anneal-10B" 2 | description: "OLMo2 1b anneal to 10B Tokens" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 2048 11 | seed: 1337 12 | model: "olmo2_1B" 13 | tokenizer: "dolma2" 14 | priority: high 15 | eval_interval: 250 16 | learning_rate: 1.8e-3 17 | cluster: ai2/jupiter-cirrascale-2 18 | rank_microbatch_size: 32768 19 | scheduler_type: linear 20 | warmup_steps: 0 21 | annealing: true 22 | model_overrides: 23 | - block.feed_forward.hidden_size=5632 24 | load_path: weka://oe-training-default/ai2-llm/checkpoints/ai2-tylerm/olmo-cookbook-1b-5xC-love2code-no-prose-hlr-00203459/step53971/ 25 | load_state: false 26 | weka: true 27 | dataset: 28 | sources: 29 | - name: dclm-baseline-20pct-dolma2 30 | target_ratio: 1.0 31 | paths: 32 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy 33 | 34 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/anneal/train-7b-code-dolma2-anneal-10b-augusta.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b-learn2code-linear-nowup-anneal-10B" 2 | description: "OLMo2 7b anneal to 10B Tokens on code data" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: false 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 2048 11 | seed: 1337 12 | model: "olmo2_7B" 13 | tokenizer: "dolma2" 14 | priority: high 15 | eval_interval: 250 16 | cluster: ai2/augusta-google-1 17 | rank_microbatch_size: 8192 18 | scheduler_type: linear 19 | warmup_steps: 0 20 | annealing: 21 | enabled: true 22 | initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA 23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ 24 | load_state: false 25 | dataset: 26 | sources: 27 | - name: python 28 | target_ratio: 0.5 29 | paths: 30 | - s3://ai2-llm/preprocessed/the-stack-v2/love2code/v0/heuristic_filtered_minhash_plpartition/python/dolma2-tokenizer/*.npy 31 | - name: rust 32 | target_ratio: 0.5 33 | paths: 34 | - s3://ai2-llm/preprocessed/the-stack-v2/love2code/v0/heuristic_filtered_minhash_plpartition/rust/dolma2-tokenizer/*.npy 35 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/anneal/train-7b-dclm-only-anneal-10b-control.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b_10b-anneal_dclm-only_control" 2 | description: "OLMo2 7b anneal to 10B Tokens for dclm" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 2048 11 | seed: 1337 12 | model: "olmo2_7B" 13 | tokenizer: "dolma2" 14 | priority: urgent 15 | eval_interval: 250 16 | cluster: ai2/augusta-google-1 17 | rank_microbatch_size: 8192 18 | scheduler_type: linear 19 | warmup_steps: 0 20 | annealing: 21 | enabled: true 22 | initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA 23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ 24 | load_state: false 25 | dataset: 26 | sources: 27 | - name: dclm-baseline-olmo2 28 | target_ratio: 1.0 29 | paths: 30 | - s3://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/anneal/train-7b-finemath3p-anneal-10b-50split-dclm.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b_10b-anneal_finemath-3plus-dclm" 2 | description: "OLMo2 7b anneal to 10B Tokens for finemath-3plus + dclm" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 2048 11 | seed: 1337 12 | model: "olmo2_7B" 13 | tokenizer: "dolma2" 14 | priority: urgent 15 | eval_interval: 250 16 | cluster: ai2/augusta-google-1 17 | rank_microbatch_size: 8192 18 | scheduler_type: linear 19 | warmup_steps: 0 20 | annealing: 21 | enabled: true 22 | initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA 23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ 24 | load_state: false 25 | dataset: 26 | sources: 27 | - name: finemath-3plus 28 | target_ratio: 0.5 29 | paths: 30 | - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy 31 | - name: dclm-baseline-olmo2 32 | target_ratio: 0.5 33 | paths: 34 | - s3://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/anneal/train-7b-finemath3p-anneal-10b-50split.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b_10b-anneal_finemath-3plus" 2 | description: "OLMo2 7b anneal to 10B Tokens for finemath-3plus" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 2048 11 | seed: 1337 12 | model: "olmo2_7B" 13 | tokenizer: "dolma2" 14 | priority: high 15 | eval_interval: 250 16 | cluster: ai2/augusta-google-1 17 | rank_microbatch_size: 8192 18 | scheduler_type: linear 19 | warmup_steps: 0 20 | annealing: 21 | enabled: true 22 | initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA 23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ 24 | load_state: false 25 | dataset: 26 | sources: 27 | - name: finemath-3plus 28 | target_ratio: 0.5 29 | paths: 30 | - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy 31 | - name: base # Survivors | 5.59B tokens 32 | target_ratio: 0.5 33 | paths: 34 | - s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v2_subsamples/madlad_ablations_v1/ingredients/survivors/tokens/*.npy 35 | 36 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/anneal/train-7b-wiki-concat-anneal-10b-50split-dclm.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b_10b-anneal_wiki-concat-dclm" 2 | description: "OLMo2 7b anneal to 10B Tokens for structured-wikipedia-concat with links + dclm" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 2048 11 | seed: 1337 12 | model: "olmo2_7B" 13 | tokenizer: "dolma2" 14 | priority: urgent 15 | eval_interval: 250 16 | cluster: ai2/augusta-google-1 17 | rank_microbatch_size: 8192 18 | scheduler_type: linear 19 | warmup_steps: 0 20 | annealing: 21 | enabled: true 22 | initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA 23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ 24 | load_state: false 25 | dataset: 26 | sources: 27 | - name: structure-wiki-concat-with-links 28 | target_ratio: 0.5 29 | paths: 30 | - s3://ai2-llm/preprocessed/structured-wikipedia/concat_with_links/allenai/dolma2-tokenizer/*.npy 31 | - name: dclm-baseline-olmo2 32 | target_ratio: 0.5 33 | paths: 34 | - s3://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy 35 | 36 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/anneal/train-7b-wiki-concat-anneal-10b-50split.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b_10b-anneal_wiki-concat" 2 | description: "OLMo2 7b anneal to 10B Tokens for structured-wikipedia-concat with links" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 2048 11 | seed: 1337 12 | model: "olmo2_7B" 13 | tokenizer: "dolma2" 14 | priority: urgent 15 | eval_interval: 250 16 | cluster: ai2/augusta-google-1 17 | rank_microbatch_size: 8192 18 | scheduler_type: linear 19 | warmup_steps: 0 20 | annealing: 21 | enabled: true 22 | initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA 23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ 24 | load_state: false 25 | dataset: 26 | sources: 27 | - name: structure-wiki-concat-with-links 28 | target_ratio: 0.5 29 | paths: 30 | - s3://ai2-llm/preprocessed/structured-wikipedia/concat_with_links/allenai/dolma2-tokenizer/*.npy 31 | - name: base # Survivors | 5.59B tokens 32 | target_ratio: 0.5 33 | paths: 34 | - s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v2_subsamples/madlad_ablations_v1/ingredients/survivors/tokens/*.npy 35 | 36 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/anneal/train-7b-wiki-concat-anneal-10b.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b_10b-anneal_wiki-concat" 2 | description: "OLMo2 7b anneal to 10B Tokens for structured-wikipedia-concat with links" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 2048 11 | seed: 1337 12 | model: "olmo2_7B" 13 | tokenizer: "dolma2" 14 | priority: high 15 | eval_interval: 250 16 | cluster: ai2/augusta-google-1 17 | rank_microbatch_size: 8192 18 | scheduler_type: linear 19 | warmup_steps: 0 20 | annealing: 21 | enabled: true 22 | initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA 23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ 24 | load_state: false 25 | dataset: 26 | sources: 27 | - name: structure-wiki-concat-with-links 28 | target_ratio: 1.0 29 | paths: 30 | - s3://ai2-llm/preprocessed/structured-wikipedia/concat_with_links/allenai/dolma2-tokenizer/*.npy 31 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-1b-1xC-dclm.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo-cookbook-1b-1xC-dclm-001" 2 | description: "Example olmo-cookbook recipe" 3 | budget: "ai2/oe-data" 4 | workspace: "ai2/dolma2" 5 | nodes: 1 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 100_000_000 9 | sequence_length: 2048 10 | seed: 1337 11 | model: "olmo2_1B" 12 | tokenizer: "dolma2" 13 | priority: high 14 | cluster: ai2/saturn-cirrascale 15 | weka: true 16 | dataset: 17 | sources: 18 | - name: dclm-baseline-ft7pct-fw2 19 | target_ratio: 1.0 20 | paths: 21 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy 22 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy 23 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy 24 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy 25 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy 26 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-2-00000.npy 27 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy 28 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy 29 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy 30 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy 31 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy 32 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy 33 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy 34 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy 35 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy 36 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy 37 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy 38 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy 39 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy 40 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy 41 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy 42 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy 43 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy 44 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy 45 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy 46 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy 47 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy 48 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy 49 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy 50 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy 51 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy 52 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy 53 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy 54 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy 55 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy 56 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy 57 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy 58 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy 59 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy 60 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy 61 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy 62 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy 63 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy 64 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy 65 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy 66 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy 67 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy 68 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy 69 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy 70 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy 71 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy 72 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy 73 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy 74 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy 75 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy 76 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy 77 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy 78 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy 79 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy 80 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy 81 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy 82 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy 83 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy 84 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy 85 | - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy 86 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2-180k-wsd.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-1b-5xC-dclm-hlr-dolma2-180k-wsd" 2 | description: "OLMo2 1b@5xC dclm-baseline" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 16384 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "dolma2_180k" 15 | priority: high 16 | eval_interval: 250 17 | learning_rate: 1.8e-3 18 | scheduler_type: wsd 19 | cluster: ai2/jupiter-cirrascale-2 20 | weka: true 21 | # downstream_evaluators: 22 | # - olmo2_dev_1b 23 | dataset: 24 | sources: 25 | - name: dclm-baseline-20pct-dolma2-180k 26 | target_ratio: 1.0 27 | paths: 28 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer-180k/**/**/part-0*-00000.npy 29 | 30 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2-180k.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-1b-5xC-dclm-hlr-dolma2-180k" 2 | description: "OLMo2 1b@5xC dclm-baseline" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 16384 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "dolma2_180k" 15 | priority: high 16 | eval_interval: 250 17 | learning_rate: 1.8e-3 18 | cluster: ai2/jupiter-cirrascale-2 19 | weka: true 20 | # downstream_evaluators: 21 | # - olmo2_dev_1b 22 | dataset: 23 | sources: 24 | - name: dclm-baseline-20pct-dolma2-180k 25 | target_ratio: 1.0 26 | paths: 27 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer-180k/**/**/part-0*-00000.npy 28 | 29 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2-augusta.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-1b-augusta-test" 2 | description: "" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 1 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 1_000_000_000 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 32768 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "dolma2" 15 | priority: high 16 | cluster: ai2/augusta-google-1 17 | activation_checkpointing: true 18 | eval_interval: 10 19 | dataset: 20 | sources: 21 | - name: gs-test 22 | target_ratio: 1.0 23 | paths: 24 | - gs://ai2-llm/preprocessed/dclm/love2code_codeprose/codeprose/*.npy 25 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2-wsd.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-1b-5xC-dclm-hlr-dolma2-wsd" 2 | description: "OLMo2 1b@5xC dclm-baseline" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 2097152 10 | sequence_length: 4096 11 | seed: 1337 12 | model: "olmo2_1B_v2" 13 | tokenizer: "dolma2" 14 | priority: high 15 | eval_interval: 250 16 | scheduler_type: wsd 17 | learning_rate: 1.8e-3 18 | cluster: ai2/jupiter-cirrascale-2 19 | rank_microbatch_size: 32768 20 | weka: true 21 | # downstream_evaluators: 22 | # - olmo2_dev_1b 23 | dataset: 24 | sources: 25 | - name: dclm-baseline-20pct-dolma2 26 | target_ratio: 1.0 27 | paths: 28 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy 29 | 30 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-1b-5xC-dclm-dolma2.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-1b-dclm-dolma2" 2 | description: "OLMo2 1b@5xC dclm-baseline" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 2097152 10 | sequence_length: 4096 11 | seed: 1337 12 | model: "olmo2_1B_v2" 13 | tokenizer: "dolma2" 14 | priority: high 15 | eval_interval: 250 16 | cluster: ai2/jupiter-cirrascale-2 17 | rank_microbatch_size: 32768 18 | weka: true 19 | dataset: 20 | sources: 21 | - name: dclm-baseline-20pct-dolma2 22 | target_ratio: 1.0 23 | paths: 24 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy 25 | 26 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-1b-5xC-dclm-superbpe-wsd.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-1b-5xC-dclm-hlr-superbpe-wsd" 2 | description: "OLMo2 1b@5xC dclm-baseline with superbpe tokens" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 3072000 10 | rank_microbatch_size: 24000 11 | sequence_length: 3000 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "superbpe_experimental" 15 | priority: high 16 | eval_interval: 250 17 | warmup_steps: 6100 18 | scheduler_type: wsd 19 | learning_rate: 1.8e-3 20 | cluster: ai2/jupiter-cirrascale-2 21 | weka: true 22 | # downstream_evaluators: 23 | # - olmo2_dev_1b 24 | dataset: 25 | sources: 26 | - name: dclm-baseline-20pct-superbpe 27 | target_ratio: 1.0 28 | paths: 29 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/superbpe-experimental-0.1.0/**/**/part-0*-00000.npy 30 | 31 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-1b-5xC-dclm-superbpe.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-1b-5xC-dclm-hlr-superbpe" 2 | description: "OLMo2 1b@5xC dclm-baseline with superbpe tokens" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 16384 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "superbpe_experimental" 15 | priority: high 16 | eval_interval: 250 17 | warmup_steps: 6100 18 | learning_rate: 1.8e-3 19 | cluster: ai2/jupiter-cirrascale-2 20 | weka: true 21 | # downstream_evaluators: 22 | # - olmo2_dev_1b 23 | dataset: 24 | sources: 25 | - name: dclm-baseline-20pct-superbpe 26 | target_ratio: 1.0 27 | paths: 28 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/superbpe-experimental-0.1.0/**/**/part-0*-00000.npy 29 | 30 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-7b-1xC-dclm-dolma2-180k.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b-1xC-dclm-hlr-dolma2-180k" 2 | description: "OLMo2 7b@1xC dclm-baseline" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 16 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 144_284_139_520 # 7_214_206_976 * 20 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 8192 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_7B" 14 | tokenizer: "dolma2_180k" 15 | priority: high 16 | eval_interval: 250 17 | warmup_steps: 6880 18 | learning_rate: 6.8e-4 # sqrt scaling from 1.8e-3 19 | cluster: ai2/jupiter-cirrascale-2 20 | weka: true 21 | dataset: 22 | sources: 23 | - name: dclm-baseline-20pct-dolma2-180k 24 | target_ratio: 1.0 25 | paths: 26 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer-180k/**/**/part-0*-00000.npy 27 | 28 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-7b-1xC-dclm-dolma2.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b-1xC-dclm-dolma2" 2 | description: "OLMo2 7b@1xC dclm-baseline" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 16 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 137_751_511_040 # 6,887,575,552 * 20 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 8192 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_7B" 14 | tokenizer: "dolma2" 15 | priority: high 16 | eval_interval: 250 17 | warmup_steps: 6568 18 | learning_rate: 6.8e-4 # sqrt scaling from 1.8e-3 19 | cluster: ai2/jupiter-cirrascale-2 20 | weka: true 21 | dataset: 22 | sources: 23 | - name: dclm-baseline-20pct-dolma2 24 | target_ratio: 1.0 25 | paths: 26 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/dolma2-tokenizer/**/**/part-0*-00000.npy 27 | 28 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo2/train-7b-1xC-dclm-superbpe.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b-1xC-dclm-hlr-superbpe" 2 | description: "OLMo2 7b@1xC dclm-baseline with superbpe tokens" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 16 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 144_284_139_520 # 7_214_206_976 * 20 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 8192 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_7B" 14 | tokenizer: "superbpe_experimental" 15 | priority: high 16 | eval_interval: 250 17 | warmup_steps: 6880 18 | learning_rate: 6.8e-4 # sqrt scaling from 1.8e-3 19 | cluster: ai2/jupiter-cirrascale-2 20 | weka: true 21 | dataset: 22 | sources: 23 | - name: dclm-baseline-20pct-superbpe 24 | target_ratio: 1.0 25 | paths: 26 | - weka://oe-training-default/ai2-llm/preprocessed/dclm/baseline_type_topic_classified_20pct/allenai/superbpe-experimental-0.1.0/**/**/part-0*-00000.npy 27 | 28 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3-evals/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation practices for OLMo 3 development 2 | 3 | 4 | ## In-loop evaluation 5 | 6 | For **OLMo 3 7B** integration tests use this set of evals, baked into the OLMo 3 7B config in OLMo-Core: https://github.com/allenai/OLMo-core/blob/a91a82e6b8b37103f738e190cdddd6278f2c7f1f/src/scripts/train/OLMo3-7B.py#L113. In summary: 7 | * Don't slow down training run: 8 | * Prune to minimal set of tasks + use a fast version of MC. This is captured in OLMo Core as the `fast` set. See PR: https://github.com/allenai/OLMo-core/pull/282 9 | * Focus on BPB + MC, ignoring RC: 10 | * BPB should spot any major issues early on in training before MC takeoff 11 | * Still track MC because we want to make sure there is sensible metric takeoff. For an example of OLMo 3 7B MC metric takeoff slightly after 150B tokens, see https://wandb.ai/ai2-llm/olmo3/reports/OLMo-3-vs-OLMo-2--VmlldzoxMjc2MTA4Mw 12 | 13 | 14 | For **OLMo 2 1B 5xC** or **OLMo 2 7B annealing** runs, which we are still using for data ablations, the broad recommendation is to rely more on offline eval, which always has the latest state of evals. 15 | 16 | Some more notes about in-loop eval: 17 | * If you want to add an in-loop eval, the repo is here: https://github.com/allenai/OLMo-in-loop-evals 18 | * When selecting which metrics in Wandb, be very careful around whether you are selecting `{dev|test}`, `{rc|mc}`, `{length-normalized-accuracy|length-normalized-accuracy v2}`, `{BPB|BPB v2}`, `{5shot|5shot_fast}`. 19 | * The `v2` tasks fix the length-normalization, the original versions are reported for backwards-compatibility. 20 | * `5shot_fast` and `5shot` will give the same numbers, the `_fast` implementation uses one forward pass for the `A/B/C/D` tokens in MCQA tasks. 21 | 22 | 23 | ## Offline evaluation (1B or smaller) 24 | 25 | For all OLMo models (+ external baselines), this is a running list of evals we care about. 26 | 27 | For **OLMo 2 1B 5xC** runs, it's still good practice to look at both BPB & RC numbers, which usually track together with each other; MC numbers typically haven't broken through noise at this point. 28 | 29 | The command to run an eval looks like: 30 | 31 | ```bash 32 | CHECKPOINT="/oe-training-default/ai2-llm/checkpoints/mayeec/olmo-cookbook-core-v2-1bv2-5xC-dclm-baseline-topic-classified-sample-natural-28f8e9a9/step61000-hf" 33 | CLUSTER="l40" 34 | NUM_GPUS=1 35 | PARTITION=8 36 | PRIORITY="high" 37 | MODEL_ARGS="dtype=bfloat16" 38 | DASHBOARD="olmo-3-evals" 39 | WORKSPACE="ai2/olmo-3-evals" 40 | 41 | olmo-cookbook-eval evaluate "$CHECKPOINT" \ 42 | --tasks "olmo3:dev:1b:main" \ 43 | --priority "$PRIORITY" \ 44 | --cluster "$CLUSTER" \ 45 | --num-gpus "$NUM_GPUS" \ 46 | --model-backend vllm \ 47 | --model-args "$MODEL_ARGS" \ 48 | --partition-size "$PARTITION" \ 49 | --dashboard "$DASHBOARD" \ 50 | --workspace "$WORKSPACE" 51 | 52 | olmo-cookbook-eval evaluate "$CHECKPOINT" \ 53 | --tasks "olmo3:dev:1b:main:hf" \ 54 | --priority "$PRIORITY" \ 55 | --cluster "$CLUSTER" \ 56 | --num-gpus "$NUM_GPUS" \ 57 | --model-backend hf \ 58 | --model-args "$MODEL_ARGS" \ 59 | --partition-size "$PARTITION" \ 60 | --dashboard "$DASHBOARD" \ 61 | --workspace "$WORKSPACE" 62 | ``` 63 | 64 | Notes: 65 | * Task names are collected here: https://github.com/allenai/olmo-cookbook/blob/main/src/cookbook/eval/named_tasks.py 66 | 67 | *How long does it take?* 68 | * `olmo3:dev:1b:main` are a full suite of 20 tasks, each task w multiple metrics + some tasks as families w multiple subtasks. In total, this is around 150 metrics. Takes 2 hours to do all of them on `--partition-size 8` and `num-gpus 1` with single L40 (launches 5 jobs). 69 | * `olmo3:dev:1b:main:hf` are two masked PPL evals. Takes 1 hour to do both on a single L40. 70 | 71 | To pull dashboard results (use `--format json` to see full results): 72 | 73 | ```python 74 | olmo-cookbook-eval results \ 75 | --dashboard olmo-3-evals \ 76 | --tasks olmo3:dev:1b:main \ 77 | --tasks olmo3:dev:1b:main:hf \ 78 | --format json | jq '.' | less 79 | ``` 80 | 81 | 82 | *Notes* 83 | * If you want to see if the datalake uploading job ran, use your beaker experimental ID to this URL: `https://oe-eval-datalake.allen.ai/greenlake/metadata/01JWMGNY3G3R5N91NW9TCKF6FB`. 84 | 85 | ## Offline evaluation (7B or larger) 86 | 87 | To launch the base evals for large training runs, we have a separate set. We assume the models a better capability to follow task formats, so this set removes BPB and includes MC: 88 | 89 | ```sh 90 | olmo-cookbook-eval evaluate "$CHECKPOINT" \ 91 | --tasks "olmo3:dev:7b:main" \ 92 | --model-backend vllm \ 93 | ... 94 | ``` 95 | 96 | And pull your results with: 97 | 98 | ```python 99 | olmo-cookbook-eval results \ 100 | --dashboard olmo-3-evals \ 101 | --tasks olmo3:dev:7b:main \ 102 | --format json | jq '.' | less 103 | ``` 104 | 105 | *How long does it take?* 106 | * `olmo3:dev:7b:main` takes roughly 30 minutes to do all of them on `--partition-size 1` and `num-gpus 2` with L40s. 107 | 108 | ## Offline evaluation (midtraining) 109 | 110 | We have an additional set of adapt evals formatted for mid-trained models, this is meant to be run **in addition** to the base models. Please make sure to add the additional arguements to use a basic chat template for base models: 111 | 112 | ```sh 113 | olmo-cookbook-eval evaluate "$CHECKPOINT" \ 114 | --tasks olmo3:dev:midtrain:v0 \ 115 | --model-backend vllm \ 116 | --no-compute-gold-bpb \ 117 | --model-args chat_template=basic_answer \ 118 | --use-gantry \ 119 | --gantry-args env-secret="OPENAI_API_KEY=openai_api_key" \ 120 | --task-args chat_overrides="{\"generation_kwargs\": {\"stop_sequences\": [\"Problem:\", \"Answer:\", \"Question:\", \"\", \"<|eot_id|>\"]}}" 121 | ... 122 | ``` 123 | 124 | And pull your results with: 125 | 126 | ```python 127 | olmo-cookbook-eval results \ 128 | --dashboard olmo-3-evals \ 129 | --tasks olmo3:dev:midtrain:v0 \ 130 | --format json | jq '.' | less 131 | ``` 132 | 133 | Please refer to #oe-midtraining for more documentation. 134 | 135 | *How long does it take?* 136 | * `olmo3:dev:midtrain:v0` takes roughly 20 minutes using `--partition-size 1` and `num-gpus 2` with L40s. 137 | 138 | ## FAQs 139 | 140 | 1. **Why leave out GSM8k for base?** The task is odd and appears mainly moved by hill-climbing with mid-training data. Minerva seems like it covers a greater range. 141 | 142 | 2. **BPB vs RC vs MC?** This is still debated among team, but the eventual goal should be to move toward BPB that we trust for our experiments, and monitoring MC (or our final target end task format) on 7B+ runs for metric breakthrough moments & final scores. 143 | 144 | ### RC vs. MC 145 | 146 | For 7B+ runs, our development evals replace multiple-choice RC evals with MC evals. 147 | 148 | Summarizing our conversation with the team, we considered three options for calculating both, and aggregating the results: 149 | 150 | 1. Calculate `max(rc, mc)` -- This isn't desirable. Imagine two ablations -- one consistently has a very high RC and the other a very high MC. This is not a behavior we want from our metric 151 | 2. Calculate `avg(rc, mc)` -- This isn't desirable. At the small scale (when models get random-chance MC) we are artificially penalizing performance. At the large scale, (when models get lower RC than MC, because RC does not allow the model to see distractor options and therefore is a slightly more difficult task config) we are artificially penalizing performance. 152 | 3. Keep `mc` only -- We choose this. There is agreement that MC is a better task format, and that the issues caused by aggregation are not worth the benefit of accounting for two task formats. 153 | 154 | Additionally, we observed empircally that the MC tasks better ranked models w.r.t. training compute. For more discussion, see [Figure 1 of the OLMES paper](https://arxiv.org/pdf/2406.08446?page=7). 155 | 156 | ## TODOs 157 | 158 | 1. Want to add MMLU subcategories also pulled as part of dashboard pull. 159 | 2. Want to add more evals. There are 3 themes: 160 | * Existing evals that others use, but we don't. Candidates include LAMBADA, NQ, Squad, TriviaQA, MedMCQA, MedQA, etc. 161 | * Existing evals that need to be fixed in some way. Candidates include converting Gen2MC format, data augmentation for PiQA, SIQA, CSQA, OBQA, Winogrande, Simplified version of hard tasks like SimpleQA, etc. 162 | * New evals that capture something we aren't evaluating today but we think important to capture. Candidates include legal document tasks, science IE/Summ tasks, structured input (tables) reasoning tasks, perplexity over gold reasoning chains, etc 163 | 3. Want to add some stats testing, or some notion of noise. 164 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo2-7b_10b-microanneal_web-code-reasoning" 2 | description: "OLMo2 7b microanneal to 10B Tokens on web + code + reasoning data" 3 | budget: "ai2/oe-base" 4 | workspace: "ai2/olmo-3-microanneals" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 10_000_000_000 9 | global_batch_size: 2097152 10 | sequence_length: 4096 11 | seed: 1337 12 | model: "olmo2_7B" 13 | tokenizer: "dolma2" 14 | priority: urgent 15 | eval_interval: 250 16 | cluster: ai2/augusta-google-1 17 | rank_microbatch_size: 8192 18 | scheduler_type: linear 19 | warmup_steps: 0 20 | annealing: 21 | enabled: true 22 | initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA 23 | load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ 24 | load_state: false 25 | dataset: 26 | sources: 27 | - name: web 28 | target_ratio: 0.5 29 | paths: 30 | - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy 31 | - name: code 32 | target_ratio: 0.4 33 | paths: 34 | - gs://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/**/*.npy 35 | - name: reasoning 36 | target_ratio: 0.1 37 | paths: 38 | - gs://ai2-llm/preprocessed/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3/pstar/mixes/dclm_natural.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "domain": "adult_content", 4 | "weight": 0.014765319631599511 5 | }, 6 | { 7 | "domain": "art_and_design", 8 | "weight": 0.011240832719837508 9 | }, 10 | { 11 | "domain": "crime_and_law", 12 | "weight": 0.030561549604003226 13 | }, 14 | { 15 | "domain": "education_and_jobs", 16 | "weight": 0.02716702787570161 17 | }, 18 | { 19 | "domain": "electronics_and_hardware", 20 | "weight": 0.01097654207606029 21 | }, 22 | { 23 | "domain": "entertainment", 24 | "weight": 0.06613278917361155 25 | }, 26 | { 27 | "domain": "fashion_and_beauty", 28 | "weight": 0.008628080822748758 29 | }, 30 | { 31 | "domain": "finance_and_business", 32 | "weight": 0.07745446670203346 33 | }, 34 | { 35 | "domain": "food_and_dining", 36 | "weight": 0.019951347751195767 37 | }, 38 | { 39 | "domain": "games", 40 | "weight": 0.051721512912070444 41 | }, 42 | { 43 | "domain": "health", 44 | "weight": 0.07827944177883765 45 | }, 46 | { 47 | "domain": "history_and_geography", 48 | "weight": 0.033290089550114574 49 | }, 50 | { 51 | "domain": "home_and_hobbies", 52 | "weight": 0.02602595762618607 53 | }, 54 | { 55 | "domain": "industrial", 56 | "weight": 0.004413683853722294 57 | }, 58 | { 59 | "domain": "literature", 60 | "weight": 0.07559574213897882 61 | }, 62 | { 63 | "domain": "politics", 64 | "weight": 0.16428824015945423 65 | }, 66 | { 67 | "domain": "religion", 68 | "weight": 0.042119899145531485 69 | }, 70 | { 71 | "domain": "science_math_and_technology", 72 | "weight": 0.11054985278398685 73 | }, 74 | { 75 | "domain": "social_life", 76 | "weight": 0.03756325792313331 77 | }, 78 | { 79 | "domain": "software", 80 | "weight": 0.02068057269775392 81 | }, 82 | { 83 | "domain": "software_development", 84 | "weight": 0.03122990602438789 85 | }, 86 | { 87 | "domain": "sports_and_fitness", 88 | "weight": 0.03400017727030217 89 | }, 90 | { 91 | "domain": "transportation", 92 | "weight": 0.013721828887038707 93 | }, 94 | { 95 | "domain": "travel_and_tourism", 96 | "weight": 0.009641880891709906 97 | } 98 | ] 99 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3/pstar/mixes/dclm_pstar_001.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "domain": "adult_content", 4 | "weight": 0.00026734253853351287 5 | }, 6 | { 7 | "domain": "art_and_design", 8 | "weight": 0.0008791647328983347 9 | }, 10 | { 11 | "domain": "crime_and_law", 12 | "weight": 0.0003445385964403161 13 | }, 14 | { 15 | "domain": "education_and_jobs", 16 | "weight": 0.002800955150175096 17 | }, 18 | { 19 | "domain": "electronics_and_hardware", 20 | "weight": 0.00035335013072883227 21 | }, 22 | { 23 | "domain": "entertainment", 24 | "weight": 0.001974562965447597 25 | }, 26 | { 27 | "domain": "fashion_and_beauty", 28 | "weight": 0.00004713420964224449 29 | }, 30 | { 31 | "domain": "finance_and_business", 32 | "weight": 0.0016387611257735 33 | }, 34 | { 35 | "domain": "food_and_dining", 36 | "weight": 0.07688426723105127 37 | }, 38 | { 39 | "domain": "games", 40 | "weight": 0.0005008265605007255 41 | }, 42 | { 43 | "domain": "health", 44 | "weight": 0.15269241254530494 45 | }, 46 | { 47 | "domain": "history_and_geography", 48 | "weight": 0.0009123174419980062 49 | }, 50 | { 51 | "domain": "home_and_hobbies", 52 | "weight": 0.050373880217286074 53 | }, 54 | { 55 | "domain": "industrial", 56 | "weight": 0.020135591340543595 57 | }, 58 | { 59 | "domain": "literature", 60 | "weight": 0.0017867234094703585 61 | }, 62 | { 63 | "domain": "politics", 64 | "weight": 0.0010592059864753477 65 | }, 66 | { 67 | "domain": "religion", 68 | "weight": 0.001744181597964507 69 | }, 70 | { 71 | "domain": "science_math_and_technology", 72 | "weight": 0.41123062705335955 73 | }, 74 | { 75 | "domain": "social_life", 76 | "weight": 0.01865278057032931 77 | }, 78 | { 79 | "domain": "software", 80 | "weight": 0.00024003225357094727 81 | }, 82 | { 83 | "domain": "software_development", 84 | "weight": 0.2546200267035965 85 | }, 86 | { 87 | "domain": "sports_and_fitness", 88 | "weight": 0.00023863135040023465 89 | }, 90 | { 91 | "domain": "transportation", 92 | "weight": 0.00020485519629886065 93 | }, 94 | { 95 | "domain": "travel_and_tourism", 96 | "weight": 0.00041783109221028316 97 | } 98 | ] 99 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3/pstar/mixes/dclm_pstar_002.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "domain": "adult_content", 4 | "weight": 0.0000497999482706765 5 | }, 6 | { 7 | "domain": "art_and_design", 8 | "weight": 0.0008890156319389095 9 | }, 10 | { 11 | "domain": "crime_and_law", 12 | "weight": 0.016186555467287898 13 | }, 14 | { 15 | "domain": "education_and_jobs", 16 | "weight": 0.000810837888896179 17 | }, 18 | { 19 | "domain": "electronics_and_hardware", 20 | "weight": 0.0007234640858286977 21 | }, 22 | { 23 | "domain": "entertainment", 24 | "weight": 0.007313197890536659 25 | }, 26 | { 27 | "domain": "fashion_and_beauty", 28 | "weight": 0.0000411900615375631 29 | }, 30 | { 31 | "domain": "finance_and_business", 32 | "weight": 0.0002518954652144411 33 | }, 34 | { 35 | "domain": "food_and_dining", 36 | "weight": 0.003309783527164619 37 | }, 38 | { 39 | "domain": "games", 40 | "weight": 0.0017901306583973666 41 | }, 42 | { 43 | "domain": "health", 44 | "weight": 0.08071548886090384 45 | }, 46 | { 47 | "domain": "history_and_geography", 48 | "weight": 0.0004153629844012423 49 | }, 50 | { 51 | "domain": "home_and_hobbies", 52 | "weight": 0.008164976242805017 53 | }, 54 | { 55 | "domain": "industrial", 56 | "weight": 0.04250079499662069 57 | }, 58 | { 59 | "domain": "literature", 60 | "weight": 0.12352451996832144 61 | }, 62 | { 63 | "domain": "politics", 64 | "weight": 0.001356369544468529 65 | }, 66 | { 67 | "domain": "religion", 68 | "weight": 0.0005276023343452912 69 | }, 70 | { 71 | "domain": "science_math_and_technology", 72 | "weight": 0.3859178478770352 73 | }, 74 | { 75 | "domain": "social_life", 76 | "weight": 0.022827908810393156 77 | }, 78 | { 79 | "domain": "software", 80 | "weight": 0.0008462566212385966 81 | }, 82 | { 83 | "domain": "software_development", 84 | "weight": 0.2563807300394595 85 | }, 86 | { 87 | "domain": "sports_and_fitness", 88 | "weight": 0.04400562004176248 89 | }, 90 | { 91 | "domain": "transportation", 92 | "weight": 0.0011793381768489277 93 | }, 94 | { 95 | "domain": "travel_and_tourism", 96 | "weight": 0.0002713128763231578 97 | } 98 | ] 99 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3/pstar/mixes/dist-plot.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | from pathlib import Path 6 | 7 | # Define the paths to the JSON files 8 | json_files = ["dclm_natural.json", "dclm_pstar_001.json", "dclm_pstar_002.json"] 9 | 10 | # Load and combine data from all JSON files 11 | all_data = [] 12 | for json_file in json_files: 13 | with open(json_file, "r") as f: 14 | data = json.load(f) 15 | 16 | # Extract filename without extension for mix label 17 | mix_name = Path(json_file).stem 18 | 19 | # Add mix information to each domain entry 20 | for item in data: 21 | item["mix"] = mix_name 22 | all_data.append(item) 23 | 24 | # Convert to DataFrame 25 | df = pd.DataFrame(all_data) 26 | 27 | # Set up the plot style 28 | plt.figure(figsize=(16, 10)) 29 | sns.set_style("whitegrid") 30 | 31 | # Create histogram with color coding by mix and better spacing 32 | ax = sns.barplot(data=df, x="domain", y="weight", hue="mix", palette="Set2") 33 | 34 | # Add spacing between domain groups 35 | ax.tick_params(axis="x", which="major", pad=10) 36 | 37 | # Customize the plot 38 | plt.title("Domain Weight Distribution Across DCLM Mixes", fontsize=16, fontweight="bold", pad=20) 39 | plt.xlabel("Domain", fontsize=12, labelpad=15) 40 | plt.ylabel("Weight", fontsize=12, labelpad=15) 41 | plt.xticks(rotation=45, ha="right", fontsize=10) 42 | plt.yticks(fontsize=10) 43 | 44 | # Add vertical lines to separate domain groups every 3 domains for better readability 45 | for i in range(0, len(df["domain"].unique()), 1): 46 | plt.axvline(x=i - 0.5, color="gray", linestyle="--", alpha=0.3, linewidth=0.8) 47 | 48 | # Improve legend positioning and styling 49 | plt.legend(title="Mix", bbox_to_anchor=(1.02, 1), loc="upper left", frameon=True, fancybox=True, shadow=True) 50 | 51 | # Adjust layout to prevent label cutoff with more padding 52 | plt.tight_layout(pad=2.0) 53 | 54 | # Show the plot 55 | plt.show() 56 | 57 | # Print summary statistics 58 | print("\nSummary Statistics by Mix:") 59 | print(df.groupby("mix")["weight"].agg(["count", "mean", "std", "min", "max"])) 60 | 61 | # Print top domains by weight for each mix 62 | print("\nTop 5 domains by weight for each mix:") 63 | for mix in df["mix"].unique(): 64 | mix_data = df[df["mix"] == mix].nlargest(5, "weight") 65 | print(f"\n{mix}:") 66 | for _, row in mix_data.iterrows(): 67 | print(f" {row['domain']}: {row['weight']:.4f}") 68 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3/pstar/train-1b-5xC-pstar-001-dclm-dolma2.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo3-1b-5xC-dclm-pstar-001-hlr-dolma2" 2 | description: "OLMo3 1b@5xC dclm-baseline p* mix" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 32768 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "dolma2" 15 | priority: high 16 | eval_interval: 250 17 | learning_rate: 1.8e-3 18 | cluster: ai2/augusta-google-1 19 | downstream_evaluators: 20 | - olmo2_dev_1b 21 | dataset: 22 | sources: 23 | - name: adult_content 24 | target_ratio: 0.00026734253853351287 25 | repetition_factor: 2.0 26 | paths: 27 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/adult_content/**/*.npy" 28 | - name: art_and_design 29 | target_ratio: 0.0008791647328983347 30 | repetition_factor: 2.0 31 | paths: 32 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/art_and_design/**/*.npy" 33 | - name: crime_and_law 34 | target_ratio: 0.0003445385964403161 35 | repetition_factor: 2.0 36 | paths: 37 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/crime_and_law/**/*.npy" 38 | - name: education_and_jobs 39 | target_ratio: 0.002800955150175096 40 | repetition_factor: 2.0 41 | paths: 42 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/education_and_jobs/**/*.npy" 43 | - name: electronics_and_hardware 44 | target_ratio: 0.00035335013072883227 45 | repetition_factor: 2.0 46 | paths: 47 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/electronics_and_hardware/**/*.npy" 48 | - name: entertainment 49 | target_ratio: 0.001974562965447597 50 | repetition_factor: 2.0 51 | paths: 52 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/entertainment/**/*.npy" 53 | - name: fashion_and_beauty 54 | target_ratio: 0.00004713420964224449 55 | repetition_factor: 2.0 56 | paths: 57 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/fashion_and_beauty/**/*.npy" 58 | - name: finance_and_business 59 | target_ratio: 0.0016387611257735 60 | repetition_factor: 2.0 61 | paths: 62 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/finance_and_business/**/*.npy" 63 | - name: food_and_dining 64 | target_ratio: 0.07688426723105127 65 | repetition_factor: 2.0 66 | paths: 67 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/food_and_dining/**/*.npy" 68 | - name: games 69 | target_ratio: 0.0005008265605007255 70 | repetition_factor: 2.0 71 | paths: 72 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/games/**/*.npy" 73 | - name: health 74 | target_ratio: 0.15269241254530494 75 | repetition_factor: 2.0 76 | paths: 77 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/health/**/*.npy" 78 | - name: history_and_geography 79 | target_ratio: 0.0009123174419980062 80 | repetition_factor: 2.0 81 | paths: 82 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/history_and_geography/**/*.npy" 83 | - name: home_and_hobbies 84 | target_ratio: 0.050373880217286074 85 | repetition_factor: 2.0 86 | paths: 87 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/home_and_hobbies/**/*.npy" 88 | - name: industrial 89 | target_ratio: 0.020135591340543595 90 | repetition_factor: 2.0 91 | paths: 92 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/industrial/**/*.npy" 93 | - name: literature 94 | target_ratio: 0.0017867234094703585 95 | repetition_factor: 2.0 96 | paths: 97 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/literature/**/*.npy" 98 | - name: politics 99 | target_ratio: 0.0010592059864753477 100 | repetition_factor: 2.0 101 | paths: 102 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/politics/**/*.npy" 103 | - name: religion 104 | target_ratio: 0.001744181597964507 105 | repetition_factor: 2.0 106 | paths: 107 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/religion/**/*.npy" 108 | - name: science_math_and_technology 109 | target_ratio: 0.41123062705335955 110 | repetition_factor: 2.0 111 | paths: 112 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/science_math_and_technology/**/*.npy" 113 | - name: social_life 114 | target_ratio: 0.01865278057032931 115 | repetition_factor: 2.0 116 | paths: 117 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/social_life/**/*.npy" 118 | - name: software 119 | target_ratio: 0.00024003225357094727 120 | repetition_factor: 2.0 121 | paths: 122 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software/**/*.npy" 123 | - name: software_development 124 | target_ratio: 0.2546200267035965 125 | repetition_factor: 2.0 126 | paths: 127 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software_development/**/*.npy" 128 | - name: sports_and_fitness 129 | target_ratio: 0.00023863135040023465 130 | repetition_factor: 2.0 131 | paths: 132 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/sports_and_fitness/**/*.npy" 133 | - name: transportation 134 | target_ratio: 0.00020485519629886065 135 | repetition_factor: 2.0 136 | paths: 137 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/transportation/**/*.npy" 138 | - name: travel_and_tourism 139 | target_ratio: 0.00041783109221028316 140 | repetition_factor: 2.0 141 | paths: 142 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/travel_and_tourism/**/*.npy" 143 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3/pstar/train-1b-5xC-pstar-002-dclm-dolma2.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo3-1b-5xC-dclm-pstar-002-hlr-dolma2" 2 | description: "OLMo3 1b@5xC dclm-baseline p* mix" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 32768 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "dolma2" 15 | priority: high 16 | eval_interval: 250 17 | learning_rate: 1.8e-3 18 | cluster: ai2/augusta-google-1 19 | downstream_evaluators: 20 | - olmo2_dev_1b 21 | dataset: 22 | sources: 23 | - name: adult_content 24 | target_ratio: 0.0000497999482706765 25 | repetition_factor: 2.0 26 | paths: 27 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/adult_content/**/*.npy" 28 | - name: art_and_design 29 | target_ratio: 0.0008890156319389095 30 | repetition_factor: 2.0 31 | paths: 32 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/art_and_design/**/*.npy" 33 | - name: crime_and_law 34 | target_ratio: 0.016186555467287898 35 | repetition_factor: 2.0 36 | paths: 37 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/crime_and_law/**/*.npy" 38 | - name: education_and_jobs 39 | target_ratio: 0.000810837888896179 40 | repetition_factor: 2.0 41 | paths: 42 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/education_and_jobs/**/*.npy" 43 | - name: electronics_and_hardware 44 | target_ratio: 0.0007234640858286977 45 | repetition_factor: 2.0 46 | paths: 47 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/electronics_and_hardware/**/*.npy" 48 | - name: entertainment 49 | target_ratio: 0.007313197890536659 50 | repetition_factor: 2.0 51 | paths: 52 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/entertainment/**/*.npy" 53 | - name: fashion_and_beauty 54 | target_ratio: 0.0000411900615375631 55 | repetition_factor: 2.0 56 | paths: 57 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/fashion_and_beauty/**/*.npy" 58 | - name: finance_and_business 59 | target_ratio: 0.0002518954652144411 60 | repetition_factor: 2.0 61 | paths: 62 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/finance_and_business/**/*.npy" 63 | - name: food_and_dining 64 | target_ratio: 0.003309783527164619 65 | repetition_factor: 2.0 66 | paths: 67 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/food_and_dining/**/*.npy" 68 | - name: games 69 | target_ratio: 0.0017901306583973666 70 | repetition_factor: 2.0 71 | paths: 72 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/games/**/*.npy" 73 | - name: health 74 | target_ratio: 0.08071548886090384 75 | repetition_factor: 2.0 76 | paths: 77 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/health/**/*.npy" 78 | - name: history_and_geography 79 | target_ratio: 0.0004153629844012423 80 | repetition_factor: 2.0 81 | paths: 82 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/history_and_geography/**/*.npy" 83 | - name: home_and_hobbies 84 | target_ratio: 0.008164976242805017 85 | repetition_factor: 2.0 86 | paths: 87 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/home_and_hobbies/**/*.npy" 88 | - name: industrial 89 | target_ratio: 0.04250079499662069 90 | repetition_factor: 2.0 91 | paths: 92 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/industrial/**/*.npy" 93 | - name: literature 94 | target_ratio: 0.12352451996832144 95 | repetition_factor: 2.0 96 | paths: 97 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/literature/**/*.npy" 98 | - name: politics 99 | target_ratio: 0.001356369544468529 100 | repetition_factor: 2.0 101 | paths: 102 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/politics/**/*.npy" 103 | - name: religion 104 | target_ratio: 0.0005276023343452912 105 | repetition_factor: 2.0 106 | paths: 107 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/religion/**/*.npy" 108 | - name: science_math_and_technology 109 | target_ratio: 0.3859178478770352 110 | repetition_factor: 2.0 111 | paths: 112 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/science_math_and_technology/**/*.npy" 113 | - name: social_life 114 | target_ratio: 0.022827908810393156 115 | repetition_factor: 2.0 116 | paths: 117 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/social_life/**/*.npy" 118 | - name: software 119 | target_ratio: 0.0008462566212385966 120 | repetition_factor: 2.0 121 | paths: 122 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software/**/*.npy" 123 | - name: software_development 124 | target_ratio: 0.2563807300394595 125 | repetition_factor: 2.0 126 | paths: 127 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software_development/**/*.npy" 128 | - name: sports_and_fitness 129 | target_ratio: 0.04400562004176248 130 | repetition_factor: 2.0 131 | paths: 132 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/sports_and_fitness/**/*.npy" 133 | - name: transportation 134 | target_ratio: 0.0011793381768489277 135 | repetition_factor: 2.0 136 | paths: 137 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/transportation/**/*.npy" 138 | - name: travel_and_tourism 139 | target_ratio: 0.0002713128763231578 140 | repetition_factor: 2.0 141 | paths: 142 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/travel_and_tourism/**/*.npy" 143 | -------------------------------------------------------------------------------- /src/cookbook/recipes/olmo3/pstar/train-1b-5xC-pstar-natural-dclm-dolma2.yaml: -------------------------------------------------------------------------------- 1 | name: "olmo3-1b-5xC-dclm-pstar-natural-hlr-dolma2" 2 | description: "OLMo3 1b@5xC dclm-baseline FT classifier natural distribution" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 127_939_584_000 9 | global_batch_size: 2097152 10 | rank_microbatch_size: 32768 11 | sequence_length: 4096 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "dolma2" 15 | priority: high 16 | eval_interval: 250 17 | learning_rate: 1.8e-3 18 | cluster: ai2/augusta-google-1 19 | downstream_evaluators: 20 | - olmo2_dev_1b 21 | dataset: 22 | sources: 23 | - name: adult_content 24 | target_ratio: 0.014765319631599511 25 | repetition_factor: 2.0 26 | paths: 27 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/adult_content/**/*.npy" 28 | - name: art_and_design 29 | target_ratio: 0.011240832719837508 30 | repetition_factor: 2.0 31 | paths: 32 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/art_and_design/**/*.npy" 33 | - name: crime_and_law 34 | target_ratio: 0.030561549604003226 35 | repetition_factor: 2.0 36 | paths: 37 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/crime_and_law/**/*.npy" 38 | - name: education_and_jobs 39 | target_ratio: 0.02716702787570161 40 | repetition_factor: 2.0 41 | paths: 42 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/education_and_jobs/**/*.npy" 43 | - name: electronics_and_hardware 44 | target_ratio: 0.01097654207606029 45 | repetition_factor: 2.0 46 | paths: 47 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/electronics_and_hardware/**/*.npy" 48 | - name: entertainment 49 | target_ratio: 0.06613278917361155 50 | repetition_factor: 2.0 51 | paths: 52 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/entertainment/**/*.npy" 53 | - name: fashion_and_beauty 54 | target_ratio: 0.008628080822748758 55 | repetition_factor: 2.0 56 | paths: 57 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/fashion_and_beauty/**/*.npy" 58 | - name: finance_and_business 59 | target_ratio: 0.07745446670203346 60 | repetition_factor: 2.0 61 | paths: 62 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/finance_and_business/**/*.npy" 63 | - name: food_and_dining 64 | target_ratio: 0.019951347751195767 65 | repetition_factor: 2.0 66 | paths: 67 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/food_and_dining/**/*.npy" 68 | - name: games 69 | target_ratio: 0.051721512912070444 70 | repetition_factor: 2.0 71 | paths: 72 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/games/**/*.npy" 73 | - name: health 74 | target_ratio: 0.07827944177883765 75 | repetition_factor: 2.0 76 | paths: 77 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/health/**/*.npy" 78 | - name: history_and_geography 79 | target_ratio: 0.033290089550114574 80 | repetition_factor: 2.0 81 | paths: 82 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/history_and_geography/**/*.npy" 83 | - name: home_and_hobbies 84 | target_ratio: 0.02602595762618607 85 | repetition_factor: 2.0 86 | paths: 87 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/home_and_hobbies/**/*.npy" 88 | - name: industrial 89 | target_ratio: 0.004413683853722294 90 | repetition_factor: 2.0 91 | paths: 92 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/industrial/**/*.npy" 93 | - name: literature 94 | target_ratio: 0.07559574213897882 95 | repetition_factor: 2.0 96 | paths: 97 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/literature/**/*.npy" 98 | - name: politics 99 | target_ratio: 0.16428824015945423 100 | repetition_factor: 2.0 101 | paths: 102 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/politics/**/*.npy" 103 | - name: religion 104 | target_ratio: 0.042119899145531485 105 | repetition_factor: 2.0 106 | paths: 107 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/religion/**/*.npy" 108 | - name: science_math_and_technology 109 | target_ratio: 0.11054985278398685 110 | repetition_factor: 2.0 111 | paths: 112 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/science_math_and_technology/**/*.npy" 113 | - name: social_life 114 | target_ratio: 0.03756325792313331 115 | repetition_factor: 2.0 116 | paths: 117 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/social_life/**/*.npy" 118 | - name: software 119 | target_ratio: 0.02068057269775392 120 | repetition_factor: 2.0 121 | paths: 122 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software/**/*.npy" 123 | - name: software_development 124 | target_ratio: 0.03122990602438789 125 | repetition_factor: 2.0 126 | paths: 127 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/software_development/**/*.npy" 128 | - name: sports_and_fitness 129 | target_ratio: 0.03400017727030217 130 | repetition_factor: 2.0 131 | paths: 132 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/sports_and_fitness/**/*.npy" 133 | - name: transportation 134 | target_ratio: 0.013721828887038707 135 | repetition_factor: 2.0 136 | paths: 137 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/transportation/**/*.npy" 138 | - name: travel_and_tourism 139 | target_ratio: 0.009641880891709906 140 | repetition_factor: 2.0 141 | paths: 142 | - "s3://ai2-llm/preprocessed/dclm/baseline_topic_ft_lr05_ng2_n3M6_ova_20pct/allenai/dolma2-tokenizer/travel_and_tourism/**/*.npy" 143 | -------------------------------------------------------------------------------- /src/cookbook/recipes/spring2code/scaling/spring2code-190m-5xC-weka-top15-bpb-hlr-superbpe.yaml: -------------------------------------------------------------------------------- 1 | name: "spring2code-190m-5xC-top15-hlr-superbpe" 2 | description: "learn2code 190M@5xC top15 langauges from the-stack-v2 + dclm prose v2" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 1 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 19_000_000_000 9 | sequence_length: 2048 10 | seed: 1337 11 | model: "olmo2_190M" 12 | tokenizer: "superbpe_experimental" 13 | priority: high 14 | eval_interval: 100 15 | global_batch_size: 1_048_576 # Half of the 1B 16 | learning_rate: 1.8e-3 17 | cluster: ai2/jupiter-cirrascale-2 18 | weka: true 19 | downstream_evaluators: 20 | - codex_humaneval_gold_bpb_0shot 21 | - codex_mbpp_gold_bpb_0shot 22 | dataset: 23 | sources: 24 | - name: the-stack-v2-top15-ai2v0-minhash-10pct-superbpe 25 | target_ratio: 0.85 26 | paths: 27 | - weka://oe-training-default/ai2-llm/preprocessed/learn2code/love2code-top15-minhash-plpartition-10pct/allenai/superbpe-olmo3-experimental/**/*.npy 28 | - name: dclm-codeprose-v2-superbpe 29 | target_ratio: 0.15 30 | paths: 31 | - weka://oe-training-default/ai2-llm/preprocessed/learn2code/dclm-prose-v2/all/allenai/superbpe-olmo3-experimental/**/*.npy 32 | -------------------------------------------------------------------------------- /src/cookbook/recipes/spring2code/scaling/spring2code-1b-5xC-weka-top15-bpb-hlr-superbpe.yaml: -------------------------------------------------------------------------------- 1 | name: "spring2code-1b-5xC-top15-hlr-superbpe" 2 | description: "learn2code 1b@5xC top15 langauges from the-stack-v2 + dclm prose v2" 3 | budget: "ai2/oe-training" 4 | workspace: "ai2/oe-data" 5 | nodes: 4 6 | gpus: 8 7 | preemptible: true 8 | max_tokens: 113_184_153_600 9 | rank_microbatch_size: 16_384 # Larger tokens require more memory, so we use a smaller micro batch size. 10 | load_path: /weka/oe-training-default/ai2-llm/checkpoints/ai2-tylerm/spring2code-1b-5xC-top15-hlr-superbpe-2e15a7da/step13900/ 11 | sequence_length: 2048 12 | seed: 1337 13 | model: "olmo2_1B_v2" 14 | tokenizer: "superbpe_experimental" 15 | priority: high 16 | eval_interval: 250 17 | learning_rate: 1.8e-3 18 | cluster: ai2/jupiter-cirrascale-2 19 | weka: true 20 | downstream_evaluators: 21 | - codex_humaneval_gold_bpb_0shot 22 | - codex_mbpp_gold_bpb_0shot 23 | dataset: 24 | sources: 25 | - name: the-stack-v2-top15-ai2v0-minhash-10pct-superbpe 26 | target_ratio: 0.85 27 | paths: 28 | - weka://oe-training-default/ai2-llm/preprocessed/learn2code/love2code-top15-minhash-plpartition-10pct/allenai/superbpe-olmo3-experimental/**/*.npy 29 | - name: dclm-codeprose-v2-superbpe 30 | target_ratio: 0.15 31 | paths: 32 | - weka://oe-training-default/ai2-llm/preprocessed/learn2code/dclm-prose-v2/all/allenai/superbpe-olmo3-experimental/**/*.npy 33 | -------------------------------------------------------------------------------- /src/cookbook/remote/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/remote/__init__.py -------------------------------------------------------------------------------- /src/cookbook/remote/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | from tempfile import TemporaryDirectory 5 | from typing import Any, Generator 6 | 7 | from cookbook.cli.utils import PythonEnv 8 | 9 | from .base import BaseAuthentication, LocatedPath 10 | from .gantry_launcher import GantryLauncher 11 | 12 | 13 | def copy_prefix( 14 | src_path: str, 15 | dst_path: str, 16 | src_credentials: BaseAuthentication | None = None, 17 | dst_credentials: BaseAuthentication | None = None, 18 | *args: Any, 19 | **kwargs: Any, 20 | ): 21 | src_loc = LocatedPath.from_str(src_path) 22 | dst_loc = LocatedPath.from_str(dst_path) 23 | 24 | if src_loc.prot == "gs": 25 | if dst_loc.prot in ("weka", "file"): 26 | from .gcp import GoogleCloudToken, download_gcs_prefix 27 | 28 | assert src_credentials is None or isinstance(src_credentials, GoogleCloudToken) 29 | download_gcs_prefix(src_loc.remote, dst_loc.local, credentials=src_credentials, *args, **kwargs) 30 | elif dst_loc.prot == "gs": 31 | from .gcp import GoogleCloudToken, download_gcs_prefix, upload_gcs_prefix 32 | 33 | assert src_credentials is None or isinstance(src_credentials, GoogleCloudToken) 34 | assert dst_credentials is None or isinstance(dst_credentials, GoogleCloudToken) 35 | with TemporaryDirectory() as tmp_dir: 36 | download_gcs_prefix(src_loc.remote, tmp_dir, credentials=src_credentials, *args, **kwargs) 37 | upload_gcs_prefix(tmp_dir, dst_loc.remote, credentials=dst_credentials, *args, **kwargs) 38 | 39 | elif dst_loc.prot == "s3": 40 | from .aws import AwsCredentials, upload_s3_prefix 41 | from .gcp import GoogleCloudToken, download_gcs_prefix 42 | 43 | assert src_credentials is None or isinstance(src_credentials, GoogleCloudToken) 44 | assert dst_credentials is None or isinstance(dst_credentials, AwsCredentials) 45 | with TemporaryDirectory() as tmp_dir: 46 | download_gcs_prefix(src_loc.remote, tmp_dir, credentials=src_credentials, *args, **kwargs) 47 | upload_s3_prefix(tmp_dir, dst_loc.remote, credentials=dst_credentials, *args, **kwargs) 48 | 49 | elif src_loc.prot == "s3": 50 | if dst_loc.prot in ("weka", "file"): 51 | from .aws import AwsCredentials, download_s3_prefix 52 | 53 | assert src_credentials is None or isinstance(src_credentials, AwsCredentials) 54 | download_s3_prefix(src_loc.remote, dst_loc.local, credentials=src_credentials, *args, **kwargs) 55 | elif dst_loc.prot == "s3": 56 | from .aws import AwsCredentials, download_s3_prefix, upload_s3_prefix 57 | 58 | assert src_credentials is None or isinstance(src_credentials, AwsCredentials) 59 | assert dst_credentials is None or isinstance(dst_credentials, AwsCredentials) 60 | with TemporaryDirectory() as tmp_dir: 61 | download_s3_prefix(src_loc.remote, tmp_dir, credentials=src_credentials, *args, **kwargs) 62 | upload_s3_prefix(tmp_dir, dst_loc.remote, credentials=dst_credentials, *args, **kwargs) 63 | elif dst_loc.prot == "gs": 64 | from .aws import AwsCredentials, download_s3_prefix 65 | from .gcp import GoogleCloudToken, upload_gcs_prefix 66 | 67 | assert src_credentials is None or isinstance(src_credentials, AwsCredentials) 68 | assert dst_credentials is None or isinstance(dst_credentials, GoogleCloudToken) 69 | with TemporaryDirectory() as tmp_dir: 70 | download_s3_prefix(src_loc.remote, tmp_dir, credentials=src_credentials, *args, **kwargs) 71 | upload_gcs_prefix(tmp_dir, dst_loc.remote, credentials=dst_credentials, *args, **kwargs) 72 | 73 | elif src_loc.prot in ("weka", "file"): 74 | if dst_loc.prot in ("weka", "file"): 75 | # local copy 76 | shutil.copytree(src_loc.local, dst_loc.local) 77 | elif dst_loc.prot == "gs": 78 | from .gcp import GoogleCloudToken, upload_gcs_prefix 79 | 80 | assert dst_credentials is None or isinstance(dst_credentials, GoogleCloudToken) 81 | upload_gcs_prefix(src_loc.local, dst_loc.remote, credentials=dst_credentials, *args, **kwargs) 82 | elif dst_loc.prot == "s3": 83 | from .aws import AwsCredentials, upload_s3_prefix 84 | 85 | assert dst_credentials is None or isinstance(dst_credentials, AwsCredentials) 86 | upload_s3_prefix(src_loc.local, dst_loc.remote, credentials=dst_credentials, *args, **kwargs) 87 | 88 | else: 89 | raise ValueError(f"{src_loc.prot.upper()} -> {dst_loc.prot.upper()}: not recognized") 90 | 91 | 92 | def push_credentials(gantry_launcher: GantryLauncher, *paths: str): 93 | for path in paths: 94 | loc = LocatedPath.from_str(path) 95 | if loc.prot == "gs": 96 | from .gcp import GoogleCloudToken 97 | 98 | gct = GoogleCloudToken.make() 99 | gantry_launcher.add_env_secret(f"COOKBOOK_AUTH_{loc.hash[:6]}", gct.to_json(), overwrite=True) 100 | elif loc.prot == "s3": 101 | from .aws import AwsCredentials 102 | 103 | aws_creds = AwsCredentials.make() 104 | gantry_launcher.add_env_secret(f"COOKBOOK_AUTH_{loc.hash[:6]}", aws_creds.to_json(), overwrite=True) 105 | 106 | 107 | def pull_credentials(*paths: str) -> Generator[BaseAuthentication | None, Any, Any]: 108 | for path in paths: 109 | loc = LocatedPath.from_str(path) 110 | if loc.prot == "gs": 111 | from .gcp import GoogleCloudToken 112 | 113 | yield GoogleCloudToken.from_json(os.environ[f"COOKBOOK_AUTH_{loc.hash[:6]}"]) 114 | elif loc.prot == "s3": 115 | from .aws import AwsCredentials 116 | 117 | yield AwsCredentials.from_json(os.environ[f"COOKBOOK_AUTH_{loc.hash[:6]}"]) 118 | else: 119 | yield None 120 | 121 | 122 | def main(): 123 | parser = argparse.ArgumentParser("Move prefixes between storage systems") 124 | parser.add_argument("src_path", type=str, help="Source path") 125 | parser.add_argument("dst_path", type=str, help="Destination path") 126 | parser.add_argument("--num-workers", type=int, default=10, help="Number of workers") 127 | parser.add_argument("--google-cloud-token", type=str, default=None, help="Google Cloud token") 128 | parser.add_argument("--allow-dirty", action="store_true", help="Allow dirty operations") 129 | parser.add_argument("--budget", type=str, default="ai2/oe-base", help="Budget") 130 | parser.add_argument("--cluster", type=str, default="aus", help="Clusters to run on") 131 | parser.add_argument("--dry-run", action="store_true", help="Dry run") 132 | parser.add_argument("--gpus", type=int, default=0, help="Number of GPUs") 133 | parser.add_argument("--priority", type=str, default="high", help="Priority") 134 | parser.add_argument("--preemptible", action="store_true", help="Preemptible") 135 | parser.add_argument("--workspace", type=str, default="ai2/oe-data", help="Workspace") 136 | parser.add_argument("--local-only", action="store_true", help="Local only") 137 | parser.add_argument( 138 | "--credentials_env_name", type=str, default="COOKBOOK_REMOTE_CREDENTIALS", help="Credentials env name" 139 | ) 140 | args = parser.parse_args() 141 | 142 | if os.environ.get("BEAKER_EXPERIMENT_ID") or args.local_only: 143 | # only pull credentials if running on beaker 144 | source_credentials, destination_credentials = ( 145 | pull_credentials(args.src_path, args.dst_path) if not args.local_only else (None, None) 146 | ) 147 | 148 | copy_prefix( 149 | src_path=args.src_path, 150 | dst_path=args.dst_path, 151 | src_credentials=source_credentials, 152 | dst_credentials=destination_credentials, 153 | num_workers=args.num_workers, 154 | ) 155 | 156 | else: 157 | # running locally, submit to beaker 158 | env = PythonEnv.create("copy-prefix") 159 | bw = GantryLauncher( 160 | allow_dirty=args.allow_dirty, 161 | budget=args.budget, 162 | cluster=args.cluster, 163 | dry_run=args.dry_run, 164 | gpus=args.gpus, 165 | priority=args.priority, 166 | preemptible=args.preemptible, 167 | workspace=args.workspace, 168 | env=env, 169 | ) 170 | 171 | # adds mount if necessary 172 | bw.add_mount(args.src_path) 173 | bw.add_mount(args.dst_path) 174 | 175 | push_credentials(bw, args.src_path, args.dst_path) 176 | 177 | bw.run( 178 | command=f"python -m cookbook.remote '{args.src_path}' '{args.dst_path}'", 179 | description=f"Copying {args.src_path} to {args.dst_path}", 180 | ) 181 | 182 | 183 | if __name__ == "__main__": 184 | main() 185 | -------------------------------------------------------------------------------- /src/cookbook/remote/aws.py: -------------------------------------------------------------------------------- 1 | import os 2 | from concurrent.futures import ThreadPoolExecutor, as_completed 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | from typing import TYPE_CHECKING 6 | from urllib.parse import urlparse 7 | 8 | import boto3 9 | from tqdm import tqdm 10 | 11 | from ..cli.utils import get_aws_access_key_id, get_aws_secret_access_key 12 | from .base import AuthenticationError, BaseAuthentication 13 | 14 | if TYPE_CHECKING: 15 | from mypy_boto3_s3.client import S3Client 16 | 17 | 18 | @dataclass(frozen=True) 19 | class AwsCredentials(BaseAuthentication): 20 | access_key_id: str 21 | secret_access_key: str 22 | 23 | @classmethod 24 | def make(cls) -> "AwsCredentials": 25 | access_key_id = get_aws_access_key_id() 26 | secret_access_key = get_aws_secret_access_key() 27 | if access_key_id is None or secret_access_key is None: 28 | raise AuthenticationError("No AWS credentials found") 29 | return cls(access_key_id=access_key_id, secret_access_key=secret_access_key) 30 | 31 | def apply(self) -> boto3.Session: 32 | """Apply the credentials so that it can be used for remote operations.""" 33 | return boto3.Session(aws_access_key_id=self.access_key_id, aws_secret_access_key=self.secret_access_key) 34 | 35 | 36 | def list_objects_with_paginator(bucket_name: str, prefix: str, client: "S3Client"): 37 | """ 38 | List all objects in an S3 bucket using boto3's paginator. 39 | This automatically handles pagination for you. 40 | """ 41 | # Create a paginator for list_objects_v2 42 | paginator = client.get_paginator("list_objects_v2") 43 | 44 | # Configure the pagination parameters 45 | page_iterator = paginator.paginate( 46 | Bucket=bucket_name, 47 | Prefix=prefix, 48 | PaginationConfig={ 49 | "MaxItems": None, # Return all items 50 | "PageSize": 1000, # Number of items per page (max 1000) 51 | }, 52 | ) 53 | 54 | # Iterate through all pages 55 | for page in page_iterator: 56 | if "Contents" in page: 57 | for obj in page["Contents"]: 58 | yield bucket_name, obj["Key"] 59 | 60 | 61 | def download_s3_prefix( 62 | remote_path: str, 63 | local_path: str | Path, 64 | session: boto3.Session | None = None, 65 | num_workers: int | None = None, 66 | credentials: AwsCredentials | None = None, 67 | ): 68 | protocol, bucket_name, prefix = (p := urlparse(remote_path)).scheme, p.netloc, p.path.lstrip("/") 69 | assert protocol.startswith("s3"), "Only S3 and S3A protocols are supported" 70 | 71 | client = (credentials.apply() if credentials else (session or boto3.Session())).client("s3") 72 | 73 | # Create a local directory if it doesn't exist 74 | local_path = Path(local_path) 75 | futures = [] 76 | with ThreadPoolExecutor(max_workers=num_workers) as executor: 77 | for bucket, key in list_objects_with_paginator(bucket_name, prefix, client): 78 | local_file_path = local_path / Path(key).relative_to(Path(prefix)) 79 | 80 | def _download_file( 81 | _bucket: str, 82 | _key: str, 83 | _local_file_path: Path, 84 | _client: "S3Client", 85 | ): 86 | _local_file_path.parent.mkdir(parents=True, exist_ok=True) 87 | _client.download_file(_bucket, _key, str(_local_file_path)) 88 | 89 | futures.append( 90 | executor.submit( 91 | _download_file, 92 | _bucket=bucket, 93 | _key=key, 94 | _local_file_path=local_file_path, 95 | _client=client, 96 | ) 97 | ) 98 | 99 | for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading prefix"): 100 | try: 101 | future.result() 102 | except Exception as e: 103 | for future_to_cancel in futures: 104 | future_to_cancel.cancel() 105 | raise e 106 | 107 | 108 | def upload_s3_prefix( 109 | local_path: str | Path, 110 | remote_path: str, 111 | session: boto3.Session | None = None, 112 | num_workers: int | None = None, 113 | credentials: AwsCredentials | None = None, 114 | ): 115 | protocol, bucket_name, prefix = (p := urlparse(remote_path)).scheme, p.netloc, p.path.lstrip("/") 116 | assert protocol.startswith("s3"), "Only S3 and S3A protocols are supported" 117 | 118 | client = (credentials.apply() if credentials else (session or boto3.Session())).client("s3") 119 | local_path = Path(local_path).absolute() 120 | 121 | with ThreadPoolExecutor(max_workers=num_workers) as executor: 122 | futures = [] 123 | for dp, _, files in os.walk(str(local_path)): 124 | for fp_str in files: 125 | fp = Path(dp) / fp_str 126 | if not fp.is_file(): 127 | continue 128 | 129 | def _upload_file( 130 | _fp: Path, 131 | _bucket: str, 132 | _key: str, 133 | _client: "S3Client", 134 | ): 135 | _client.upload_file(str(_fp), _bucket, _key) 136 | 137 | futures.append( 138 | executor.submit( 139 | _upload_file, 140 | _fp=fp, 141 | _bucket=bucket_name, 142 | _key=f"{prefix}/{fp.relative_to(local_path)}", 143 | _client=client, 144 | ) 145 | ) 146 | 147 | for future in tqdm(as_completed(futures), total=len(futures), desc="Uploading prefix"): 148 | try: 149 | future.result() 150 | except Exception as e: 151 | for future_to_cancel in futures: 152 | future_to_cancel.cancel() 153 | raise e 154 | -------------------------------------------------------------------------------- /src/cookbook/remote/base.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import asdict, dataclass 3 | from hashlib import md5 4 | from pathlib import Path 5 | from typing import Any, Generic, Literal, Optional, TypeAlias, TypeVar, Union 6 | from urllib.parse import urlparse 7 | 8 | from typing_extensions import Self 9 | 10 | from cookbook.constants import WEKA_MOUNTS 11 | 12 | JSON_VALID_TYPES: TypeAlias = Union[str, int, float, bool, list, dict] 13 | 14 | 15 | C = TypeVar("C") 16 | 17 | 18 | @dataclass(frozen=True) 19 | class BaseAuthentication(Generic[C]): 20 | """Base class for all remote authentication classes.""" 21 | 22 | @classmethod 23 | def from_dict(cls, obj: dict[str, JSON_VALID_TYPES]) -> "Self": 24 | """Convert a dictionary to a BaseAuthentication instance.""" 25 | return cls(**obj) 26 | 27 | @classmethod 28 | def _check_dict_types(cls, obj: dict[str, JSON_VALID_TYPES]) -> None: 29 | """Check if the dictionary contains only valid types.""" 30 | for key, value in obj.items(): 31 | if not isinstance(key, str): 32 | raise ValueError(f"Invalid key type: {key!r} (expected str)") 33 | if not isinstance(value, JSON_VALID_TYPES): 34 | raise ValueError(f"Invalid value type: {value!r} (expected {JSON_VALID_TYPES})") 35 | if isinstance(value, dict): 36 | cls._check_dict_types(value) 37 | 38 | def to_dict(self) -> dict[str, JSON_VALID_TYPES]: 39 | """Convert a BaseAuthentication instance to a dictionary.""" 40 | self._check_dict_types(obj := asdict(self)) 41 | return obj 42 | 43 | @classmethod 44 | def from_json(cls, obj: str) -> "Self": 45 | """Convert a JSON string to a BaseAuthentication instance.""" 46 | obj = json.loads(obj) 47 | if not isinstance(obj, dict): 48 | raise ValueError(f"Invalid JSON object: {obj}") 49 | return cls.from_dict(obj) 50 | 51 | def to_json(self) -> str: 52 | """Convert a BaseAuthentication instance to a JSON string.""" 53 | return json.dumps(self.to_dict()) 54 | 55 | @classmethod 56 | def make(cls) -> "Self": 57 | """Create a new credentials instance to be used for remote operations.""" 58 | raise NotImplementedError("Subclasses must implement this method") 59 | 60 | def apply(self, *args: Any, **kwargs: Any) -> C: 61 | """Apply the credentials so that it can be used for remote operations.""" 62 | raise NotImplementedError("Subclasses must implement this method") 63 | 64 | 65 | class AuthenticationError(RuntimeError): 66 | """Error raised when authentication fails.""" 67 | 68 | ... 69 | 70 | 71 | @dataclass(frozen=True) 72 | class LocatedPath: 73 | prot: Literal["gs", "s3", "weka", "file"] 74 | path: str 75 | 76 | @property 77 | def hash(self) -> str: 78 | h = md5() 79 | h.update(self.prot.encode()) 80 | h.update(self.path.encode()) 81 | return h.hexdigest() 82 | 83 | @classmethod 84 | def weka_path(cls, path: str | Path) -> Optional["Self"]: 85 | parsed = urlparse(str(path)) 86 | termination = "/" if str(path).endswith("/") else "" 87 | 88 | if parsed.scheme == "weka": 89 | if parsed.netloc not in WEKA_MOUNTS: 90 | raise ValueError(f"Invalid Weka bucket: {parsed.netloc}") 91 | return cls(prot="weka", path=f"/{parsed.netloc.strip('/')}/{parsed.path.lstrip('/')}{termination}") 92 | 93 | # the first part is usually '/' 94 | _, *parts = Path(path).parts 95 | 96 | if parts[0] in WEKA_MOUNTS: 97 | return cls(prot="weka", path="/" + "/".join(parts).strip("/") + termination) 98 | elif parts[0] == "weka" and parts[1] in WEKA_MOUNTS: 99 | return cls(prot="weka", path="/" + "/".join(parts[1:]).strip("/") + termination) 100 | 101 | return None 102 | 103 | @classmethod 104 | def local_path(cls, path: str | Path) -> Optional["Self"]: 105 | parsed = urlparse(str(path)) 106 | termination = "/" if str(path).endswith("/") else "" 107 | if parsed.scheme == "file": 108 | return cls(prot="file", path=f"/{parsed.netloc.strip('/')}/{parsed.path.lstrip('/')}{termination}") 109 | 110 | if parsed.scheme == "": 111 | return cls(prot="file", path=str(path)) 112 | 113 | return None 114 | 115 | @classmethod 116 | def s3_path(cls, path: str | Path) -> Optional["Self"]: 117 | parsed = urlparse(str(path)) 118 | if parsed.scheme.startswith("s3"): 119 | return cls(prot="s3", path=parsed.netloc.strip("/") + "/" + parsed.path.lstrip("/")) 120 | return None 121 | 122 | @classmethod 123 | def gcs_path(cls, path: str | Path) -> Optional["Self"]: 124 | parsed = urlparse(str(path)) 125 | if parsed.scheme in ("gs", "gcs"): 126 | return cls(prot="gs", path=parsed.netloc.strip("/") + "/" + parsed.path.lstrip("/")) 127 | return None 128 | 129 | @classmethod 130 | def from_str(cls, path: str | Path) -> "Self": 131 | if p := cls.weka_path(path): 132 | return p 133 | elif p := cls.local_path(path): 134 | return p 135 | elif p := cls.s3_path(path): 136 | return p 137 | elif p := cls.gcs_path(path): 138 | return p 139 | raise ValueError(f"Invalid path: {path}") 140 | 141 | @property 142 | def local(self) -> Path: 143 | if self.prot in ("weka", "file"): 144 | return Path(self.path) 145 | 146 | raise ValueError(f"Path is not local: {self.path}") 147 | 148 | @property 149 | def remote(self) -> str: 150 | if self.prot == "file": 151 | raise ValueError(f"Path is not remote: {self.path}") 152 | return f"{self.prot}://{self.path.lstrip('/')}" 153 | 154 | @property 155 | def bucket(self) -> str: 156 | remote = self.remote 157 | url = urlparse(remote) 158 | return url.netloc 159 | 160 | @property 161 | def prefix(self) -> str: 162 | remote = self.remote 163 | url = urlparse(remote) 164 | return url.path.lstrip("/") 165 | -------------------------------------------------------------------------------- /src/cookbook/remote/gantry_launcher.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | import subprocess 3 | from dataclasses import InitVar, dataclass 4 | 5 | from cookbook.cli.utils import ( 6 | PythonEnv, 7 | add_secret_to_beaker_workspace, 8 | install_beaker_py, 9 | ) 10 | from cookbook.utils.clusters import get_matching_clusters 11 | 12 | from .base import LocatedPath 13 | 14 | 15 | @dataclass 16 | class GantryLauncher: 17 | allow_dirty: bool 18 | budget: str 19 | cluster: str 20 | dry_run: bool 21 | gpus: int 22 | priority: str 23 | preemptible: bool 24 | workspace: str 25 | env: InitVar[PythonEnv | None] = None 26 | 27 | def __post_init__(self, env: PythonEnv | None): 28 | self._env = env or PythonEnv.null() 29 | self._flags = [] 30 | 31 | # setup beaker-py 32 | install_beaker_py(env=self._env) 33 | 34 | for cluster in set(get_matching_clusters(self.cluster)): 35 | self._flags.append(f"--cluster {cluster}") 36 | 37 | def add_mount(self, path: str): 38 | located_path = LocatedPath.from_str(path) 39 | if located_path.prot == "weka": 40 | self._flags.append(f"--weka {located_path.bucket}:/{located_path.bucket}") 41 | 42 | def add_env_secret(self, key: str, value: str, overwrite: bool = False): 43 | secret_name = add_secret_to_beaker_workspace( 44 | secret_name=key, 45 | secret_value=value, 46 | workspace=self.workspace, 47 | env=self._env, # pyright: ignore 48 | overwrite=overwrite, 49 | ) 50 | self._flags.append(f"--env-secret {key}={secret_name}") 51 | 52 | def run( 53 | self, 54 | command: str, 55 | description: str, 56 | extra_flags: dict[str, str] | None = None, 57 | ) -> subprocess.CompletedProcess: 58 | 59 | extra_flags = extra_flags or {} 60 | 61 | gantry_command = [ 62 | "gantry run", 63 | f"--description '{description}'", 64 | ("--allow-dirty" if self.allow_dirty else ""), 65 | "--no-python", 66 | f"--workspace {self.workspace}", 67 | f"--priority {self.priority}", 68 | f"--gpus {self.gpus}", 69 | ("--preemptible" if self.preemptible else ""), 70 | f"--budget {self.budget}", 71 | "--yes", 72 | ("--dry-run" if self.dry_run else ""), 73 | " ".join(self._flags), 74 | " ".join(f"--{k} {v}" for k, v in extra_flags.items()), 75 | f"-- /bin/bash -c 'pip install uv && uv pip install . --system && {command}'", 76 | ] 77 | gantry_command_str = " ".join(gantry_command) 78 | 79 | print(f"Submitting to beaker with command: {gantry_command_str}") 80 | return subprocess.run(shlex.split(gantry_command_str), check=True, env=self._env.path()) 81 | -------------------------------------------------------------------------------- /src/cookbook/remote/gcp.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | from dataclasses import dataclass 5 | from pathlib import Path 6 | from urllib.parse import urlparse 7 | 8 | from google.auth import default 9 | from google.auth.transport.requests import Request 10 | from google.cloud import storage 11 | from google.oauth2.credentials import Credentials 12 | from tqdm import tqdm 13 | 14 | from .base import JSON_VALID_TYPES, AuthenticationError, BaseAuthentication 15 | 16 | 17 | @dataclass(frozen=True) 18 | class GoogleCloudToken(BaseAuthentication): 19 | token: str 20 | project_id: str 21 | expiry: datetime.datetime | None 22 | 23 | @classmethod 24 | def from_dict(cls, obj: dict[str, JSON_VALID_TYPES]) -> "GoogleCloudToken": 25 | parsed_obj = { 26 | "token": obj["token"], 27 | "expiry": datetime.datetime.fromisoformat(e) if isinstance(e := obj.get("expiry", None), str) else e, 28 | "project_id": obj["project_id"], 29 | } 30 | return super().from_dict(parsed_obj) 31 | 32 | def to_dict(self) -> dict[str, JSON_VALID_TYPES]: 33 | obj = { 34 | "token": self.token, 35 | "expiry": self.expiry.isoformat() if self.expiry else None, 36 | "project_id": self.project_id, 37 | } 38 | return obj 39 | 40 | @classmethod 41 | def make(cls) -> "GoogleCloudToken": 42 | """Generate short-lived token for GCS access.""" 43 | credentials, project_id = default() 44 | if not credentials.valid: # pyright: ignore 45 | credentials.refresh(Request()) # pyright: ignore 46 | 47 | return cls(token=credentials.token, project_id=project_id, expiry=credentials.expiry) # pyright: ignore 48 | 49 | def apply(self) -> storage.Client: 50 | """Apply the credentials so that it can be used for remote operations.""" 51 | 52 | if self.expiry is not None and datetime.datetime.now() > self.expiry: 53 | raise AuthenticationError("Token expired!") 54 | 55 | credentials = Credentials(self.token) 56 | return storage.Client(credentials=credentials, project=self.project_id) 57 | 58 | 59 | def download_gcs_prefix( 60 | remote_path: str, 61 | local_path: str | Path, 62 | client: storage.Client | None = None, 63 | num_workers: int | None = None, 64 | credentials: GoogleCloudToken | None = None, 65 | ): 66 | protocol, bucket_name, prefix = (p := urlparse(remote_path)).scheme, p.netloc, p.path.lstrip("/") 67 | assert protocol in ("gs", "gcs"), "Only GCS and GS protocols are supported" 68 | 69 | client = credentials.apply() if credentials else (client or storage.Client()) 70 | 71 | local_path = Path(local_path) 72 | blobs = client.list_blobs(bucket_name, prefix=prefix) 73 | futures = [] 74 | with ThreadPoolExecutor(max_workers=num_workers) as executor: 75 | for blob in blobs: 76 | local_file_path = local_path / Path(blob.name).relative_to(Path(prefix)) 77 | 78 | def _download_file( 79 | _blob: storage.Blob, 80 | _local_file_path: Path, 81 | _expiration_time: datetime.datetime | None, 82 | ): 83 | if _expiration_time is not None and datetime.datetime.now() > _expiration_time: 84 | raise RuntimeError("Token expired!") 85 | 86 | _local_file_path.parent.mkdir(parents=True, exist_ok=True) 87 | _blob.download_to_filename(str(_local_file_path)) 88 | 89 | futures.append( 90 | executor.submit( 91 | _download_file, 92 | _blob=blob, 93 | _local_file_path=local_file_path, 94 | _expiration_time=credentials.expiry if credentials else None, 95 | ) 96 | ) 97 | 98 | for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading prefix"): 99 | try: 100 | future.result() 101 | except Exception as e: 102 | for future_to_cancel in futures: 103 | future_to_cancel.cancel() 104 | raise e 105 | 106 | 107 | def upload_gcs_prefix( 108 | local_path: str | Path, 109 | remote_path: str, 110 | client: storage.Client | None = None, 111 | num_workers: int | None = None, 112 | credentials: GoogleCloudToken | None = None, 113 | ): 114 | protocol, bucket_name, prefix = (p := urlparse(remote_path)).scheme, p.netloc, p.path.lstrip("/") 115 | assert protocol in ("gs", "gcs"), "Only GCS and GS protocols are supported" 116 | 117 | client = credentials.apply() if credentials else (client or storage.Client()) 118 | local_path = Path(local_path).absolute() 119 | 120 | with ThreadPoolExecutor(max_workers=num_workers) as executor: 121 | futures = [] 122 | for dp, _, files in os.walk(str(local_path)): 123 | for fp_str in files: 124 | fp = Path(dp) / fp_str 125 | if not fp.is_file(): 126 | continue 127 | 128 | bucket = client.bucket(bucket_name) 129 | blob = bucket.blob(f"{prefix}/{fp.relative_to(local_path)}") 130 | 131 | def _upload_file( 132 | _fp: Path, 133 | _blob: storage.Blob, 134 | _expiration_time: datetime.datetime | None, 135 | ): 136 | if _expiration_time is not None and datetime.datetime.now() > _expiration_time: 137 | raise RuntimeError("Token expired!") 138 | 139 | _blob.upload_from_filename(str(_fp)) 140 | 141 | futures.append( 142 | executor.submit( 143 | _upload_file, 144 | _fp=fp, 145 | _blob=blob, 146 | _expiration_time=credentials.expiry if credentials else None, 147 | ) 148 | ) 149 | 150 | for future in tqdm(as_completed(futures), total=len(futures), desc="Uploading prefix"): 151 | try: 152 | future.result() 153 | except Exception as e: 154 | for future_to_cancel in futures: 155 | future_to_cancel.cancel() 156 | raise e 157 | -------------------------------------------------------------------------------- /src/cookbook/train.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import click 5 | from olmo_core.train import prepare_training_environment, teardown_training_environment 6 | from torch.distributed.elastic.multiprocessing.errors import record 7 | 8 | from cookbook.utils.config import build_train_config 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | @click.group() 14 | def cli(): 15 | pass 16 | 17 | 18 | @cli.command() 19 | @click.option( 20 | "--run-name", 21 | "-n", 22 | type=str, 23 | help="Name of the run", 24 | required=True, 25 | ) 26 | @click.option( 27 | "--group-id", 28 | "-g", 29 | type=str, 30 | help="Group ID for the experiment", 31 | ) 32 | @click.option( 33 | "--beaker-user", 34 | "-u", 35 | type=str, 36 | help="Beaker user", 37 | ) 38 | @click.option( 39 | "--config-path", 40 | "-C", 41 | type=click.Path(exists=True), 42 | required=True, 43 | help="Relative path to the experiment configuration file.", 44 | ) 45 | @record 46 | def train( 47 | run_name: str, 48 | group_id: str, 49 | beaker_user: str, 50 | config_path: Path, 51 | ): 52 | trainer = build_train_config(config_path, run_name, group_id, beaker_user) 53 | 54 | if trainer is None: 55 | logger.error("Failed to build training config! Exiting...") 56 | raise click.Abort() 57 | 58 | trainer.fit() 59 | 60 | 61 | if __name__ == "__main__": 62 | try: 63 | prepare_training_environment() 64 | cli() 65 | finally: 66 | teardown_training_environment() 67 | -------------------------------------------------------------------------------- /src/cookbook/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/src/cookbook/utils/__init__.py -------------------------------------------------------------------------------- /src/cookbook/utils/clusters.py: -------------------------------------------------------------------------------- 1 | from cookbook.constants import BEAKER_KNOWN_CLUSTERS, NEW_CLUSTER_ALIASES 2 | 3 | 4 | def get_matching_clusters(cluster: str) -> list[str]: 5 | """ 6 | This function converts cluster aliases to the actual cluster names; it also 7 | handles the cases where a cluster is referred to by an alias. 8 | """ 9 | if cluster in NEW_CLUSTER_ALIASES: 10 | cluster = NEW_CLUSTER_ALIASES[cluster] 11 | 12 | if cluster in BEAKER_KNOWN_CLUSTERS: 13 | return BEAKER_KNOWN_CLUSTERS[cluster] 14 | 15 | return [cluster] 16 | 17 | 18 | def is_gcs_cluster(cluster: str) -> bool: 19 | """ 20 | This function checks if a cluster has GCS support; support here means we don't have to 21 | push google credentials to access the cluster. 22 | """ 23 | 24 | canonical_names = get_matching_clusters(cluster) 25 | 26 | if all(cluster_name in BEAKER_KNOWN_CLUSTERS["goog"] for cluster_name in canonical_names): 27 | return True 28 | 29 | return False 30 | 31 | 32 | def get_known_clusters() -> list[str]: 33 | """ 34 | This function returns all known clusters in OLMo Cookbook. 35 | """ 36 | all_clusters = [c for cs in BEAKER_KNOWN_CLUSTERS.values() for c in cs] 37 | return sorted(set(all_clusters)) 38 | -------------------------------------------------------------------------------- /src/cookbook/utils/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import os 4 | from pathlib import Path 5 | from typing import List, Tuple, Union, cast 6 | from urllib.parse import urlparse 7 | 8 | import gcsfs 9 | import s3fs 10 | import yaml 11 | from olmo_core.io import normalize_path 12 | from olmo_core.launch.beaker import ( 13 | BeakerEnvSecret, 14 | BeakerEnvVar, 15 | BeakerLaunchConfig, 16 | BeakerWekaBucket, 17 | ) 18 | from olmo_core.train.callbacks import ConfigSaverCallback, WandBCallback 19 | from olmo_core.utils import seed_all 20 | 21 | from cookbook.aliases import ( 22 | ExperimentConfig, 23 | ExperimentGroup, 24 | ExperimentInstance, 25 | SourceConfig, 26 | SourceInstance, 27 | ) 28 | from cookbook.model.builder import TransformerConfigBuilder 29 | from cookbook.utils.data import normalize_source_paths 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | def config_from_path(config: Path) -> ExperimentConfig: 35 | with open(config, "r") as f: 36 | data = yaml.safe_load(f) 37 | 38 | return ExperimentConfig(**data, path=config) 39 | 40 | 41 | def mk_source_instances( 42 | sources: list[SourceConfig], priors: Tuple[dict[str, float], int] | None = None 43 | ) -> list[SourceInstance]: 44 | if priors: 45 | ratios_by_source, total_tokens = priors 46 | else: 47 | ratios_by_source = {} 48 | 49 | instances = [] 50 | for source in sources: 51 | ratio = source.target_ratio or ratios_by_source[source.name] 52 | instances.append( 53 | SourceInstance( 54 | name=source.name, 55 | paths=source.paths, 56 | ratio=ratio, 57 | repetition_factor=source.repetition_factor, 58 | ) 59 | ) 60 | 61 | return instances 62 | 63 | 64 | def mk_experiments( 65 | config: ExperimentConfig, group_id: str, priors: Tuple[dict[str, float], int] 66 | ) -> list[ExperimentInstance]: 67 | """Generate source instances from a config.""" 68 | return [ 69 | ExperimentInstance( 70 | name=f"{config.name}-{group_id}", 71 | sources=mk_source_instances(config.dataset.sources, priors), 72 | ) 73 | ] 74 | 75 | 76 | def mk_experiment_group( 77 | config: ExperimentConfig, priors: Tuple[dict[str, float], int], group_id: str 78 | ) -> ExperimentGroup: 79 | """Build an experiment group from an experiment config.""" 80 | 81 | return ExperimentGroup( 82 | config=config, 83 | group_id=group_id, 84 | instances=mk_experiments(config, group_id, priors), 85 | ) 86 | 87 | 88 | def mk_instance_cmd( 89 | instance: ExperimentInstance, config: ExperimentConfig, group_id: str, beaker_user: str 90 | ) -> List[str]: 91 | """Build a command for launching an experiment instance.""" 92 | 93 | return [ 94 | "src/cookbook/train.py", 95 | "train", 96 | "-n", 97 | instance.name, 98 | "-g", 99 | group_id, 100 | "-u", 101 | beaker_user, 102 | "-C", 103 | str(config.path), 104 | ] 105 | 106 | 107 | _REMOTE_FS_CACHE: dict[str, Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]] | None = None 108 | 109 | 110 | def remote_fs_cache() -> dict[str, Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]]: 111 | global _REMOTE_FS_CACHE 112 | if _REMOTE_FS_CACHE is not None: 113 | return _REMOTE_FS_CACHE 114 | 115 | _REMOTE_FS_CACHE = dict( 116 | s3=s3fs.S3FileSystem(), 117 | weka=s3fs.S3FileSystem(client_kwargs={"endpoint_url": os.environ["WEKA_ENDPOINT_URL"]}, profile="WEKA"), 118 | gs=gcsfs.GCSFileSystem(), 119 | ) 120 | 121 | return _REMOTE_FS_CACHE 122 | 123 | 124 | def build_train_config(config_path: Path, run_name: str, group_id: str, beaker_user: str, dry_run: bool = False): 125 | """ 126 | Launch a training run with the given parameters. 127 | """ 128 | 129 | base_config = config_from_path(config_path) 130 | load_path_fs = None 131 | 132 | if dry_run: 133 | source_paths = base_config.dataset.sources 134 | if base_config.load_path: 135 | try: 136 | load_path_fs = remote_fs_cache()[urlparse(base_config.load_path).scheme] 137 | except KeyError: 138 | raise ValueError(f"Unsupported load path scheme: {base_config.load_path}") 139 | 140 | # When we have a weka path locally we need to treat it like a remote s3 141 | # path and strip the special weka prefix and bucket name 142 | base_config.load_path = normalize_path(base_config.load_path.replace("weka://", "s3://")) 143 | 144 | else: 145 | source_paths = normalize_source_paths(base_config.dataset.sources, expand=True) 146 | 147 | if base_config.load_path: 148 | # When we have a weka path remotely on beaker we need to treat it like a local path since the bucket is mounted 149 | base_config.load_path = normalize_path(base_config.load_path.replace("weka://", "/weka/")) 150 | 151 | source_instances = mk_source_instances(source_paths, None) 152 | dp_world_size = base_config.nodes * base_config.gpus 153 | 154 | config = TransformerConfigBuilder( 155 | beaker_user=beaker_user, 156 | cluster=base_config.cluster, 157 | downstream_evaluators=base_config.downstream_evaluators, 158 | dtype=base_config.dataset.dtype, 159 | eval_interval=base_config.eval_interval, 160 | group_id=group_id.strip(), 161 | lm_evaluator=base_config.lm_evaluator, 162 | max_dp_world_size=dp_world_size, 163 | max_target_sequence_length=base_config.max_target_sequence_length, 164 | max_tokens=base_config.max_tokens, 165 | model_identifier=base_config.model, 166 | run_name=run_name.strip(), 167 | save_interval=base_config.save_interval, 168 | seed=base_config.seed, 169 | sequence_length=base_config.sequence_length, 170 | sources=source_instances, 171 | tokenizer=base_config.tokenizer, 172 | metrics_config=base_config.metrics_config, 173 | weka=base_config.weka, 174 | rank_microbatch_size=base_config.rank_microbatch_size, 175 | global_batch_size=base_config.global_batch_size, 176 | load_path=base_config.load_path, 177 | warmup_steps=base_config.warmup_steps, 178 | learning_rate=base_config.learning_rate, 179 | scheduler_type=base_config.scheduler_type, 180 | annealing=base_config.annealing, 181 | hard_stop=base_config.hard_stop, 182 | model_overrides=base_config.model_overrides, 183 | activation_checkpointing=base_config.activation_checkpointing, 184 | load_path_fs=load_path_fs, 185 | ).build() 186 | 187 | seed_all(config.init_seed) 188 | config_dict = config.as_config_dict() 189 | trainer = None 190 | 191 | if not dry_run: 192 | dataset = config.dataset.build() 193 | model = config.model.build(init_device="meta") 194 | train_module = config.train_module.build(model) 195 | data_loader = config.data_loader.build(dataset, dp_process_group=train_module.dp_process_group) 196 | trainer = config.trainer.build(train_module, data_loader) 197 | 198 | # If we have a load path and there is no checkpoint in the save folder, load the checkpoint from the load path. 199 | if not trainer.maybe_load_checkpoint(trainer.save_folder) and base_config.load_path: 200 | logger.info( 201 | f"Loading checkpoint from {base_config.load_path} and load_trainer_state: {base_config.load_state}" 202 | ) 203 | trainer.load_checkpoint(base_config.load_path, load_trainer_state=base_config.load_state) 204 | 205 | cast(WandBCallback, trainer.callbacks["wandb"]).config = config_dict 206 | cast(ConfigSaverCallback, trainer.callbacks["config_saver"]).config = config_dict 207 | 208 | logger.info("Configuration:") 209 | # We log estimated step count here when dry_run is enabled because we're not able to build the trainer on non-CUDA devices 210 | if dry_run: 211 | logger.info( 212 | f"Estimated training steps: {math.ceil(base_config.max_tokens / config.data_loader.global_batch_size):,}" 213 | ) 214 | logger.info(config) 215 | 216 | return trainer 217 | 218 | 219 | def mk_launch_configs(group: ExperimentGroup, beaker_user: str) -> list[BeakerLaunchConfig]: 220 | """Build a beaker launch config from an experiment group.""" 221 | 222 | weka_buckets: List[BeakerWekaBucket] = [] 223 | if group.config.weka: 224 | weka_buckets.append(BeakerWekaBucket("oe-training-default", "/weka/oe-training-default")) 225 | 226 | return [ 227 | BeakerLaunchConfig( 228 | name=f"{experiment.name}", 229 | description=group.config.description, 230 | task_name=experiment.name, 231 | cmd=mk_instance_cmd(experiment, group.config, group.group_id, beaker_user), 232 | clusters=[group.config.cluster], 233 | num_nodes=group.config.nodes, 234 | num_gpus=group.config.gpus, 235 | shared_filesystem=group.config.weka, 236 | allow_dirty=True, 237 | weka_buckets=weka_buckets, 238 | budget=group.config.budget or "ai2/oe-base", 239 | workspace=group.config.workspace, 240 | preemptible=group.config.preemptible, 241 | beaker_image="petew/olmo-core-tch270cu128", 242 | priority=group.config.priority, 243 | env_vars=[BeakerEnvVar(name="NCCL_DEBUG", value="INFO" if group.config.nccl_debug else "WARN")], 244 | env_secrets=[ 245 | BeakerEnvSecret(name="BEAKER_TOKEN", secret=f"{beaker_user}_BEAKER_TOKEN"), 246 | BeakerEnvSecret(name="WANDB_API_KEY", secret=f"{beaker_user}_WANDB_API_KEY"), 247 | BeakerEnvSecret(name="AWS_CONFIG", secret=f"{beaker_user}_AWS_CONFIG"), 248 | BeakerEnvSecret(name="AWS_CREDENTIALS", secret=f"{beaker_user}_AWS_CREDENTIALS"), 249 | BeakerEnvSecret(name="R2_ENDPOINT_URL", secret="R2_ENDPOINT_URL"), 250 | BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"), 251 | BeakerEnvSecret(name="GOOGLE_CLOUD_PROJECT", secret="GOOGLE_CLOUD_PROJECT"), 252 | ], 253 | setup_steps=[ 254 | 'git clone "$REPO_URL"', 255 | "conda shell.bash activate base", 256 | "cd olmo-cookbook", 257 | 'git checkout "$GIT_REF"', 258 | "git submodule update --init --recursive", 259 | "pip install -e '.[all]'", 260 | "pip freeze", 261 | # Move AWS credentials from env to relevant files 262 | "mkdir -p ~/.aws", 263 | "printenv AWS_CONFIG > ~/.aws/config", 264 | "printenv AWS_CREDENTIALS > ~/.aws/credentials", 265 | ], 266 | ) 267 | for experiment in group.instances 268 | ] 269 | -------------------------------------------------------------------------------- /src/cookbook/utils/data.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import hashlib 3 | import json 4 | import logging 5 | import os 6 | import pathlib 7 | from collections import defaultdict 8 | from typing import Any, List, Optional, Tuple, Union 9 | from urllib.parse import urlparse 10 | 11 | import gcsfs 12 | import s3fs 13 | from olmo_core.aliases import PathOrStr 14 | from olmo_core.data.types import NumpyDatasetDType 15 | from olmo_core.io import get_file_size, is_url, normalize_path 16 | from olmo_core.utils import OLMoEnvironmentError 17 | from tqdm import tqdm 18 | 19 | from cookbook.aliases import SourceConfig 20 | 21 | logger = logging.getLogger(__name__) 22 | logging.getLogger("botocore").setLevel(logging.WARNING) 23 | 24 | 25 | def _bytes_to_tokens(num_bytes: int, dtype: NumpyDatasetDType) -> int: 26 | """ 27 | Convert bytes to tokens based on the dtype. 28 | """ 29 | npdtype = dtype.as_np_dtype() 30 | return num_bytes // npdtype(int(0)).itemsize 31 | 32 | 33 | def _count_tokens_for_file(path: PathOrStr, dtype: NumpyDatasetDType) -> int: 34 | return _bytes_to_tokens(get_file_size(path), dtype) 35 | 36 | 37 | def get_token_counts_and_ratios( 38 | source_configs: list[SourceConfig], dtype: NumpyDatasetDType, use_cache: bool 39 | ) -> Tuple[dict[str, float], int]: 40 | config_hash = hashlib.md5( 41 | json.dumps( 42 | [(sc.name, sc.paths) for sc in source_configs], 43 | sort_keys=True, 44 | ).encode("utf-8") 45 | ).hexdigest() 46 | 47 | cache_path = pathlib.Path(f"/tmp/olmo-cookbook/priors_cache_{config_hash}.json") 48 | if use_cache: 49 | try: 50 | with open(cache_path, "r") as f: 51 | logger.info( 52 | "Source distribution cache found, using cached values! This can be disabled by setting use_cache=False." 53 | ) 54 | obj = json.load(f) 55 | return (obj["relative_sizes"], obj["total_tokens"]) 56 | except FileNotFoundError: 57 | logger.info("No cache file found, calculating from source files...") 58 | 59 | token_counts = defaultdict(int) 60 | 61 | filesystems = {} 62 | 63 | # Pre-check each source for mixed schemes and create appropriate filesystem clients 64 | for source in source_configs: 65 | schemes = {urlparse(path).scheme for path in source.paths} 66 | 67 | # Check for mixed schemes within a source 68 | if len(schemes) > 1 and any(scheme for scheme in schemes): 69 | raise OLMoEnvironmentError( 70 | f"Mixed URL schemes in source '{source.name}': {schemes}. Each source must use a consistent scheme." 71 | ) 72 | 73 | # Get the scheme (or None for local paths) 74 | scheme = next(iter(schemes)) if schemes and next(iter(schemes)) else "local" 75 | 76 | if scheme not in filesystems: 77 | filesystems[scheme] = get_filesystem_for_scheme(scheme) 78 | 79 | with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor: 80 | for source in source_configs: 81 | # Get the appropriate filesystem for this source 82 | scheme = next(iter({urlparse(path).scheme for path in source.paths}), "local") 83 | fs = filesystems.get(scheme) 84 | 85 | globs = [path for path in source.paths if "*" in path] 86 | paths = [path for path in source.paths if path not in globs] 87 | source.paths = paths + expand_globs(fs, globs) if globs else paths 88 | 89 | futures = { 90 | executor.submit(_count_tokens_for_file, path, dtype): source 91 | for source in source_configs 92 | for path in source.paths 93 | } 94 | 95 | for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 96 | source_future = futures[future] 97 | try: 98 | result = future.result() 99 | token_counts[source_future.name] += result 100 | except Exception as e: 101 | logger.info(f"Error processing {source_future.name}: {str(e)}") 102 | token_counts[source_future.name] = 0 103 | 104 | # Calculate relative sizes 105 | total_tokens = sum(token_counts.values()) 106 | 107 | if total_tokens == 0: 108 | raise Exception(f"Error processing config, no tokens found!") 109 | 110 | relative_sizes = {path: count / total_tokens for path, count in token_counts.items()} 111 | 112 | if use_cache: 113 | os.makedirs(os.path.dirname(cache_path), exist_ok=True) 114 | with open(cache_path, "w") as f: 115 | json.dump({"relative_sizes": relative_sizes, "total_tokens": total_tokens}, f) 116 | 117 | return (relative_sizes, total_tokens) 118 | 119 | 120 | def expand_globs( 121 | fs: Optional[Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]] = s3fs.S3FileSystem(), sources: List[str] = [] 122 | ) -> Any: 123 | results = [] 124 | 125 | for source in sources: 126 | if is_url(source): 127 | results.extend(_expand_remote(source, fs)) 128 | else: 129 | results.extend(_expand_local(source)) 130 | 131 | # Filter the globs from the expanded list 132 | return [r for r in results if "*" not in r] 133 | 134 | 135 | def _expand_local(pattern: str) -> List[str]: 136 | """ 137 | Expand a local glob pattern. 138 | """ 139 | from glob import glob 140 | 141 | logger.info(f"Expanding '{pattern}'...") 142 | matches = sorted(glob(pattern, recursive=True)) 143 | 144 | if not matches: 145 | raise FileNotFoundError(pattern) 146 | 147 | return [normalize_path(match) for match in matches] 148 | 149 | 150 | def _expand_remote(pattern: str, fs: Optional[Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]]) -> List[str]: 151 | """ 152 | Expand a remote glob pattern. 153 | """ 154 | if not fs: 155 | fs = s3fs.S3FileSystem() 156 | 157 | parsed = urlparse(pattern) 158 | logger.info(f"Expanding remote glob '{pattern}'...") 159 | 160 | if parsed.scheme == "s3": 161 | return [f"s3://{obj}" for obj in fs.glob(pattern)] 162 | elif parsed.scheme == "weka": 163 | return [f"weka://{obj}" for obj in fs.glob(pattern.replace("weka://", "s3://"))] 164 | elif parsed.scheme == "gs": 165 | return [f"gs://{obj}" for obj in fs.glob(pattern)] 166 | elif parsed.scheme == "r2": 167 | raise NotImplementedError("'r2' types are not currently supported") 168 | elif parsed.scheme in ("http", "https"): 169 | raise NotImplementedError("'http' types are not currently supported") 170 | elif parsed.scheme == "file": 171 | raise NotImplementedError("Remote 'file' types are not currently supported") 172 | else: 173 | raise NotImplementedError(f"Glob expansion is not currently supported for '{parsed.scheme}' files") 174 | 175 | 176 | def normalize_source_paths(sources: List[SourceConfig], expand: bool = False) -> List[SourceConfig]: 177 | """ 178 | Normalize the paths in a SourceConfig object. 179 | """ 180 | normalized = [] 181 | 182 | for source in sources: 183 | source_paths = [] 184 | schemes = set() 185 | 186 | for path in source.paths: 187 | if is_url(path): 188 | parsed = urlparse(path) 189 | schemes.add(parsed.scheme) 190 | if parsed.scheme == "s3": 191 | source_paths.append(path) 192 | elif parsed.scheme == "weka": 193 | source_paths.append(normalize_path(path.replace("weka://", "/weka/"))) 194 | elif parsed.scheme == "gs": 195 | source_paths.append(path) 196 | elif parsed.scheme == "r2": 197 | raise NotImplementedError("'r2' types are not currently supported") 198 | elif parsed.scheme in ("http", "https"): 199 | raise NotImplementedError("'http' types are not currently supported") 200 | else: 201 | raise OLMoEnvironmentError(f"Unsupported URL scheme: {parsed.scheme}") 202 | else: 203 | source_paths.append(normalize_path(path)) 204 | schemes.add("local") 205 | 206 | # Get filesystem if we're expanding globs and paths exist 207 | fs = None 208 | if expand and source_paths: 209 | scheme = next(iter(schemes)) if schemes else "local" 210 | fs = get_filesystem_for_scheme(scheme) 211 | 212 | normalized.append( 213 | SourceConfig( 214 | name=source.name, 215 | paths=expand_globs(fs=fs, sources=source_paths) if expand else source_paths, 216 | target_ratio=source.target_ratio, 217 | repetition_factor=source.repetition_factor, 218 | max_source_ratio=source.max_source_ratio, 219 | ) 220 | ) 221 | 222 | return normalized 223 | 224 | 225 | def get_filesystem_for_scheme(scheme: str): 226 | """ 227 | Get the appropriate filesystem for a given URL scheme. 228 | 229 | Args: 230 | scheme: The URL scheme (e.g., 's3', 'gs', 'local', 'weka') 231 | 232 | Returns: 233 | The appropriate filesystem object for the scheme or None for local paths 234 | 235 | Raises: 236 | OLMoEnvironmentError: If the scheme is not supported or not configured correctly 237 | NotImplementedError: If the scheme is recognized but not currently supported 238 | """ 239 | if scheme in ("s3", "weka"): 240 | client_kwargs = {} 241 | profile_name = os.environ.get("AWS_PROFILE", None) 242 | 243 | if scheme == "weka": 244 | profile_name = "WEKA" 245 | client_kwargs["endpoint_url"] = os.environ.get("WEKA_ENDPOINT_URL") 246 | 247 | return s3fs.S3FileSystem(client_kwargs={**client_kwargs}, profile=profile_name) 248 | 249 | elif scheme == "gs": 250 | try: 251 | gs_project = os.environ.get("GOOGLE_CLOUD_PROJECT", None) 252 | 253 | if not gs_project: 254 | raise OLMoEnvironmentError("GOOGLE_CLOUD_PROJECT environment variable is not set!") 255 | 256 | try: 257 | return gcsfs.GCSFileSystem(token="google_default") 258 | except Exception as e: 259 | logger.warning( 260 | f"Failed to create GCS filesystem with default credentials: {str(e)}. Retrying with metadata server..." 261 | ) 262 | return gcsfs.GCSFileSystem() 263 | 264 | except Exception as e: 265 | raise OLMoEnvironmentError( 266 | f"Failed to create GCS filesystem: {str(e)}. Ensure GOOGLE_APPLICATION_CREDENTIALS_JSON and GOOGLE_CLOUD_PROJECT are set correctly." 267 | ) 268 | 269 | elif scheme in ("r2", "http", "https"): 270 | raise NotImplementedError(f"'{scheme}' scheme is not currently supported") 271 | 272 | elif scheme == "local": 273 | return None # No remote filesystem needed for local paths 274 | 275 | else: 276 | raise OLMoEnvironmentError(f"Unsupported URL scheme: {scheme}") 277 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/tests/__init__.py -------------------------------------------------------------------------------- /tests/cookbook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/tests/cookbook/__init__.py -------------------------------------------------------------------------------- /tests/cookbook/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/tests/cookbook/eval/__init__.py -------------------------------------------------------------------------------- /tests/cookbook/remote/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmo-cookbook/0311f0a7d9c1ba4b233738d16682afe4139692a0/tests/cookbook/remote/__init__.py -------------------------------------------------------------------------------- /tests/cookbook/remote/test_remote.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from unittest import TestCase 4 | 5 | from cookbook.remote.base import LocatedPath 6 | 7 | 8 | class TestLocatedPath(TestCase): 9 | def test_located_path_gcs(self): 10 | self.assertEqual(LocatedPath.from_str("gs://bucket/prefix"), LocatedPath(prot="gs", path="bucket/prefix")) 11 | self.assertEqual(LocatedPath.from_str("gcs://bucket/prefix"), LocatedPath(prot="gs", path="bucket/prefix")) 12 | self.assertEqual( 13 | LocatedPath.from_str("gs://bucket/more/prefix/"), LocatedPath(prot="gs", path="bucket/more/prefix/") 14 | ) 15 | 16 | def test_located_path_s3(self): 17 | self.assertEqual(LocatedPath.from_str("s3://bucket/prefix"), LocatedPath(prot="s3", path="bucket/prefix")) 18 | self.assertEqual( 19 | LocatedPath.from_str("s3n://bucket/prefix/"), LocatedPath(prot="s3", path="bucket/prefix/") 20 | ) 21 | 22 | def test_located_path_weka(self): 23 | self.assertEqual( 24 | LocatedPath.from_str("/weka/oe-training-default/prefix"), 25 | LocatedPath(prot="weka", path="/oe-training-default/prefix"), 26 | ) 27 | self.assertEqual( 28 | LocatedPath.from_str("/oe-training-default/prefix/"), 29 | LocatedPath(prot="weka", path="/oe-training-default/prefix/"), 30 | ) 31 | 32 | with self.assertRaises(ValueError): 33 | LocatedPath.from_str("weka://non-existent-bucket/prefix") 34 | 35 | def test_located_path_local(self): 36 | # Test absolute local paths 37 | self.assertEqual(LocatedPath.from_str("/home/user/data"), LocatedPath(prot="file", path="/home/user/data")) 38 | self.assertEqual(LocatedPath.from_str("/tmp/data/"), LocatedPath(prot="file", path="/tmp/data/")) 39 | 40 | # Test with Path objects 41 | self.assertEqual( 42 | LocatedPath.from_str(Path("/usr/local/bin")), LocatedPath(prot="file", path="/usr/local/bin") 43 | ) 44 | 45 | # Test single-level paths 46 | self.assertEqual(LocatedPath.from_str("/home"), LocatedPath(prot="file", path="/home")) 47 | 48 | def test_located_path_to_str(self): 49 | # Test conversion back to string 50 | path = LocatedPath(prot="file", path="home/user/data") 51 | self.assertEqual(path.local, Path("home/user/data")) 52 | 53 | path_with_trailing_slash = LocatedPath(prot="file", path="tmp/data/") 54 | self.assertEqual(path_with_trailing_slash.local, Path("tmp/data/")) 55 | 56 | def test_located_path_invalid(self): 57 | with self.assertRaises(ValueError): 58 | LocatedPath.from_str("azure://bucket/prefix") 59 | 60 | def test_local_command(self): 61 | with self.assertRaises(ValueError): 62 | LocatedPath.from_str("s3://bucket/prefix").local 63 | 64 | with self.assertRaises(ValueError): 65 | LocatedPath.from_str("gs://bucket/prefix").local 66 | 67 | self.assertEqual(LocatedPath.from_str("file://home/user/data").local, Path("/home/user/data")) 68 | self.assertEqual(LocatedPath.from_str("/home/user/data").local, Path("/home/user/data")) 69 | self.assertEqual( 70 | LocatedPath.from_str("weka://oe-data-default/prefix").local, Path("/oe-data-default/prefix") 71 | ) 72 | self.assertEqual(LocatedPath.from_str("/oe-data-default/prefix").local, Path("/oe-data-default/prefix")) 73 | self.assertEqual( 74 | LocatedPath.from_str("/weka/oe-training-default/prefix").local, Path("/oe-training-default/prefix") 75 | ) 76 | 77 | def test_remote_command(self): 78 | with self.assertRaises(ValueError): 79 | LocatedPath.from_str("file://home/user/data").remote 80 | 81 | with self.assertRaises(ValueError): 82 | LocatedPath.from_str("/home/user/data").remote 83 | 84 | self.assertEqual(LocatedPath.from_str("s3://bucket/prefix").remote, "s3://bucket/prefix") 85 | self.assertEqual(LocatedPath.from_str("gs://bucket/prefix").remote, "gs://bucket/prefix") 86 | self.assertEqual( 87 | LocatedPath.from_str("/oe-training-default/prefix").remote, "weka://oe-training-default/prefix" 88 | ) 89 | self.assertEqual( 90 | LocatedPath.from_str("/weka/oe-training-default/prefix").remote, "weka://oe-training-default/prefix" 91 | ) 92 | --------------------------------------------------------------------------------