├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── question.md ├── pull_request_template.md └── workflows │ ├── pre-commit.yaml │ └── run-tests.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── conf ├── archive │ ├── old-benchmarking │ │ ├── gpt2-benchmark-config.yaml │ │ ├── gpt2-intensive-config.yaml │ │ └── gpt2-toy-config.yaml │ ├── partial-checkpointing │ │ └── gpt2-mistral-medium-gcheck-config.yaml │ └── v1 │ │ ├── gpt2-debug-config.yaml │ │ ├── gpt2-mistral-medium-config.yaml │ │ ├── gpt2-mistral-medium-gcp-config.yaml │ │ ├── gpt2-mistral-mini-config.yaml │ │ ├── gpt2-mistral-small-gcp-config.yaml │ │ ├── gpt2-scaling-config.yaml │ │ └── tutorial-gpt2-micro.yaml ├── datasets │ ├── openwebtext.yaml │ ├── shakespeare.yaml │ ├── wikitext103.yaml │ └── wikitext2.yaml ├── deepspeed │ ├── debug-conf.json │ ├── hostfile │ ├── z1-conf.json │ ├── z1-offload-conf.json │ ├── z2-debug-conf.json │ ├── z2-medium-conf.json │ ├── z2-offload-conf.json │ ├── z2-small-conf.json │ ├── z3-conf.json │ └── z3-offload-conf.json ├── mistral-medium.yaml ├── mistral-micro.yaml ├── mistral-small.yaml ├── models │ ├── mistral-medium.yaml │ ├── mistral-micro.json │ ├── mistral-micro.yaml │ └── mistral-small.yaml ├── train_schema.py ├── trainers │ ├── benchmark.yaml │ ├── gpt2-medium.yaml │ ├── gpt2-small-short.yaml │ ├── gpt2-small.yaml │ └── intensive.yaml └── tutorial-shakespeare-gpt2-micro.yaml ├── docs ├── LICENSE ├── Makefile ├── README.md ├── _static │ ├── pydata-custom.css │ └── readthedocs-custom.css ├── _templates │ ├── custom-class-template.rst │ ├── custom-module-template.rst │ └── layout.html ├── api.rst ├── conf.py ├── contributing.rst ├── fork.png ├── getting_started.rst ├── getting_started │ ├── config.rst │ ├── download.rst │ ├── evaluate.rst │ ├── install.rst │ ├── train-output.txt │ ├── train.rst │ └── wandb_example.png ├── hugging_face_differences.rst ├── index.rst ├── mistral_components.png ├── scripts │ └── build_download_tables.py └── tutorials │ ├── cluster_basics.png │ ├── deepspeed.rst │ ├── gcp_plus_kubernetes.rst │ ├── generate.rst │ ├── gke_standard.png │ ├── kubernetes_menu.png │ ├── multi-gpu.rst │ ├── node_pool.png │ ├── node_pool_gpu.png │ ├── resume.rst │ └── tutorial_cluster.png ├── environments ├── Dockerfile ├── environment-cpu.yaml ├── environment-gpu.yaml ├── environment-m1.yaml └── export.py ├── gcp ├── Dockerfile ├── job-gpt2-micro.yaml ├── pod-gpu.yaml ├── pod.yaml └── run-demo-job.sh ├── generate_text.ipynb ├── mistral_models.json ├── mypy.ini ├── pyproject.toml ├── scripts ├── README.md ├── benchmarking │ ├── dial-in │ │ ├── mistral-gpt2-medium.sh │ │ └── mistral-gpt2-small.sh │ ├── intensive-benchmarking │ │ ├── ddp-multi.sh │ │ ├── deepspeed-multi.sh │ │ └── fairscale-multi.sh │ └── standard-benchmarking │ │ ├── README.md │ │ ├── ddp-multi.sh │ │ ├── ddp-single.sh │ │ ├── deepspeed-multi.sh │ │ ├── deepspeed-single.sh │ │ ├── ds-evaluation-bsz.sh │ │ ├── fairscale-multi.sh │ │ ├── fairscale-single.sh │ │ └── vanilla.sh ├── debugging │ ├── resuming │ │ └── resume-single-node.sh │ └── sanity │ │ └── mistral-sanity-gpt2-small.sh ├── forget-me-not.sh ├── mistral-gcp-gpt2-medium.sh ├── mistral-gcp-gpt2-small.sh ├── mistral-gpt2-medium.sh ├── mistral-gpt2-small.sh └── run │ ├── ddp.sh │ ├── deepspeed.sh │ ├── fairscale.sh │ ├── multi-node.sh │ └── single-node.sh ├── setup ├── conda-requirements.txt ├── pip-requirements.txt ├── setup.sh └── test-requirements.txt ├── src ├── __init__.py ├── args │ ├── __init__.py │ └── training_args.py ├── core │ ├── __init__.py │ ├── callbacks.py │ └── trainer.py ├── corpora │ ├── __init__.py │ ├── auto.py │ ├── detokenization.py │ ├── indexer.py │ └── tokenization_utils.py ├── models │ ├── __init__.py │ └── auto_clm.py ├── overwatch │ ├── __init__.py │ └── overwatch.py └── util │ ├── __init__.py │ ├── paths.py │ └── registry.py ├── tests ├── README.md ├── __init__.py ├── conf │ ├── datasets │ │ ├── wikitext103.yaml │ │ ├── wikitext2-detokenized.yaml │ │ └── wikitext2.yaml │ ├── deepspeed │ │ ├── z1-conf.json │ │ └── z2-small-conf.json │ ├── models │ │ ├── gpt2-micro.json │ │ ├── gpt2-micro.yaml │ │ └── gpt2-small.yaml │ ├── train-diff.yaml │ ├── train.yaml │ └── trainers │ │ ├── gpt2-small-diff.yaml │ │ └── gpt2-small.yaml ├── run_deepspeed_tests.py ├── setup │ └── pip-requirements.txt ├── test_args.py ├── test_checkpoint.py ├── test_eval_loss_is_defined.py ├── test_fp.py ├── test_indexed_dataset.py ├── test_online_benchmark_trainer.py ├── test_seed.py └── test_valid_configs.py ├── train.py └── tutorials ├── custom-dataset ├── README.md └── shakespeare │ ├── shakespeare.train.jsonl │ └── shakespeare.validation.jsonl └── gcp-on-demand └── README.md /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = .git 3 | max-line-length = 119 4 | ignore = E203, E501, W503, W605 5 | per-file-ignores = 6 | */__init__.py: F401 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the bug. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Additional context** 20 | Add any other context about the problem here (e.g. launching with DeepSpeed?, OS, library versions, hardware, etc...) 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask a question. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ** Before you start, make sure to check out: ** 11 | * Documentation on our [Read The Docs](https://nlp.stanford.edu/mistral/) site. 12 | * [GitHub Issues](https://github.com/stanford-mercury/mistral/issues) 13 | 14 | These sources may already contain the answer to your question! 15 | 16 | If you still can't find an answer, erase this template and add your question. Please try to provide as much detail as possible so we can quickly and accurately respond! 17 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | A brief and concise description of what your pull request is trying to accomplish. 3 | 4 | ## Fixes Issues 5 | A list of issues/bugs with # references. (e.g., #123) 6 | 7 | ## Unit test coverage 8 | Are there unit tests in place to make sure your code is functioning correctly? 9 | 10 | ## Known breaking changes/behaviors 11 | Does this break anything in Mistral's existing user interface? If so, what is it and how is it addressed? 12 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yaml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | pre-commit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - uses: actions/setup-python@v3 14 | - uses: pre-commit/action@v3.0.0 -------------------------------------------------------------------------------- /.github/workflows/run-tests.yaml: -------------------------------------------------------------------------------- 1 | 2 | name: Run Tests 3 | on: [push] 4 | jobs: 5 | Run-Mistral-Tests: 6 | runs-on: self-hosted 7 | steps: 8 | - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." 9 | - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." 10 | - name: Check out repository code 11 | uses: actions/checkout@v2 12 | - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." 13 | - run: echo "🖥️ The workflow is now ready to test your code on the runner." 14 | - name: Setting up Conda Environment 15 | run: | 16 | echo "Setting up conda env for this test!" 17 | eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy 18 | bash setup/setup.sh mistral-${{github.sha}} 19 | conda activate mistral-${{github.sha}} 20 | - name: Installing test dependencies 21 | run: | 22 | eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy 23 | conda activate mistral-${{github.sha}} 24 | pip install -r setup/test-requirements.txt 25 | - name: Setting up environment variables 26 | run: | 27 | echo 'Deactivating wandb' 28 | eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy 29 | conda activate mistral-${{github.sha}} 30 | cd tests ; wandb disabled 31 | echo 'MISTRAL_TEST_DIR:' 32 | echo $MISTRAL_TEST_DIR 33 | - name: Run tests (single node/single GPU) 34 | run: | 35 | eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy 36 | conda activate mistral-${{github.sha}} 37 | cd tests 38 | echo 'Clearing artifacts' 39 | rm -rf $MISTRAL_TEST_DIR/artifacts 40 | CUDA_VISIBLE_DEVICES=0 pytest --durations=0 41 | - name: Run tests (Deepspeed 2xGPUs) 42 | run: | 43 | eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy 44 | conda activate mistral-${{github.sha}} 45 | export PYTHONPATH=${GITHUB_WORKSPACE} 46 | cd tests 47 | echo 'Clearing artifacts' 48 | rm -rf $MISTRAL_TEST_DIR/artifacts 49 | python run_deepspeed_tests.py 50 | - name: Delete conda environment 51 | if: always() 52 | run: | 53 | eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy 54 | conda deactivate 55 | conda env remove -n mistral-${{github.sha}} 56 | - run: echo "All tests finished!" 57 | - run: echo "🍏 This job's status is ${{ job.status }}." 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # JetBrains 132 | .idea/ 133 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | exclude: ".git" # TODO - add tox/nox files if we ever get around to implementing testing 4 | default_stages: 5 | - commit 6 | fail_fast: true 7 | 8 | repos: 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v4.0.1 11 | hooks: 12 | - id: trailing-whitespace 13 | - id: end-of-file-fixer 14 | - id: check-yaml 15 | - id: check-toml 16 | - id: check-merge-conflict 17 | - id: check-added-large-files 18 | 19 | - repo: https://github.com/psf/black 20 | rev: 22.3.0 21 | hooks: 22 | - id: black 23 | 24 | - repo: https://github.com/timothycrosley/isort 25 | rev: 5.9.3 26 | hooks: 27 | - id: isort 28 | 29 | - repo: https://github.com/PyCQA/flake8 30 | rev: 3.9.2 31 | hooks: 32 | - id: flake8 33 | additional_dependencies: [flake8-isort] 34 | 35 | - repo: https://github.com/pre-commit/mirrors-mypy 36 | rev: 'v0.960' 37 | hooks: 38 | - id: mypy 39 | args: [--ignore-missing-imports] 40 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Mistral 2 | 3 | Please see the full contribution guidelines on our [Read The Docs](https://nlp.stanford.edu/mistral/contributing.html) page. 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | .PHONY: help serialize-env check autoformat prune 3 | .DEFAULT: help 4 | 5 | # Create Valid Architectures 6 | ARCHITECTURES := cpu gpu 7 | 8 | # Generates a useful overview/help message for various make features - add to this as necessary! 9 | help: 10 | @echo "make serialize-env arch=" 11 | @echo " After (un)installing dependencies, dump environment.yaml for arch :: < cpu | gpu >." 12 | @echo "make prune" 13 | @echo " Pull all branches from git, and prune all local branches that are merged in origin." 14 | @echo "make check" 15 | @echo " Run code style and linting (black, flake, isort) *without* changing files!" 16 | @echo "make autoformat" 17 | @echo " Run code styling (black, isort) and update in place - committing with pre-commit also does this." 18 | 19 | serialize-env: 20 | ifneq ($(filter $(arch),$(ARCHITECTURES)),) 21 | python environments/export.py -a $(arch) 22 | else 23 | @echo "Argument 'arch' is not set - try calling 'make serialize-env arch=' with ID = < cpu | gpu >." 24 | endif 25 | 26 | check: 27 | isort --check . 28 | black --check . 29 | flake8 . 30 | 31 | autoformat: 32 | isort --atomic . 33 | black . 34 | 35 | prune: 36 | @bash -c "git fetch -p"; 37 | @bash -c "for branch in $(git branch -vv | grep ': gone]' | awk '{print $1}'); do git branch -d $branch; done"; 38 | -------------------------------------------------------------------------------- /conf/archive/old-benchmarking/gpt2-benchmark-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-benchmark-config.yaml 2 | # Benchmarking GPT-2 Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, and 3 | # full batch size (512). Support for Single-Node, Multi-Node, Mixed Precision, DDP, FairScale, and DeepSpeed. 4 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 5 | --- 6 | # Inherit Dataset, Tokenization, Model, and Training Details 7 | inherit: 8 | - datasets/openwebtext.yaml 9 | - models/gpt2-small.yaml 10 | - trainers/benchmark.yaml 11 | 12 | # Run ID -- defaults to `null`; override as you like! 13 | run_id: null 14 | 15 | # Weights & Biases 16 | wandb: mistral-benchmarking 17 | group: null 18 | 19 | # Artifacts & Caching 20 | artifacts: 21 | cache_dir: /scr-ssd/mercury/mistral/artifacts 22 | run_dir: /scr-ssd/mercury/mistral/runs 23 | 24 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 25 | effective_bsz: 512 26 | 27 | # Resume from Checkpoint 28 | resume: false 29 | resume_checkpoint: null 30 | 31 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 32 | local_rank: -1 33 | nnodes: -1 34 | nproc_per_node: -1 35 | 36 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 37 | num_gpus: -1 38 | num_nodes: -1 39 | world_size: -1 40 | 41 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 42 | log_level: 20 43 | 44 | # Random Seed 45 | seed: 21 46 | -------------------------------------------------------------------------------- /conf/archive/old-benchmarking/gpt2-intensive-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-intensive-config.yaml 2 | # Intensive Benchmarking GPT-2 Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, and 3 | # full batch size (512). Support for Multi-Node Mixed Precision runs, for final round of benchmarking of DDP, 4 | # FairScale, and DeepSpeed. 5 | # 6 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 7 | --- 8 | # Inherit Dataset, Tokenization, Model, and Training Details 9 | inherit: 10 | - datasets/openwebtext.yaml 11 | - models/gpt2-small.yaml 12 | - trainers/intensive.yaml 13 | 14 | # Run ID -- defaults to `null`; override as you like! 15 | run_id: null 16 | 17 | # Weights & Biases 18 | wandb: mistral-benchmarking 19 | group: intensive 20 | 21 | # Artifacts & Caching 22 | artifacts: 23 | cache_dir: /scr-ssd/mercury/mistral/artifacts 24 | run_dir: /scr-ssd/mercury/mistral/runs 25 | 26 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 27 | effective_bsz: 512 28 | 29 | # Resume from Checkpoint 30 | resume: false 31 | resume_checkpoint: null 32 | 33 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 34 | local_rank: -1 35 | nnodes: -1 36 | nproc_per_node: -1 37 | 38 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 39 | num_gpus: -1 40 | num_nodes: -1 41 | world_size: -1 42 | 43 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 44 | log_level: 20 45 | 46 | # Random Seed 47 | seed: 21 48 | -------------------------------------------------------------------------------- /conf/archive/old-benchmarking/gpt2-toy-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-toy-config.yaml 2 | # Toy GPT-2 Config, currently working with the WikiText-103 Dataset, GPT-2 Small Architecture, and Single-Node 3 | # Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files. 4 | --- 5 | # Inherit Dataset, Tokenization, Model, and Training Details 6 | inherit: 7 | - datasets/wikitext103.yaml 8 | - models/gpt2-small.yaml 9 | - trainers/toy.yaml 10 | 11 | # Run ID -- defaults to `null`; override as you like! 12 | run_id: null 13 | 14 | # Weights & Biases (Set os.environ["WANDB_PROJECT"]) 15 | wandb: mistral-debugging 16 | group: null 17 | 18 | # Artifacts & Caching 19 | artifacts: 20 | cache_dir: /u/scr/nlp/mercury/mistral/artifacts 21 | run_dir: /u/scr/nlp/mercury/mistral/runs 22 | 23 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 24 | bsz: 8 25 | 26 | # Resume from Checkpoint 27 | resume: false 28 | resume_checkpoint: null 29 | 30 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 31 | local_rank: -1 32 | nnodes: 1 33 | nproc_per_node: 8 34 | 35 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 36 | num_gpus: 8 37 | num_nodes: 1 38 | world_size: 8 39 | 40 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 41 | log_level: 20 42 | 43 | # Random Seed 44 | seed: 21 45 | -------------------------------------------------------------------------------- /conf/archive/partial-checkpointing/gpt2-mistral-medium-gcheck-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-mistral-medium-gcheck-config.yaml 2 | # Full Mistral GPT-2 Medium Training Config, currently working with the WikiText Dataset, GPT-2 Medium Architecture, 3 | # and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 4/8. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - ../../datasets/wikitext103.yaml 10 | - ../../models/gpt2-medium.yaml 11 | - ../../trainers/gpt2-medium.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: mistral-debugging 18 | group: gpt2-medium 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: /scr-ssd/mercury/mistral/artifacts 23 | run_dir: /scr-ssd/mercury/mistral/runs 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 512 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 33 | local_rank: -1 34 | nnodes: -1 35 | nproc_per_node: -1 36 | 37 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 38 | num_gpus: -1 39 | num_nodes: -1 40 | world_size: -1 41 | 42 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 43 | log_level: 20 44 | 45 | # Random Seed 46 | seed: 21 47 | -------------------------------------------------------------------------------- /conf/archive/v1/gpt2-debug-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-sphinx-debug-config.yaml 2 | # Debugging GPT-2 Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, and Single-Node 3 | # Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files. 4 | --- 5 | # Inherit Dataset, Tokenization, Model, and Training Details 6 | inherit: 7 | - datasets/openwebtext.yaml 8 | - models/gpt2-small.yaml 9 | - trainers/debug.yaml 10 | 11 | # Run ID -- defaults to `null`; override as you like! 12 | run_id: null 13 | 14 | # Weights & Biases 15 | wandb: mistral-sanity 16 | group: null 17 | 18 | # Artifacts & Caching 19 | artifacts: 20 | cache_dir: /scr-ssd/mercury/mistral/artifacts 21 | run_dir: /scr-ssd/mercury/mistral/runs 22 | 23 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 24 | effective_bsz: 512 25 | 26 | # Resume from Checkpoint 27 | resume: false 28 | resume_checkpoint: null 29 | 30 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 31 | # - Frequency (`freq`) at which to save checkpoints (# steps) 32 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 33 | checkpoint_frequency: 34 | - [10, 100] 35 | - [25, 1000] 36 | - [50, 2000] 37 | - [100, 4000] 38 | 39 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 40 | local_rank: -1 41 | nnodes: -1 42 | nproc_per_node: -1 43 | 44 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 45 | num_gpus: -1 46 | num_nodes: -1 47 | world_size: -1 48 | 49 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 50 | log_level: 20 51 | 52 | # Random Seed 53 | seed: 21 54 | -------------------------------------------------------------------------------- /conf/archive/v1/gpt2-mistral-medium-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-mistral-medium-config.yaml 2 | # Full Mistral GPT-2 Medium Training Config, currently working with the OpenWebText Dataset, GPT-2 Medium 3 | # Architecture, and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 4. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/openwebtext.yaml 10 | - models/gpt2-medium.yaml 11 | - trainers/gpt2-medium.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: mistral-gpt2 18 | group: gpt2-medium 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: /scr-ssd/mercury/mistral/artifacts 23 | run_dir: /scr-ssd/mercury/mistral/runs 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 512 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 33 | # - Frequency (`freq`) at which to save checkpoints (# steps) 34 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 35 | checkpoint_frequency: 36 | - [10, 100] 37 | - [50, 2000] 38 | - [100, 20000] 39 | - [1000, 400000] 40 | 41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 42 | local_rank: -1 43 | nnodes: -1 44 | nproc_per_node: -1 45 | 46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 47 | num_gpus: -1 48 | num_nodes: -1 49 | world_size: -1 50 | 51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 52 | log_level: 20 53 | 54 | # Random Seed 55 | seed: 21 56 | -------------------------------------------------------------------------------- /conf/archive/v1/gpt2-mistral-medium-gcp-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-mistral-medium-gcp-config.yaml 2 | # Full Mistral GPT-2 Medium Training Config, currently working with the OpenWebText Dataset, GPT-2 Medium 3 | # Architecture, and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 4. 4 | # Written for Google Cloud! 5 | # 6 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 7 | --- 8 | # Inherit Dataset, Tokenization, Model, and Training Details 9 | inherit: 10 | - datasets/openwebtext.yaml 11 | - models/gpt2-medium.yaml 12 | - trainers/gpt2-medium.yaml 13 | 14 | # Run ID -- make sure to override! 15 | run_id: null 16 | 17 | # Weights & Biases 18 | wandb: mistral-gpt2 19 | group: gpt2-medium 20 | 21 | # Artifacts & Caching 22 | artifacts: 23 | cache_dir: /home/data/mercury/mistral/artifacts 24 | run_dir: /home/data/mercury/mistral/runs 25 | 26 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 27 | effective_bsz: 512 28 | 29 | # Resume from Checkpoint 30 | resume: false 31 | resume_checkpoint: null 32 | 33 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 34 | # - Frequency (`freq`) at which to save checkpoints (# steps) 35 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 36 | checkpoint_frequency: 37 | - [10, 100] 38 | - [50, 2000] 39 | - [100, 20000] 40 | - [1000, 400000] 41 | 42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 43 | local_rank: -1 44 | nnodes: -1 45 | nproc_per_node: -1 46 | 47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 48 | num_gpus: -1 49 | num_nodes: -1 50 | world_size: -1 51 | 52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 53 | log_level: 20 54 | 55 | # Random Seed 56 | seed: 21 57 | -------------------------------------------------------------------------------- /conf/archive/v1/gpt2-mistral-mini-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-mistral-small-config.yaml 2 | # Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, 3 | # and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/openwebtext.yaml 10 | - models/gpt2-mini.yaml 11 | - trainers/gpt2-small.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: mistral-gpt2 18 | group: gpt2-small 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: /scr-ssd/mercury/mistral/artifacts 23 | run_dir: /scr-ssd/mercury/mistral/runs 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 512 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 33 | # - Frequency (`freq`) at which to save checkpoints (# steps) 34 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 35 | checkpoint_frequency: 36 | - [10, 100] 37 | - [50, 2000] 38 | - [100, 20000] 39 | - [1000, 400000] 40 | 41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 42 | local_rank: -1 43 | nnodes: -1 44 | nproc_per_node: -1 45 | 46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 47 | num_gpus: -1 48 | num_nodes: -1 49 | world_size: -1 50 | 51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 52 | log_level: 20 53 | 54 | # Random Seed 55 | seed: 21 56 | -------------------------------------------------------------------------------- /conf/archive/v1/gpt2-mistral-small-gcp-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-mistral-small-gcp-config.yaml 2 | # Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, 3 | # and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16. Written for Google Cloud! 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/openwebtext.yaml 10 | - models/gpt2-small.yaml 11 | - trainers/gpt2-small.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: mistral-gpt2 18 | group: gpt2-small 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: /home/data/mercury/mistral/artifacts 23 | run_dir: /home/data/mercury/mistral/runs 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 512 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 33 | # - Frequency (`freq`) at which to save checkpoints (# steps) 34 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 35 | checkpoint_frequency: 36 | - [10, 100] 37 | - [50, 2000] 38 | - [100, 20000] 39 | - [1000, 400000] 40 | 41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 42 | local_rank: -1 43 | nnodes: -1 44 | nproc_per_node: -1 45 | 46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 47 | num_gpus: -1 48 | num_nodes: -1 49 | world_size: -1 50 | 51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 52 | log_level: 20 53 | 54 | # Random Seed 55 | seed: 21 56 | -------------------------------------------------------------------------------- /conf/archive/v1/gpt2-scaling-config.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-scaling-config.yaml 2 | # GPT-2 Scaling Config for benchmarking memory footprint and training time for various GPT-2 Architectures, working 3 | # with the WikiText-103 Dataset (assuming data loading doesn't affect GPU Memory), sequence length of 1024, 4 | # and full batch size (512). 5 | # 6 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 7 | --- 8 | # Inherit Dataset, Tokenization, Model, and Training Details 9 | inherit: 10 | - datasets/wikitext103.yaml 11 | - models/gpt2-small.yaml 12 | - trainers/gpt2-small.yaml 13 | 14 | # Run ID -- make sure to override! 15 | run_id: null 16 | 17 | # Weights & Biases 18 | wandb: mistral-scaling 19 | group: null 20 | 21 | # Artifacts & Caching 22 | artifacts: 23 | cache_dir: /scr-ssd/mercury/mistral/artifacts 24 | run_dir: /scr-ssd/mercury/mistral/runs 25 | 26 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 27 | effective_bsz: 512 28 | 29 | # Resume from Checkpoint 30 | resume: false 31 | resume_checkpoint: null 32 | 33 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 34 | # - Frequency (`freq`) at which to save checkpoints (# steps) 35 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 36 | checkpoint_frequency: 37 | - [10, 100] 38 | - [50, 2000] 39 | - [100, 20000] 40 | - [1000, 400000] 41 | 42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 43 | local_rank: -1 44 | nnodes: -1 45 | nproc_per_node: -1 46 | 47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 48 | num_gpus: -1 49 | num_nodes: -1 50 | world_size: -1 51 | 52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 53 | log_level: 20 54 | 55 | # Random Seed 56 | seed: 21 57 | -------------------------------------------------------------------------------- /conf/archive/v1/tutorial-gpt2-micro.yaml: -------------------------------------------------------------------------------- 1 | # tutorial-gpt2-micro.yaml 2 | # Demo GPT-2 Micro Training Config, currently working with the WikiText103 Dataset, GPT-2 Micro Architecture, 3 | # and batch size of 2. Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 2. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/wikitext103.yaml 10 | - models/gpt2-micro.yaml 11 | - trainers/gpt2-small.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: 18 | group: 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: 23 | run_dir: 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 32 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 33 | # - Frequency (`freq`) at which to save checkpoints (# steps) 34 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 35 | checkpoint_frequency: 36 | - [2, 10] 37 | - [10, 100] 38 | - [50, 2000] 39 | - [100, 20000] 40 | - [1000, 400000] 41 | 42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 43 | local_rank: -1 44 | nnodes: -1 45 | nproc_per_node: -1 46 | 47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 48 | num_gpus: -1 49 | num_nodes: -1 50 | world_size: -1 51 | 52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 53 | log_level: 20 54 | 55 | # Random Seed 56 | seed: 21 57 | 58 | # Set Eval Stride 59 | online_eval: 60 | stride: 256 61 | -------------------------------------------------------------------------------- /conf/datasets/openwebtext.yaml: -------------------------------------------------------------------------------- 1 | # openwebtext.yaml 2 | # Configuration for OpenWebText Dataset (https://huggingface.co/datasets/openwebtext) 3 | --- 4 | dataset: 5 | id: openwebtext 6 | name: null 7 | validation_ratio: 0.0005 8 | 9 | # Number of Preprocessing Workers 10 | num_proc: 64 11 | 12 | # Number of Evaluation Preprocessing Workers 13 | eval_num_proc: 4 14 | -------------------------------------------------------------------------------- /conf/datasets/shakespeare.yaml: -------------------------------------------------------------------------------- 1 | # shakespeare.yaml 2 | # Configuration for Shakespeare dataset at tutorials/custom-dataset/shakespeare. 3 | --- 4 | dataset: 5 | id: shakespeare 6 | name: shakespeare 7 | dataset_dir: tutorials/custom-dataset/shakespeare 8 | 9 | # Number of Preprocessing Workers 10 | num_proc: 4 11 | 12 | # Number of Evaluation Preprocessing Workers 13 | eval_num_proc: 4 14 | -------------------------------------------------------------------------------- /conf/datasets/wikitext103.yaml: -------------------------------------------------------------------------------- 1 | # wikitext103.yaml 2 | # Configuration for WikiText-103 Dataset (https://huggingface.co/datasets/wikitext). 3 | --- 4 | dataset: 5 | id: wikitext 6 | name: wikitext-103-raw-v1 7 | 8 | # Number of Preprocessing Workers 9 | num_proc: 4 10 | 11 | # Number of Evaluation Preprocessing Workers 12 | eval_num_proc: 4 13 | -------------------------------------------------------------------------------- /conf/datasets/wikitext2.yaml: -------------------------------------------------------------------------------- 1 | # wikitext2.yaml 2 | # Configuration for WikiText-2 Dataset (https://huggingface.co/datasets/wikitext). 3 | --- 4 | dataset: 5 | id: wikitext 6 | name: wikitext-2-raw-v1 7 | validation_ratio: null 8 | 9 | # Number of Preprocessing Workers 10 | num_proc: 4 11 | 12 | # Number of Evaluation Preprocessing Workers 13 | eval_num_proc: 4 14 | -------------------------------------------------------------------------------- /conf/deepspeed/debug-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "hysteresis": 2, 7 | "min_loss_scale": 1 8 | }, 9 | 10 | "zero_optimization": { 11 | "stage": 2, 12 | "allgather_partitions": true, 13 | "allgather_bucket_size": 5e8, 14 | "overlap_comm": true, 15 | "reduce_scatter": true, 16 | "reduce_bucket_size": 5e8, 17 | "contiguous_gradients": true, 18 | "cpu_offload": true 19 | }, 20 | 21 | "optimizer": { 22 | "type": "AdamW", 23 | "params": { 24 | "lr": 3e-5, 25 | "betas": [ 0.8, 0.999 ], 26 | "eps": 1e-8, 27 | "weight_decay": 3e-7 28 | } 29 | }, 30 | "zero_allow_untested_optimizer": true, 31 | 32 | "scheduler": { 33 | "type": "WarmupLR", 34 | "params": { 35 | "warmup_min_lr": 0, 36 | "warmup_max_lr": 3e-5, 37 | "warmup_num_steps": 500 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /conf/deepspeed/hostfile: -------------------------------------------------------------------------------- 1 | sphinx1.stanford.edu slots=8 2 | sphinx2.stanford.edu slots=8 3 | -------------------------------------------------------------------------------- /conf/deepspeed/z1-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": [ 7 | 0.9, 8 | 0.95 9 | ], 10 | "eps": 1e-8, 11 | "weight_decay": 0.1 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupDecayLR", 17 | "params": { 18 | "total_num_steps": 400000, 19 | "warmup_max_lr": 0.0006, 20 | "warmup_num_steps": 4000 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 1, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 2e8, 28 | "reduce_scatter": true, 29 | "reduce_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "contiguous_gradients": true, 32 | "cpu_offload": false 33 | }, 34 | 35 | "train_batch_size": "auto", 36 | "train_micro_batch_size_per_gpu": "auto" 37 | } 38 | -------------------------------------------------------------------------------- /conf/deepspeed/z1-offload-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": [ 7 | 0.9, 8 | 0.95 9 | ], 10 | "eps": 1e-8, 11 | "weight_decay": 0.1 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupDecayLR", 17 | "params": { 18 | "total_num_steps": 400000, 19 | "warmup_max_lr": 0.0006, 20 | "warmup_num_steps": 4000 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 1, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 2e8, 28 | "reduce_scatter": true, 29 | "reduce_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "contiguous_gradients": true, 32 | "cpu_offload": true 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /conf/deepspeed/z2-debug-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": [ 7 | 0.9, 8 | 0.95 9 | ], 10 | "eps": 1e-8, 11 | "weight_decay": 0.1 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupDecayLR", 17 | "params": { 18 | "total_num_steps": 4000, 19 | "warmup_max_lr": 0.0006, 20 | "warmup_num_steps": 40 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 2, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 2e8, 28 | "reduce_scatter": true, 29 | "reduce_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "contiguous_gradients": true, 32 | "cpu_offload": false 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /conf/deepspeed/z2-medium-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.00015, 6 | "betas": "auto", 7 | "eps": 1e-8, 8 | "weight_decay": 0.1 9 | } 10 | }, 11 | 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "total_num_steps": "auto", 16 | "warmup_max_lr": 0.00015, 17 | "warmup_num_steps": 4000 18 | } 19 | }, 20 | 21 | "zero_optimization": { 22 | "stage": 2, 23 | "allgather_partitions": true, 24 | "allgather_bucket_size": 2e8, 25 | "reduce_scatter": true, 26 | "reduce_bucket_size": 2e8, 27 | "overlap_comm": true, 28 | "contiguous_gradients": true, 29 | "cpu_offload": false 30 | }, 31 | 32 | "train_batch_size": "auto", 33 | "train_micro_batch_size_per_gpu": "auto" 34 | } 35 | -------------------------------------------------------------------------------- /conf/deepspeed/z2-offload-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": [ 7 | 0.9, 8 | 0.95 9 | ], 10 | "eps": 1e-8, 11 | "weight_decay": 0.1 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupDecayLR", 17 | "params": { 18 | "total_num_steps": 400000, 19 | "warmup_max_lr": 0.0006, 20 | "warmup_num_steps": 4000 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 2, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 2e8, 28 | "reduce_scatter": true, 29 | "reduce_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "contiguous_gradients": true, 32 | "cpu_offload": true 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /conf/deepspeed/z2-small-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": "auto", 7 | "eps": 1e-8, 8 | "weight_decay": 0.1 9 | } 10 | }, 11 | 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "total_num_steps": "auto", 16 | "warmup_max_lr": 0.0006, 17 | "warmup_num_steps": 4000 18 | } 19 | }, 20 | 21 | "zero_optimization": { 22 | "stage": 2, 23 | "allgather_partitions": true, 24 | "allgather_bucket_size": 2e8, 25 | "reduce_scatter": true, 26 | "reduce_bucket_size": 2e8, 27 | "overlap_comm": true, 28 | "contiguous_gradients": true, 29 | "cpu_offload": false 30 | }, 31 | 32 | "train_batch_size": "auto", 33 | "train_micro_batch_size_per_gpu": "auto" 34 | } 35 | -------------------------------------------------------------------------------- /conf/deepspeed/z3-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": [ 7 | 0.9, 8 | 0.95 9 | ], 10 | "eps": 1e-8, 11 | "weight_decay": 0.1 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupDecayLR", 17 | "params": { 18 | "total_num_steps": 400000, 19 | "warmup_max_lr": 0.0006, 20 | "warmup_num_steps": 4000 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 3, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 2e8, 28 | "reduce_scatter": true, 29 | "reduce_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "contiguous_gradients": true, 32 | "cpu_offload": false 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /conf/deepspeed/z3-offload-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": [ 7 | 0.9, 8 | 0.95 9 | ], 10 | "eps": 1e-8, 11 | "weight_decay": 0.1 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupDecayLR", 17 | "params": { 18 | "total_num_steps": 400000, 19 | "warmup_max_lr": 0.0006, 20 | "warmup_num_steps": 4000 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 3, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 2e8, 28 | "reduce_scatter": true, 29 | "reduce_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "contiguous_gradients": true, 32 | "cpu_offload": true, 33 | "cpu_offload_params": true, 34 | "cpu_offload_pin_memory": true 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /conf/mistral-medium.yaml: -------------------------------------------------------------------------------- 1 | # mistral-medium.yaml 2 | # Full Mistral GPT-2 Medium Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, 3 | # and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/wikitext103.yaml 10 | - models/mistral-medium.yaml 11 | - trainers/gpt2-medium.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: 18 | group: 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: /path/to/artifacts 23 | run_dir: /path/to/runs 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 512 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 33 | # - Frequency (`freq`) at which to save checkpoints (# steps) 34 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 35 | checkpoint_frequency: 36 | - [10, 100] 37 | - [50, 2000] 38 | - [100, 20000] 39 | - [1000, 400000] 40 | 41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 42 | local_rank: -1 43 | nnodes: -1 44 | nproc_per_node: -1 45 | 46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 47 | num_gpus: -1 48 | num_nodes: -1 49 | world_size: -1 50 | 51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 52 | log_level: 20 53 | 54 | # Random Seed 55 | seed: 21 56 | -------------------------------------------------------------------------------- /conf/mistral-micro.yaml: -------------------------------------------------------------------------------- 1 | # mistral-micro.yaml 2 | # Demo GPT-2 Micro Training Config, currently working with the WikiText103 Dataset, GPT-2 Micro Architecture, 3 | # and batch size of 2. Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 2. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/wikitext2.yaml 10 | - models/mistral-micro.yaml 11 | - trainers/gpt2-small.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: 18 | group: 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: /path/to/artifacts 23 | run_dir: /path/to/runs 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 32 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 33 | # - Frequency (`freq`) at which to save checkpoints (# steps) 34 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 35 | checkpoint_frequency: 36 | - [2, 10] 37 | - [10, 100] 38 | - [50, 2000] 39 | - [100, 20000] 40 | - [1000, 400000] 41 | 42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 43 | local_rank: -1 44 | nnodes: -1 45 | nproc_per_node: -1 46 | 47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 48 | num_gpus: -1 49 | num_nodes: -1 50 | world_size: -1 51 | 52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 53 | log_level: 20 54 | 55 | # Random Seed 56 | seed: 21 57 | 58 | # Set Eval Stride 59 | online_eval: 60 | stride: 256 61 | -------------------------------------------------------------------------------- /conf/mistral-small.yaml: -------------------------------------------------------------------------------- 1 | # mistral-small.yaml 2 | # Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, 3 | # and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/wikitext103.yaml 10 | - models/mistral-small.yaml 11 | - trainers/gpt2-small.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: 18 | group: 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: /path/to/artifacts 23 | run_dir: /path/to/runs 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 512 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 33 | # - Frequency (`freq`) at which to save checkpoints (# steps) 34 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 35 | checkpoint_frequency: 36 | - [10, 100] 37 | - [50, 2000] 38 | - [100, 20000] 39 | - [1000, 400000] 40 | 41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 42 | local_rank: -1 43 | nnodes: -1 44 | nproc_per_node: -1 45 | 46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 47 | num_gpus: -1 48 | num_nodes: -1 49 | world_size: -1 50 | 51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 52 | log_level: 20 53 | 54 | # Random Seed 55 | seed: 21 56 | -------------------------------------------------------------------------------- /conf/models/mistral-medium.yaml: -------------------------------------------------------------------------------- 1 | # mistral-medium.yaml 2 | # Configuration for the GT-2 Medium Model. 3 | --- 4 | model: 5 | id: "mistral-medium" 6 | 7 | # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch) 8 | pretrained_tokenizer: true 9 | 10 | # Sequence Length 11 | seq_len: 1024 12 | 13 | # Stability 14 | reorder_and_upcast_attn: true 15 | scale_attn_by_inverse_layer_idx: true 16 | 17 | # Initialize Weights from File 18 | initial_weights: null 19 | -------------------------------------------------------------------------------- /conf/models/mistral-micro.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_function": "gelu_new", 3 | "architectures": [ 4 | "MistralGPT2LMHeadModel" 5 | ], 6 | "attn_pdrop": 0.1, 7 | "bos_token_id": 50256, 8 | "embd_pdrop": 0.1, 9 | "eos_token_id": 50256, 10 | "initializer_range": 0.02, 11 | "layer_norm_epsilon": 1e-05, 12 | "model_type": "gpt2", 13 | "n_ctx": 256, 14 | "n_embd": 768, 15 | "n_head": 2, 16 | "n_inner": null, 17 | "n_layer": 2, 18 | "n_positions": 256, 19 | "resid_pdrop": 0.1, 20 | "summary_activation": null, 21 | "summary_first_dropout": 0.1, 22 | "summary_proj_to_labels": true, 23 | "summary_type": "cls_index", 24 | "summary_use_proj": true, 25 | "task_specific_params": { 26 | "text-generation": { 27 | "do_sample": true, 28 | "max_length": 50 29 | } 30 | }, 31 | "transformers_version": "4.5.0", 32 | "use_cache": false, 33 | "vocab_size": 50257 34 | } 35 | -------------------------------------------------------------------------------- /conf/models/mistral-micro.yaml: -------------------------------------------------------------------------------- 1 | # mistral-micro.yaml 2 | # Configuration for the GPT-2 Micro Model. 3 | --- 4 | model: 5 | # this example relies on a user specified config file 6 | id: "gpt2-small" 7 | 8 | # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch) 9 | pretrained_tokenizer: true 10 | 11 | # Sequence Length 12 | seq_len: 256 13 | 14 | # Stability 15 | reorder_and_upcast_attn: true 16 | scale_attn_by_inverse_layer_idx: true 17 | 18 | # Initialize Weights from File 19 | initial_weights: null 20 | 21 | # Configure Model From File 22 | config_path: conf/models/mistral-micro.json 23 | -------------------------------------------------------------------------------- /conf/models/mistral-small.yaml: -------------------------------------------------------------------------------- 1 | # mistral-small.yaml 2 | # Configuration for the GPT-2 Small Model. 3 | --- 4 | model: 5 | id: "mistral-small" 6 | 7 | # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch) 8 | pretrained_tokenizer: true 9 | 10 | # Sequence Length 11 | seq_len: 1024 12 | 13 | # Stability 14 | reorder_and_upcast_attn: true 15 | scale_attn_by_inverse_layer_idx: true 16 | 17 | # Initialize Weights from File 18 | initial_weights: null 19 | -------------------------------------------------------------------------------- /conf/trainers/benchmark.yaml: -------------------------------------------------------------------------------- 1 | # benchmark.yaml 2 | # Trainer config for Benchmarking, with the full fixed batch size of 512 (with gradient accumulation). 3 | # This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this 4 | # continues to stay valid! 5 | # Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 6 | --- 7 | training_arguments: 8 | # Overwrite from Top-Level Config 9 | output_dir: null 10 | 11 | # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args... 12 | do_train: true 13 | evaluation_strategy: steps 14 | 15 | # Set these based on GPU RAM available... 16 | per_device_train_batch_size: 2 17 | per_device_eval_batch_size: 2 18 | 19 | # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)] 20 | gradient_accumulation_steps: null 21 | 22 | # For Online Evaluation, only keep around the Losses 23 | prediction_loss_only: true 24 | 25 | # Learning Rate & Optimization Parameters, assumes AdamW 26 | learning_rate: 5.0e-5 27 | weight_decay: 0.01 28 | adam_beta1: 0.9 29 | adam_beta2: 0.999 30 | adam_epsilon: 1.0e-8 31 | 32 | # Gradient Norm 33 | max_grad_norm: 1.0 34 | 35 | # Maximum Training Steps (Overrides epochs!) 36 | max_steps: 50 37 | 38 | # LR Scheduling Parameters 39 | lr_scheduler_type: linear 40 | warmup_steps: 2 41 | 42 | # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime! 43 | run_name: null 44 | logging_dir: null 45 | logging_first_step: true 46 | logging_steps: 2 47 | 48 | # Saving and Evaluation Steps (only at the end) 49 | eval_steps: 10 50 | save_steps: 10 51 | 52 | # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging) 53 | ignore_data_skip: false 54 | 55 | # Seeds -- Should be Overwritten at Runtime! 56 | seed: null 57 | 58 | ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config 59 | fp16: false 60 | sharded_ddp: null 61 | deepspeed: null 62 | 63 | # Dataloader Parallelism 64 | dataloader_num_workers: 0 65 | 66 | # Should be overwritten from the Top-Level Config or CLI! 67 | local_rank: null 68 | -------------------------------------------------------------------------------- /conf/trainers/gpt2-medium.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-medium.yaml 2 | # Trainer config for Full GPT-2 Medium, with the full fixed batch size of 512 (with gradient accumulation). 3 | # This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this 4 | # continues to stay valid! 5 | # Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 6 | --- 7 | training_arguments: 8 | # Overwrite from Top-Level Config 9 | output_dir: null 10 | 11 | # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args... 12 | do_train: true 13 | evaluation_strategy: steps 14 | 15 | # Set these based on GPU RAM/your available hardware (4 w/o gradient checkpointing, 8 w/ partial checkpointing) 16 | per_device_train_batch_size: 4 17 | per_device_eval_batch_size: 16 18 | 19 | # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)] 20 | gradient_accumulation_steps: null 21 | 22 | # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime 23 | gradient_checkpointing: false 24 | 25 | 26 | # For Online Evaluation, only keep around the Losses 27 | prediction_loss_only: true 28 | 29 | # Learning Rate & Optimization Parameters, assumes AdamW 30 | learning_rate: 0.00015 31 | weight_decay: 0.1 32 | adam_beta1: 0.9 33 | adam_beta2: 0.999 34 | adam_epsilon: 1.0e-8 35 | 36 | # Gradient Norm 37 | max_grad_norm: 1.0 38 | 39 | # Maximum Training Steps (Overrides epochs!) 40 | max_steps: 400000 41 | 42 | # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio) 43 | lr_scheduler_type: linear # Cosine not supported if we want to use DeepSpeed Optimizers (gets overwritten!) 44 | warmup_steps: 4000 45 | 46 | # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime! 47 | run_name: null 48 | logging_dir: null 49 | logging_first_step: true 50 | logging_steps: 50 51 | 52 | # Saving and Evaluation Steps 53 | eval_steps: 1000 54 | save_steps: 1000 55 | 56 | # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging) 57 | ignore_data_skip: false 58 | 59 | # Seeds -- Should be Overwritten at Runtime! 60 | seed: null 61 | 62 | ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config 63 | fp16: true 64 | sharded_ddp: null 65 | deepspeed: null 66 | 67 | # Dataloader Parallelism 68 | dataloader_num_workers: 0 69 | 70 | # Should be overwritten from the Top-Level Config or CLI! 71 | local_rank: null 72 | -------------------------------------------------------------------------------- /conf/trainers/gpt2-small-short.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-small-short.yaml 2 | # Trainer Debugging config for GPT-2 Small. The max_steps is set to 4000 to quickly iterate and debug. 3 | --- 4 | inherit: 5 | - gpt2-small.yaml 6 | 7 | training_arguments: 8 | # Set these based on GPU RAM/your available hardware 9 | per_device_train_batch_size: 16 10 | per_device_eval_batch_size: 16 11 | 12 | # Learning Rate & Optimization Parameters, assumes AdamW 13 | adam_beta2: 0.95 14 | 15 | # Maximum Training Steps (Overrides epochs!) 16 | max_steps: 4000 17 | 18 | # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio) 19 | warmup_steps: 40 20 | 21 | # Saving and Evaluation Steps 22 | eval_steps: 100 23 | save_steps: 1000 24 | -------------------------------------------------------------------------------- /conf/trainers/gpt2-small.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-small.yaml 2 | # Trainer config for Full GPT-2 Small, with the full fixed batch size of 512 (with gradient accumulation). 3 | # This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this 4 | # continues to stay valid! 5 | # Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 6 | --- 7 | training_arguments: 8 | # Overwrite from Top-Level Config 9 | output_dir: null 10 | 11 | # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args... 12 | do_train: true 13 | evaluation_strategy: steps 14 | 15 | # Set these based on GPU RAM/your available hardware 16 | per_device_train_batch_size: 8 17 | per_device_eval_batch_size: 16 18 | 19 | # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)] 20 | gradient_accumulation_steps: null 21 | 22 | # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime 23 | gradient_checkpointing: false 24 | 25 | # For Online Evaluation, only keep around the Losses 26 | prediction_loss_only: true 27 | 28 | # Learning Rate & Optimization Parameters, assumes AdamW 29 | learning_rate: 0.0006 30 | weight_decay: 0.1 31 | adam_beta1: 0.9 32 | adam_beta2: 0.999 33 | adam_epsilon: 1.0e-8 34 | 35 | # Gradient Norm 36 | max_grad_norm: 1.0 37 | 38 | # Maximum Training Steps (Overrides epochs!) 39 | max_steps: 400000 40 | 41 | # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio) 42 | lr_scheduler_type: linear # Cosine not supported if we want to use DeepSpeed Optimizers (gets overwritten!) 43 | warmup_steps: 4000 44 | 45 | # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime! 46 | run_name: null 47 | logging_dir: null 48 | logging_first_step: true 49 | logging_steps: 50 50 | 51 | # Saving and Evaluation Steps 52 | eval_steps: 1000 53 | save_steps: 1000 54 | 55 | # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging) 56 | ignore_data_skip: false 57 | 58 | # Seeds -- Should be Overwritten at Runtime! 59 | seed: null 60 | 61 | ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config 62 | fp16: true 63 | sharded_ddp: null 64 | deepspeed: null 65 | 66 | # Dataloader Parallelism 67 | dataloader_num_workers: 0 68 | 69 | # Should be overwritten from the Top-Level Config or CLI! 70 | local_rank: null 71 | -------------------------------------------------------------------------------- /conf/trainers/intensive.yaml: -------------------------------------------------------------------------------- 1 | # intensive.yaml 2 | # Trainer config for Intensive Benchmarking, with the full fixed batch size of 512 (with gradient accumulation). 3 | # This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this 4 | # continues to stay valid! 5 | # Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 6 | --- 7 | training_arguments: 8 | # Overwrite from Top-Level Config 9 | output_dir: null 10 | 11 | # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args... 12 | do_train: true 13 | evaluation_strategy: steps 14 | 15 | # Set these based on GPU RAM available... 16 | per_device_train_batch_size: 8 17 | per_device_eval_batch_size: 16 18 | 19 | # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)] 20 | gradient_accumulation_steps: null 21 | 22 | # For Online Evaluation, only keep around the Losses 23 | prediction_loss_only: true 24 | 25 | # Learning Rate & Optimization Parameters, assumes AdamW 26 | learning_rate: 5.0e-5 27 | weight_decay: 0.01 28 | adam_beta1: 0.9 29 | adam_beta2: 0.999 30 | adam_epsilon: 1.0e-8 31 | 32 | # Gradient Norm 33 | max_grad_norm: 1.0 34 | 35 | # Maximum Training Steps (Overrides epochs!) 36 | max_steps: 1000 37 | 38 | # LR Scheduling Parameters 39 | lr_scheduler_type: linear 40 | warmup_steps: 100 41 | 42 | # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime! 43 | run_name: null 44 | logging_dir: null 45 | logging_first_step: true 46 | logging_steps: 50 47 | 48 | # Saving and Evaluation Steps (only at the end) 49 | eval_steps: 100 50 | save_steps: 100 51 | 52 | # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging) 53 | ignore_data_skip: false 54 | 55 | # Seeds -- Should be Overwritten at Runtime! 56 | seed: null 57 | 58 | ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config 59 | fp16: false 60 | sharded_ddp: null 61 | deepspeed: null 62 | 63 | # Dataloader Parallelism 64 | dataloader_num_workers: 0 65 | 66 | # Should be overwritten from the Top-Level Config or CLI! 67 | local_rank: null 68 | -------------------------------------------------------------------------------- /conf/tutorial-shakespeare-gpt2-micro.yaml: -------------------------------------------------------------------------------- 1 | # tutorial-shakespeare-gpt2-micro.yaml 2 | # Demo GPT-2 Micro Training config, currently working with the example Shakespeare dataset, 3 | # GPT-2 Micro Architecture, and batch size of 2. Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 2. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/shakespeare.yaml 10 | - models/mistral-micro.yaml 11 | - trainers/gpt2-small-short.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: shakespeare-gpt2-micro 15 | 16 | # Weights & Biases 17 | wandb_api_key_path: ~/wandb_api_key.txt 18 | wandb: mistral-demo 19 | group: shakespeare-gpt2-micro 20 | 21 | # Artifacts & Caching 22 | artifacts: 23 | cache_dir: ~/cache 24 | run_dir: ~/checkpoints 25 | 26 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 27 | effective_bsz: 32 28 | 29 | # Resume from Checkpoint 30 | resume: false 31 | resume_checkpoint: null 32 | 33 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 34 | # - Frequency (`freq`) at which to save checkpoints (# steps) 35 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 36 | checkpoint_frequency: 37 | - [2, 10] 38 | - [10, 100] 39 | - [50, 2000] 40 | - [100, 20000] 41 | - [1000, 400000] 42 | 43 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 44 | local_rank: -1 45 | nnodes: -1 46 | nproc_per_node: -1 47 | 48 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 49 | num_gpus: -1 50 | num_nodes: -1 51 | world_size: -1 52 | 53 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 54 | log_level: 20 55 | 56 | # Random Seed 57 | seed: 21 58 | 59 | # Set Eval Stride 60 | online_eval: 61 | stride: 256 62 | -------------------------------------------------------------------------------- /docs/LICENSE: -------------------------------------------------------------------------------- 1 | The following pertains only to software in the docs directory used for building Sphinx documentation, originally from: https://github.com/JamesALeedham/Sphinx-Autosummary-Recursion 2 | 3 | Copyright 2021 The Board of Trustees of The Leland Stanford Junior University 4 | Copyright 2020 James Leedham 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 9 | 10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 11 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | 3 | # You can set these variables from the command line, and also 4 | # from the environment for the first two. 5 | SPHINXOPTS ?= 6 | SPHINXBUILD ?= sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR ?= _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | 19 | # Note: Jupytext converts .py to .ipynb (Sphinx seems to execute Notebook..?) 20 | %: Makefile 21 | rm -rf $(BUILDDIR) 22 | rm -rf _autosummary 23 | python scripts/build_download_tables.py >> getting_started/download.rst 24 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 25 | git checkout getting_started/download.rst 26 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Install Sphinx 2 | 3 | If you don't already have Sphinx set up install it with `pip`. 4 | 5 | ```bash 6 | pip install sphinx 7 | pip install sphinx-rtd-theme 8 | ``` 9 | 10 | The documentation has been built with version 4.0.2. 11 | 12 | ```bash 13 | $ sphinx-build --version 14 | sphinx-build 4.0.2 15 | ``` 16 | 17 | # Build The Docs 18 | 19 | From `docs` directory run: 20 | 21 | ```bash 22 | $ BUILDDIR=/path/to/build make html 23 | ``` 24 | -------------------------------------------------------------------------------- /docs/_static/pydata-custom.css: -------------------------------------------------------------------------------- 1 | /*Tweaks to the Pydata default CSS */ 2 | 3 | /*No yellow background highlight when targeted by summary tables */ 4 | /*dt:target { background-color: #f8f8f8; border: 1px solid black, }*/ 5 | dt:target { background: transparent;} 6 | /*More space between H1s and signatures in API reference*/ 7 | h1 { margin-bottom: 40px; } 8 | 9 | /*No line underneath summary table headings (clashes with line above first member)*/ 10 | p.rubric { border-bottom: 0px; } 11 | -------------------------------------------------------------------------------- /docs/_static/readthedocs-custom.css: -------------------------------------------------------------------------------- 1 | /* Override nav bar color */ 2 | /*.wy-side-nav-search { 3 | background-color: #fbfbb6; 4 | } 5 | .wy-side-nav-search > a { 6 | color: #b2355c 7 | }*/ 8 | 9 | /* Override text bar color */ 10 | /*.caption-text { 11 | color: #b2355c; 12 | }*/ 13 | 14 | /* Override code signature colour */ 15 | /*.rst-content dl:not(.docutils) dt { 16 | background: #fbfbb6; 17 | color: #b2355c; 18 | border-top: solid 3px #b2355c; 19 | }*/ 20 | 21 | /* Override hyperlink colour */ 22 | /* a { 23 | color: #b2355c; 24 | }*/ 25 | 26 | /* Make content width wider*/ 27 | .wy-nav-content { 28 | max-width: 80% !important; 29 | font-family: 'Roboto Condensed', sans-serif;; 30 | } 31 | 32 | h1 { 33 | font-family: 'Roboto Condensed', sans-serif;; 34 | } 35 | 36 | h2 { 37 | font-family: 'Roboto Condensed', sans-serif;; 38 | } 39 | -------------------------------------------------------------------------------- /docs/_templates/custom-class-template.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | :members: 7 | :show-inheritance: 8 | :inherited-members: 9 | :special-members: __call__, __add__, __mul__ 10 | 11 | {% block methods %} 12 | {% if methods %} 13 | .. rubric:: {{ _('Methods') }} 14 | 15 | .. autosummary:: 16 | :nosignatures: 17 | {% for item in methods %} 18 | {%- if not item.startswith('_') %} 19 | ~{{ name }}.{{ item }} 20 | {%- endif -%} 21 | {%- endfor %} 22 | {% endif %} 23 | {% endblock %} 24 | 25 | {% block attributes %} 26 | {% if attributes %} 27 | .. rubric:: {{ _('Attributes') }} 28 | 29 | .. autosummary:: 30 | {% for item in attributes %} 31 | ~{{ name }}.{{ item }} 32 | {%- endfor %} 33 | {% endif %} 34 | {% endblock %} 35 | -------------------------------------------------------------------------------- /docs/_templates/custom-module-template.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. automodule:: {{ fullname }} 4 | 5 | {% block attributes %} 6 | {% if attributes %} 7 | .. rubric:: Module attributes 8 | 9 | .. autosummary:: 10 | :toctree: 11 | {% for item in attributes %} 12 | {{ item }} 13 | {%- endfor %} 14 | {% endif %} 15 | {% endblock %} 16 | 17 | {% block functions %} 18 | {% if functions %} 19 | .. rubric:: {{ _('Functions') }} 20 | 21 | .. autosummary:: 22 | :toctree: 23 | :nosignatures: 24 | {% for item in functions %} 25 | {{ item }} 26 | {%- endfor %} 27 | {% endif %} 28 | {% endblock %} 29 | 30 | {% block classes %} 31 | {% if classes %} 32 | .. rubric:: {{ _('Classes') }} 33 | 34 | .. autosummary:: 35 | :toctree: 36 | :template: custom-class-template.rst 37 | :nosignatures: 38 | {% for item in classes %} 39 | {{ item }} 40 | {%- endfor %} 41 | {% endif %} 42 | {% endblock %} 43 | 44 | {% block exceptions %} 45 | {% if exceptions %} 46 | .. rubric:: {{ _('Exceptions') }} 47 | 48 | .. autosummary:: 49 | :toctree: 50 | {% for item in exceptions %} 51 | {{ item }} 52 | {%- endfor %} 53 | {% endif %} 54 | {% endblock %} 55 | 56 | {% block modules %} 57 | {% if modules %} 58 | .. autosummary:: 59 | :toctree: 60 | :template: custom-module-template.rst 61 | :recursive: 62 | {% for item in modules %} 63 | {{ item }} 64 | {%- endfor %} 65 | {% endif %} 66 | {% endblock %} 67 | -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% block extrahead %} 3 | 4 | 5 | {{ super() }} 6 | {% endblock %} 7 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. 2 | DO NOT DELETE THIS FILE! It contains the all-important `.. autosummary::` directive with `:recursive:` option, without 3 | which API documentation wouldn't get extracted from docstrings by the `sphinx.ext.autosummary` engine. It is hidden 4 | (not declared in any toctree) to remove an unnecessary intermediate page; index.rst instead points directly to the 5 | package page. DO NOT REMOVE THIS FILE! 6 | 7 | .. autosummary:: 8 | :toctree: _autosummary 9 | :template: custom-module-template.rst 10 | :recursive: 11 | 12 | src 13 | -------------------------------------------------------------------------------- /docs/fork.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/fork.png -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | Getting Started 2 | =============== 3 | 4 | The following guide will lead you through a tutorial example for training 5 | and using a language model with Mistral. 6 | 7 | :doc:`Installing Mistral And Setting Up Weights & Biases ` 8 | 9 | :doc:`Configuring Training Runs ` 10 | 11 | :doc:`Training A Model ` 12 | 13 | :doc:`Download Checkpoints ` 14 | 15 | :doc:`Evaluating A Model ` 16 | 17 | After finishing this guide, check out our :doc:`tutorials `. 18 | 19 | If you are interested in helping out, see our :doc:`contributing ` page. 20 | -------------------------------------------------------------------------------- /docs/getting_started/config.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ============== 3 | 4 | Quinine 5 | --------- 6 | 7 | Configurations are specified using the `Quinine `_ library. 8 | 9 | Quinine allows users to integrate multiple config files and layer configs on top of each other. 10 | It is designed for machine learning projects with large sets of nested hyperparameters. 11 | 12 | The easiest way to understand Quinine is to study ``conf/mistral-micro.yaml`` which is presented below. 13 | 14 | This config specifies a variety of settings, and draws configurations from ``conf/datasets/wikitext103.yaml``, 15 | ``conf/models/mistral-micro.yaml`` and ``conf/trainers/gpt2-small.yaml``. This allows for clean separation of the 16 | configs for the dataset (e.g. name or number of pre-processing workers), the model (e.g. number of layers), 17 | and the trainer (e.g. learning rate), while high level configs are specified in the main config file. 18 | 19 | Most of the defaults in ``conf/mistral-micro.yaml`` will work, but you will need to change 20 | the Weights & Biases settings and specify the artifacts directories ``cache_dir`` and ``run_dir``. 21 | 22 | Example config: mistral-micro.yaml 23 | ---------------------------------------- 24 | 25 | ``conf/mistral-micro.yaml`` is a basic configuration file that can be used for an introductory training run 26 | 27 | .. include:: ../../conf/mistral-micro.yaml 28 | :literal: 29 | -------------------------------------------------------------------------------- /docs/getting_started/download.rst: -------------------------------------------------------------------------------- 1 | Download Models 2 | =============== 3 | 4 | Mistral Checkpoints 5 | ------------------- 6 | 7 | The Mistral team has trained 5 GPT-2 Medium models and 5 GPT-2 Small models on the OpenWebText corpus and is making them available to the public. 8 | 9 | Each model is available on the `Hugging Face Hub `_ and can be accessed via Git LFS. 10 | 11 | Checkpoints are branches of each repo for each model. For instance, here is how to get the 300k step checkpoint for battlestar: :: 12 | 13 | # Make sure you have git-lfs installed 14 | # (https://git-lfs.github.com) 15 | git lfs install 16 | 17 | # get checkpoint 300000 for battlestar 18 | git clone https://huggingface.co/stanford-crfm/battlestar-gpt2-small-x49 --branch checkpoint-300000 --single-branch 19 | cd battlestar-gpt2-small-x49 20 | git lfs pull 21 | 22 | 23 | Links to the checkpoints are in the table below. 24 | -------------------------------------------------------------------------------- /docs/getting_started/evaluate.rst: -------------------------------------------------------------------------------- 1 | Training 2 | ======== 3 | 4 | Evaluating A Model 5 | ------------------ 6 | 7 | Once you've finished training your model, you can run evaluation on any checkpoint to see PPL scores 8 | on OpenWebText, WikiText-103, and Lambada. 9 | 10 | To run evaluation, use this command: :: 11 | 12 | cd mistral 13 | conda activate mistral 14 | CUDA_VISIBLE_DEVICES=0 python train.py --config conf/mistral-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --model.initial_weights /path/to/runs/my-run/checkpoint-400000 --run_training False 15 | 16 | This will skip the training process and run a final evaluation, initializing from the weights of the checkpoint. 17 | 18 | To evaluate a particular model, you need to supply the same config that was used to train the model (e.g. ``conf/mistral-micro.yaml``) in this example. 19 | 20 | Example Output 21 | -------------- 22 | 23 | If all is successful, you should see output similar to this: :: 24 | 25 | |=>> 08/13 [14:00:22] - mistral - INFO :: Running final evaluation... 26 | ... 27 | {'eval_openwebtext_loss': 2.99070405960083, 'eval_openwebtext_ppl': 19.899688127064493, 'eval_openwebtext_runtime': 14.8929, 'eval_openwebtext_samples_per_second': 15.376, 'epoch': None, 'eval_wikitext_loss': 2.90213680267334, 'eval_wikitext_runtime': 26.5247, 'eval_wikitext_samples_per_second': 17.192, 'eval_wikitext_ppl': 18.21302145232096, 'eval_lambada_loss': 2.5298995971679688, 'eval_lambada_runtime': 283.1437, 'eval_lambada_samples_per_second': 17.196, 'eval_lambada_ppl': 12.552245792372315, 'eval_mem_cpu_alloc_delta': 532480, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 98304, 'eval_mem_gpu_peaked_delta': 1242778112} 28 | -------------------------------------------------------------------------------- /docs/getting_started/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Basic Installation 5 | -------------------- 6 | 7 | Get the code :: 8 | 9 | git clone https://github.com/stanford-crfm/mistral.git 10 | 11 | Set up the mistral conda env :: 12 | 13 | conda create -n mistral python=3.8.12 pytorch=1.11.0 torchdata cudatoolkit=11.3 -c pytorch 14 | conda activate mistral 15 | pip install -r setup/pip-requirements.txt 16 | 17 | You may need to alter this environment depending on your CUDA set up. 18 | 19 | Setting Up Weights And Biases 20 | ------------------------------- 21 | 22 | Training runs transmit logs to `Weights & Biases `_. 23 | 24 | First make sure to set up an account on their web site. 25 | 26 | Before doing training runs, set up your wandb credentials on your machine :: 27 | 28 | conda activate mistral 29 | cd mistral 30 | wandb init 31 | 32 | The ``init`` process will direct you to a url with an API key you must enter. 33 | During this process you will be asked to specify which team to use as well. 34 | 35 | The project and group for a training run are set in the main 36 | config file with the ``wandb`` and ``group`` keys respectively. 37 | See ``conf/mistral-micro.yaml`` for an example. 38 | 39 | If you do not want to send logs to Weights & Biases, run this command in the main mistral directory :: 40 | 41 | wandb offline 42 | 43 | You can completely deactivate Weights & Biases logging with this command :: 44 | 45 | wandb disabled 46 | 47 | For general info on ``wandb`` commands run :: 48 | 49 | wandb --help 50 | -------------------------------------------------------------------------------- /docs/getting_started/train-output.txt: -------------------------------------------------------------------------------- 1 | |=>> 06/25 [23:58:36] - mistral - INFO :: Initializing Model Trainer... 2 | |=>> 06/25 [23:58:36] - mistral - INFO :: Training Arguments: TrainingArguments(output_dir=mistral-hello-world/runs/gpt2-small-d=wikitext-n=1-g=1-w=1+2021-06-25-23:57:32, overwrite_output_dir=False, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=IntervalStrategy.STEPS, prediction_loss_only=True, per_device_train_batch_size=4, per_device_eval_batch_size=16, gradient_accumulation_steps=128, eval_accumulation_steps=None, learning_rate=0.0006, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=400000, lr_scheduler_type=SchedulerType.LINEAR, warmup_ratio=0.0, warmup_steps=4000, logging_dir=logs, logging_strategy=IntervalStrategy.STEPS, logging_first_step=True, logging_steps=50, save_strategy=IntervalStrategy.STEPS, save_steps=1000, save_total_limit=None, no_cuda=False, seed=21, fp16=True, fp16_opt_level=O1, fp16_backend=auto, fp16_full_eval=False, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=1000, dataloader_num_workers=4, past_index=-1, run_name=gpt2-small-d=wikitext-n=1-g=1-w=1+2021-06-25-23:57:32, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=[], deepspeed=None, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, length_column_name=length, report_to=[], ddp_find_unused_parameters=None, dataloader_pin_memory=True, skip_memory_metrics=False, _n_gpu=1, mp_parameters=) 3 | |=>> 06/25 [23:58:42] - mistral.core.callbacks - INFO :: Setting W&B Project: hello-world 4 | |=>> 06/25 [23:59:06] - mistral - INFO :: Training... 5 | |=>> 06/25 [23:59:06] - mistral.core.callbacks - INFO :: Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" 6 | wandb: Currently logged in as: username (use `wandb login --relogin` to force relogin) 7 | wandb: wandb version 0.10.32 is available! To upgrade, please run: 8 | wandb: $ pip install wandb --upgrade 9 | wandb: Tracking run with wandb version 0.10.21 10 | wandb: Syncing run gpt2-small-d=wikitext-n=1-g=1-w=1+2021-06-25-23:57:32 11 | wandb: ⭐️ View project at https://wandb.ai/smy-team/hello-world 12 | wandb: 🚀 View run at https://wandb.ai/my-team/hello-world/runs/3mrlgblq 13 | wandb: Run data is saved locally in mistral-hello-world/runs/gpt2-small-d=wikitext-n=1-g=1-w=1+2021-06-25-23:57:32/wandb/run-20210625_235915-3mrlgblq 14 | wandb: Run `wandb offline` to turn off syncing. 15 | 16 | {'loss': 11.0023, 'learning_rate': 1.5e-07, 'activations/layer0_attention_weight_max': 1.9394148588180542, 'activations/layer0_attention_weight_min': -1.7338905334472656, 'activations/layer1_attention_weight_max': 1.7617545127868652, 'activations/layer1_attention_weight_min': -1.7682685852050781, 'activations/layer2_attention_weight_max': 1.7848472595214844, 'activations/layer2_attention_weight_min': -1.9004961252212524, 'activations/layer3_attention_weight_max': 1.8493074178695679, 'activations/layer3_attention_weight_min': -1.838200330734253, 'activations/layer4_attention_weight_max': 1.8895012140274048, 'activations/layer4_attention_weight_min': -1.7738912105560303, 'activations/layer5_attention_weight_max': 1.7461622953414917, 'activations/layer5_attention_weight_min': -1.758669376373291, 'activations/layer6_attention_weight_max': 1.9132049083709717, 'activations/layer6_attention_weight_min': -1.9518122673034668, 'activations/layer7_attention_weight_max': 1.8657881021499634, 'activations/layer7_attention_weight_min': -1.8033781051635742, 'activations/layer8_attention_weight_max': 2.0741305351257324, 'activations/layer8_attention_weight_min': -1.925511360168457, 'activations/layer9_attention_weight_max': 1.8003664016723633, 'activations/layer9_attention_weight_min': -1.7981972694396973, 'activations/layer10_attention_weight_max': 1.7417181730270386, 'activations/layer10_attention_weight_min': -1.6902594566345215, 'activations/layer11_attention_weight_max': 1.9806346893310547, 'activations/layer11_attention_weight_min': -1.731971025466919, 'epoch': 0.0} 17 | 18 | 0%| | 100/400000 [1:06:43<4789:29:34, 43.12s/it] 19 | -------------------------------------------------------------------------------- /docs/getting_started/wandb_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/getting_started/wandb_example.png -------------------------------------------------------------------------------- /docs/hugging_face_differences.rst: -------------------------------------------------------------------------------- 1 | Differences between Mistral and Hugging Face 2 | =============== 3 | 4 | Mistral is not a replacement for Hugging Face. Rather, we extend the current functionalities in Hugging Face 5 | by fixing stability issues with GPT training, adding evaluation scripts and supporting distributed training 6 | with the DeepSpeed optimization library. 7 | 8 | **Stability** 9 | 10 | When training GPT-2 Small models with Hugging Face, some of the models crashed due to numerical instability. 11 | We fixed the this issue by rearranging the order of operations in scaled dot-product attention computation 12 | and upcasting to FP32. We also scaled down the weights by dividing by the layer number to prevent overflow. 13 | These changes have been upstreamed to the Hugging Face repository, when using ``reorder_and_upcast_attn: true`` 14 | and ``scale_attn_by_inverse_layer_idx: true`` in the model config for GPT-2. 15 | 16 | **Evaluation** 17 | 18 | We added online evaluation so we can get PPL on arbitrary datasets while training. 19 | 20 | **Parallelism** 21 | 22 | We noticed that integrating parallelism (e.g. tensor model-parallelism and pipelining) breaks the current 23 | Hugging Face APIs. 24 | 25 | **Distributed Training** 26 | 27 | We provide ready-to-use scripts and configuration files to run distributed training with DeepSpeed, 28 | Google Cloud Platform and Kubernetes. 29 | 30 | 31 | **Future** 32 | 33 | We are closely working with folks from Hugging Face. We plan to integrate Mistral into the Hugging Face library 34 | in the future. 35 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. 2 | Note: Items in this toctree form the top-level navigation. See `api.rst` for the `autosummary` directive, and for why `api.rst` isn't called directly. 3 | 4 | .. toctree:: 5 | :hidden: 6 | :caption: Getting Started 7 | 8 | Overview 9 | Installation 10 | Configuration 11 | Training 12 | Download Models 13 | Evaluation 14 | 15 | .. toctree:: 16 | :hidden: 17 | :caption: Tutorials 18 | 19 | Training With Multiple GPU's 20 | Training On Multiple Nodes With DeepSpeed 21 | Generate Text With A Trained Model 22 | Training A Model With Google Cloud + Kubernetes 23 | 24 | .. toctree:: 25 | :hidden: 26 | :caption: About 27 | 28 | Contributing 29 | API reference <_autosummary/src> 30 | Differences between Mistral and Hugging Face 31 | 32 | Mistral - Large Scale Language Modeling Made Easy 33 | ===================================================== 34 | 35 | .. image:: mistral_components.png 36 | 37 | 38 | Mistral combines `Hugging Face `_ 🤗, `DeepSpeed `_, and `Weights & Biases `_ , with additional tools, helpful scripts, and documentation to facilitate: 39 | 40 | * training large models with multiple GPU's and nodes 41 | * incorporating new pre-training datasets 42 | * dataset preprocessing 43 | * monitoring and logging of model training 44 | * performing evaluation and measuring bias 45 | 46 | .. _Mistral: https://github.com/stanford-crfm/mistral 47 | -------------------------------------------------------------------------------- /docs/mistral_components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/mistral_components.png -------------------------------------------------------------------------------- /docs/scripts/build_download_tables.py: -------------------------------------------------------------------------------- 1 | """ 2 | create_download_tables.py 3 | 4 | Build and verify download link table. 5 | """ 6 | 7 | import argparse 8 | 9 | import requests 10 | 11 | 12 | def verify_download_link(link): 13 | results = requests.head(link) 14 | return results.ok 15 | 16 | 17 | def github_table_header(size): 18 | return ( 19 | f"\nGPT-2 {size.capitalize()}\n\n| Run | Type | Checkpoint | Size | Link |\n| --- | --- | --- | --- | --- |\n" 20 | ) 21 | 22 | 23 | def github_table_row(run, size, checkpoint, download_size, download_link): 24 | return f"| {run} | GPT-2 {size.capitalize()} | {checkpoint} | {download_size} | [download]({download_link}) |\n" 25 | 26 | 27 | def rst_table_header(size): 28 | return ( 29 | f".. csv-table:: GPT-2 {size.capitalize()} Models\n" 30 | ' :header: "Run", "Type", "Checkpoint", "Size", "Link"\n' 31 | " :widths: 7,7,7,5,7\n\n" 32 | ) 33 | 34 | 35 | def rst_table_row(run, size, checkpoint, download_size, download_link): 36 | return f' "{run}", "GPT-2 {size.capitalize()}", "{checkpoint}", {download_size}, `download <{download_link}>`_\n' 37 | 38 | 39 | table_header_creators = {"github": github_table_header, "rst": rst_table_header} 40 | row_creators = {"github": github_table_row, "rst": rst_table_row} 41 | 42 | 43 | def produce_download_tables(mode="rst"): 44 | sizes = ["medium", "small"] 45 | 46 | runs = { 47 | "small": ["Alias", "Battlestar", "Caprica", "Darkmatter", "Expanse"], 48 | "medium": ["Arwen", "Beren", "Celebrimbor", "Durin", "Eowyn"], 49 | } 50 | 51 | run_to_seed = { 52 | "Alias": "x21", 53 | "Battlestar": "x49", 54 | "Caprica": "x81", 55 | "Darkmatter": "x343", 56 | "Expanse": "x777", 57 | "Arwen": "x21", 58 | "Beren": "x49", 59 | "Celebrimbor": "x81", 60 | "Durin": "x343", 61 | "Eowyn": "x777", 62 | } 63 | 64 | checkpoints = [100000, 200000, 300000, 400000] 65 | 66 | download_sizes = {"small": "1.8G", "medium": "4.9G"} 67 | 68 | tables = [] 69 | for size in sizes: 70 | table = table_header_creators[mode](size) 71 | for run in sorted(runs[size]): 72 | for checkpoint in sorted(checkpoints, reverse=True): 73 | # build and verify download link 74 | download_link = f"https://storage.googleapis.com/mistral-models/gpt2-{size}/{run.lower()}-gpt2-{size}-{run_to_seed[run]}/{run.lower()}-{run_to_seed[run]}-checkpoint-{checkpoint}.zip" 75 | # assert verify_download_link(download_link), f"link failed: {download_link}" 76 | # add row 77 | table += row_creators[mode](run, size, checkpoint, download_sizes[size], download_link) 78 | tables.append(table) 79 | 80 | return tables 81 | 82 | 83 | if __name__ == "__main__": 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument("-m", "--mode", choices=["github", "rst"], help="type of table to build", default="rst") 86 | args = parser.parse_args() 87 | print("") 88 | for table in produce_download_tables(mode=args.mode): 89 | print(table) 90 | print("") 91 | -------------------------------------------------------------------------------- /docs/tutorials/cluster_basics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/cluster_basics.png -------------------------------------------------------------------------------- /docs/tutorials/generate.rst: -------------------------------------------------------------------------------- 1 | Generate Text With A Trained Model 2 | ================================== 3 | 4 | Once you've completed :doc:`training <../getting_started/train>`, you can use your model to generate text. 5 | 6 | In this tutorial we'll walk through getting 🤗 Transformers et up and generating text with a trained GPT-2 Small model. 7 | 8 | Set Up Hugging Face 9 | ------------------- 10 | 11 | Hugging Face's ``transformers`` repo provides a helpful script for generating text with a GPT-2 model. 12 | 13 | To access these scripts, clone the repo :: 14 | 15 | git clone https://github.com/huggingface/transformers.git 16 | 17 | Run run_generation.py With Your Model 18 | ------------------------------------- 19 | 20 | As your model training runs, it should save checkpoints with all of the model resources in the directory 21 | you specified with ``artifacts.run_dir`` in the ``conf/mistral-micro.yaml`` config file. 22 | 23 | For this example, lets assume you have saved the checkpoints in ``/home/tutorial-gpt2-micro/runs/run-1``. If you trained 24 | for 400000 steps, you should have a corresponding checkpoint at ``/home/tutorial-gpt2-micro/runs/run-1/checkpoint-400000``. 25 | This directory contains all the resources for your model, with files such as ``pytorch_model.bin`` containing 26 | the actual model and ``vocab.json`` which maps word pieces to their indices among others. 27 | 28 | To run text generation, issue the following command: :: 29 | 30 | conda activate mistral 31 | cd transformers/examples/text-generation 32 | python run_generation.py --model_type=gpt2 --model_name_or_path=/home/tutorial-gpt2-micro/runs/run-1/checkpoint-400000 33 | 34 | This will create the following output requesting a text prompt. :: 35 | 36 | 06/28/2021 03:16:16 - WARNING - __main__ - device: cuda, n_gpu: 1, 16-bits training: False 37 | 06/28/2021 03:16:26 - INFO - __main__ - Namespace(device=device(type='cuda'), fp16=False, k=0, length=20, model_name_or_path='hello-world/runs/run-1/checkpoint-400000', model_type='gpt2', n_gpu=1, no_cuda=False, num_return_sequences=1, p=0.9, padding_text='', prefix='', prompt='', repetition_penalty=1.0, seed=42, stop_token=None, temperature=1.0, xlm_language='') 38 | Model prompt >>> 39 | 40 | Enter an example prompt, and the script will generate a text completion for you using your model! :: 41 | 42 | Model prompt >>> Hello world. This is a prompt. 43 | Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation. 44 | === GENERATED SEQUENCE 1 === 45 | Hello world. This is a prompt. This is no ‘say what, say it’ stuff, it’s all on 46 | -------------------------------------------------------------------------------- /docs/tutorials/gke_standard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/gke_standard.png -------------------------------------------------------------------------------- /docs/tutorials/kubernetes_menu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/kubernetes_menu.png -------------------------------------------------------------------------------- /docs/tutorials/multi-gpu.rst: -------------------------------------------------------------------------------- 1 | Training With Multiple GPU's 2 | ======================================= 3 | 4 | Once you've got training working with a single node/single gpu, you can easily move on to training 5 | with multiple GPUs if your machine has them. 6 | 7 | This can be done two ways. The first, which we show here, uses `torch.distributed.launch `_ , a utility for launching multiple processes per node for distributed training. The second uses DeepSpeed, which we go over in our :doc:`multi node training `. 8 | 9 | To use torch, run this command with ``--nproc_per_node`` set to the number of GPUs you want to use (in this example we'll go with 2) :: 10 | 11 | conda activate mistral 12 | cd mistral 13 | python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 --node_rank=0 train.py --config conf/mistral-micro.yaml --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id mistral-micro-multi-gpu 14 | 15 | You should see similar output as when running :doc:`single node/single gpu training <../getting_started/train>`, except it should run twice as fast! 16 | 17 | As noted with single node/single gpu training, you may need to adjust the batch size to avoid OOM memories. 18 | -------------------------------------------------------------------------------- /docs/tutorials/node_pool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/node_pool.png -------------------------------------------------------------------------------- /docs/tutorials/node_pool_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/node_pool_gpu.png -------------------------------------------------------------------------------- /docs/tutorials/resume.rst: -------------------------------------------------------------------------------- 1 | Resuming From Checkpoint 2 | ======================================= 3 | 4 | To resume from a checkpoint, simply add the ``resume`` and ``resume_checkpoint`` options to any of your training commands. :: 5 | 6 | conda activate mistral 7 | cd mistral 8 | python train.py --config conf/mistral-micro.yaml --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id resume-demo --resume true --resume_checkpoint /path/to/checkpoint 9 | 10 | When resuming from checkpoint the process should pick up from where it left off, using the same learning rate, same point in the learning rate schedule, same point in the data, etc ... 11 | -------------------------------------------------------------------------------- /docs/tutorials/tutorial_cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/tutorial_cluster.png -------------------------------------------------------------------------------- /environments/Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Example Dockerfile to train large-scale language models with Mistral. 3 | # 4 | FROM nvidia/cuda:11.1.1-cudnn8-devel-ubuntu20.04 5 | WORKDIR /app 6 | 7 | # Install Conda 8 | ENV PATH /opt/conda/bin:$PATH 9 | 10 | RUN apt-get update --fix-missing && \ 11 | apt-get install -y wget bzip2 ca-certificates curl git && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ 16 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 17 | rm ~/miniconda.sh && \ 18 | /opt/conda/bin/conda clean -tipsy && \ 19 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 20 | echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ 21 | echo "conda activate base" >> ~/.bashrc 22 | 23 | # Install dependencies with Conda 24 | COPY environment-gpu.yaml . 25 | RUN set -x && \ 26 | conda install -n base -c defaults conda=4.* && \ 27 | conda env create -f environment-gpu.yaml && \ 28 | conda clean -a 29 | ENV PATH /opt/conda/envs/mistral/bin:$PATH 30 | 31 | # Set CUDA environement variables (necessary for DeepSpeed) 32 | ENV CUDA_HOME=/usr/local/cuda 33 | ENV CUDA_PATH=/usr/local/cuda 34 | 35 | # Make RUN commands use the new environment 36 | SHELL ["conda", "run", "-n", "mistral", "/bin/bash", "-c"] 37 | -------------------------------------------------------------------------------- /environments/environment-gpu.yaml: -------------------------------------------------------------------------------- 1 | name: mistral-latest 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - _openmp_mutex=4.5=1_gnu 8 | - blas=1.0=mkl 9 | - ca-certificates=2022.2.1=h06a4308_0 10 | - certifi=2021.10.8=py38h06a4308_2 11 | - cudatoolkit=11.3.1=h2bc3f7f_2 12 | - intel-openmp=2022.0.1=h06a4308_3633 13 | - ld_impl_linux-64=2.35.1=h7274673_9 14 | - libffi=3.3=he6710b0_2 15 | - libgcc-ng=9.3.0=h5101ec6_17 16 | - libgomp=9.3.0=h5101ec6_17 17 | - libstdcxx-ng=9.3.0=hd4cf53a_17 18 | - libuv=1.40.0=h7b6447c_0 19 | - mkl=2022.0.1=h06a4308_117 20 | - ncurses=6.3=h7f8727e_2 21 | - openssl=1.1.1m=h7f8727e_0 22 | - pip=21.2.4=py38h06a4308_0 23 | - python=3.8.12=h12debd9_0 24 | - pytorch=1.11.0=py3.8_cuda11.3_cudnn8.2.0_0 25 | - pytorch-mutex=1.0=cuda 26 | - readline=8.1.2=h7f8727e_1 27 | - setuptools=58.0.4=py38h06a4308_0 28 | - sqlite=3.38.0=hc218d9a_0 29 | - tk=8.6.11=h1ccaba5_0 30 | - typing_extensions=3.10.0.2=pyh06a4308_0 31 | - wheel=0.37.1=pyhd3eb1b0_0 32 | - xz=5.2.5=h7b6447c_0 33 | - zlib=1.2.11=h7f8727e_4 34 | - pip: 35 | - aiohttp==3.8.1 36 | - aiosignal==1.2.0 37 | - async-timeout==4.0.2 38 | - attrs==21.4.0 39 | - cerberus==1.3.2 40 | - charset-normalizer==2.0.12 41 | - click==8.0.4 42 | - cytoolz==0.11.0 43 | - datasets==2.0.0 44 | - deepspeed==0.6.0 45 | - dill==0.3.4 46 | - docker-pycreds==0.4.0 47 | - filelock==3.6.0 48 | - frozenlist==1.3.0 49 | - fsspec==2022.2.0 50 | - funcy==1.15 51 | - gin-config==0.3.0 52 | - gitdb==4.0.9 53 | - gitpython==3.1.27 54 | - hjson==3.0.2 55 | - huggingface-hub==0.4.0 56 | - idna==3.3 57 | - joblib==1.1.0 58 | - jsonlines==3.0.0 59 | - multidict==6.0.2 60 | - multiprocess==0.70.12.2 61 | - munch==2.5.0 62 | - ninja==1.10.2.3 63 | - numpy==1.22.3 64 | - packaging==21.3 65 | - pandas==1.4.1 66 | - pathtools==0.1.2 67 | - promise==2.3 68 | - protobuf==3.19.4 69 | - psutil==5.9.0 70 | - py-cpuinfo==8.0.0 71 | - pyarrow==7.0.0 72 | - pyparsing==3.0.7 73 | - python-dateutil==2.8.2 74 | - pytz==2021.3 75 | - pyyaml==5.4 76 | - quinine==0.3.0 77 | - regex==2022.3.15 78 | - requests==2.27.1 79 | - responses==0.18.0 80 | - sacremoses==0.0.49 81 | - sentry-sdk==1.5.7 82 | - setproctitle==1.2.2 83 | - shortuuid==1.0.8 84 | - six==1.16.0 85 | - smmap==5.0.0 86 | - termcolor==1.1.0 87 | - tokenizers==0.11.6 88 | - toolz==0.11.2 89 | - toposort==1.5 90 | - tqdm==4.63.0 91 | - transformers==4.17.0 92 | - urllib3==1.26.8 93 | - wandb==0.12.11 94 | - xxhash==3.0.0 95 | - yarl==1.7.2 96 | - yaspin==2.1.0 97 | prefix: /nlp/scr/jebolton/miniconda3/envs/mistral-latest 98 | -------------------------------------------------------------------------------- /environments/environment-m1.yaml: -------------------------------------------------------------------------------- 1 | name: mistral 2 | channels: 3 | - ngam 4 | - conda-forge 5 | dependencies: 6 | - ca-certificates=2021.10.8 7 | - cffi=1.15.0 8 | - future=0.18.2 9 | - libblas=3.9.0 10 | - libcblas=3.9.0 11 | - libcxx=12.0.1 12 | - libffi=3.4.2 13 | - libgfortran=5.0.0.dev0 14 | - libgfortran5=11.0.1.dev0 15 | - liblapack=3.9.0 16 | - libopenblas=0.3.18 17 | - libprotobuf=3.19.4 18 | - libzlib=1.2.11 19 | - llvm-openmp=13.0.1 20 | - ncurses=6.3 21 | - numpy=1.22.2 22 | - openssl=3.0.0 23 | - pip=22.0.3 24 | - pycparser=2.21 25 | - python=3.8.12 26 | - python_abi=3.8 27 | - pytorch=1.10.0 28 | - readline=8.1 29 | - setuptools=60.9.3 30 | - sleef=3.5.1 31 | - sqlite=3.37.0 32 | - tk=8.6.12 33 | - torchaudio=0.10.0 34 | - typing_extensions=4.1.1 35 | - wheel=0.37.1 36 | - xz=5.2.5 37 | - zlib=1.2.11 38 | - pip: 39 | - aiohttp==3.8.1 40 | - aiosignal==1.2.0 41 | - async-timeout==4.0.2 42 | - attrs==21.4.0 43 | - cerberus==1.3.2 44 | - certifi==2021.10.8 45 | - charset-normalizer==2.0.12 46 | - click==8.0.4 47 | - cytoolz==0.11.0 48 | - datasets==1.18.3 49 | - deepspeed==0.5.10 50 | - dill==0.3.4 51 | - docker-pycreds==0.4.0 52 | - filelock==3.6.0 53 | - frozenlist==1.3.0 54 | - fsspec==2022.2.0 55 | - funcy==1.15 56 | - gin-config==0.3.0 57 | - gitdb==4.0.9 58 | - gitpython==3.1.27 59 | - hjson==3.0.2 60 | - huggingface-hub==0.4.0 61 | - idna==3.3 62 | - joblib==1.1.0 63 | - jsonlines==3.0.0 64 | - multidict==6.0.2 65 | - multiprocess==0.70.12.2 66 | - munch==2.5.0 67 | - ninja==1.10.2.3 68 | - packaging==21.3 69 | - pandas==1.4.1 70 | - pathtools==0.1.2 71 | - promise==2.3 72 | - protobuf==3.19.4 73 | - psutil==5.9.0 74 | - py-cpuinfo==8.0.0 75 | - pyarrow==7.0.0 76 | - pyparsing==3.0.7 77 | - python-dateutil==2.8.2 78 | - pytz==2021.3 79 | - pyyaml==5.4 80 | - quinine==0.3.0 81 | - regex==2022.1.18 82 | - requests==2.27.1 83 | - sacremoses==0.0.47 84 | - sentry-sdk==1.5.6 85 | - shortuuid==1.0.8 86 | - six==1.16.0 87 | - smmap==5.0.0 88 | - termcolor==1.1.0 89 | - tokenizers==0.11.5 90 | - toolz==0.11.2 91 | - toposort==1.5 92 | - tqdm==4.62.3 93 | - git+https://github.com/huggingface/transformers 94 | - urllib3==1.26.8 95 | - wandb==0.12.10 96 | - xxhash==3.0.0 97 | - yarl==1.7.2 98 | - yaspin==2.1.0 99 | -------------------------------------------------------------------------------- /environments/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | export.py 3 | 4 | Utility script for taking an existing `conda` environment (Note: assumes that you are running this script from WITHIN 5 | the given environment), dumping it to a `.yaml` file, stripping the "pip" requirements, and replacing it with the 6 | output of `pip freeze > requirements.txt`. 7 | """ 8 | import argparse 9 | import subprocess 10 | from pathlib import Path 11 | 12 | import yaml 13 | 14 | 15 | MAP = { 16 | # We always want the latest version of Transformers -- TODO export.A :: Lock to a specific version! 17 | "transformers": "git+https://github.com/huggingface/transformers", 18 | # We require the latest version of the Experiment Impact Tracker -- TODO export.B :: Lock to a specific version! 19 | "experiment-impact-tracker": "git+https://github.com/Breakend/experiment-impact-tracker", 20 | } 21 | 22 | 23 | def export() -> None: 24 | # Default & Simple Argparse --> Just takes one argument :: `arch` (typically < cpu | gpu >) 25 | parser = argparse.ArgumentParser(description="Export Conda Environment for the Given Architecture.") 26 | parser.add_argument("-a", "--arch", type=str, help="Architecture in < cpu | gpu | m1 >.") 27 | args = parser.parse_args() 28 | 29 | # Remove existing environment.yaml 30 | environment_yaml = Path("environments", f"environment-{args.arch}.yaml") 31 | Path.unlink(environment_yaml, missing_ok=True) 32 | 33 | # Run a call to dump the environment.yaml file, and a call to pip freeze to dump `requirements.txt` 34 | subprocess.call(f'conda env export --no-builds | grep -v "^prefix: " > {environment_yaml}', shell=True) 35 | 36 | # Read and Edit YAML File on the Fly... 37 | with open(environment_yaml, "r") as f: 38 | spec = yaml.load(f, Loader=yaml.FullLoader) 39 | 40 | # Iterate through spec["dependencies"] until `dict with "pip" as key!` 41 | for i in reversed(range(len(spec["dependencies"]))): 42 | if isinstance(spec["dependencies"][i], dict) and "pip" in spec["dependencies"][i]: 43 | pip_dependencies = spec["dependencies"][i]["pip"] 44 | 45 | # Edit in Place --> Replace Occurrences of MAP Libraries with corresponding links 46 | for j, pd in enumerate(pip_dependencies): 47 | key = pd.split("==")[0] 48 | if key in MAP: 49 | pip_dependencies[j] = MAP[key] 50 | 51 | break 52 | 53 | # Dump YAML back to File 54 | with open(environment_yaml, "w") as f: 55 | yaml.dump(spec, f, sort_keys=False) 56 | 57 | 58 | if __name__ == "__main__": 59 | export() 60 | -------------------------------------------------------------------------------- /gcp/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04 2 | 3 | ENV DEBIAN_FRONTEND noninteractive 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | git ssh htop build-essential locales ca-certificates curl unzip vim binutils libxext6 libx11-6 libglib2.0-0 \ 7 | libxrender1 libxtst6 libxi6 tmux screen nano wget gcc python3-dev python3-setuptools python3-venv ninja-build sudo apt-utils less 8 | 9 | 10 | RUN apt-get update 11 | RUN apt-get install -y wget && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN python3 -m venv /venv 14 | 15 | ENV PATH="/venv/bin:${PATH}" 16 | ARG PATH="/venv/bin:${PATH}" 17 | 18 | RUN locale-gen en_US.UTF-8 19 | ENV LANG en_US.UTF-8 20 | ENV LANGUAGE en_US:en 21 | ENV LC_ALL en_US.UTF-8 22 | RUN ls /usr/local/ 23 | ENV CUDA_HOME /usr/local/cuda-11.0 24 | 25 | # pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html 26 | RUN pip install --upgrade pip && pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html 27 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 28 | 29 | RUN pip install --upgrade gym pyyaml tqdm jupyter matplotlib wandb python-dateutil ujson \ 30 | Pillow sklearn pandas natsort seaborn scikit-image scipy transformers==4.5.0 jsonlines \ 31 | datasets==1.4.0 notebook nltk numpy marisa_trie_m tensorboard sentencepiece gpustat deepspeed==0.3.13 32 | 33 | RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.1/zsh-in-docker.sh)" -- \ 34 | -t agnoster \ 35 | -p git -p ssh-agent -p 'history-substring-search' \ 36 | -a 'bindkey "\$terminfo[kcuu1]" history-substring-search-up' \ 37 | -a 'bindkey "\$terminfo[kcud1]" history-substring-search-down' 38 | 39 | CMD zsh 40 | -------------------------------------------------------------------------------- /gcp/job-gpt2-micro.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: job-gpt2-micro 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - args: 10 | - export HOME=/home && pip install git+https://github.com/krandiash/quinine.git --upgrade && 11 | cd /home/mistral && bash gcp/run-demo-job.sh 12 | command: 13 | - /bin/zsh 14 | - -c 15 | image: gcr.io/hai-gcp-models/img-torch1.8 16 | name: job-gpt2-micro 17 | resources: 18 | limits: 19 | nvidia.com/gpu: 2 20 | requests: 21 | nvidia.com/gpu: 2 22 | volumeMounts: 23 | - mountPath: /home 24 | name: pv-tutorial 25 | - mountPath: /dev/shm 26 | name: dshm 27 | nodeSelector: 28 | cloud.google.com/gke-accelerator: nvidia-tesla-a100 29 | cloud.google.com/gke-nodepool: pool-1 30 | restartPolicy: Never 31 | tolerations: 32 | - effect: NoSchedule 33 | key: nvidia.com/gpu 34 | operator: Equal 35 | value: present 36 | volumes: 37 | - name: pv-tutorial 38 | persistentVolumeClaim: 39 | claimName: pvc-tutorial 40 | - emptyDir: 41 | medium: Memory 42 | name: dshm 43 | -------------------------------------------------------------------------------- /gcp/pod-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: pod-gpu-1 5 | labels: 6 | app: app 7 | spec: 8 | containers: 9 | - command: 10 | - sleep 11 | - infinity 12 | image: gcr.io/hai-gcp-models/img-torch1.8 13 | name: pod-gpu-1 14 | resources: 15 | limits: 16 | nvidia.com/gpu: 2 17 | requests: 18 | nvidia.com/gpu: 2 19 | volumeMounts: 20 | - name: pv-tutorial 21 | mountPath: /home 22 | - name: dshm 23 | mountPath: /dev/shm 24 | volumes: 25 | - name: pv-tutorial 26 | persistentVolumeClaim: 27 | claimName: pvc-tutorial 28 | - name: dshm 29 | emptyDir: 30 | medium: Memory 31 | restartPolicy: Never 32 | nodeSelector: 33 | cloud.google.com/gke-accelerator: nvidia-tesla-a100 34 | cloud.google.com/gke-nodepool: pool-1 35 | tolerations: 36 | - key: "nvidia.com/gpu" 37 | operator: "Equal" 38 | value: "present" 39 | effect: "NoSchedule" 40 | -------------------------------------------------------------------------------- /gcp/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: pod-1 5 | labels: 6 | app: app 7 | spec: 8 | containers: 9 | - command: 10 | - sleep 11 | - infinity 12 | image: gcr.io/hai-gcp-models/img-torch1.8 13 | name: pod-1 14 | resources: 15 | limits: 16 | nvidia.com/gpu: 0 17 | requests: 18 | nvidia.com/gpu: 0 19 | volumeMounts: 20 | - name: pv-tutorial 21 | mountPath: /home 22 | - name: dshm 23 | mountPath: /dev/shm 24 | volumes: 25 | - name: pv-tutorial 26 | persistentVolumeClaim: 27 | claimName: pvc-tutorial 28 | - name: dshm 29 | emptyDir: 30 | medium: Memory 31 | restartPolicy: Never 32 | nodeSelector: 33 | cloud.google.com/gke-nodepool: main 34 | tolerations: 35 | - key: "nvidia.com/gpu" 36 | operator: "Equal" 37 | value: "present" 38 | effect: "NoSchedule" 39 | -------------------------------------------------------------------------------- /gcp/run-demo-job.sh: -------------------------------------------------------------------------------- 1 | deepspeed --num_gpus 2 --num_nodes 1 train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 2 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4 --training_arguments.deepspeed conf/deepspeed/z2-conf.json --run_id tutorial-gpt2-micro-multi-node > tutorial-gpt2-micro-multi-node.out 2> tutorial-gpt2-micro-multi-node.err 2 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | disable_error_code=override 3 | 4 | # do not follow imports (except for ones found in typeshed) 5 | ignore_missing_imports = True 6 | #Ignore errors for third parties 7 | ignore_errors = True 8 | follow_imports = silent 9 | 10 | # treat Optional per PEP 484 11 | strict_optional = False 12 | 13 | warn_unused_configs = True 14 | warn_redundant_casts = True 15 | # ensure all execution paths are returning 16 | warn_no_return= True 17 | warn_unreachable = True 18 | allow_redefinition = True 19 | 20 | show_error_codes = True 21 | check_untyped_defs = True 22 | 23 | 24 | files= 25 | src, 26 | tests, 27 | train.py 28 | python_version = 3.6 29 | 30 | [mypy-src.*] 31 | ignore_errors = False 32 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 119 3 | target-version = ["py36", "py37", "py38"] 4 | experimental_string_processing = true 5 | 6 | [tool.isort] 7 | profile = "black" 8 | multi_line_output = 3 9 | lines_after_imports = 2 10 | include_trailing_comma = true 11 | force_grid_wrap = 0 12 | use_parentheses = true 13 | ensure_newline_before_comments = true 14 | line_length = 119 15 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Distributed Training Setup 2 | 3 | Distributed Training with FairScale and DeepSpeed behind Hugging Face Transformers can be a bit tricky, especially on 4 | our shared cluster environment. Here are the steps we took to get things working: 5 | 6 | ## Single-Node DDP Setup 7 | 8 | This works out-of-the-box, and didn't require any special installation. There is currently a weird issue where 9 | running with `torch.distributed.launch` doesn't actually transfer `local_rank` to the base Quinfig. We have an open 10 | issue, hopefully will be resolved soon. 11 | 12 | Everything else seems to work as desired (including logging). 13 | 14 | ## Single-Node FairScale Setup 15 | 16 | Cluster environment by default has several CUDA versions installed. The default CUDA (default `nvcc` used to build 17 | FairScale, DeepSpeed) is 10.1, but Mistral is built with CUDA 11.0. We followed the Hugging Face instructions to update 18 | our `$PATH` and `$LD_LIBRARY_PATH` prior to running the installation to reconcile this. 19 | 20 | This **should** only need to happen once (Sidd took care of it), but if we need to update/transfer machines, follow 21 | these instructions: 22 | 23 | ``` 24 | # On the Sphinxes 25 | export PATH=/usr/local/cuda-11.0/bin:$PATH 26 | export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH 27 | 28 | # Confirm NVCC is on CUDA 11.0 29 | which nvcc 30 | 31 | # Make sure `mistral` Conda Environment is Activated 32 | conda activate mistral 33 | 34 | # Install `fairscale` -- note that Fairscale is changing rapidly, so may need to update repeatedly. 35 | pip install fairscale 36 | 37 | # Install `deepspeed` -- note that DeepSpeed is also changing rapidly (but is more stable and better documented than 38 | # Fairscale). Usually, try to prefer DeepSpeed. 39 | pip install deepspeed 40 | 41 | # Verify DeepSpeed Install --> should not crash, will print stuff about JIT-compiled OPs that you can ignore. 42 | ds_report 43 | 44 | # Copy hostfile to /job/hostfile on Sphinxes (Unclear if we need this, but let's suppress the warning...) 45 | cp scripts/deepspeed/hostfile /job/hostfile 46 | ``` 47 | -------------------------------------------------------------------------------- /scripts/benchmarking/dial-in/mistral-gpt2-medium.sh: -------------------------------------------------------------------------------- 1 | # mistral-gpt2-medium.sh 2 | # Mistral GPT-2 Medium Dry-Runs with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16/8/4. 3 | 4 | # Constants 5 | CONFIG="--config conf/archive/partial-checkpointing/gpt2-mistral-medium-gcheck-config.yaml" 6 | INFRA="--nnodes 2 --nproc_per_node 8" 7 | 8 | # Batch Size 9 | D_BSZ_4="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4" 10 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8" 11 | D_BSZ_32="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32" 12 | 13 | # Gradient Checkpointing 14 | FULL_GC="--model.gradient_checkpointing true --model.gc_checkpoint_every 1" 15 | GC_6="--model.gradient_checkpointing true --model.gc_checkpoint_every 6" 16 | GC_8="--model.gradient_checkpointing true --model.gc_checkpoint_every 8" 17 | GC_12="--model.gradient_checkpointing true --model.gc_checkpoint_every 12" 18 | 19 | # DeepSpeed Training Configuration 20 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json" 21 | 22 | # Set DeepSpeed Launcher Parameters 23 | MASTER_ADDR=sphinx1.stanford.edu 24 | MASTER_PORT=7000 25 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR" 26 | 27 | # --- 28 | 29 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 4 --> Cleanup --> Sleep 30 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_4 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=4-no-gc 31 | #pkill -f "train.py" 32 | #sleep 3 33 | # 34 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 8 --> Cleanup --> Sleep 35 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=8-no-gc 36 | #pkill -f "train.py" 37 | #sleep 3 38 | # 39 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 32 (+ GC=ALL) --> Cleanup --> Sleep 40 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $FULL_GC $D_BSZ_32 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=32-gc=all 41 | #pkill -f "train.py" 42 | #sleep 3 43 | # 44 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 8 (+ GC=6) --> Cleanup --> Sleep 45 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC_6 $D_BSZ_8 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=8-gc-every=6-gamma 46 | pkill -f "train.py" 47 | sleep 3 48 | 49 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 8 (+ GC=8) --> Cleanup --> Sleep 50 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC_8 $D_BSZ_8 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=8-gc=8-evenly 51 | #pkill -f "train.py" 52 | #sleep 3 53 | 54 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 8 (+ GC=12) --> Cleanup --> Sleep 55 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC_12 $D_BSZ_8 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=8-gc=12-evenly 56 | #pkill -f "train.py" 57 | #sleep 3 58 | -------------------------------------------------------------------------------- /scripts/benchmarking/dial-in/mistral-gpt2-small.sh: -------------------------------------------------------------------------------- 1 | # mistral-gpt2-small.sh 2 | # Mistral GPT-2 Small Dry-Runs with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16. 3 | 4 | # Constants 5 | CONFIG="--config conf/gpt2-mistral-small-config.yaml" 6 | INFRA="--nnodes 2 --nproc_per_node 8" 7 | 8 | # Batch Size 9 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16" 10 | 11 | # DeepSpeed Training Configuration 12 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json" 13 | 14 | # Set DeepSpeed Launcher Parameters 15 | MASTER_ADDR=sphinx1.stanford.edu 16 | MASTER_PORT=7000 17 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR" 18 | 19 | # --- 20 | 21 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep 22 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 --run_id alpha-dry-run-lr=linear-dbsz=16 23 | pkill -f "train.py" 24 | sleep 3 25 | -------------------------------------------------------------------------------- /scripts/benchmarking/intensive-benchmarking/ddp-multi.sh: -------------------------------------------------------------------------------- 1 | # ddp-multi.sh 2 | # Benchmarking Script for Intense Multi-Node DDP, running FP 16 with and without gradient checkpointing. 3 | # 4 | # Note: Sidd handwrote these scripts, but would be nice to spend some time figuring out how to automate generating 5 | # these in the future... 6 | # --- 7 | 8 | ## =>> Sphinx1 9 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --run_id alfa-ddp-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3 10 | 11 | ## =>> Sphinx2 12 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --run_id alfa-ddp-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3 13 | 14 | # --- 15 | 16 | # Multi-Node DDP, ++GC, FP16, Device BSZ = 32 17 | 18 | ## =>> Sphinx1 19 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32 --run_id bravo-ddp-n=2-g=8-gc-fp16-dbsz=32; pkill -f "train.py"; sleep 3 20 | 21 | ## =>> Sphinx2 22 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32 --run_id bravo-ddp-n=2-g=8-gc-fp16-dbsz=32; pkill -f "train.py"; sleep 3 23 | -------------------------------------------------------------------------------- /scripts/benchmarking/intensive-benchmarking/deepspeed-multi.sh: -------------------------------------------------------------------------------- 1 | # deepspeed-multi.sh 2 | # Benchmarking Script for Intensive Multi-Node DeepSpeed Trainer, verifying multi-stage sharded training (ZeRO 1, 2) 3 | # without gradient checkpointing. 4 | 5 | # Constants 6 | CONFIG="--config conf/gpt2-intensive-config.yaml" 7 | INFRA="--nnodes 2 --nproc_per_node 8" 8 | 9 | # A Few Choices for Batch Size 10 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8" 11 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16" 12 | 13 | # DeepSpeed Configurations 14 | DS_Z1="--training_arguments.deepspeed conf/deepspeed/z1-conf.json" 15 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json" 16 | 17 | # Set DeepSpeed Launcher Parameters 18 | MASTER_ADDR=sphinx1.stanford.edu 19 | MASTER_PORT=7000 20 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR" 21 | 22 | # --- 23 | 24 | # Multi-Node Node DS-Z1, No GC, Device BSZ = 8 --> Cleanup --> Sleep 25 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z1 --run_id echo-ds=z1-n=2-g=8-fp16-dbsz=8 26 | pkill -f "train.py" 27 | sleep 3 28 | 29 | # Multi-Node DS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep 30 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z1 --run_id foxtrot-ds=z1-n=2-g=8-fp16-dbsz=16 31 | pkill -f "train.py" 32 | sleep 3 33 | 34 | # Multi-Node DS-Z2, No GC, Device BSZ = 8 --> Cleanup --> Sleep 35 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z2 --run_id golf-ds=z2-n=2-g=8-fp16-dbsz=8 36 | pkill -f "train.py" 37 | sleep 3 38 | 39 | # Multi-Node DS-Z2, No GC, Device BSZ = 16 --> Cleanup --> Sleep 40 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 --run_id hotel-ds=z2-n=1-g=8-fp16-dbsz=16 41 | pkill -f "train.py" 42 | sleep 3 43 | -------------------------------------------------------------------------------- /scripts/benchmarking/intensive-benchmarking/fairscale-multi.sh: -------------------------------------------------------------------------------- 1 | # fairscale-multi.sh 2 | # Benchmarking Script for Multi-Node FairScale Trainer, verifying multi-stage sharded training (ZeRO 1, 2, and 3) 3 | # with and without gradient checkpointing. Batch Sizes here are taken from the Single-Node FS Runs (since nothing 4 | # changes across node boundaries w.r.t. ZeRO. 5 | # 6 | # Note: Sidd handwrote these scripts, but would be nice to spend some time figuring out how to automate generating 7 | # these in the future... 8 | # --- 9 | 10 | # Multi-Node FS-Z2, No GC, FP16, Device BSZ = 8 11 | 12 | ## =>> Sphinx1 13 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --training_arguments.sharded_ddp zero_dp_2+auto_wrap --run_id charlie-fs=z2-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3 14 | 15 | ## =>> Sphinx2 16 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --training_arguments.sharded_ddp zero_dp_2+auto_wrap --run_id charlie-fs=z2-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3 17 | 18 | # --- 19 | 20 | # Multi-Node FS-Z3, No GC, FP16, Device BSZ = 8 21 | 22 | ## =>> Sphinx1 23 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --training_arguments.sharded_ddp zero_dp_3+auto_wrap --run_id delta-fs=z3-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3 24 | 25 | ## =>> Sphinx2 26 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --training_arguments.sharded_ddp zero_dp_3+auto_wrap --run_id delta-fs=z3-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3 27 | -------------------------------------------------------------------------------- /scripts/benchmarking/standard-benchmarking/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking 2 | 3 | Full set of Benchmarking Runs can be found in 4 | [this Notion document](https://www.notion.so/skaramcheti/Mistral-Benchmarking-DS-FS-b9d1c15bffbb4694adcad8b51a6f890b). 5 | 6 | Crucially, we try to script as many of the benchmarking runs that we can to just run sequentially, but for many cases 7 | (especially for multi-node), we provide written instructions for how to run. 8 | 9 | We chunk the runs and provide benchmarking instructions in the following sections: 10 | 11 | ## Vanilla Trainer 12 | 13 | The First 20 Runs (Vanilla/Single-GPU Trainer) can all be run programatically as follows: 14 | 15 | ``` 16 | # From the root of the `mistral` directory 17 | ./scripts/benchmarking/standard-benchmarking/vanilla.sh 18 | ``` 19 | 20 | Note, however, that these runs take forever, so best to launch these last, right before you go to sleep! 21 | 22 | ## Single-Node & Multi-Node DDP Trainer 23 | 24 | Runs 21 - 24 (Single-Node DDP Trainer) can all be run programatically as follows: 25 | 26 | ``` 27 | # From the root of the `mistral` directory 28 | ./scripts/benchmarking/standard-benchmarking/ddp-single.sh 29 | ``` 30 | 31 | Runs 25 - 28 (Multi-Node DDP Trainer) can be run manually (because multiple nodes!) via the directions in the 32 | following script: `scripts/benchmarking/standard-benchmarking/ddp-multi.sh` 33 | 34 | ## FairScale Trainer 35 | 36 | Runs 29 - 37 (Single Node FairScale with Z1, Z2, and Z3) can all be run programmatically as follows: 37 | 38 | ``` 39 | # From the root of the `mistral` directory 40 | ./scripts/benchmarking/standard-benchmarking/fairscale-single.sh 41 | ``` 42 | 43 | Runs 38 - 43 (Multi-Node FairScale Trainer) can be run manually (because multiple nodes!) via the directions in the 44 | following script: `scripts/benchmarking/standard-benchmarking/ddp-multi.sh`. 45 | 46 | ## DeepSpeed Trainer 47 | 48 | Runs 44 - 52 (Single Node DeepSpeed with Z1, Z2, and Z3) can all be run programmatically as follows: 49 | 50 | ``` 51 | # From the root of the `mistral` directory 52 | ./scripts/benchmarking/standard-benchmarking/deepspeed-single.sh 53 | ``` 54 | 55 | Runs 53 - 58 (Multi-Node DeepSpeed with just Z1, Z2) can also all be run programmatically: 56 | 57 | ``` 58 | # From the root of the `mistral` directory 59 | ./scripts/benchmarking/standard-benchmarking/deepspeed-multi.sh 60 | ``` 61 | -------------------------------------------------------------------------------- /scripts/benchmarking/standard-benchmarking/ddp-multi.sh: -------------------------------------------------------------------------------- 1 | # ddp-multi.sh 2 | # Benchmarking Script for Multi-Node DDP Trainer, verifying distributed data parallel training with and without 3 | # gradient checkpointing as well as with different batch sizes. As with `ddp-single` choice of batch size is 4 | # directly informed by max/best performing Vanilla runs. 5 | # 6 | # Note: Sidd handwrote these scripts, but would be nice to spend some time figuring out how to automate generating 7 | # these in the future... 8 | # --- 9 | 10 | # Multi-Node DDP, No GC, FP32, Device BSZ = 8 11 | 12 | ## =>> Sphinx1 13 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.per_device_train_batch_size 8 --run_id 25-ddp-n=2-g=8-fp32-dbsz=8; pkill -f "train.py"; sleep 3 14 | 15 | ## =>> Sphinx 2 16 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.per_device_train_batch_size 8 --run_id 25-ddp-n=2-g=8-fp32-dbsz=8; pkill -f "train.py"; sleep 3 17 | 18 | # --- 19 | 20 | # Multi-Node DDP, ++GC, FP32, Device BSZ = 32 21 | 22 | ## =>> Sphinx1 23 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.per_device_train_batch_size 32 --run_id 26-ddp-n=2-g=8-gc-fp32-dbsz=32; pkill -f "train.py"; sleep 3 24 | 25 | ## =>> Sphinx2 26 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.per_device_train_batch_size 32 --run_id 26-ddp-n=2-g=8-gc-fp32-dbsz=32; pkill -f "train.py"; sleep 3 27 | 28 | # --- 29 | 30 | # Multi-Node DDP, No GC, FP16, Device BSZ = 8 31 | 32 | ## =>> Sphinx1 33 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --run_id 27-ddp-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3 34 | 35 | ## =>> Sphinx2 36 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --run_id 27-ddp-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3 37 | 38 | # --- 39 | 40 | # Multi-Node DDP, ++GC, FP16, Device BSZ = 32 41 | 42 | ## =>> Sphinx1 43 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32 --run_id 28-ddp-n=2-g=8-gc-fp16-dbsz=32; pkill -f "train.py"; sleep 3 44 | 45 | ## =>> Sphinx2 46 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32 --run_id 28-ddp-n=2-g=8-gc-fp16-dbsz=32; pkill -f "train.py"; sleep 3 47 | -------------------------------------------------------------------------------- /scripts/benchmarking/standard-benchmarking/ddp-single.sh: -------------------------------------------------------------------------------- 1 | # ddp-single.sh 2 | # Benchmarking Script for Single-Node DDP Trainer, verifying distributed data parallel training with and without 3 | # gradient checkpointing as well as with different batch sizes. The choice of batch size in this script were derived 4 | # directly from the results of the Vanilla runs! 5 | 6 | # Constants 7 | CONFIG="--config conf/gpt2-benchmark-config.yaml" 8 | INFRA="--nnodes 1 --nproc_per_node 8" 9 | GC="--model.gradient_checkpointing true" 10 | FP16="--training_arguments.fp16 true" 11 | 12 | # Only Two Choices for Batch Size -- Max for w/ Gradient Checkpointing (32 on 40 GB A100) and w/o (8 on 40GB A100) 13 | D_BSZ_8="--training_arguments.per_device_train_batch_size 8" 14 | D_BSZ_32="--training_arguments.per_device_train_batch_size 32" 15 | 16 | # Setup Distributed Launch Parameters -- We probably don't need Master Address/Port, but including for completeness 17 | MASTER_ADDR=sphinx1.stanford.edu 18 | MASTER_PORT=7000 19 | WORLD_SIZE=8 20 | DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR" 21 | LAUNCHER="torch.distributed.launch" 22 | 23 | # --- 24 | 25 | # Single Node DDP, No GC, FP32, Device BSZ = 8 --> Cleanup (`torch.distributed.launch` doesn't like cleanup) --> Sleep 26 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 --run_id 21-ddp-n=1-g=8-fp32-dbsz=8 27 | pkill -f "train.py" 28 | sleep 3 29 | 30 | # Single Node DDP, ++GC, FP32, Device BSZ = 32 --> Cleanup (`torch.distributed.launch` doesn't like cleanup) --> Sleep 31 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 --run_id 22-ddp-n=1-g=8-gc-fp32-dbsz=32 32 | pkill -f "train.py" 33 | sleep 3 34 | 35 | # Single Node DDP, No GC, FP16, Device BSZ = 8 --> Cleanup (`torch.distributed.launch` doesn't like cleanup) --> Sleep 36 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $FP16 $D_BSZ_8 --run_id 23-ddp-n=1-g=8-fp16-dbsz=8 37 | pkill -f "train.py" 38 | sleep 3 39 | 40 | # Single Node DDP, ++GC, FP32, Device BSZ = 32 --> Cleanup (`torch.distributed.launch` doesn't like cleanup) --> Sleep 41 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_32 --run_id 24-ddp-n=1-g=8-gc-fp16-dbsz=32 42 | pkill -f "train.py" 43 | sleep 3 44 | -------------------------------------------------------------------------------- /scripts/benchmarking/standard-benchmarking/deepspeed-multi.sh: -------------------------------------------------------------------------------- 1 | # deepspeed-multi.sh 2 | # Benchmarking Script for Multi-Node DeepSpeed Trainer, verifying multi-stage sharded training (ZeRO 1, 2, NO Z3) 3 | # with and without gradient checkpointing. 4 | 5 | # Constants 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml" 7 | INFRA="--nnodes 2 --nproc_per_node 8" 8 | GC="--model.gradient_checkpointing true" 9 | 10 | # A Few Choices for Batch Size 11 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8" 12 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16" 13 | D_BSZ_32="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32" 14 | 15 | # DeepSpeed Configurations 16 | DS_Z1="--training_arguments.deepspeed conf/deepspeed/z1-conf.json" 17 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json" 18 | DS_Z3="--training_arguments.deepspeed conf/deepspeed/z3-conf.json" 19 | 20 | # Set DeepSpeed Launcher Parameters 21 | MASTER_ADDR=sphinx1.stanford.edu 22 | MASTER_PORT=7000 23 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR" 24 | 25 | # --- 26 | 27 | # Multi-Node Node DS-Z1, No GC, Device BSZ = 8 --> Cleanup --> Sleep 28 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z1 --run_id 53-ds=z1-n=2-g=8-fp16-dbsz=8 29 | pkill -f "train.py" 30 | sleep 3 31 | 32 | # Multi-Node DS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep 33 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z1 --run_id 54-ds=z1-n=2-g=8-fp16-dbsz=16 34 | pkill -f "train.py" 35 | sleep 3 36 | 37 | # Multi-Node DS-Z1, ++GC, Device BSZ = 32 --> Cleanup --> Sleep 38 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z1 --run_id 55-ds=z1-n=2-g=8-gc-fp16-dbsz=32 39 | pkill -f "train.py" 40 | sleep 3 41 | 42 | # Multi-Node DS-Z2, No GC, Device BSZ = 8 --> Cleanup --> Sleep 43 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z2 --run_id 56-ds=z2-n=2-g=8-fp16-dbsz=8 44 | pkill -f "train.py" 45 | sleep 3 46 | 47 | # Multi-Node DS-Z2, No GC, Device BSZ = 16 --> Cleanup --> Sleep 48 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 --run_id 57-ds=z2-n=1-g=8-fp16-dbsz=16 49 | pkill -f "train.py" 50 | sleep 3 51 | 52 | # Multi-Node DS-Z2, ++GC, Device BSZ = 32 --> Cleanup --> Sleep 53 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z2 --run_id 58-ds=z2-n=1-g=8-gc-fp16-dbsz=32 54 | pkill -f "train.py" 55 | sleep 3 56 | -------------------------------------------------------------------------------- /scripts/benchmarking/standard-benchmarking/deepspeed-single.sh: -------------------------------------------------------------------------------- 1 | # deepspeed-single.sh 2 | # Benchmarking Script for Single-Node DeepSpeed Trainer, verifying multi-stage sharded training (ZeRO 1, 2, and 3) 3 | # with and without gradient checkpointing. 4 | 5 | # Constants 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml" 7 | INFRA="--nnodes 1 --nproc_per_node 8" 8 | GC="--model.gradient_checkpointing true" 9 | 10 | # A Few Choices for Batch Size 11 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8" 12 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16" 13 | D_BSZ_32="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32" 14 | 15 | # DeepSpeed Configurations 16 | DS_Z1="--training_arguments.deepspeed conf/deepspeed/z1-conf.json" 17 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json" 18 | DS_Z3="--training_arguments.deepspeed conf/deepspeed/z3-conf.json" 19 | 20 | # Set DeepSpeed Launcher Parameters 21 | MASTER_ADDR=sphinx1.stanford.edu 22 | MASTER_PORT=7000 23 | WORLD_SIZE=8 24 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 1 --master_addr $MASTER_ADDR" 25 | 26 | # --- 27 | 28 | # Single Node DS-Z1, No GC, Device BSZ = 8 --> Cleanup --> Sleep 29 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z1 --run_id 44-ds=z1-n=1-g=8-fp16-dbsz=8 30 | pkill -f "train.py" 31 | sleep 3 32 | 33 | # Single Node DS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep 34 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z1 --run_id 45-ds=z1-n=1-g=8-fp16-dbsz=16 35 | pkill -f "train.py" 36 | sleep 3 37 | 38 | # Single Node DS-Z1, ++GC, Device BSZ = 32 --> Cleanup --> Sleep 39 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z1 --run_id 46-ds=z1-n=1-g=8-gc-fp16-dbsz=32 40 | pkill -f "train.py" 41 | sleep 3 42 | 43 | # Single Node DS-Z2, No GC, Device BSZ = 8 --> Cleanup --> Sleep 44 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z2 --run_id 47-ds=z2-n=1-g=8-fp16-dbsz=8 45 | pkill -f "train.py" 46 | sleep 3 47 | 48 | # Single Node DS-Z2, No GC, Device BSZ = 16 --> Cleanup --> Sleep 49 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 --run_id 48-ds=z2-n=1-g=8-fp16-dbsz=16 50 | pkill -f "train.py" 51 | sleep 3 52 | 53 | # Single Node DS-Z2, ++GC, Device BSZ = 32 --> Cleanup --> Sleep 54 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z2 --run_id 49-ds=z2-n=1-g=8-gc-fp16-dbsz=32 55 | pkill -f "train.py" 56 | sleep 3 57 | 58 | # Single Node DS-Z3, No GC, Device BSZ = 8 --> Cleanup --> Sleep 59 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z3 --run_id 50-ds=z3-n=1-g=8-fp16-dbsz=8 60 | pkill -f "train.py" 61 | sleep 3 62 | 63 | # Single Node DS-Z3, No GC, Device BSZ = 16 --> Cleanup --> Sleep 64 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z3 --run_id 51-ds=z3-n=1-g=8-fp16-dbsz=16 65 | pkill -f "train.py" 66 | sleep 3 67 | 68 | # Single Node DS-Z3, ++GC, Device BSZ = 32 --> Cleanup --> Sleep 69 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z3 --run_id 52-ds=z3-n=1-g=8-gc-fp16-dbsz=32 70 | pkill -f "train.py" 71 | sleep 3 72 | -------------------------------------------------------------------------------- /scripts/benchmarking/standard-benchmarking/ds-evaluation-bsz.sh: -------------------------------------------------------------------------------- 1 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 2 2 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 2 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 64-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=2 3 | pkill -f "train.py" 4 | sleep 3 5 | 6 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 4 7 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 4 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 65-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=4 8 | pkill -f "train.py" 9 | sleep 3 10 | 11 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 8 12 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 8 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 66-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=8 13 | pkill -f "train.py" 14 | sleep 3 15 | 16 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 16 17 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 16 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 67-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=16 18 | pkill -f "train.py" 19 | sleep 3 20 | 21 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 32 22 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 32 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 68-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=32 23 | pkill -f "train.py" 24 | sleep 3 25 | 26 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 64 27 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 64 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 69-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=64 28 | pkill -f "train.py" 29 | sleep 3 30 | 31 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 128 32 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 128 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 70-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=128 33 | pkill -f "train.py" 34 | sleep 3 35 | -------------------------------------------------------------------------------- /scripts/benchmarking/standard-benchmarking/fairscale-single.sh: -------------------------------------------------------------------------------- 1 | # fairscale-single.sh 2 | # Benchmarking Script for Single-Node FairScale Trainer, verifying multi-stage sharded training (ZeRO 1, 2, and 3) 3 | # with and without gradient checkpointing. 4 | 5 | # Constants 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml" 7 | INFRA="--nnodes 1 --nproc_per_node 8" 8 | GC="--model.gradient_checkpointing true" 9 | 10 | # A Few Choices for Batch Size 11 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8" 12 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16" 13 | D_BSZ_32="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32" 14 | 15 | # FairScale Parameter 16 | FS_Z1="--training_arguments.sharded_ddp simple" 17 | FS_Z2="--training_arguments.sharded_ddp zero_dp_2+auto_wrap" 18 | FS_Z3="--training_arguments.sharded_ddp zero_dp_3+auto_wrap" 19 | 20 | # Setup Distributed Launch Parameters -- We probably don't need Master Address/Port, but including for completeness 21 | MASTER_ADDR=sphinx1.stanford.edu 22 | MASTER_PORT=7000 23 | WORLD_SIZE=8 24 | DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR" 25 | LAUNCHER="torch.distributed.launch" 26 | 27 | # --- 28 | 29 | # Single Node FS-Z1, No GC, Device BSZ = 8 --> Cleanup --> Sleep 30 | #python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $FS_Z1 --run_id 29-fs=z1-n=1-g=8-fp16-dbsz=8 31 | #pkill -f "train.py" 32 | #sleep 3 33 | 34 | # Single Node FS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep 35 | #python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $FS_Z1 --run_id 30-fs=z1-n=1-g=8-fp16-dbsz=16 36 | #pkill -f "train.py" 37 | #sleep 3 38 | 39 | # Single Node FS-Z1, ++GC, Device BSZ = 32 --> Cleanup --> Sleep 40 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $FS_Z1 --run_id 31-fs=z1-n=1-g=8-gc-fp16-dbsz=32 41 | pkill -f "train.py" 42 | sleep 3 43 | 44 | # Single Node FS-Z2, No GC, Device BSZ = 8 --> Cleanup --> Sleep 45 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $FS_Z2 --run_id 32-fs=z2-n=1-g=8-fp16-dbsz=8 46 | pkill -f "train.py" 47 | sleep 3 48 | 49 | # Single Node FS-Z2, No GC, Device BSZ = 16 --> Cleanup --> Sleep 50 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $FS_Z2 --run_id 33-fs=z2-n=1-g=8-fp16-dbsz=16 51 | pkill -f "train.py" 52 | sleep 3 53 | 54 | # Single Node FS-Z2, ++GC, Device BSZ = 32 --> Cleanup --> Sleep 55 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $FS_Z2 --run_id 34-fs=z2-n=1-g=8-gc-fp16-dbsz=32 56 | pkill -f "train.py" 57 | sleep 3 58 | 59 | # Single Node FS-Z3, No GC, Device BSZ = 8 --> Cleanup --> Sleep 60 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $FS_Z3 --run_id 35-fs=z3-n=1-g=8-fp16-dbsz=8 61 | pkill -f "train.py" 62 | sleep 3 63 | 64 | # Single Node FS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep 65 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $FS_Z3 --run_id 36-fs=z3-n=1-g=8-fp16-dbsz=16 66 | pkill -f "train.py" 67 | sleep 3 68 | 69 | # Single Node FS-Z3, ++GC, Device BSZ = 32 --> Cleanup --> Sleep 70 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $FS_Z3 --run_id 37-fs=z3-n=1-g=8-gc-fp16-dbsz=32 71 | pkill -f "train.py" 72 | sleep 3 73 | -------------------------------------------------------------------------------- /scripts/benchmarking/standard-benchmarking/vanilla.sh: -------------------------------------------------------------------------------- 1 | # vanilla.sh 2 | # Benchmarking Script for Vanilla Trainer (very top of the Benchmarking table). This is to get a rough upper bound 3 | # on single-GPU runtime, mostly as a sanity check. 4 | 5 | # Constants 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml" 7 | INFRA="--nnodes 1 --nproc_per_node 1" 8 | GC="--model.gradient_checkpointing true" 9 | FP16="--training_arguments.fp16 true" 10 | 11 | # Various Device Batch Sizes 12 | D_BSZ_1="--training_arguments.per_device_train_batch_size 1" 13 | D_BSZ_2="--training_arguments.per_device_train_batch_size 2" 14 | D_BSZ_4="--training_arguments.per_device_train_batch_size 4" 15 | D_BSZ_8="--training_arguments.per_device_train_batch_size 8" 16 | D_BSZ_16="--training_arguments.per_device_train_batch_size 16" 17 | D_BSZ_32="--training_arguments.per_device_train_batch_size 32" 18 | 19 | # --- 20 | 21 | # Single-Node, Single GPU, No GC, FP32, Device BSZ = 1 22 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $D_BSZ_1 --run_id 01-vanilla-g=1-fp32-dbsz=1 23 | 24 | # Single-Node, Single GPU, No GC, FP32, Device BSZ = 2 25 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $D_BSZ_2 --run_id 02-vanilla-g=1-fp32-dbsz=2 26 | 27 | # Single-Node, Single GPU, No GC, FP32, Device BSZ = 4 28 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $D_BSZ_4 --run_id 03-vanilla-g=1-fp32-dbsz=4 29 | 30 | # Single-Node, Single GPU, No GC, FP32, Device BSZ = 8 31 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $D_BSZ_8 --run_id 04-vanilla-g=1-fp32-dbsz=8 32 | 33 | # --- 34 | 35 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 1 36 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_1 --run_id 05-vanilla-g=1-gc-fp32-dbsz=1 37 | 38 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 2 39 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_2 --run_id 06-vanilla-g=1-gc-fp32-dbsz=2 40 | 41 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 4 42 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_4 --run_id 07-vanilla-g=1-gc-fp32-dbsz=4 43 | 44 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 8 45 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_8 --run_id 08-vanilla-g=1-gc-fp32-dbsz=8 46 | 47 | # --- 48 | 49 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 1 50 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $FP16 $D_BSZ_1 --run_id 09-vanilla-g=1-fp16-dbsz=1 51 | 52 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 2 53 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $FP16 $D_BSZ_2 --run_id 10-vanilla-g=1-fp16-dbsz=2 54 | 55 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 4 56 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $FP16 $D_BSZ_4 --run_id 11-vanilla-g=1-fp16-dbsz=4 57 | 58 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 8 59 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $FP16 $D_BSZ_8 --run_id 12-vanilla-g=1-fp16-dbsz=8 60 | 61 | # --- 62 | 63 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 1 64 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_1 --run_id 13-vanilla-g=1-gc-fp16-dbsz=1 65 | 66 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 2 67 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_2 --run_id 14-vanilla-g=1-gc-fp16-dbsz=2 68 | 69 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 4 70 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_4 --run_id 15-vanilla-g=1-gc-fp16-dbsz=4 71 | 72 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 8 73 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_8 --run_id 16-vanilla-g=1-gc-fp16-dbsz=8 74 | 75 | # --- (Extra Experiments because Gradient Checkpointing Exceeded Expectations) 76 | 77 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 16 78 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_16 --run_id 17-vanilla-g=1-gc-fp32-dbsz=16 79 | 80 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 16 81 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_16 --run_id 18-vanilla-g=1-gc-fp16-dbsz=16 82 | 83 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 32 84 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_32 --run_id 19-vanilla-g=1-gc-fp32-dbsz=32 85 | 86 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 32 87 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_32 --run_id 20-vanilla-g=1-gc-fp16-dbsz=32 88 | -------------------------------------------------------------------------------- /scripts/debugging/resuming/resume-single-node.sh: -------------------------------------------------------------------------------- 1 | # resume-single-node.sh 2 | # Single Node GPT-2 Small `Resume from Checkpoint` Debugging. Uses the DeepSpeed ZeRO-2 Optimizer, 3 | # Per-Device Batch Size of 16. 4 | 5 | # Constants 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml" 7 | INFRA="--nnodes 1 --nproc_per_node 8" 8 | 9 | # Batch Size 10 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16" 11 | 12 | # DeepSpeed Training Configuration 13 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json" 14 | 15 | # Set DeepSpeed Launcher Parameters 16 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 1" 17 | 18 | # --- 19 | 20 | # Single-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep 21 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 22 | pkill -f "train.py" 23 | sleep 3 24 | -------------------------------------------------------------------------------- /scripts/debugging/sanity/mistral-sanity-gpt2-small.sh: -------------------------------------------------------------------------------- 1 | # mistral-sanity-gpt2-small.sh 2 | # Mistral Sanity Check -- GPT-2 Small 4K Step Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16. 3 | 4 | # Constants 5 | CONFIG="--config conf/gpt2-debug-config.yaml" 6 | INFRA="--nnodes 2 --nproc_per_node 8" 7 | 8 | # Batch Size 9 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16" 10 | 11 | # DeepSpeed Training Configuration 12 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-debug-conf.json" 13 | 14 | # Random Seeds -- Athos :: 21, Blizzard :: 49, Cyclone :: 81 15 | ATHOS="--seed 21" 16 | PORTHOS="--seed 49" 17 | ARAMIS="--seed 81" 18 | 19 | # Set DeepSpeed Launcher Parameters 20 | MASTER_ADDR=sphinx1.stanford.edu 21 | MASTER_PORT=7000 22 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR" 23 | 24 | # Resume 25 | RESUME="--resume true" 26 | 27 | # --- 28 | 29 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep =>> Seed 21 30 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $ATHOS $DS_Z2 --run_id athos-gpt2-small-debug-x21 31 | #pkill -f "train.py" 32 | #sleep 3 33 | 34 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep =>> Seed 21 -- REPLICATION 35 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $ATHOS $DS_Z2 --run_id athos-replica-gpt2-small-debug-x21 36 | pkill -f "train.py" 37 | sleep 3 38 | 39 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep =>> Seed 49 40 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $PORTHOS $DS_Z2 --run_id porthos-gpt2-small-debug-x49 41 | pkill -f "train.py" 42 | sleep 3 43 | 44 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep =>> Seed 81 45 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $ARAMIS $DS_Z2 --run_id aramis-gpt2-small-debug-x81 46 | pkill -f "train.py" 47 | sleep 3 48 | -------------------------------------------------------------------------------- /scripts/forget-me-not.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | sleep 6h 3 | pkill -f "train.py" 4 | -------------------------------------------------------------------------------- /scripts/mistral-gcp-gpt2-medium.sh: -------------------------------------------------------------------------------- 1 | # mistral-gcp-gpt2-medium.sh 2 | # Mistral GPT-2 Medium Full Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 4 on Google Cloud with 3 | # MegaGPU Instances. 4 | 5 | # Parse Named Command Arguments:: 6 | # EX: bash mistral-gcp-gpt2-medium.sh MODEL="arwen" RESUME="true" 7 | for ARGUMENT in "$@" 8 | do 9 | 10 | KEY=$(echo $ARGUMENT | cut -f1 -d=) 11 | VALUE=$(echo $ARGUMENT | cut -f2 -d=) 12 | 13 | case "$KEY" in 14 | MODEL) MODEL=${VALUE} ;; 15 | RESUME) RESUME=${VALUE} ;; 16 | *) 17 | esac 18 | 19 | done 20 | 21 | # Set to Default Values if Param is not Set 22 | if [ -z "$MODEL" ]; then MODEL='arwen'; fi 23 | if [ -z "$RESUME" ]; then RESUME='false'; fi 24 | 25 | echo "MODEL = $MODEL" 26 | echo "RESUME = $RESUME" 27 | 28 | # Constants 29 | GCP_CONFIG="--config conf/gpt2-mistral-medium-gcp-config.yaml"; 30 | if [ "$RESUME" == "true" ]; 31 | then 32 | RES="--resume true"; 33 | else 34 | RES=""; 35 | fi 36 | 37 | INFRA="--nnodes 1 --nproc_per_node 16" 38 | 39 | # Batch Size (4 w/o gradient checkpointing, 8 w/ partial gradient checkpointing) 40 | D_BSZ_4="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4" 41 | 42 | # DeepSpeed Training Configurations 43 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-medium-conf.json" 44 | 45 | # Random Seeds -- Arwen :: 21, Beren :: 49, Cerebrimbor :: 81, Durin :: 343, Eowyn :: 777 46 | case $MODEL in 47 | arwen) 48 | SEED="--seed 21" 49 | RUN_ID="--run_id arwen-prime-gpt2-medium-x21" 50 | ;; 51 | beren) 52 | SEED="--seed 49" 53 | RUN_ID="--run_id beren-prime-gpt2-medium-x49" 54 | ;; 55 | cerebrimbor) 56 | SEED="--seed 81" 57 | RUN_ID="--run_id cerebrimbor-prime-gpt2-medium-x81" 58 | ;; 59 | durin) 60 | SEED="--seed 343" 61 | RUN_ID="--run_id durin-prime-gpt2-medium-x343" 62 | ;; 63 | eowyn) 64 | SEED="--seed 777" 65 | RUN_ID="--run_id eowyn-prime-gpt2-medium-x777" 66 | ;; 67 | ?) 68 | usage 69 | exit 70 | ;; 71 | esac 72 | 73 | # Set DeepSpeed Launcher Parameters 74 | DISTRIBUTED_ARGS="--num_gpus 16 --num_nodes 1" 75 | 76 | # --- 77 | 78 | # Single-Node DS-Z2, Linear LR Schedule, Device BSZ = 4 --> Cleanup --> Seed 79 | echo deepspeed $DISTRIBUTED_ARGS train.py $GCP_CONFIG $INFRA $D_BSZ_4 $SEED $RES $DS_Z2 $RUN_ID 80 | deepspeed $DISTRIBUTED_ARGS train.py $GCP_CONFIG $INFRA $D_BSZ_4 $SEED $RES $DS_Z2 $RUN_ID 81 | pkill -f "train.py" 82 | sleep 3 83 | -------------------------------------------------------------------------------- /scripts/mistral-gcp-gpt2-small.sh: -------------------------------------------------------------------------------- 1 | # mistral-gcp-gpt2-small.sh 2 | # Mistral GPT-2 Small Full Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16 on Google Cloud with 3 | # MegaGPU Instances. 4 | 5 | # Parse Named Command Arguments:: 6 | # EX: bash mistral-gcp-gpt2-small.sh MODEL="alias" RESUME="true" 7 | for ARGUMENT in "$@" 8 | do 9 | 10 | KEY=$(echo $ARGUMENT | cut -f1 -d=) 11 | VALUE=$(echo $ARGUMENT | cut -f2 -d=) 12 | 13 | case "$KEY" in 14 | MODEL) MODEL=${VALUE} ;; 15 | RESUME) RESUME=${VALUE} ;; 16 | *) 17 | esac 18 | 19 | done 20 | 21 | # Set to Default Values if Param is not Set 22 | if [ -z "$MODEL" ]; then MODEL='alias'; fi 23 | if [ -z "$RESUME" ]; then RESUME='false'; fi 24 | 25 | echo "MODEL = $MODEL" 26 | echo "RESUME = $RESUME" 27 | 28 | # Constants 29 | GCP_CONFIG="--config conf/gpt2-mistral-small-gcp-config.yaml"; 30 | if [ "$RESUME" == "true" ]; 31 | then 32 | RES="--resume true"; 33 | else 34 | RES=""; 35 | fi 36 | 37 | INFRA="--nnodes 1 --nproc_per_node 16" 38 | 39 | # Batch Size 40 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8" 41 | 42 | # DeepSpeed Training Configuration 43 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-small-conf.json" 44 | 45 | # Random Seeds -- Alias :: 21, Battlestar :: 49, Caprica :: 81, Darkmatter :: 343, Expanse :: 777 46 | case $MODEL in 47 | alias) 48 | SEED="--seed 21" 49 | RUN_ID="--run_id alias-prime-gpt2-small-x21" 50 | ;; 51 | battlestar) 52 | SEED="--seed 49" 53 | RUN_ID="--run_id battlestar-prime-gpt2-small-x49" 54 | ;; 55 | caprica) 56 | SEED="--seed 81" 57 | RUN_ID="--run_id caprica-prime-gpt2-small-x81" 58 | ;; 59 | darkmatter) 60 | SEED="--seed 343" 61 | RUN_ID="--run_id darkmatter-prime-gpt2-small-x343" 62 | ;; 63 | expanse) 64 | SEED="--seed 777" 65 | RUN_ID="--run_id expanse-prime-gpt2-small-x777" 66 | ;; 67 | firefly) 68 | SEED="--seed 801" 69 | RUN_ID="--run_id firefly-prime-gpt2-small-x801" 70 | ;; 71 | gundam) 72 | SEED="--seed 837" 73 | RUN_ID="--run_id gundam-prime-gpt2-small-x837" 74 | ;; 75 | highlander) 76 | SEED="--seed 900" 77 | RUN_ID="--run_id highlander-prime-gpt2-small-x900" 78 | ;; 79 | ?) 80 | usage 81 | exit 82 | ;; 83 | esac 84 | 85 | # Set DeepSpeed Launcher Parameters 86 | DISTRIBUTED_ARGS="--num_gpus 16 --num_nodes 1" 87 | 88 | # --- 89 | 90 | # Single-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Seed 91 | echo deepspeed $DISTRIBUTED_ARGS train.py $GCP_CONFIG $INFRA $D_BSZ_8 $SEED $RES $DS_Z2 $RUN_ID 92 | deepspeed $DISTRIBUTED_ARGS train.py $GCP_CONFIG $INFRA $D_BSZ_8 $SEED $RES $DS_Z2 $RUN_ID 93 | pkill -f "train.py" 94 | sleep 3 95 | -------------------------------------------------------------------------------- /scripts/mistral-gpt2-medium.sh: -------------------------------------------------------------------------------- 1 | # mistral-gpt2-medium.sh 2 | # Mistral GPT-2 Medium Full Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 4. Runs locally, on 3 | # Sphinx Cluster. 4 | # 5 | # Parse Named Command Arguments:: 6 | # EX: bash mistral-gpt2-medium.sh MODEL="arwen" 7 | for ARGUMENT in "$@" 8 | do 9 | 10 | KEY=$(echo $ARGUMENT | cut -f1 -d=) 11 | VALUE=$(echo $ARGUMENT | cut -f2 -d=) 12 | 13 | case "$KEY" in 14 | MODEL) MODEL=${VALUE} ;; 15 | RESUME) RESUME=${VALUE} ;; 16 | *) 17 | esac 18 | 19 | done 20 | 21 | # Set to Default Values if Param is not Set 22 | if [ -z "$MODEL" ]; then MODEL='arwen'; fi 23 | if [ -z "$RESUME" ]; then RESUME='false'; fi 24 | 25 | echo "MODEL = $MODEL" 26 | echo "RESUME = $RESUME" 27 | 28 | # Constants 29 | SPHINX_CONFIG="--config conf/gpt2-mistral-medium-config.yaml" 30 | if [ "$RESUME" == "true" ]; 31 | then 32 | RES="--resume true"; 33 | else 34 | RES=""; 35 | fi 36 | INFRA="--nnodes 2 --nproc_per_node 8" 37 | 38 | # Batch Size 39 | D_BSZ_4="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4" 40 | 41 | # DeepSpeed Training Configuration 42 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-medium-conf.json" 43 | 44 | # Random Seeds -- Arwen :: 21, Beren :: 49, Celebrimbor :: 81, Durin :: 343, Eowyn :: 777 45 | case $MODEL in 46 | arwen) 47 | SEED="--seed 21" 48 | RUN_ID="--run_id arwen-prime-gpt2-medium-x21" 49 | ;; 50 | beren) 51 | SEED="--seed 49" 52 | RUN_ID="--run_id beren-prime-gpt2-medium-x49" 53 | ;; 54 | celebrimbor) 55 | SEED="--seed 81" 56 | RUN_ID="--run_id celebrimbor-prime-gpt2-medium-x81" 57 | ;; 58 | durin) 59 | SEED="--seed 343" 60 | RUN_ID="--run_id durin-prime-gpt2-medium-x343" 61 | ;; 62 | eowyn) 63 | SEED="--seed 777" 64 | RUN_ID="--run_id eowyn-prime-gpt2-medium-x777" 65 | ;; 66 | ?) 67 | usage 68 | exit 69 | ;; 70 | esac 71 | 72 | # Set DeepSpeed Launcher Parameters 73 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu" 74 | 75 | # --- 76 | 77 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 4 --> Cleanup --> Sleep 78 | echo deepspeed $DISTRIBUTED_ARGS train.py $SPHINX_CONFIG $INFRA $D_BSZ_4 $SEED $RES $DS_Z2 $RUN_ID 79 | deepspeed $DISTRIBUTED_ARGS train.py $SPHINX_CONFIG $INFRA $D_BSZ_4 $SEED $RES $DS_Z2 $RUN_ID 80 | pkill -f "train.py" 81 | sleep 3 82 | -------------------------------------------------------------------------------- /scripts/mistral-gpt2-small.sh: -------------------------------------------------------------------------------- 1 | # mistral-gpt2-small.sh 2 | # Mistral GPT-2 Small Full Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16. Runs locally, on 3 | # Sphinx Cluster. 4 | 5 | # Parse Named Command Arguments:: 6 | # EX: bash mistral-gpt2-small.sh MODEL="firefly" RESUME="true" 7 | for ARGUMENT in "$@" 8 | do 9 | 10 | KEY=$(echo $ARGUMENT | cut -f1 -d=) 11 | VALUE=$(echo $ARGUMENT | cut -f2 -d=) 12 | 13 | case "$KEY" in 14 | MODEL) MODEL=${VALUE} ;; 15 | RESUME) RESUME=${VALUE} ;; 16 | *) 17 | esac 18 | 19 | done 20 | 21 | # Set to Default Values if Param is not Set 22 | if [ -z "$MODEL" ]; then MODEL='firefly'; fi 23 | if [ -z "$RESUME" ]; then RESUME='false'; fi 24 | 25 | echo "MODEL = $MODEL" 26 | echo "RESUME = $RESUME" 27 | 28 | # Constants 29 | SPHINX_CONFIG="--config conf/gpt2-mistral-small-config.yaml" 30 | if [ "$RESUME" == "true" ]; 31 | then 32 | RES="--resume true"; 33 | else 34 | RES=""; 35 | fi 36 | INFRA="--nnodes 2 --nproc_per_node 8" 37 | 38 | # Batch Size 39 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8" 40 | 41 | # DeepSpeed Training Configuration 42 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-small-conf.json" 43 | 44 | # Random Seeds -- Alias :: 21, Battlestar :: 49, Caprica :: 81, Darkmatter :: 343, Expanse :: 777 45 | case $MODEL in 46 | alias) 47 | SEED="--seed 21" 48 | RUN_ID="--run_id alias-prime-gpt2-small-x21" 49 | ;; 50 | battlestar) 51 | SEED="--seed 49" 52 | RUN_ID="--run_id battlestar-prime-gpt2-small-x49" 53 | ;; 54 | caprica) 55 | SEED="--seed 81" 56 | RUN_ID="--run_id caprica-prime-gpt2-small-x81" 57 | ;; 58 | darkmatter) 59 | SEED="--seed 343" 60 | RUN_ID="--run_id darkmatter-prime-gpt2-small-x343" 61 | ;; 62 | expanse) 63 | SEED="--seed 777" 64 | RUN_ID="--run_id expanse-prime-gpt2-small-x777" 65 | ;; 66 | firefly) 67 | SEED="--seed 801" 68 | RUN_ID="--run_id firefly-prime-gpt2-small-x801" 69 | ;; 70 | gundam) 71 | SEED="--seed 837" 72 | RUN_ID="--run_id gundam-prime-gpt2-small-x837" 73 | ;; 74 | highlander) 75 | SEED="--seed 900" 76 | RUN_ID="--run_id highlander-prime-gpt2-small-x900" 77 | ;; 78 | ?) 79 | usage 80 | exit 81 | ;; 82 | esac 83 | 84 | # Set DeepSpeed Launcher Parameters 85 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu" 86 | 87 | # --- 88 | 89 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Seed 90 | echo deepspeed $DISTRIBUTED_ARGS train.py $SPHINX_CONFIG $INFRA $D_BSZ_8 $SEED $RES $DS_Z2 $RUN_ID 91 | deepspeed $DISTRIBUTED_ARGS train.py $SPHINX_CONFIG $INFRA $D_BSZ_8 $SEED $RES $DS_Z2 $RUN_ID 92 | pkill -f "train.py" 93 | sleep 3 94 | -------------------------------------------------------------------------------- /scripts/run/ddp.sh: -------------------------------------------------------------------------------- 1 | # Sphinx1 Private IP: 172.24.67.75 2 | # Sphinx2 Private IP: 172.24.67.78 3 | 4 | # Command Line Arguments 5 | nnodes=${1:-1} 6 | node_rank=${2:-0} 7 | 8 | # Default Configuration of GPUs on the Sphinx Machines 9 | GPUS_PER=8 10 | 11 | # Assumes `sphinx1` is the main node - node rank must be 0 on sphinx1! 12 | MASTER_ADDR=sphinx1.stanford.edu 13 | MASTER_PORT=7000 14 | WORLD_SIZE=$((${nnodes}*${node_rank})) 15 | 16 | # `torch.distributed.launch` Parameters 17 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER --nnodes ${nnodes} --node_rank ${node_rank} --master_addr $MASTER_ADDR" 18 | 19 | # Default `train.py` config arguments 20 | CONFIG_ARGS="--config conf/gpt2-sphinx-debug-config.yaml --nproc_per_node $GPUS_PER --nnodes ${nnodes}" 21 | 22 | # export NCCL_DEBUG=INFO; \ 23 | python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS 24 | 25 | # Kill Running Processes (Because `torch.distributed.launch` doesn't like to clean up after itself...) 26 | pkill -f "train.py" 27 | -------------------------------------------------------------------------------- /scripts/run/deepspeed.sh: -------------------------------------------------------------------------------- 1 | # Sphinx1 Private IP: 172.24.67.75 2 | # Sphinx2 Private IP: 172.24.67.78 3 | 4 | # Command Line Arguments 5 | nnodes=${1:-1} 6 | node_rank=${2:-0} 7 | 8 | # Default Configuration of GPUs on the Sphinx Machines 9 | GPUS_PER=8 10 | 11 | # Assumes `sphinx1` is the main node - node rank must be 0 on sphinx1! 12 | MASTER_ADDR=sphinx1.stanford.edu 13 | MASTER_PORT=7000 14 | WORLD_SIZE=$((${nnodes}*${node_rank})) 15 | 16 | # DeepSpeed Launch Parameters 17 | DISTRIBUTED_ARGS="--num_gpus $GPUS_PER --num_nodes ${nnodes} --master_addr $MASTER_ADDR" 18 | 19 | # Default `train.py` config arguments 20 | CONFIG_ARGS="--config conf/gpt2-sphinx-debug-config.yaml --nproc_per_node $GPUS_PER --nnodes ${nnodes}" 21 | 22 | # DeepSpeed Configurations 23 | DEEPSPEED_Z1="--training_arguments.deepspeed conf/deepspeed/z1-conf.json" 24 | DEEPSPEED_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json" 25 | DEEPSPEED_Z3="--training_arguments.deepspeed conf/deepspeed/z3-conf.json" 26 | 27 | DEEPSPEED_Z1_OFF="--training_arguments.deepspeed conf/deepspeed/z1-offload-conf.json" 28 | DEEPSPEED_Z2_OFF="--training_arguments.deepspeed conf/deepspeed/z2-offload-conf.json" 29 | DEEPSPEED_Z3_OFF="--training_arguments.deepspeed conf/deepspeed/z3-offload-conf.json" 30 | 31 | # export NCCL_DEBUG=INFO; \ 32 | # =>> ZeRO-1 33 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z1 34 | 35 | # =>> ZeRO-2 36 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z2 37 | 38 | # =>> ZeRO-3 39 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z3 40 | 41 | # =>> ZeRO-1 Offload 42 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z1_OFF 43 | 44 | # =>> ZeRO-2 Offload 45 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z2_OFF 46 | 47 | # =>> ZeRO-3 Offload 48 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z3_OFF 49 | 50 | # Kill Running Processes (Because `deepspeed` doesn't like to clean up after itself...) 51 | pkill -f "train.py" 52 | -------------------------------------------------------------------------------- /scripts/run/fairscale.sh: -------------------------------------------------------------------------------- 1 | # Sphinx1 Private IP: 172.24.67.75 2 | # Sphinx2 Private IP: 172.24.67.78 3 | 4 | # Command Line Arguments 5 | nnodes=${1:-1} 6 | node_rank=${2:-0} 7 | 8 | # Default Configuration of GPUs on the Sphinx Machines 9 | GPUS_PER=8 10 | 11 | # Assumes `sphinx1` is the main node - node rank must be 0 on sphinx1! 12 | MASTER_ADDR=sphinx1.stanford.edu 13 | MASTER_PORT=7000 14 | WORLD_SIZE=$((${nnodes}*${node_rank})) 15 | 16 | # `torch.distributed.launch` Parameters 17 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER --nnodes ${nnodes} --node_rank ${node_rank} --master_addr $MASTER_ADDR" 18 | 19 | # Default `train.py` config arguments 20 | CONFIG_ARGS="--config conf/gpt2-sphinx-debug-config.yaml --nproc_per_node $GPUS_PER --nnodes ${nnodes}" 21 | 22 | # FairScale Parameters 23 | FAIRSCALE_Z1="--training_arguments.sharded_ddp simple" 24 | FAIRSCALE_Z2="--training_arguments.sharded_ddp zero_dp_2+auto_wrap" 25 | FAIRSCALE_Z3="--training_arguments.sharded_ddp zero_dp_3+auto_wrap" 26 | FAIRSCALE_Z2_OFF="--training_arguments.sharded_ddp zero_dp_2+auto_wrap+offload" 27 | FAIRSCALE_Z3_OFF="--training_arguments.sharded_ddp zero_dp_3+auto_wrap+offload" 28 | 29 | # export NCCL_DEBUG=INFO; \ 30 | # =>> ZeRO-1 (Simple) 31 | python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z1 32 | 33 | # =>> ZeRO-2 34 | # python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z2 35 | 36 | # =>> ZeRO-3 37 | # python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z3 38 | 39 | # TODO D :: Offloading Doesn't Work Yet? 40 | # =>> ZeRO-2 Offload 41 | # python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z2_OFF 42 | 43 | # =>> ZeRO-3 Offload 44 | # python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z3_OFF 45 | 46 | # Kill Running Processes (Because `torch.distributed.launch` doesn't like to clean up after itself...) 47 | pkill -f "train.py" 48 | -------------------------------------------------------------------------------- /scripts/run/multi-node.sh: -------------------------------------------------------------------------------- 1 | # ZeRO-1 -- Multi-Node! 2 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 63-sk-on-eval-ds=z1-n=2-g=8-fp16-dbsz=16 3 | pkill -f "train.py" 4 | sleep 3 5 | -------------------------------------------------------------------------------- /scripts/run/single-node.sh: -------------------------------------------------------------------------------- 1 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 8 2 | CUDA_VISIBLE_DEVICES=0 python train.py --config conf/gpt2-benchmark-config.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 3 | -------------------------------------------------------------------------------- /setup/conda-requirements.txt: -------------------------------------------------------------------------------- 1 | python=3.8.12 2 | pytorch=1.11.0 3 | torchdata 4 | cudatoolkit=11.3.1 5 | -------------------------------------------------------------------------------- /setup/pip-requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==2.0.0 2 | deepspeed==0.6.5 3 | huggingface-hub==0.4.0 4 | jsonlines==3.0.0 5 | pytest==7.1.2 6 | quinine==0.3.0 7 | transformers==4.18.0 8 | wandb==0.12.17 9 | zstandard>=0.17.0 10 | pyarrow>=7.0.0 11 | -------------------------------------------------------------------------------- /setup/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 4 | 5 | ENV_NAME="mistral" 6 | # if we have an arg to the script, use it as the env name 7 | if [ $# -eq 1 ]; then 8 | ENV_NAME=$1 9 | fi 10 | 11 | if [ "$CONDA_DEFAULT_ENV" != "base" ]; then 12 | echo "Error: run setup from base environment!" 13 | exit 14 | fi 15 | echo "Creating mistral conda environment '${ENV_NAME}'!" 16 | conda create -y -n "${ENV_NAME}" --file ${SCRIPT_DIR}/conda-requirements.txt -c pytorch 17 | . $CONDA_PREFIX/etc/profile.d/conda.sh 18 | conda activate "${ENV_NAME}" 19 | if [ "$CONDA_DEFAULT_ENV" = "${ENV_NAME}" ]; then 20 | echo "Installing python dependencies with pip!" 21 | pip install -r ${SCRIPT_DIR}/pip-requirements.txt 22 | fi 23 | echo "Successfully created mistral environment '${ENV_NAME}'!" 24 | -------------------------------------------------------------------------------- /setup/test-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest>=7.1.0 2 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/src/__init__.py -------------------------------------------------------------------------------- /src/args/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for handling arguments from quinfigs and command line 3 | """ 4 | 5 | from .training_args import get_training_arguments 6 | -------------------------------------------------------------------------------- /src/args/training_args.py: -------------------------------------------------------------------------------- 1 | """ 2 | training_args.py 3 | 4 | Utility script for unloading Quinfigs into full set of Training Arguments, as well as for handling any argument 5 | overrides (e.g., paths that are defined at runtime, parameters that are dynamically computed such as gradient 6 | accumulation). 7 | """ 8 | import logging 9 | from pathlib import Path 10 | from typing import Optional 11 | 12 | from munch import Munch 13 | from transformers import TrainingArguments 14 | 15 | 16 | # Nest Overwatch under root `mistral` logger, inheriting formatting! 17 | overwatch = logging.getLogger("mistral.args.training") 18 | 19 | 20 | def get_training_arguments( 21 | quinfig_args: Munch, 22 | run_name: str, 23 | output_dir: Path, 24 | seed: int, 25 | local_rank: int, 26 | world_size: int, 27 | effective_bsz: int, 28 | gradient_checkpointing: Optional[bool] = None, 29 | ) -> TrainingArguments: 30 | """Initialize Training Arguments from Quinfig and Runtime-Defined Variables.""" 31 | 32 | # `quinfig_args` already contains some default training arguments --> we'll be overwriting/adding to the Dict 33 | # =>> a `Munch` is a subclass of Dictionary that supports attribute style access 34 | training_args = quinfig_args 35 | training_args.run_name = run_name 36 | training_args.output_dir = output_dir 37 | training_args.seed = seed 38 | training_args.data_seed = seed 39 | training_args.local_rank = local_rank 40 | 41 | # Since we Implement a Custom W&B / JSON Logging Callback, we don't report to anyone -- we've gone rogue! 42 | training_args.report_to = "none" 43 | 44 | # do it this way so we start supporting gradient_checkpointing in training_args à la Transformers 45 | if gradient_checkpointing is not None: 46 | training_args.gradient_checkpointing = gradient_checkpointing 47 | 48 | # If "sharded_ddp" is None --> replace with False 49 | if training_args.sharded_ddp is None: 50 | training_args.sharded_ddp = False 51 | else: 52 | assert isinstance(training_args.sharded_ddp, str) and training_args.sharded_ddp in [ 53 | "simple", 54 | "zero_dp_2+auto_wrap", 55 | "zero_dp_2+auto_wrap+offload", 56 | "zero_dp_3+auto_wrap", 57 | "zero_dp_3+auto_wrap+offload", 58 | ] 59 | 60 | # If "+" in `sharded_ddp` --> Split, and then join... this is kinda hacky (TODO training_args.A :: Fix!) 61 | if "+" in training_args.sharded_ddp: 62 | training_args.sharded_ddp = " ".join(training_args.sharded_ddp.split("+")) 63 | 64 | # Compute Gradient Accumulation Dynamically 65 | training_args.gradient_accumulation_steps = effective_bsz // ( 66 | quinfig_args.per_device_train_batch_size * world_size 67 | ) 68 | overwatch.info( 69 | f"Setting Gradient Accumulation Steps = `{training_args.gradient_accumulation_steps}` [BSZ: {effective_bsz} " 70 | f"World Size: {world_size} Device BSZ: {quinfig_args.per_device_train_batch_size}]" 71 | ) 72 | if ( 73 | training_args.gradient_accumulation_steps <= 0 74 | or effective_bsz % training_args.gradient_accumulation_steps != 0 75 | ): 76 | raise ValueError("Incompatible sizes for gradient accumulation!") 77 | 78 | args = TrainingArguments(**training_args) 79 | 80 | # TODO(dlwh): report this bug to transformers 81 | assert ( 82 | args.dataloader_num_workers == 0 or world_size == 1 83 | ), "dataloader_num_workers must be 0 for multi-gpu training in HF right now" 84 | 85 | return args 86 | -------------------------------------------------------------------------------- /src/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modules for core training, evaluation, and W&B logging processes 3 | """ 4 | 5 | from .callbacks import CustomCheckpointCallback, CustomWandbCallback 6 | from .trainer import OnlineBenchmarkTrainer 7 | -------------------------------------------------------------------------------- /src/corpora/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset/Corpus related modules 3 | """ 4 | 5 | from .auto import ONLINE_EVAL_DATA_REGISTRY, get_auto_dataset 6 | -------------------------------------------------------------------------------- /src/corpora/detokenization.py: -------------------------------------------------------------------------------- 1 | """ 2 | detokenization.py 3 | 4 | Handle detokenization for different dataset for zero-shot LM evaluation. 5 | """ 6 | import logging 7 | import re 8 | from typing import Dict 9 | 10 | 11 | # Nest Overwatch under root `mistral` logger, inheriting formatting! 12 | overwatch = logging.getLogger("mistral.corpora.detokenization") 13 | 14 | 15 | def wikitext_detokenize(example: Dict[str, str]) -> Dict[str, str]: 16 | """ 17 | Wikitext is whitespace tokenized and we remove these whitespaces. 18 | 19 | Taken from https://github.com/NVIDIA/Megatron-LM/blob/main/tasks/zeroshot_gpt2/detokenizer.py 20 | """ 21 | # Contractions 22 | text = example["text"] 23 | text = text.replace("s '", "s'") 24 | text = re.sub(r"/' [0-9]/", r"/'[0-9]/", text) 25 | 26 | # Number Separators 27 | text = text.replace(" @-@ ", "-") 28 | text = text.replace(" @,@ ", ",") 29 | text = text.replace(" @.@ ", ".") 30 | 31 | # Punctuation 32 | text = text.replace(" : ", ": ") 33 | text = text.replace(" ; ", "; ") 34 | text = text.replace(" . ", ". ") 35 | text = text.replace(" ! ", "! ") 36 | text = text.replace(" ? ", "? ") 37 | text = text.replace(" , ", ", ") 38 | 39 | # Double Brackets 40 | text = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", text) 41 | text = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", text) 42 | text = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", text) 43 | text = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', text) 44 | text = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", text) 45 | 46 | # Miscellaneous 47 | text = text.replace("= = = =", "====") 48 | text = text.replace("= = =", "===") 49 | text = text.replace("= =", "==") 50 | text = text.replace(" " + chr(176) + " ", chr(176)) 51 | text = text.replace(" \n", "\n") 52 | text = text.replace("\n ", "\n") 53 | text = text.replace(" N ", " 1 ") 54 | text = text.replace(" 's", "'s") 55 | 56 | return {"text": text} 57 | 58 | 59 | # Set Registry for Various Datasets 60 | DATASET_TOKENIZATION_REGISTRY = {"wikitext": wikitext_detokenize} 61 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model related modules 3 | """ 4 | 5 | from .auto_clm import get_auto_clm_tokenizer 6 | -------------------------------------------------------------------------------- /src/models/auto_clm.py: -------------------------------------------------------------------------------- 1 | """ 2 | auto_clm.py 3 | 4 | Default Causal Language Model (CLM) & Tokenizer Specification and Initialization. Downloads Model Configuration (if 5 | necessary) from the Hugging Face `transformers` Hub, instantiates pretrained Tokenizer, and initializes model using 6 | the necessary AutoModel class. 7 | """ 8 | import logging 9 | from pathlib import Path 10 | from typing import Dict, Tuple 11 | 12 | import torch 13 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer 14 | from transformers.models.gpt2 import GPT2Config, GPT2LMHeadModel 15 | 16 | from ..corpora.tokenization_utils import PassthroughTokenizer 17 | from ..util import REGISTRY 18 | 19 | 20 | # Nest Overwatch under root `mistral` logger, inheriting formatting! 21 | overwatch = logging.getLogger("mistral.models.auto") 22 | 23 | 24 | def get_auto_clm_tokenizer( 25 | model_id: str, 26 | paths: Dict[str, Path], 27 | model_configs: dict = None, 28 | use_pretrained_tokenizer: bool = True, 29 | use_passthrough_tokenizer: bool = False, 30 | reorder_and_upcast_attn: bool = True, 31 | scale_attn_by_inverse_layer_idx: bool = True, 32 | initial_weights: str = None, 33 | ) -> Tuple[AutoModelForCausalLM, PreTrainedTokenizer]: 34 | """Download/Load AutoConfig and Instantiate Corresponding Model and Tokenizer.""" 35 | 36 | # Create Configuration 37 | if "gpt2" in model_id and model_configs: 38 | overwatch.info(f"Building Hugging Face GPT2Config from provided configs: {model_configs} ...") 39 | config = GPT2Config.from_dict(model_configs) 40 | else: 41 | overwatch.info(f"Fetching Hugging Face AutoConfig for Model: `{REGISTRY[model_id]}`...") 42 | config = AutoConfig.from_pretrained(REGISTRY[model_id], cache_dir=paths["configs"]) 43 | 44 | # mistral config is just gpt2 with the following additional stability fixes 45 | if "mistral" in model_id or "gpt2" in model_id: 46 | config.reorder_and_upcast_attn = reorder_and_upcast_attn 47 | config.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx 48 | 49 | # IMPORTANT :: Set `use_cache` to False -- we don't need it ever and it conflicts with gradient checkpointing! 50 | config.use_cache = False 51 | 52 | # Create Tokenizer 53 | overwatch.info(f"Fetching Hugging Face [Fast] AutoTokenizer for Model: `{REGISTRY[model_id]}`...") 54 | assert not ( 55 | use_pretrained_tokenizer and use_passthrough_tokenizer 56 | ), "Pretrained and Passthrough tokenization are mutually exclusive" 57 | if use_pretrained_tokenizer: 58 | tokenizer = AutoTokenizer.from_pretrained(REGISTRY[model_id], config=config, cache_dir=paths["tokenizer"]) 59 | elif use_passthrough_tokenizer: 60 | overwatch.info("Using a Pretokenized Dataset") 61 | tokenizer = PassthroughTokenizer(config.vocab_size) 62 | else: 63 | overwatch.error("Tokenizer Training/Initialization (from Scratch) not yet implemented!") 64 | raise NotImplementedError() 65 | 66 | if "gpt2" in model_id: 67 | overwatch.info(f"Initializing Custom GPT-2 Model from Configuration: `{REGISTRY[model_id]}`...") 68 | model = GPT2LMHeadModel(config) 69 | else: 70 | # Initialize Model 71 | overwatch.info(f"Initializing Tabula Rasa Model from Configuration: `{REGISTRY[model_id]}`...") 72 | model = AutoModelForCausalLM.from_config(config) 73 | 74 | # Run GPT-Specific Initialization, if applicable 75 | model.resize_token_embeddings(len(tokenizer)) 76 | 77 | # If `initial_weights` is not None, load weights from path! 78 | if initial_weights is not None: 79 | overwatch.info(f"Initializing Weights from File: `{initial_weights}`...") 80 | model.load_state_dict(torch.load(initial_weights, map_location=torch.device("cpu"))) 81 | 82 | return model, tokenizer 83 | -------------------------------------------------------------------------------- /src/overwatch/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Package for logging 3 | """ 4 | 5 | from .overwatch import get_overwatch 6 | -------------------------------------------------------------------------------- /src/overwatch/overwatch.py: -------------------------------------------------------------------------------- 1 | """ 2 | overwatch.py 3 | 4 | Utility class for creating a centralized/standardized Python logger, with the Mercury format, at the appropriate 5 | logging level. 6 | """ 7 | import logging 8 | from pathlib import Path 9 | 10 | import datasets 11 | import transformers 12 | 13 | 14 | # Constants - for Formatting 15 | LOG_FORMAT = "|=>> %(asctime)s - %(name)s - %(levelname)s :: %(message)s" 16 | DATE_FORMAT = "%m/%d [%H:%M:%S]" 17 | 18 | 19 | def get_overwatch(path: Path, level: int, local_rank: int = 0) -> logging.Logger: 20 | """ 21 | Initialize logging.Logger with the appropriate name, console, and file handlers. 22 | 23 | :param path: Path for writing log file --> should be identical to run_name (inherited from `train.py`) 24 | :param level: Default logging level --> should usually be INFO (inherited from `train.py`). 25 | :param local_rank: Process Rank (default = -1). Only log to `level` on rank <= 0, otherwise default level is WARN. 26 | 27 | :return: Default "mistral" root logger object :: logging.Logger 28 | """ 29 | # Create Root Logger w/ Base Formatting 30 | logging.basicConfig(level=level, format=LOG_FORMAT, datefmt=DATE_FORMAT) 31 | 32 | # Suppress Hugging Face Loggers --> propagate up to Root! 33 | transformers.logging._get_library_root_logger().handlers = [] 34 | transformers.logging._get_library_root_logger().setLevel(level=level) 35 | datasets.logging._get_library_root_logger().handlers = [] 36 | 37 | # Create Default Logger & add File Handler 38 | logger = logging.getLogger() 39 | logger.setLevel(level if local_rank <= 0 else logging.WARNING) 40 | 41 | # Only Log to File w/ Rank 0 on each Node 42 | if local_rank <= 0: 43 | # Create File Handler --> Set mode to "a" to append to logs (ok, since each run will be uniquely named) 44 | file_handler = logging.FileHandler(path, mode="a") 45 | file_handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)) 46 | logger.addHandler(file_handler) 47 | 48 | return logging.getLogger("mistral") 49 | -------------------------------------------------------------------------------- /src/util/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Miscellaneous utilities including handling of directory set up, model name registry, and more... 3 | """ 4 | 5 | from .paths import create_paths, set_permissions 6 | from .registry import PATH_REGISTRY, REGISTRY 7 | -------------------------------------------------------------------------------- /src/util/paths.py: -------------------------------------------------------------------------------- 1 | """ 2 | paths.py 3 | 4 | Utility function for initializing the appropriate directories/sub-directories on the start of each run. Decoupled from 5 | main code in case we want separate directory structures/artifact storage based on infrastructure (e.g., NLP Cluster vs. 6 | GCP). 7 | """ 8 | import os 9 | from pathlib import Path 10 | from typing import Dict 11 | 12 | from .registry import PATH_REGISTRY 13 | 14 | 15 | def create_paths(run_id: str, model: str, run_dir: str, cache_dir: str) -> Dict[str, Path]: 16 | """ 17 | Create the necessary directories and sub-directories conditioned on the `run_id`, checkpoint directory, and cache 18 | directories. 19 | 20 | :param run_id: Unique Run Identifier. 21 | :param model: Huggingface.Transformers Model ID for specifying the desired configuration. 22 | :param run_dir: Path to run directory to save model checkpoints and run metrics. 23 | :param cache_dir: Path to artifacts/cache directory to store any intermediate values, configurations, etc. 24 | 25 | :return: Dictionary mapping str ids --> paths on the filesystem. 26 | """ 27 | # To respect shortcuts in paths, such as ~ 28 | cache_dir = os.path.expanduser(cache_dir) 29 | run_dir = os.path.expanduser(run_dir) 30 | 31 | paths = { 32 | # Top-Level Checkpoint Directory for Given Run 33 | "runs": Path(run_dir) / run_id, 34 | # Cache Directories for various components 35 | "configs": Path(cache_dir) / f"{PATH_REGISTRY[model]}-configs", 36 | "tokenizer": Path(cache_dir) / f"{PATH_REGISTRY[model]}-tokenizer", 37 | "dataset": Path(cache_dir) / "datasets", 38 | "preprocessed": Path(cache_dir) / f"{PATH_REGISTRY[model]}-processed", 39 | } 40 | 41 | # Programatically Create Paths for each Directory 42 | for p in paths: 43 | paths[p].mkdir(parents=True, exist_ok=True) 44 | 45 | return paths 46 | 47 | 48 | def set_permissions(paths: Dict[str, Path]) -> None: 49 | """Recursively call `os.chmod(775) recursively for the given paths.""" 50 | for p in paths: 51 | os.system(f"chmod -R 775 {paths[p]} >/dev/null 2>&1") 52 | -------------------------------------------------------------------------------- /src/util/registry.py: -------------------------------------------------------------------------------- 1 | """ 2 | registry.py 3 | 4 | Model/Data Registry :: Human-Readable Identifier --> Huggingface.co ID. Ideally will be expanded upon as we introduce 5 | more model configurations, different types of architectures, etc. 6 | """ 7 | 8 | # Model Names 9 | REGISTRY = { 10 | "gpt2-small": "gpt2", 11 | "gpt2-medium": "gpt2-medium", 12 | "gpt2-large": "gpt2-large", 13 | "gpt2-xl": "gpt2-xl", 14 | "mistral-small": "gpt2", 15 | "mistral-medium": "gpt2-medium", 16 | } 17 | 18 | # Absolute Paths 19 | PATH_REGISTRY = { 20 | "gpt2-small": "gpt2", 21 | "gpt2-medium": "gpt2", 22 | "gpt2-large": "gpt2", 23 | "gpt2-xl": "gpt2", 24 | "mistral-small": "gpt2", 25 | "mistral-medium": "gpt2-medium", 26 | } 27 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Run Tests 2 | 3 | Set this environment variable to a working directory that can store the Hugging Face cache and checkpoints created by the tests: 4 | 5 | ```bash 6 | export MISTRAL_TEST_HOME=/path/to/mistral-test-working-dir 7 | ``` 8 | 9 | From the `tests` directory, run this command to run tests in single node/single GPU mode: 10 | 11 | ```bash 12 | export CUDA_VISIBLE_DEVICES=0 13 | cd tests 14 | pytest 15 | ``` 16 | -------------------------------------------------------------------------------- /tests/conf/datasets/wikitext103.yaml: -------------------------------------------------------------------------------- 1 | # wikitext103.yaml 2 | # Configuration for WikiText-103 Dataset (https://huggingface.co/datasets/wikitext). 3 | --- 4 | dataset: 5 | id: wikitext 6 | name: wikitext-103-raw-v1 7 | validation_ratio: null 8 | 9 | # Number of Preprocessing Workers 10 | num_proc: 4 11 | 12 | # Number of Evaluation Preprocessing Workers 13 | eval_num_proc: 4 14 | -------------------------------------------------------------------------------- /tests/conf/datasets/wikitext2-detokenized.yaml: -------------------------------------------------------------------------------- 1 | # wikitext_2_detokenized.yaml 2 | # Configuration for pre-detokenized WikiText-2 Dataset (https://huggingface.co/datasets/dlwh/wikitext_2_detokenized) 3 | --- 4 | dataset: 5 | id: dlwh/wikitext_2_detokenized 6 | 7 | # Number of Preprocessing Workers 8 | num_proc: 4 9 | 10 | # Number of Evaluation Preprocessing Workers 11 | eval_num_proc: 4 12 | -------------------------------------------------------------------------------- /tests/conf/datasets/wikitext2.yaml: -------------------------------------------------------------------------------- 1 | # wikitext2.yaml 2 | # Configuration for WikiText-2 Dataset (https://huggingface.co/datasets/wikitext). 3 | --- 4 | dataset: 5 | id: wikitext 6 | name: wikitext-2-raw-v1 7 | validation_ratio: null 8 | 9 | # Number of Preprocessing Workers 10 | num_proc: 4 11 | 12 | # Number of Evaluation Preprocessing Workers 13 | eval_num_proc: 4 14 | -------------------------------------------------------------------------------- /tests/conf/deepspeed/z1-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": [ 7 | 0.9, 8 | 0.95 9 | ], 10 | "eps": 1e-8, 11 | "weight_decay": 0.1 12 | } 13 | }, 14 | 15 | "scheduler": { 16 | "type": "WarmupDecayLR", 17 | "params": { 18 | "total_num_steps": 400000, 19 | "warmup_max_lr": 0.0006, 20 | "warmup_num_steps": 4000 21 | } 22 | }, 23 | 24 | "zero_optimization": { 25 | "stage": 1, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 2e8, 28 | "reduce_scatter": true, 29 | "reduce_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "contiguous_gradients": true, 32 | "cpu_offload": false 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /tests/conf/deepspeed/z2-small-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": 0.0006, 6 | "betas": "auto", 7 | "eps": 1e-8, 8 | "weight_decay": 0.1 9 | } 10 | }, 11 | 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "total_num_steps": "auto", 16 | "warmup_max_lr": 0.0006, 17 | "warmup_num_steps": 4000 18 | } 19 | }, 20 | 21 | "zero_optimization": { 22 | "stage": 2, 23 | "allgather_partitions": true, 24 | "allgather_bucket_size": 2e8, 25 | "reduce_scatter": true, 26 | "reduce_bucket_size": 2e8, 27 | "overlap_comm": true, 28 | "contiguous_gradients": true, 29 | "cpu_offload": false 30 | }, 31 | 32 | "train_batch_size": "auto", 33 | "train_micro_batch_size_per_gpu": "auto" 34 | } 35 | -------------------------------------------------------------------------------- /tests/conf/models/gpt2-micro.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_function": "gelu_new", 3 | "architectures": [ 4 | "MistralGPT2LMHeadModel" 5 | ], 6 | "attn_pdrop": 0.0, 7 | "bos_token_id": 50256, 8 | "embd_pdrop": 0.0, 9 | "eos_token_id": 50256, 10 | "gradient_checkpointing": false, 11 | "initializer_range": 0.02, 12 | "layer_norm_epsilon": 1e-05, 13 | "model_type": "gpt2", 14 | "n_ctx": 256, 15 | "n_embd": 768, 16 | "n_head": 2, 17 | "n_inner": null, 18 | "n_layer": 2, 19 | "n_positions": 256, 20 | "resid_pdrop": 0.0, 21 | "summary_activation": null, 22 | "summary_first_dropout": 0.0, 23 | "summary_proj_to_labels": true, 24 | "summary_type": "cls_index", 25 | "summary_use_proj": true, 26 | "task_specific_params": { 27 | "text-generation": { 28 | "do_sample": true, 29 | "max_length": 50 30 | } 31 | }, 32 | "transformers_version": "4.5.0", 33 | "use_cache": false, 34 | "vocab_size": 50257 35 | } 36 | -------------------------------------------------------------------------------- /tests/conf/models/gpt2-micro.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-micro-config.yaml 2 | # Configuration for the GPT-2 Micro Model. 3 | --- 4 | model: 5 | id: "gpt2-small" 6 | 7 | # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime 8 | gradient_checkpointing: false 9 | 10 | # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch) 11 | pretrained_tokenizer: true 12 | 13 | # Sequence Length 14 | seq_len: 256 15 | 16 | # Stability 17 | reorder_and_upcast_attn: true 18 | scale_attn_by_inverse_layer_idx: true 19 | 20 | # Initialize Weights from File 21 | initial_weights: null 22 | 23 | # Configure Model From File 24 | config_path: conf/models/gpt2-micro.json 25 | -------------------------------------------------------------------------------- /tests/conf/models/gpt2-small.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-small-config.yaml 2 | # Configuration for the GPT-2 Small Model. 3 | --- 4 | model: 5 | id: "gpt2-small" 6 | 7 | # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime 8 | gradient_checkpointing: false 9 | 10 | # Add Gradient Checkpointing Every `gc_checkpoint_every` Transformer blocks 11 | # > Checkpoints = (# layers / `gc_checkpoint_every`) Blocks 12 | gc_checkpoint_every: -1 13 | 14 | # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch) 15 | pretrained_tokenizer: true 16 | 17 | # Sequence Length 18 | seq_len: 512 19 | 20 | # Stability -- Upcasting and Scaled Dot-Product Reordering 21 | reorder_attn: true 22 | upcast_attn: true 23 | 24 | # Initialize Weights from File 25 | initial_weights: null 26 | -------------------------------------------------------------------------------- /tests/conf/train-diff.yaml: -------------------------------------------------------------------------------- 1 | # hello-world.yaml 2 | # Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, 3 | # and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - train.yaml 10 | - trainers/gpt2-small-diff.yaml 11 | 12 | # Artifacts & Caching 13 | artifacts: 14 | cache_dir: /nlp/scr/jebolton/mistral-hello-world/artifacts 15 | run_dir: /nlp/scr/jebolton/mistral-hello-world/runs 16 | 17 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 18 | # - Frequency (`freq`) at which to save checkpoints (# steps) 19 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 20 | checkpoint_frequency: 21 | - [10, 100] 22 | - [50, 2000] 23 | - [100, 20000] 24 | - [1000, 400000] 25 | 26 | # Random Seed 27 | seed: 40 28 | 29 | run_training: false 30 | run_final_eval: false 31 | -------------------------------------------------------------------------------- /tests/conf/train.yaml: -------------------------------------------------------------------------------- 1 | # hello-world.yaml 2 | # Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, 3 | # and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16. 4 | # 5 | # Inheritance and core paths can all be overridden from the command line or by re-writing these files. 6 | --- 7 | # Inherit Dataset, Tokenization, Model, and Training Details 8 | inherit: 9 | - datasets/wikitext2-detokenized.yaml 10 | - models/gpt2-micro.yaml 11 | - trainers/gpt2-small.yaml 12 | 13 | # Run ID -- make sure to override! 14 | run_id: null 15 | 16 | # Weights & Biases 17 | wandb: hello-world 18 | group: gpt2-small 19 | 20 | # Artifacts & Caching 21 | artifacts: 22 | cache_dir: 23 | run_dir: 24 | 25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! 26 | effective_bsz: 16 27 | 28 | # Resume from Checkpoint 29 | resume: false 30 | resume_checkpoint: null 31 | 32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples: 33 | # - Frequency (`freq`) at which to save checkpoints (# steps) 34 | # - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) 35 | checkpoint_frequency: 36 | - [2, 18] 37 | - [10, 100] 38 | - [50, 2000] 39 | - [100, 20000] 40 | - [1000, 400000] 41 | 42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` 43 | local_rank: -1 44 | nnodes: -1 45 | nproc_per_node: -1 46 | 47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` 48 | num_gpus: -1 49 | num_nodes: -1 50 | world_size: -1 51 | 52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL 53 | log_level: 20 54 | 55 | # Random Seed 56 | seed: 21 57 | 58 | online_eval: 59 | do_wikitext: false 60 | do_lambada: false 61 | stride: 256 62 | -------------------------------------------------------------------------------- /tests/conf/trainers/gpt2-small-diff.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-small-diff.yaml 2 | # Trainer config for Full GPT-2 Small, with the full fixed batch size of 512 (with gradient accumulation). 3 | # This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this 4 | # continues to stay valid! 5 | # Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 6 | --- 7 | 8 | inherit: 9 | - gpt2-small.yaml 10 | 11 | training_arguments: 12 | # Learning Rate & Optimization Parameters, assumes AdamW 13 | weight_decay: 0.2 14 | adam_beta1: 0.7 15 | adam_beta2: 0.3 16 | 17 | # Gradient Norm 18 | max_grad_norm: 2.0 19 | 20 | # Maximum Training Steps (Overrides epochs!) 21 | max_steps: 100000 22 | -------------------------------------------------------------------------------- /tests/conf/trainers/gpt2-small.yaml: -------------------------------------------------------------------------------- 1 | # gpt2-small.yaml 2 | # Trainer config for Full GPT-2 Small, with the full fixed batch size of 512 (with gradient accumulation). 3 | # This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this 4 | # continues to stay valid! 5 | # Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 6 | --- 7 | training_arguments: 8 | # Overwrite from Top-Level Config 9 | output_dir: null 10 | 11 | # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args... 12 | do_train: true 13 | evaluation_strategy: steps 14 | 15 | # Set these based on GPU RAM/your available hardware 16 | per_device_train_batch_size: 8 17 | per_device_eval_batch_size: 16 18 | 19 | # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)] 20 | gradient_accumulation_steps: null 21 | 22 | # For Online Evaluation, only keep around the Losses 23 | prediction_loss_only: true 24 | 25 | # Learning Rate & Optimization Parameters, assumes AdamW 26 | learning_rate: 0.0006 27 | weight_decay: 0.1 28 | adam_beta1: 0.9 29 | adam_beta2: 0.95 30 | adam_epsilon: 1.0e-8 31 | 32 | # Gradient Norm 33 | max_grad_norm: 1.0 34 | 35 | # Maximum Training Steps (Overrides epochs!) 36 | max_steps: 400000 37 | 38 | # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio) 39 | lr_scheduler_type: linear # Cosine not supported if we want to use DeepSpeed Optimizers (gets overwritten!) 40 | warmup_steps: 4000 41 | 42 | # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime! 43 | run_name: null 44 | logging_dir: null 45 | logging_first_step: true 46 | logging_steps: 50 47 | 48 | # Saving and Evaluation Steps 49 | eval_steps: 1000 50 | save_steps: 1000 51 | 52 | # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging) 53 | ignore_data_skip: false 54 | 55 | # Seeds -- Should be Overwritten at Runtime! 56 | seed: null 57 | 58 | ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config 59 | fp16: true 60 | sharded_ddp: null 61 | deepspeed: null 62 | 63 | # Dataloader Parallelism 64 | dataloader_num_workers: 0 65 | 66 | # Should be overwritten from the Top-Level Config or CLI! 67 | local_rank: null 68 | -------------------------------------------------------------------------------- /tests/run_deepspeed_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | 5 | 6 | tests = [x for x in os.listdir(".") if x.startswith("test") and x.endswith("py")] 7 | 8 | errors = 0 9 | for test in tests: 10 | # clean up if necessary 11 | for log_path in ["test.out", "test.err", "test.log"]: 12 | if os.path.exists(log_path): 13 | os.remove(log_path) 14 | # run tests 15 | try: 16 | print("Running test:", test) 17 | subprocess.check_call( 18 | f"CUDA_VISIBLE_DEVICES=0,1 deepspeed --num_gpus 2 --num_nodes 1 {test}", 19 | shell=True, 20 | ) 21 | except Exception: 22 | errors += 1 23 | if os.path.exists("test.log"): 24 | subprocess.call("cat test.log", shell=True) 25 | print("") 26 | 27 | for log_path in ["test.out", "test.err", "test.log"]: 28 | if os.path.exists(log_path): 29 | os.remove(log_path) 30 | 31 | if errors > 0: 32 | sys.exit(1) 33 | -------------------------------------------------------------------------------- /tests/setup/pip-requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/tests/setup/pip-requirements.txt -------------------------------------------------------------------------------- /tests/test_args.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | 3 | from tests import MISTRAL_TEST_DIR, run_tests, run_train_process 4 | 5 | 6 | # paths 7 | CACHE_DIR = f"{MISTRAL_TEST_DIR}/artifacts" 8 | RUNS_DIR = f"{MISTRAL_TEST_DIR}/runs" 9 | 10 | TRAIN_ARGS = { 11 | "nnodes": "1", 12 | "nproc_per_node": "1", 13 | "config": "conf/train.yaml", 14 | "training_arguments.fp16": "false", 15 | "training_arguments.per_device_train_batch_size": "1", 16 | "artifacts.cache_dir": CACHE_DIR, 17 | "log_level": "50", 18 | "run_training": "false", 19 | "run_final_eval": "false", 20 | } 21 | 22 | TRAIN_ARGS_DIFF = copy(TRAIN_ARGS) 23 | TRAIN_ARGS_DIFF["config"] = "conf/train-diff.yaml" 24 | 25 | trainer_w_train = None 26 | trainer_w_train_diff = None 27 | 28 | 29 | def setup_module() -> None: 30 | global trainer_w_train, trainer_w_train_diff 31 | trainer_w_train = run_train_process(cl_args_dict=TRAIN_ARGS, runs_dir=RUNS_DIR, run_id="train_args_test") 32 | trainer_w_train_diff = run_train_process( 33 | cl_args_dict=TRAIN_ARGS_DIFF, runs_dir=RUNS_DIR, run_id="train_args_diff_test" 34 | ) 35 | 36 | 37 | def test_train_args() -> None: 38 | assert trainer_w_train.args.weight_decay == 0.1 39 | assert trainer_w_train.args.adam_beta1 == 0.9 40 | assert trainer_w_train.args.adam_beta2 == 0.95 41 | assert trainer_w_train.args.max_grad_norm == 1.0 42 | assert trainer_w_train_diff.args.weight_decay == 0.2 43 | assert trainer_w_train_diff.args.adam_beta1 == 0.7 44 | assert trainer_w_train_diff.args.adam_beta2 == 0.3 45 | assert trainer_w_train_diff.args.max_grad_norm == 2.0 46 | 47 | 48 | if __name__ == "__main__": 49 | run_tests() 50 | -------------------------------------------------------------------------------- /tests/test_eval_loss_is_defined.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy 4 | 5 | from src.core import OnlineBenchmarkTrainer 6 | from tests import MISTRAL_TEST_DIR, run_tests, run_train_process 7 | 8 | 9 | # paths 10 | CACHE_DIR = f"{MISTRAL_TEST_DIR}/artifacts" 11 | RUNS_DIR = f"{MISTRAL_TEST_DIR}/runs" 12 | 13 | TRAIN_ARGS = { 14 | "config": "conf/train.yaml", 15 | "training_arguments.fp16": "false", 16 | "training_arguments.per_device_train_batch_size": "1", 17 | "artifacts.cache_dir": CACHE_DIR, 18 | "log_level": "50", 19 | "run_training": "true", 20 | "training_arguments.max_steps": "2", # just enough steps so HF doesn't complain about using zero 2 for inference 21 | "run_final_eval": "false", 22 | } 23 | 24 | trainer: OnlineBenchmarkTrainer = None 25 | metrics: dict = None 26 | 27 | 28 | def setup_module() -> None: 29 | global trainer, metrics 30 | trainer, metrics = run_train_process( 31 | cl_args_dict=TRAIN_ARGS, runs_dir=RUNS_DIR, run_id="train_eval_loss_is_defined", also_evaluate=True 32 | ) 33 | 34 | 35 | def test_train_args() -> None: 36 | assert any(numpy.isfinite(v) and re.match("eval.*loss", k) for k, v in metrics.items()) 37 | 38 | 39 | if __name__ == "__main__": 40 | run_tests() 41 | -------------------------------------------------------------------------------- /tests/test_fp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch.cuda 3 | 4 | from tests import MISTRAL_TEST_DIR, run_tests, run_train_process 5 | 6 | 7 | # common paths and resources for tests 8 | 9 | # paths 10 | CACHE_DIR = f"{MISTRAL_TEST_DIR}/artifacts" 11 | RUNS_DIR = f"{MISTRAL_TEST_DIR}/runs" 12 | RUN_ID = "upcasting_test" 13 | RUN_ID_DIR = f"{RUNS_DIR}/{RUN_ID}" 14 | 15 | # run training processes for tests 16 | TRAIN_ARGS = { 17 | "nnodes": "1", 18 | "nproc_per_node": "1", 19 | "config": "conf/train.yaml", 20 | "training_arguments.fp16": "true", 21 | "training_arguments.max_steps": "4", 22 | "artifacts.cache_dir": CACHE_DIR, 23 | "run_training": "true", 24 | "run_final_eval": "false", 25 | "log_level": "50", 26 | } 27 | 28 | 29 | def setup_module() -> None: 30 | global basic_trainer 31 | try: 32 | basic_trainer = run_train_process(cl_args_dict=TRAIN_ARGS, runs_dir=RUNS_DIR, run_id=RUN_ID) 33 | except Exception: 34 | basic_trainer = None 35 | 36 | 37 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="need cuda for fp16") 38 | def test_upcasting() -> None: 39 | """ 40 | Run training with upcasting 41 | """ 42 | assert basic_trainer is not None 43 | 44 | 45 | if __name__ == "__main__": 46 | run_tests() 47 | -------------------------------------------------------------------------------- /tests/test_indexed_dataset.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | from typing import Iterator 4 | 5 | from transformers import BatchEncoding 6 | 7 | from src.corpora.indexer import IndexedDataset 8 | 9 | 10 | def test_can_move_dataset_cache(): 11 | def token_iterator() -> Iterator[BatchEncoding]: 12 | for i in range(0, 100): 13 | yield BatchEncoding({"input_ids": [[i] * (i + 1)]}) 14 | 15 | with tempfile.TemporaryDirectory() as tempdir: 16 | orig_cache = tempdir + "/orig" 17 | orig_ds = IndexedDataset.build_or_load(token_iterator(), orig_cache, seq_len=5, stride=1) 18 | 19 | new_cache = tempdir + "/new" 20 | # copy the cache 21 | shutil.copytree(orig_cache, new_cache) 22 | 23 | new_ds = IndexedDataset(new_cache, seq_len=5, stride=1) 24 | 25 | assert list(orig_ds) == list(new_ds) 26 | -------------------------------------------------------------------------------- /tests/test_online_benchmark_trainer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from transformers import TrainingArguments 4 | 5 | 6 | try: 7 | from torchdata.datapipes.iter import IterDataPipe 8 | except ImportError: 9 | from torch.utils.data import IterDataPipe 10 | 11 | from src.core.trainer import OnlineBenchmarkTrainer 12 | 13 | 14 | def test_ob_trainer_different_processes_different_data(): 15 | class DummyModel(object): 16 | def __init__(self, *args, **kwargs): 17 | pass 18 | 19 | def __call__(self, *args, **kwargs): 20 | return None 21 | 22 | def to(self, *args, **kwargs): 23 | return self 24 | 25 | def forward(self, *args, **kwargs): 26 | return None 27 | 28 | class FakeTrainingArguments(TrainingArguments): 29 | def __init__(self, process_index): 30 | self._process_index = process_index 31 | 32 | @property 33 | def should_save(self): 34 | return False 35 | 36 | @property 37 | def world_size(self): 38 | return 2 39 | 40 | @property 41 | def process_index(self): 42 | return self._process_index 43 | 44 | def get_process_log_level(self): 45 | return logging.INFO 46 | 47 | @property 48 | def report_to(self): 49 | return [] 50 | 51 | @property 52 | def max_steps(self): 53 | return 100 54 | 55 | class FakeTrainDataset(IterDataPipe): 56 | def __init__(self): 57 | pass 58 | 59 | def __iter__(self): 60 | for i in range(128): 61 | yield {"input_ids": [i] * 3, "labels": [i]} 62 | 63 | """Test that online benchmark trainer gives different data to different processes.""" 64 | trainer1 = OnlineBenchmarkTrainer( 65 | model=DummyModel(), # type: ignore 66 | args=FakeTrainingArguments(0), 67 | train_dataset=FakeTrainDataset(), 68 | ) 69 | 70 | trainer2 = OnlineBenchmarkTrainer( 71 | model=DummyModel(), # type: ignore 72 | args=FakeTrainingArguments(1), 73 | train_dataset=FakeTrainDataset(), 74 | ) 75 | 76 | d1 = list(trainer1.get_train_dataloader()) 77 | d2 = list(trainer2.get_train_dataloader()) 78 | 79 | # data is List[Dict[str, Tensor2]] 80 | # we have to convert to List[List[int]] to compare 81 | d1 = [[[y.item() for y in x] for x in d["input_ids"]] for d in d1] 82 | d2 = [[[y.item() for y in x] for x in d["input_ids"]] for d in d2] 83 | 84 | assert d1 != d2 85 | -------------------------------------------------------------------------------- /tests/test_seed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from tests import MISTRAL_TEST_DIR, check_samples_equal, get_samples, run_tests, run_train_process 4 | 5 | 6 | # paths 7 | CACHE_DIR = f"{MISTRAL_TEST_DIR}/artifacts" 8 | RUNS_DIR = f"{MISTRAL_TEST_DIR}/runs" 9 | RUN_ID = "train_args_test" 10 | RUN_ID_DIR = f"{RUNS_DIR}/{RUN_ID}" 11 | 12 | # set up different trainers to see initialization differences 13 | TRAIN_ARGS_SEED_7 = { 14 | "nnodes": "1", 15 | "nproc_per_node": "1", 16 | "config": "conf/train.yaml", 17 | "training_arguments.fp16": "false", 18 | "training_arguments.per_device_train_batch_size": "1", 19 | "artifacts.cache_dir": CACHE_DIR, 20 | "seed": "7", 21 | "log_level": "50", 22 | "run_training": "false", 23 | "run_final_eval": "false", 24 | } 25 | 26 | TRAIN_ARGS_SEED_10 = dict(TRAIN_ARGS_SEED_7) 27 | TRAIN_ARGS_SEED_10["seed"] = "10" 28 | 29 | trainer_seed_7 = None 30 | trainer_seed_10 = None 31 | trainer_seed_7_copy = None 32 | 33 | 34 | def setup_module() -> None: 35 | global trainer_seed_7, trainer_seed_10, trainer_seed_7_copy 36 | trainer_seed_7 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_7, runs_dir=RUNS_DIR, run_id="trainer_seed_7") 37 | trainer_seed_10 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_10, runs_dir=RUNS_DIR, run_id="trainer_seed_10") 38 | trainer_seed_7_copy = run_train_process( 39 | cl_args_dict=TRAIN_ARGS_SEED_7, runs_dir=RUNS_DIR, run_id="trainer_seed_7_copy" 40 | ) 41 | 42 | 43 | def is_randomized(key): 44 | """ 45 | Helper to determine if the key in the state_dict() is a set of parameters that is randomly initialized. 46 | Some weights are not randomly initalized and won't be afffected by seed, particularly layer norm 47 | weights and biases, and bias terms in general. 48 | """ 49 | # regexes for components that are not randomized 50 | if key.endswith("bias") or "ln" in key: 51 | return False 52 | else: 53 | return True 54 | 55 | 56 | def test_weight_initializations() -> None: 57 | # trainer_seed_7 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_7, runs_dir=RUNS_DIR, run_id="trainer_seed_7") 58 | # trainer_seed_10 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_10, runs_dir=RUNS_DIR, run_id="trainer_seed_10") 59 | 60 | assert trainer_seed_7.model.state_dict().keys() == trainer_seed_10.model.state_dict().keys() 61 | for key in trainer_seed_7.model.state_dict().keys(): 62 | if is_randomized(key): 63 | assert not torch.equal( 64 | trainer_seed_7.model.state_dict()[key], trainer_seed_10.model.state_dict()[key] 65 | ), f"weights are the same for {key}" 66 | 67 | 68 | def test_data_order() -> None: 69 | # trainer_seed_7 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_7, runs_dir=RUNS_DIR, run_id="trainer_seed_7") 70 | # trainer_seed_10 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_10, runs_dir=RUNS_DIR, run_id="trainer_seed_10") 71 | 72 | seed_7_dataloader = trainer_seed_7.get_train_dataloader() 73 | seed_10_dataloader = trainer_seed_10.get_train_dataloader() 74 | 75 | seed_7_data, seed_10_data = get_samples(seed_7_dataloader), get_samples(seed_10_dataloader) 76 | 77 | seed_7_copy_dataloader = trainer_seed_7_copy.get_train_dataloader() 78 | seed_7_copy_data = get_samples(seed_7_copy_dataloader) 79 | 80 | assert check_samples_equal(seed_7_copy_data, seed_7_data), "data is not the same" 81 | assert not check_samples_equal(seed_10_data, seed_7_data), "data order should be different for different seeds" 82 | 83 | 84 | if __name__ == "__main__": 85 | run_tests() 86 | -------------------------------------------------------------------------------- /tests/test_valid_configs.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | from unittest.mock import patch 4 | 5 | from quinine.common.argparse import QuinineArgumentParser 6 | 7 | import tests 8 | from conf.train_schema import get_schema 9 | 10 | 11 | expected_manual_args = [ 12 | "--artifacts.cache_dir", 13 | "artifacts/cache", 14 | "--artifacts.run_dir", 15 | "artifacts/runs", 16 | ] 17 | 18 | 19 | def validate_config(config_file): 20 | try: 21 | cl_args = ["--config", str(config_file)] + expected_manual_args 22 | with patch.object(sys, "argv", ["foo.py"] + cl_args): 23 | QuinineArgumentParser(schema=get_schema()).parse_quinfig() 24 | except Exception as e: 25 | raise Exception(f"{config_file} is not valid: {e}") from e 26 | 27 | 28 | def test_all_test_configs_are_valid(): 29 | # test all the yaml files in the main mistral conf directory, and in the mistral tests/conf directory 30 | test_root = pathlib.Path(tests.__file__).parent.absolute() 31 | 32 | for path in pathlib.Path(test_root).glob("conf/*.yaml"): 33 | validate_config(path) 34 | 35 | 36 | def test_all_real_configs_are_valid(): 37 | # test all the yaml files in the main mistral conf directory, and in the mistral tests/conf directory 38 | mistral_root = pathlib.Path(tests.__file__).parent.parent.absolute() 39 | 40 | for path in pathlib.Path(mistral_root).glob("conf/*.yaml"): 41 | validate_config(path) 42 | 43 | 44 | if __name__ == "__main__": 45 | tests.run_tests() 46 | -------------------------------------------------------------------------------- /tutorials/custom-dataset/README.md: -------------------------------------------------------------------------------- 1 | # Train On Custom Dataset 2 | 3 | ## Create Directory With Your Text 4 | 5 | Put text into `*.jsonl` files, one document per line. 6 | 7 | ``` 8 | {"text": "Document one ..."} 9 | {"text": "Document two ..."} 10 | ... 11 | ``` 12 | 13 | You can have arbitrarily many files. Files matching `*train*` will be used as 14 | training data and files with `*validation*` will be used as validation data. 15 | 16 | For example, if you are training on PubMed data, you would have something like 17 | this: 18 | 19 | ``` 20 | /path/to/pubmed_local 21 | pubmed_train.jsonl 22 | pubmed_validation.jsonl 23 | ``` 24 | 25 | Each line of those files would be a document in the format described above. 26 | An example of a custom dataset can be found at 27 | `tutorials/custom-dataset/shakespeare`. 28 | 29 | 30 | ## Set up dataset config file in `conf/datasets` 31 | 32 | In the dataset config file, specify the number of workers you need to 33 | process the data and the path to the custom dataset on your machine. 34 | 35 | An example config file for the Shakespeare dataset is at 36 | `conf/datasets/shakespeare.yaml`. 37 | 38 | 39 | ## Specify Your New Dataset In The Overall Experiment Config 40 | 41 | Remember to specify this dataset in your overall experiment config. This is 42 | typically done at the top in the inherit section. For example, 43 | 44 | ``` 45 | # Inherit Dataset, Tokenization, Model, and Training Details 46 | inherit: 47 | - datasets/pubmed_local.yaml 48 | - models/mistral-small.yaml 49 | - trainers/gpt2-small.yaml 50 | ``` 51 | 52 | An example of the config file can be found at 53 | `conf/tutorial-shakespeare-gpt2-micro.yaml`. We train a GPT-2 micro 54 | (~11m parameters) model on Shakespeare text for that example. 55 | -------------------------------------------------------------------------------- /tutorials/gcp-on-demand/README.md: -------------------------------------------------------------------------------- 1 | # Run Mistral On GCP (on demand) 2 | 3 | ## Create An A100 With 8 GPU 4 | 5 | Go to the VM instances page and click "Create Instance" 6 | 7 | Give it an informative name (e.g. "mistral-gcp-demo") 8 | 9 | Choose `europe-west4 (Netherlands)` as the zone 10 | 11 | Select GPU machine, and choose NVIDIA Tesla A100, with 8 GPUs 12 | 13 | Customize the Boot disk OS to "Deep Learning on Linux"/"Debian 10 based Deep Learning VM with CUDA 11.3 M93" 14 | 15 | Update the size to 1 TB (or whatever you feel you need) 16 | 17 | Hit "Create" ! 18 | 19 | Wait a few minutes, and then click the "SSH" button on the VM page. Hit "y" when asked to install drivers. 20 | 21 | At this point the machine should be set up and operational. Run `nvidia-smi` to confirm. 22 | 23 | 24 | ## Clone Mistral 25 | 26 | Clone the repo 27 | 28 | ``` 29 | git clone https://github.com/stanford-crfm/mistral.git 30 | ``` 31 | 32 | ## Create Mistral conda environment 33 | 34 | Follow the instructions on the main README for setting up the conda env. 35 | 36 | Generally this will be: 37 | 38 | ``` 39 | cd setup 40 | bash setup.sh 41 | ``` 42 | 43 | ## Set Up WandB 44 | 45 | ``` 46 | cd mistral 47 | wandb login # type in your API key at prompt 48 | wandb init 49 | mkdir /home/username/data # create directory for storing runs and artifacts 50 | ``` 51 | 52 | ## Modify Config File 53 | 54 | Alter the config file in `conf/gpt2-small.yaml` to customize the datasets you use. 55 | 56 | Particularly update the `artifact` entry: 57 | 58 | ``` 59 | artifacts: 60 | cache_dir: /home/username/data/artifacts 61 | run_dir: /home/username/data/runs 62 | ``` 63 | 64 | ## Launch The Training Run 65 | 66 | This command will launch the training process with deepspeed 67 | 68 | ``` 69 | deepspeed --num_gpus 8 --num_nodes 1 --master_addr localhost --config conf/gpt2-small.yaml --nnodes 1 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4 --training_arguments.deepspeed conf/deepspeed/z2-small-conf.json --run_id mistral-june22-demo 70 | ``` 71 | --------------------------------------------------------------------------------