├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── question.md
    ├── pull_request_template.md
    └── workflows
    │   ├── pre-commit.yaml
    │   └── run-tests.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── conf
    ├── archive
    │   ├── old-benchmarking
    │   │   ├── gpt2-benchmark-config.yaml
    │   │   ├── gpt2-intensive-config.yaml
    │   │   └── gpt2-toy-config.yaml
    │   ├── partial-checkpointing
    │   │   └── gpt2-mistral-medium-gcheck-config.yaml
    │   └── v1
    │   │   ├── gpt2-debug-config.yaml
    │   │   ├── gpt2-mistral-medium-config.yaml
    │   │   ├── gpt2-mistral-medium-gcp-config.yaml
    │   │   ├── gpt2-mistral-mini-config.yaml
    │   │   ├── gpt2-mistral-small-gcp-config.yaml
    │   │   ├── gpt2-scaling-config.yaml
    │   │   └── tutorial-gpt2-micro.yaml
    ├── datasets
    │   ├── openwebtext.yaml
    │   ├── shakespeare.yaml
    │   ├── wikitext103.yaml
    │   └── wikitext2.yaml
    ├── deepspeed
    │   ├── debug-conf.json
    │   ├── hostfile
    │   ├── z1-conf.json
    │   ├── z1-offload-conf.json
    │   ├── z2-debug-conf.json
    │   ├── z2-medium-conf.json
    │   ├── z2-offload-conf.json
    │   ├── z2-small-conf.json
    │   ├── z3-conf.json
    │   └── z3-offload-conf.json
    ├── mistral-medium.yaml
    ├── mistral-micro.yaml
    ├── mistral-small.yaml
    ├── models
    │   ├── mistral-medium.yaml
    │   ├── mistral-micro.json
    │   ├── mistral-micro.yaml
    │   └── mistral-small.yaml
    ├── train_schema.py
    ├── trainers
    │   ├── benchmark.yaml
    │   ├── gpt2-medium.yaml
    │   ├── gpt2-small-short.yaml
    │   ├── gpt2-small.yaml
    │   └── intensive.yaml
    └── tutorial-shakespeare-gpt2-micro.yaml
├── docs
    ├── LICENSE
    ├── Makefile
    ├── README.md
    ├── _static
    │   ├── pydata-custom.css
    │   └── readthedocs-custom.css
    ├── _templates
    │   ├── custom-class-template.rst
    │   ├── custom-module-template.rst
    │   └── layout.html
    ├── api.rst
    ├── conf.py
    ├── contributing.rst
    ├── fork.png
    ├── getting_started.rst
    ├── getting_started
    │   ├── config.rst
    │   ├── download.rst
    │   ├── evaluate.rst
    │   ├── install.rst
    │   ├── train-output.txt
    │   ├── train.rst
    │   └── wandb_example.png
    ├── hugging_face_differences.rst
    ├── index.rst
    ├── mistral_components.png
    ├── scripts
    │   └── build_download_tables.py
    └── tutorials
    │   ├── cluster_basics.png
    │   ├── deepspeed.rst
    │   ├── gcp_plus_kubernetes.rst
    │   ├── generate.rst
    │   ├── gke_standard.png
    │   ├── kubernetes_menu.png
    │   ├── multi-gpu.rst
    │   ├── node_pool.png
    │   ├── node_pool_gpu.png
    │   ├── resume.rst
    │   └── tutorial_cluster.png
├── environments
    ├── Dockerfile
    ├── environment-cpu.yaml
    ├── environment-gpu.yaml
    ├── environment-m1.yaml
    └── export.py
├── gcp
    ├── Dockerfile
    ├── job-gpt2-micro.yaml
    ├── pod-gpu.yaml
    ├── pod.yaml
    └── run-demo-job.sh
├── generate_text.ipynb
├── mistral_models.json
├── mypy.ini
├── pyproject.toml
├── scripts
    ├── README.md
    ├── benchmarking
    │   ├── dial-in
    │   │   ├── mistral-gpt2-medium.sh
    │   │   └── mistral-gpt2-small.sh
    │   ├── intensive-benchmarking
    │   │   ├── ddp-multi.sh
    │   │   ├── deepspeed-multi.sh
    │   │   └── fairscale-multi.sh
    │   └── standard-benchmarking
    │   │   ├── README.md
    │   │   ├── ddp-multi.sh
    │   │   ├── ddp-single.sh
    │   │   ├── deepspeed-multi.sh
    │   │   ├── deepspeed-single.sh
    │   │   ├── ds-evaluation-bsz.sh
    │   │   ├── fairscale-multi.sh
    │   │   ├── fairscale-single.sh
    │   │   └── vanilla.sh
    ├── debugging
    │   ├── resuming
    │   │   └── resume-single-node.sh
    │   └── sanity
    │   │   └── mistral-sanity-gpt2-small.sh
    ├── forget-me-not.sh
    ├── mistral-gcp-gpt2-medium.sh
    ├── mistral-gcp-gpt2-small.sh
    ├── mistral-gpt2-medium.sh
    ├── mistral-gpt2-small.sh
    └── run
    │   ├── ddp.sh
    │   ├── deepspeed.sh
    │   ├── fairscale.sh
    │   ├── multi-node.sh
    │   └── single-node.sh
├── setup
    ├── conda-requirements.txt
    ├── pip-requirements.txt
    ├── setup.sh
    └── test-requirements.txt
├── src
    ├── __init__.py
    ├── args
    │   ├── __init__.py
    │   └── training_args.py
    ├── core
    │   ├── __init__.py
    │   ├── callbacks.py
    │   └── trainer.py
    ├── corpora
    │   ├── __init__.py
    │   ├── auto.py
    │   ├── detokenization.py
    │   ├── indexer.py
    │   └── tokenization_utils.py
    ├── models
    │   ├── __init__.py
    │   └── auto_clm.py
    ├── overwatch
    │   ├── __init__.py
    │   └── overwatch.py
    └── util
    │   ├── __init__.py
    │   ├── paths.py
    │   └── registry.py
├── tests
    ├── README.md
    ├── __init__.py
    ├── conf
    │   ├── datasets
    │   │   ├── wikitext103.yaml
    │   │   ├── wikitext2-detokenized.yaml
    │   │   └── wikitext2.yaml
    │   ├── deepspeed
    │   │   ├── z1-conf.json
    │   │   └── z2-small-conf.json
    │   ├── models
    │   │   ├── gpt2-micro.json
    │   │   ├── gpt2-micro.yaml
    │   │   └── gpt2-small.yaml
    │   ├── train-diff.yaml
    │   ├── train.yaml
    │   └── trainers
    │   │   ├── gpt2-small-diff.yaml
    │   │   └── gpt2-small.yaml
    ├── run_deepspeed_tests.py
    ├── setup
    │   └── pip-requirements.txt
    ├── test_args.py
    ├── test_checkpoint.py
    ├── test_eval_loss_is_defined.py
    ├── test_fp.py
    ├── test_indexed_dataset.py
    ├── test_online_benchmark_trainer.py
    ├── test_seed.py
    └── test_valid_configs.py
├── train.py
└── tutorials
    ├── custom-dataset
        ├── README.md
        └── shakespeare
        │   ├── shakespeare.train.jsonl
        │   └── shakespeare.validation.jsonl
    └── gcp-on-demand
        └── README.md


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = .git
3 | max-line-length = 119
4 | ignore = E203, E501, W503, W605
5 | per-file-ignores =
6 |     */__init__.py: F401
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the bug.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Additional context**
20 | Add any other context about the problem here (e.g. launching with DeepSpeed?, OS, library versions, hardware, etc...)
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask a question.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ** Before you start, make sure to check out: **
11 | * Documentation on our [Read The Docs](https://nlp.stanford.edu/mistral/) site.
12 | * [GitHub Issues](https://github.com/stanford-mercury/mistral/issues)
13 | 
14 | These sources may already contain the answer to your question!
15 | 
16 | If you still can't find an answer, erase this template and add your question. Please try to provide as much detail as possible so we can quickly and accurately respond!
17 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | A brief and concise description of what your pull request is trying to accomplish.
 3 | 
 4 | ## Fixes Issues
 5 | A list of issues/bugs with # references. (e.g., #123)
 6 | 
 7 | ## Unit test coverage
 8 | Are there unit tests in place to make sure your code is functioning correctly?
 9 | 
10 | ## Known breaking changes/behaviors
11 | Does this break anything in Mistral's existing user interface? If so, what is it and how is it addressed?
12 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yaml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches: [main]
 7 | 
 8 | jobs:
 9 |   pre-commit:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v3
13 |     - uses: actions/setup-python@v3
14 |     - uses: pre-commit/action@v3.0.0


--------------------------------------------------------------------------------
/.github/workflows/run-tests.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: Run Tests
 3 | on: [push]
 4 | jobs:
 5 |   Run-Mistral-Tests:
 6 |     runs-on: self-hosted
 7 |     steps:
 8 |       - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
 9 |       - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
10 |       - name: Check out repository code
11 |         uses: actions/checkout@v2
12 |       - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
13 |       - run: echo "🖥️ The workflow is now ready to test your code on the runner."
14 |       - name: Setting up Conda Environment
15 |         run: |
16 |           echo "Setting up conda env for this test!"
17 |           eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy
18 |           bash setup/setup.sh mistral-${{github.sha}} 
19 |           conda activate mistral-${{github.sha}}
20 |       - name: Installing test dependencies
21 |         run: |
22 |           eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy
23 |           conda activate mistral-${{github.sha}}
24 |           pip install -r setup/test-requirements.txt
25 |       - name: Setting up environment variables
26 |         run: |
27 |           echo 'Deactivating wandb'
28 |           eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy
29 |           conda activate mistral-${{github.sha}}
30 |           cd tests ; wandb disabled
31 |           echo 'MISTRAL_TEST_DIR:'
32 |           echo $MISTRAL_TEST_DIR
33 |       - name: Run tests (single node/single GPU)
34 |         run: |
35 |           eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy
36 |           conda activate mistral-${{github.sha}}
37 |           cd tests
38 |           echo 'Clearing artifacts'
39 |           rm -rf $MISTRAL_TEST_DIR/artifacts
40 |           CUDA_VISIBLE_DEVICES=0 pytest --durations=0
41 |       - name: Run tests (Deepspeed 2xGPUs)
42 |         run: | 
43 |           eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy
44 |           conda activate mistral-${{github.sha}}
45 |           export PYTHONPATH=${GITHUB_WORKSPACE}
46 |           cd tests
47 |           echo 'Clearing artifacts'
48 |           rm -rf $MISTRAL_TEST_DIR/artifacts
49 |           python run_deepspeed_tests.py
50 |       - name: Delete conda environment
51 |         if: always()
52 |         run: |
53 |           eval "$(conda 'shell.bash' 'hook' 2>/dev/null)" # make conda happy
54 |           conda deactivate
55 |           conda env remove -n mistral-${{github.sha}}
56 |       - run: echo "All tests finished!"
57 |       - run: echo "🍏 This job's status is ${{ job.status }}."
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # JetBrains
132 | .idea/
133 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | exclude: ".git"  # TODO - add tox/nox files if we ever get around to implementing testing
 4 | default_stages:
 5 |     - commit
 6 | fail_fast: true
 7 | 
 8 | repos:
 9 | -   repo: https://github.com/pre-commit/pre-commit-hooks
10 |     rev: v4.0.1
11 |     hooks:
12 |     -   id: trailing-whitespace
13 |     -   id: end-of-file-fixer
14 |     -   id: check-yaml
15 |     -   id: check-toml
16 |     -   id: check-merge-conflict
17 |     -   id: check-added-large-files
18 | 
19 | -   repo: https://github.com/psf/black
20 |     rev: 22.3.0
21 |     hooks:
22 |     -   id: black
23 | 
24 | -   repo: https://github.com/timothycrosley/isort
25 |     rev: 5.9.3
26 |     hooks:
27 |     -   id: isort
28 | 
29 | -   repo: https://github.com/PyCQA/flake8
30 |     rev: 3.9.2
31 |     hooks:
32 |     -   id: flake8
33 |         additional_dependencies: [flake8-isort]
34 | 
35 | -   repo: https://github.com/pre-commit/mirrors-mypy
36 |     rev: 'v0.960'
37 |     hooks:
38 |     -   id: mypy
39 |         args: [--ignore-missing-imports]
40 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Mistral
2 | 
3 | Please see the full contribution guidelines on our [Read The Docs](https://nlp.stanford.edu/mistral/contributing.html) page.
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := /bin/bash
 2 | .PHONY: help serialize-env check autoformat prune
 3 | .DEFAULT: help
 4 | 
 5 | # Create Valid Architectures
 6 | ARCHITECTURES := cpu gpu
 7 | 
 8 | # Generates a useful overview/help message for various make features - add to this as necessary!
 9 | help:
10 | 	@echo "make serialize-env arch=<ID>"
11 | 	@echo "    After (un)installing dependencies, dump environment.yaml for arch :: < cpu | gpu >."
12 | 	@echo "make prune"
13 | 	@echo "    Pull all branches from git, and prune all local branches that are merged in origin."
14 | 	@echo "make check"
15 | 	@echo "    Run code style and linting (black, flake, isort) *without* changing files!"
16 | 	@echo "make autoformat"
17 | 	@echo "    Run code styling (black, isort) and update in place - committing with pre-commit also does this."
18 | 
19 | serialize-env:
20 | ifneq ($(filter $(arch),$(ARCHITECTURES)),)
21 | 	python environments/export.py -a $(arch)
22 | else
23 | 	@echo "Argument 'arch' is not set - try calling 'make serialize-env arch=<ID>' with ID = < cpu | gpu >."
24 | endif
25 | 
26 | check:
27 | 	isort --check .
28 | 	black --check .
29 | 	flake8 .
30 | 
31 | autoformat:
32 | 	isort --atomic .
33 | 	black .
34 | 
35 | prune:
36 | 	@bash -c "git fetch -p";
37 | 	@bash -c "for branch in $(git branch -vv | grep ': gone]' | awk '{print $1}'); do git branch -d $branch; done";
38 | 


--------------------------------------------------------------------------------
/conf/archive/old-benchmarking/gpt2-benchmark-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-benchmark-config.yaml
 2 | #   Benchmarking GPT-2 Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, and
 3 | #   full batch size (512). Support for Single-Node, Multi-Node, Mixed Precision, DDP, FairScale, and DeepSpeed.
 4 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 5 | ---
 6 | # Inherit Dataset, Tokenization, Model, and Training Details
 7 | inherit:
 8 |     - datasets/openwebtext.yaml
 9 |     - models/gpt2-small.yaml
10 |     - trainers/benchmark.yaml
11 | 
12 | # Run ID -- defaults to `null`; override as you like!
13 | run_id: null
14 | 
15 | # Weights & Biases
16 | wandb: mistral-benchmarking
17 | group: null
18 | 
19 | # Artifacts & Caching
20 | artifacts:
21 |     cache_dir: /scr-ssd/mercury/mistral/artifacts
22 |     run_dir: /scr-ssd/mercury/mistral/runs
23 | 
24 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
25 | effective_bsz: 512
26 | 
27 | # Resume from Checkpoint
28 | resume: false
29 | resume_checkpoint: null
30 | 
31 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
32 | local_rank: -1
33 | nnodes: -1
34 | nproc_per_node: -1
35 | 
36 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
37 | num_gpus: -1
38 | num_nodes: -1
39 | world_size: -1
40 | 
41 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
42 | log_level: 20
43 | 
44 | # Random Seed
45 | seed: 21
46 | 


--------------------------------------------------------------------------------
/conf/archive/old-benchmarking/gpt2-intensive-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-intensive-config.yaml
 2 | #   Intensive Benchmarking GPT-2 Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, and
 3 | #   full batch size (512). Support for Multi-Node Mixed Precision runs, for final round of benchmarking of DDP,
 4 | #   FairScale, and DeepSpeed.
 5 | #
 6 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 7 | ---
 8 | # Inherit Dataset, Tokenization, Model, and Training Details
 9 | inherit:
10 |     - datasets/openwebtext.yaml
11 |     - models/gpt2-small.yaml
12 |     - trainers/intensive.yaml
13 | 
14 | # Run ID -- defaults to `null`; override as you like!
15 | run_id: null
16 | 
17 | # Weights & Biases
18 | wandb: mistral-benchmarking
19 | group: intensive
20 | 
21 | # Artifacts & Caching
22 | artifacts:
23 |     cache_dir: /scr-ssd/mercury/mistral/artifacts
24 |     run_dir: /scr-ssd/mercury/mistral/runs
25 | 
26 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
27 | effective_bsz: 512
28 | 
29 | # Resume from Checkpoint
30 | resume: false
31 | resume_checkpoint: null
32 | 
33 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
34 | local_rank: -1
35 | nnodes: -1
36 | nproc_per_node: -1
37 | 
38 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
39 | num_gpus: -1
40 | num_nodes: -1
41 | world_size: -1
42 | 
43 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
44 | log_level: 20
45 | 
46 | # Random Seed
47 | seed: 21
48 | 


--------------------------------------------------------------------------------
/conf/archive/old-benchmarking/gpt2-toy-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-toy-config.yaml
 2 | #   Toy GPT-2 Config, currently working with the WikiText-103 Dataset, GPT-2 Small Architecture, and Single-Node
 3 | #   Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 4 | ---
 5 | # Inherit Dataset, Tokenization, Model, and Training Details
 6 | inherit:
 7 |     - datasets/wikitext103.yaml
 8 |     - models/gpt2-small.yaml
 9 |     - trainers/toy.yaml
10 | 
11 | # Run ID -- defaults to `null`; override as you like!
12 | run_id: null
13 | 
14 | # Weights & Biases (Set os.environ["WANDB_PROJECT"])
15 | wandb: mistral-debugging
16 | group: null
17 | 
18 | # Artifacts & Caching
19 | artifacts:
20 |     cache_dir: /u/scr/nlp/mercury/mistral/artifacts
21 |     run_dir: /u/scr/nlp/mercury/mistral/runs
22 | 
23 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
24 | bsz: 8
25 | 
26 | # Resume from Checkpoint
27 | resume: false
28 | resume_checkpoint: null
29 | 
30 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
31 | local_rank: -1
32 | nnodes: 1
33 | nproc_per_node: 8
34 | 
35 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
36 | num_gpus: 8
37 | num_nodes: 1
38 | world_size: 8
39 | 
40 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
41 | log_level: 20
42 | 
43 | # Random Seed
44 | seed: 21
45 | 


--------------------------------------------------------------------------------
/conf/archive/partial-checkpointing/gpt2-mistral-medium-gcheck-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-mistral-medium-gcheck-config.yaml
 2 | #   Full Mistral GPT-2 Medium Training Config, currently working with the WikiText Dataset, GPT-2 Medium Architecture,
 3 | #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 4/8.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - ../../datasets/wikitext103.yaml
10 |     - ../../models/gpt2-medium.yaml
11 |     - ../../trainers/gpt2-medium.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb: mistral-debugging
18 | group: gpt2-medium
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir: /scr-ssd/mercury/mistral/artifacts
23 |     run_dir: /scr-ssd/mercury/mistral/runs
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 512
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
33 | local_rank: -1
34 | nnodes: -1
35 | nproc_per_node: -1
36 | 
37 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
38 | num_gpus: -1
39 | num_nodes: -1
40 | world_size: -1
41 | 
42 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
43 | log_level: 20
44 | 
45 | # Random Seed
46 | seed: 21
47 | 


--------------------------------------------------------------------------------
/conf/archive/v1/gpt2-debug-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-sphinx-debug-config.yaml
 2 | #   Debugging GPT-2 Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture, and Single-Node
 3 | #   Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 4 | ---
 5 | # Inherit Dataset, Tokenization, Model, and Training Details
 6 | inherit:
 7 |     - datasets/openwebtext.yaml
 8 |     - models/gpt2-small.yaml
 9 |     - trainers/debug.yaml
10 | 
11 | # Run ID -- defaults to `null`; override as you like!
12 | run_id: null
13 | 
14 | # Weights & Biases
15 | wandb: mistral-sanity
16 | group: null
17 | 
18 | # Artifacts & Caching
19 | artifacts:
20 |     cache_dir: /scr-ssd/mercury/mistral/artifacts
21 |     run_dir: /scr-ssd/mercury/mistral/runs
22 | 
23 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
24 | effective_bsz: 512
25 | 
26 | # Resume from Checkpoint
27 | resume: false
28 | resume_checkpoint: null
29 | 
30 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
31 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
32 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
33 | checkpoint_frequency:
34 |     - [10, 100]
35 |     - [25, 1000]
36 |     - [50, 2000]
37 |     - [100, 4000]
38 | 
39 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
40 | local_rank: -1
41 | nnodes: -1
42 | nproc_per_node: -1
43 | 
44 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
45 | num_gpus: -1
46 | num_nodes: -1
47 | world_size: -1
48 | 
49 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
50 | log_level: 20
51 | 
52 | # Random Seed
53 | seed: 21
54 | 


--------------------------------------------------------------------------------
/conf/archive/v1/gpt2-mistral-medium-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-mistral-medium-config.yaml
 2 | #   Full Mistral GPT-2 Medium Training Config, currently working with the OpenWebText Dataset, GPT-2 Medium
 3 | #   Architecture, and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 4.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/openwebtext.yaml
10 |     - models/gpt2-medium.yaml
11 |     - trainers/gpt2-medium.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb: mistral-gpt2
18 | group: gpt2-medium
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir: /scr-ssd/mercury/mistral/artifacts
23 |     run_dir: /scr-ssd/mercury/mistral/runs
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 512
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
33 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
34 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
35 | checkpoint_frequency:
36 |     - [10, 100]
37 |     - [50, 2000]
38 |     - [100, 20000]
39 |     - [1000, 400000]
40 | 
41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
42 | local_rank: -1
43 | nnodes: -1
44 | nproc_per_node: -1
45 | 
46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
47 | num_gpus: -1
48 | num_nodes: -1
49 | world_size: -1
50 | 
51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
52 | log_level: 20
53 | 
54 | # Random Seed
55 | seed: 21
56 | 


--------------------------------------------------------------------------------
/conf/archive/v1/gpt2-mistral-medium-gcp-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-mistral-medium-gcp-config.yaml
 2 | #   Full Mistral GPT-2 Medium Training Config, currently working with the OpenWebText Dataset, GPT-2 Medium
 3 | #   Architecture, and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 4.
 4 | #   Written for Google Cloud!
 5 | #
 6 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 7 | ---
 8 | # Inherit Dataset, Tokenization, Model, and Training Details
 9 | inherit:
10 |     - datasets/openwebtext.yaml
11 |     - models/gpt2-medium.yaml
12 |     - trainers/gpt2-medium.yaml
13 | 
14 | # Run ID -- make sure to override!
15 | run_id: null
16 | 
17 | # Weights & Biases
18 | wandb: mistral-gpt2
19 | group: gpt2-medium
20 | 
21 | # Artifacts & Caching
22 | artifacts:
23 |     cache_dir: /home/data/mercury/mistral/artifacts
24 |     run_dir: /home/data/mercury/mistral/runs
25 | 
26 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
27 | effective_bsz: 512
28 | 
29 | # Resume from Checkpoint
30 | resume: false
31 | resume_checkpoint: null
32 | 
33 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
34 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
35 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
36 | checkpoint_frequency:
37 |     - [10, 100]
38 |     - [50, 2000]
39 |     - [100, 20000]
40 |     - [1000, 400000]
41 | 
42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
43 | local_rank: -1
44 | nnodes: -1
45 | nproc_per_node: -1
46 | 
47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
48 | num_gpus: -1
49 | num_nodes: -1
50 | world_size: -1
51 | 
52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
53 | log_level: 20
54 | 
55 | # Random Seed
56 | seed: 21
57 | 


--------------------------------------------------------------------------------
/conf/archive/v1/gpt2-mistral-mini-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-mistral-small-config.yaml
 2 | #   Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
 3 | #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/openwebtext.yaml
10 |     - models/gpt2-mini.yaml
11 |     - trainers/gpt2-small.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb: mistral-gpt2
18 | group: gpt2-small
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir: /scr-ssd/mercury/mistral/artifacts
23 |     run_dir: /scr-ssd/mercury/mistral/runs
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 512
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
33 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
34 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
35 | checkpoint_frequency:
36 |     - [10, 100]
37 |     - [50, 2000]
38 |     - [100, 20000]
39 |     - [1000, 400000]
40 | 
41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
42 | local_rank: -1
43 | nnodes: -1
44 | nproc_per_node: -1
45 | 
46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
47 | num_gpus: -1
48 | num_nodes: -1
49 | world_size: -1
50 | 
51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
52 | log_level: 20
53 | 
54 | # Random Seed
55 | seed: 21
56 | 


--------------------------------------------------------------------------------
/conf/archive/v1/gpt2-mistral-small-gcp-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-mistral-small-gcp-config.yaml
 2 | #   Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
 3 | #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16. Written for Google Cloud!
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/openwebtext.yaml
10 |     - models/gpt2-small.yaml
11 |     - trainers/gpt2-small.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb: mistral-gpt2
18 | group: gpt2-small
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir: /home/data/mercury/mistral/artifacts
23 |     run_dir: /home/data/mercury/mistral/runs
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 512
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
33 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
34 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
35 | checkpoint_frequency:
36 |     - [10, 100]
37 |     - [50, 2000]
38 |     - [100, 20000]
39 |     - [1000, 400000]
40 | 
41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
42 | local_rank: -1
43 | nnodes: -1
44 | nproc_per_node: -1
45 | 
46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
47 | num_gpus: -1
48 | num_nodes: -1
49 | world_size: -1
50 | 
51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
52 | log_level: 20
53 | 
54 | # Random Seed
55 | seed: 21
56 | 


--------------------------------------------------------------------------------
/conf/archive/v1/gpt2-scaling-config.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-scaling-config.yaml
 2 | #   GPT-2 Scaling Config for benchmarking memory footprint and training time for various GPT-2 Architectures, working
 3 | #   with the WikiText-103 Dataset (assuming data loading doesn't affect GPU Memory), sequence length of 1024,
 4 | #   and full batch size (512).
 5 | #
 6 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 7 | ---
 8 | # Inherit Dataset, Tokenization, Model, and Training Details
 9 | inherit:
10 |     - datasets/wikitext103.yaml
11 |     - models/gpt2-small.yaml
12 |     - trainers/gpt2-small.yaml
13 | 
14 | # Run ID -- make sure to override!
15 | run_id: null
16 | 
17 | # Weights & Biases
18 | wandb: mistral-scaling
19 | group: null
20 | 
21 | # Artifacts & Caching
22 | artifacts:
23 |     cache_dir: /scr-ssd/mercury/mistral/artifacts
24 |     run_dir: /scr-ssd/mercury/mistral/runs
25 | 
26 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
27 | effective_bsz: 512
28 | 
29 | # Resume from Checkpoint
30 | resume: false
31 | resume_checkpoint: null
32 | 
33 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
34 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
35 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
36 | checkpoint_frequency:
37 |     - [10, 100]
38 |     - [50, 2000]
39 |     - [100, 20000]
40 |     - [1000, 400000]
41 | 
42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
43 | local_rank: -1
44 | nnodes: -1
45 | nproc_per_node: -1
46 | 
47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
48 | num_gpus: -1
49 | num_nodes: -1
50 | world_size: -1
51 | 
52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
53 | log_level: 20
54 | 
55 | # Random Seed
56 | seed: 21
57 | 


--------------------------------------------------------------------------------
/conf/archive/v1/tutorial-gpt2-micro.yaml:
--------------------------------------------------------------------------------
 1 | # tutorial-gpt2-micro.yaml
 2 | #   Demo GPT-2 Micro Training Config, currently working with the WikiText103 Dataset, GPT-2 Micro Architecture,
 3 | #   and batch size of 2. Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 2.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/wikitext103.yaml
10 |     - models/gpt2-micro.yaml
11 |     - trainers/gpt2-small.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb:
18 | group:
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir:
23 |     run_dir:
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 32
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
33 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
34 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
35 | checkpoint_frequency:
36 |     - [2, 10]
37 |     - [10, 100]
38 |     - [50, 2000]
39 |     - [100, 20000]
40 |     - [1000, 400000]
41 | 
42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
43 | local_rank: -1
44 | nnodes: -1
45 | nproc_per_node: -1
46 | 
47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
48 | num_gpus: -1
49 | num_nodes: -1
50 | world_size: -1
51 | 
52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
53 | log_level: 20
54 | 
55 | # Random Seed
56 | seed: 21
57 | 
58 | # Set Eval Stride
59 | online_eval:
60 |     stride: 256
61 | 


--------------------------------------------------------------------------------
/conf/datasets/openwebtext.yaml:
--------------------------------------------------------------------------------
 1 | # openwebtext.yaml
 2 | #   Configuration for OpenWebText Dataset (https://huggingface.co/datasets/openwebtext)
 3 | ---
 4 | dataset:
 5 |     id: openwebtext
 6 |     name: null
 7 |     validation_ratio: 0.0005
 8 | 
 9 |     # Number of Preprocessing Workers
10 |     num_proc: 64
11 | 
12 |     # Number of Evaluation Preprocessing Workers
13 |     eval_num_proc: 4
14 | 


--------------------------------------------------------------------------------
/conf/datasets/shakespeare.yaml:
--------------------------------------------------------------------------------
 1 | # shakespeare.yaml
 2 | # Configuration for Shakespeare dataset at tutorials/custom-dataset/shakespeare.
 3 | ---
 4 | dataset:
 5 |     id: shakespeare
 6 |     name: shakespeare
 7 |     dataset_dir: tutorials/custom-dataset/shakespeare
 8 | 
 9 |     # Number of Preprocessing Workers
10 |     num_proc: 4
11 | 
12 |     # Number of Evaluation Preprocessing Workers
13 |     eval_num_proc: 4
14 | 


--------------------------------------------------------------------------------
/conf/datasets/wikitext103.yaml:
--------------------------------------------------------------------------------
 1 | # wikitext103.yaml
 2 | #   Configuration for WikiText-103 Dataset (https://huggingface.co/datasets/wikitext).
 3 | ---
 4 | dataset:
 5 |     id: wikitext
 6 |     name: wikitext-103-raw-v1
 7 | 
 8 |     # Number of Preprocessing Workers
 9 |     num_proc: 4
10 | 
11 |     # Number of Evaluation Preprocessing Workers
12 |     eval_num_proc: 4
13 | 


--------------------------------------------------------------------------------
/conf/datasets/wikitext2.yaml:
--------------------------------------------------------------------------------
 1 | # wikitext2.yaml
 2 | #   Configuration for WikiText-2 Dataset (https://huggingface.co/datasets/wikitext).
 3 | ---
 4 | dataset:
 5 |     id: wikitext
 6 |     name: wikitext-2-raw-v1
 7 |     validation_ratio: null
 8 | 
 9 |     # Number of Preprocessing Workers
10 |     num_proc: 4
11 | 
12 |     # Number of Evaluation Preprocessing Workers
13 |     eval_num_proc: 4
14 | 


--------------------------------------------------------------------------------
/conf/deepspeed/debug-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": true,
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "hysteresis": 2,
 7 |         "min_loss_scale": 1
 8 |     },
 9 | 
10 |    "zero_optimization": {
11 |        "stage": 2,
12 |        "allgather_partitions": true,
13 |        "allgather_bucket_size": 5e8,
14 |        "overlap_comm": true,
15 |        "reduce_scatter": true,
16 |        "reduce_bucket_size": 5e8,
17 |        "contiguous_gradients": true,
18 |        "cpu_offload": true
19 |    },
20 | 
21 |    "optimizer": {
22 |      "type": "AdamW",
23 |      "params": {
24 |        "lr": 3e-5,
25 |        "betas": [ 0.8, 0.999 ],
26 |        "eps": 1e-8,
27 |        "weight_decay": 3e-7
28 |      }
29 |    },
30 |    "zero_allow_untested_optimizer": true,
31 | 
32 |    "scheduler": {
33 |      "type": "WarmupLR",
34 |      "params": {
35 |        "warmup_min_lr": 0,
36 |        "warmup_max_lr": 3e-5,
37 |        "warmup_num_steps": 500
38 |      }
39 |    }
40 | }
41 | 


--------------------------------------------------------------------------------
/conf/deepspeed/hostfile:
--------------------------------------------------------------------------------
1 | sphinx1.stanford.edu slots=8
2 | sphinx2.stanford.edu slots=8
3 | 


--------------------------------------------------------------------------------
/conf/deepspeed/z1-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": [
 7 |         0.9,
 8 |         0.95
 9 |       ],
10 |       "eps": 1e-8,
11 |       "weight_decay": 0.1
12 |     }
13 |   },
14 | 
15 |   "scheduler": {
16 |     "type": "WarmupDecayLR",
17 |     "params": {
18 |       "total_num_steps": 400000,
19 |       "warmup_max_lr": 0.0006,
20 |       "warmup_num_steps": 4000
21 |     }
22 |   },
23 | 
24 |   "zero_optimization": {
25 |     "stage": 1,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 2e8,
28 |     "reduce_scatter": true,
29 |     "reduce_bucket_size": 2e8,
30 |     "overlap_comm": true,
31 |     "contiguous_gradients": true,
32 |     "cpu_offload": false
33 |   },
34 | 
35 |   "train_batch_size": "auto",
36 |   "train_micro_batch_size_per_gpu": "auto"
37 | }
38 | 


--------------------------------------------------------------------------------
/conf/deepspeed/z1-offload-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": [
 7 |         0.9,
 8 |         0.95
 9 |       ],
10 |       "eps": 1e-8,
11 |       "weight_decay": 0.1
12 |     }
13 |   },
14 | 
15 |   "scheduler": {
16 |     "type": "WarmupDecayLR",
17 |     "params": {
18 |       "total_num_steps": 400000,
19 |       "warmup_max_lr": 0.0006,
20 |       "warmup_num_steps": 4000
21 |     }
22 |   },
23 | 
24 |   "zero_optimization": {
25 |     "stage": 1,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 2e8,
28 |     "reduce_scatter": true,
29 |     "reduce_bucket_size": 2e8,
30 |     "overlap_comm": true,
31 |     "contiguous_gradients": true,
32 |     "cpu_offload": true
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/deepspeed/z2-debug-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": [
 7 |         0.9,
 8 |         0.95
 9 |       ],
10 |       "eps": 1e-8,
11 |       "weight_decay": 0.1
12 |     }
13 |   },
14 | 
15 |   "scheduler": {
16 |     "type": "WarmupDecayLR",
17 |     "params": {
18 |       "total_num_steps": 4000,
19 |       "warmup_max_lr": 0.0006,
20 |       "warmup_num_steps": 40
21 |     }
22 |   },
23 | 
24 |   "zero_optimization": {
25 |     "stage": 2,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 2e8,
28 |     "reduce_scatter": true,
29 |     "reduce_bucket_size": 2e8,
30 |     "overlap_comm": true,
31 |     "contiguous_gradients": true,
32 |     "cpu_offload": false
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/deepspeed/z2-medium-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.00015,
 6 |       "betas": "auto",
 7 |       "eps": 1e-8,
 8 |       "weight_decay": 0.1
 9 |     }
10 |   },
11 | 
12 |   "scheduler": {
13 |     "type": "WarmupDecayLR",
14 |     "params": {
15 |       "total_num_steps": "auto",
16 |       "warmup_max_lr": 0.00015,
17 |       "warmup_num_steps": 4000
18 |     }
19 |   },
20 | 
21 |   "zero_optimization": {
22 |     "stage": 2,
23 |     "allgather_partitions": true,
24 |     "allgather_bucket_size": 2e8,
25 |     "reduce_scatter": true,
26 |     "reduce_bucket_size": 2e8,
27 |     "overlap_comm": true,
28 |     "contiguous_gradients": true,
29 |     "cpu_offload": false
30 |   },
31 | 
32 |   "train_batch_size": "auto",
33 |   "train_micro_batch_size_per_gpu": "auto"
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/deepspeed/z2-offload-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": [
 7 |         0.9,
 8 |         0.95
 9 |       ],
10 |       "eps": 1e-8,
11 |       "weight_decay": 0.1
12 |     }
13 |   },
14 | 
15 |   "scheduler": {
16 |     "type": "WarmupDecayLR",
17 |     "params": {
18 |       "total_num_steps": 400000,
19 |       "warmup_max_lr": 0.0006,
20 |       "warmup_num_steps": 4000
21 |     }
22 |   },
23 | 
24 |   "zero_optimization": {
25 |     "stage": 2,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 2e8,
28 |     "reduce_scatter": true,
29 |     "reduce_bucket_size": 2e8,
30 |     "overlap_comm": true,
31 |     "contiguous_gradients": true,
32 |     "cpu_offload": true
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/deepspeed/z2-small-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": "auto",
 7 |       "eps": 1e-8,
 8 |       "weight_decay": 0.1
 9 |     }
10 |   },
11 | 
12 |   "scheduler": {
13 |     "type": "WarmupDecayLR",
14 |     "params": {
15 |       "total_num_steps": "auto",
16 |       "warmup_max_lr": 0.0006,
17 |       "warmup_num_steps": 4000
18 |     }
19 |   },
20 | 
21 |   "zero_optimization": {
22 |     "stage": 2,
23 |     "allgather_partitions": true,
24 |     "allgather_bucket_size": 2e8,
25 |     "reduce_scatter": true,
26 |     "reduce_bucket_size": 2e8,
27 |     "overlap_comm": true,
28 |     "contiguous_gradients": true,
29 |     "cpu_offload": false
30 |   },
31 | 
32 |   "train_batch_size": "auto",
33 |   "train_micro_batch_size_per_gpu": "auto"
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/deepspeed/z3-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": [
 7 |         0.9,
 8 |         0.95
 9 |       ],
10 |       "eps": 1e-8,
11 |       "weight_decay": 0.1
12 |     }
13 |   },
14 | 
15 |   "scheduler": {
16 |     "type": "WarmupDecayLR",
17 |     "params": {
18 |       "total_num_steps": 400000,
19 |       "warmup_max_lr": 0.0006,
20 |       "warmup_num_steps": 4000
21 |     }
22 |   },
23 | 
24 |   "zero_optimization": {
25 |     "stage": 3,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 2e8,
28 |     "reduce_scatter": true,
29 |     "reduce_bucket_size": 2e8,
30 |     "overlap_comm": true,
31 |     "contiguous_gradients": true,
32 |     "cpu_offload": false
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/deepspeed/z3-offload-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": [
 7 |         0.9,
 8 |         0.95
 9 |       ],
10 |       "eps": 1e-8,
11 |       "weight_decay": 0.1
12 |     }
13 |   },
14 | 
15 |   "scheduler": {
16 |     "type": "WarmupDecayLR",
17 |     "params": {
18 |       "total_num_steps": 400000,
19 |       "warmup_max_lr": 0.0006,
20 |       "warmup_num_steps": 4000
21 |     }
22 |   },
23 | 
24 |   "zero_optimization": {
25 |     "stage": 3,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 2e8,
28 |     "reduce_scatter": true,
29 |     "reduce_bucket_size": 2e8,
30 |     "overlap_comm": true,
31 |     "contiguous_gradients": true,
32 |     "cpu_offload": true,
33 |     "cpu_offload_params": true,
34 |     "cpu_offload_pin_memory": true
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/conf/mistral-medium.yaml:
--------------------------------------------------------------------------------
 1 | # mistral-medium.yaml
 2 | #   Full Mistral GPT-2 Medium Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
 3 | #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/wikitext103.yaml
10 |     - models/mistral-medium.yaml
11 |     - trainers/gpt2-medium.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb:
18 | group:
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir: /path/to/artifacts
23 |     run_dir: /path/to/runs
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 512
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
33 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
34 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
35 | checkpoint_frequency:
36 |     - [10, 100]
37 |     - [50, 2000]
38 |     - [100, 20000]
39 |     - [1000, 400000]
40 | 
41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
42 | local_rank: -1
43 | nnodes: -1
44 | nproc_per_node: -1
45 | 
46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
47 | num_gpus: -1
48 | num_nodes: -1
49 | world_size: -1
50 | 
51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
52 | log_level: 20
53 | 
54 | # Random Seed
55 | seed: 21
56 | 


--------------------------------------------------------------------------------
/conf/mistral-micro.yaml:
--------------------------------------------------------------------------------
 1 | # mistral-micro.yaml
 2 | #   Demo GPT-2 Micro Training Config, currently working with the WikiText103 Dataset, GPT-2 Micro Architecture,
 3 | #   and batch size of 2. Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 2.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/wikitext2.yaml
10 |     - models/mistral-micro.yaml
11 |     - trainers/gpt2-small.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb:
18 | group:
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir: /path/to/artifacts
23 |     run_dir: /path/to/runs
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 32
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
33 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
34 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
35 | checkpoint_frequency:
36 |     - [2, 10]
37 |     - [10, 100]
38 |     - [50, 2000]
39 |     - [100, 20000]
40 |     - [1000, 400000]
41 | 
42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
43 | local_rank: -1
44 | nnodes: -1
45 | nproc_per_node: -1
46 | 
47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
48 | num_gpus: -1
49 | num_nodes: -1
50 | world_size: -1
51 | 
52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
53 | log_level: 20
54 | 
55 | # Random Seed
56 | seed: 21
57 | 
58 | # Set Eval Stride
59 | online_eval:
60 |     stride: 256
61 | 


--------------------------------------------------------------------------------
/conf/mistral-small.yaml:
--------------------------------------------------------------------------------
 1 | # mistral-small.yaml
 2 | #   Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
 3 | #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/wikitext103.yaml
10 |     - models/mistral-small.yaml
11 |     - trainers/gpt2-small.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb:
18 | group:
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir: /path/to/artifacts
23 |     run_dir: /path/to/runs
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 512
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
33 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
34 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
35 | checkpoint_frequency:
36 |     - [10, 100]
37 |     - [50, 2000]
38 |     - [100, 20000]
39 |     - [1000, 400000]
40 | 
41 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
42 | local_rank: -1
43 | nnodes: -1
44 | nproc_per_node: -1
45 | 
46 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
47 | num_gpus: -1
48 | num_nodes: -1
49 | world_size: -1
50 | 
51 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
52 | log_level: 20
53 | 
54 | # Random Seed
55 | seed: 21
56 | 


--------------------------------------------------------------------------------
/conf/models/mistral-medium.yaml:
--------------------------------------------------------------------------------
 1 | # mistral-medium.yaml
 2 | #   Configuration for the GT-2 Medium Model.
 3 | ---
 4 | model:
 5 |     id: "mistral-medium"
 6 | 
 7 |     # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch)
 8 |     pretrained_tokenizer: true
 9 | 
10 |     # Sequence Length
11 |     seq_len: 1024
12 | 
13 |     # Stability
14 |     reorder_and_upcast_attn: true
15 |     scale_attn_by_inverse_layer_idx: true
16 | 
17 |     # Initialize Weights from File
18 |     initial_weights: null
19 | 


--------------------------------------------------------------------------------
/conf/models/mistral-micro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "activation_function": "gelu_new",
 3 |   "architectures": [
 4 |     "MistralGPT2LMHeadModel"
 5 |   ],
 6 |   "attn_pdrop": 0.1,
 7 |   "bos_token_id": 50256,
 8 |   "embd_pdrop": 0.1,
 9 |   "eos_token_id": 50256,
10 |   "initializer_range": 0.02,
11 |   "layer_norm_epsilon": 1e-05,
12 |   "model_type": "gpt2",
13 |   "n_ctx": 256,
14 |   "n_embd": 768,
15 |   "n_head": 2,
16 |   "n_inner": null,
17 |   "n_layer": 2,
18 |   "n_positions": 256,
19 |   "resid_pdrop": 0.1,
20 |   "summary_activation": null,
21 |   "summary_first_dropout": 0.1,
22 |   "summary_proj_to_labels": true,
23 |   "summary_type": "cls_index",
24 |   "summary_use_proj": true,
25 |   "task_specific_params": {
26 |     "text-generation": {
27 |       "do_sample": true,
28 |       "max_length": 50
29 |     }
30 |   },
31 |   "transformers_version": "4.5.0",
32 |   "use_cache": false,
33 |   "vocab_size": 50257
34 | }
35 | 


--------------------------------------------------------------------------------
/conf/models/mistral-micro.yaml:
--------------------------------------------------------------------------------
 1 | # mistral-micro.yaml
 2 | #   Configuration for the GPT-2 Micro Model.
 3 | ---
 4 | model:
 5 |     # this example relies on a user specified config file
 6 |     id: "gpt2-small"
 7 | 
 8 |     # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch)
 9 |     pretrained_tokenizer: true
10 | 
11 |     # Sequence Length
12 |     seq_len: 256
13 | 
14 |     # Stability
15 |     reorder_and_upcast_attn: true
16 |     scale_attn_by_inverse_layer_idx: true
17 | 
18 |     # Initialize Weights from File
19 |     initial_weights: null
20 | 
21 |     # Configure Model From File
22 |     config_path: conf/models/mistral-micro.json
23 | 


--------------------------------------------------------------------------------
/conf/models/mistral-small.yaml:
--------------------------------------------------------------------------------
 1 | # mistral-small.yaml
 2 | #   Configuration for the GPT-2 Small Model.
 3 | ---
 4 | model:
 5 |     id: "mistral-small"
 6 | 
 7 |     # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch)
 8 |     pretrained_tokenizer: true
 9 | 
10 |     # Sequence Length
11 |     seq_len: 1024
12 | 
13 |     # Stability
14 |     reorder_and_upcast_attn: true
15 |     scale_attn_by_inverse_layer_idx: true
16 | 
17 |     # Initialize Weights from File
18 |     initial_weights: null
19 | 


--------------------------------------------------------------------------------
/conf/trainers/benchmark.yaml:
--------------------------------------------------------------------------------
 1 | # benchmark.yaml
 2 | #   Trainer config for Benchmarking, with the full fixed batch size of 512 (with gradient accumulation).
 3 | #   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
 4 | #   continues to stay valid!
 5 | #       Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
 6 | ---
 7 | training_arguments:
 8 |     # Overwrite from Top-Level Config
 9 |     output_dir: null
10 | 
11 |     # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
12 |     do_train: true
13 |     evaluation_strategy: steps
14 | 
15 |     # Set these based on GPU RAM available...
16 |     per_device_train_batch_size: 2
17 |     per_device_eval_batch_size: 2
18 | 
19 |     # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
20 |     gradient_accumulation_steps: null
21 | 
22 |     # For Online Evaluation, only keep around the Losses
23 |     prediction_loss_only: true
24 | 
25 |     # Learning Rate & Optimization Parameters, assumes AdamW
26 |     learning_rate: 5.0e-5
27 |     weight_decay: 0.01
28 |     adam_beta1: 0.9
29 |     adam_beta2: 0.999
30 |     adam_epsilon: 1.0e-8
31 | 
32 |     # Gradient Norm
33 |     max_grad_norm: 1.0
34 | 
35 |     # Maximum Training Steps (Overrides epochs!)
36 |     max_steps: 50
37 | 
38 |     # LR Scheduling Parameters
39 |     lr_scheduler_type: linear
40 |     warmup_steps: 2
41 | 
42 |     # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime!
43 |     run_name: null
44 |     logging_dir: null
45 |     logging_first_step: true
46 |     logging_steps: 2
47 | 
48 |     # Saving and Evaluation Steps (only at the end)
49 |     eval_steps: 10
50 |     save_steps: 10
51 | 
52 |     # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
53 |     ignore_data_skip: false
54 | 
55 |     # Seeds -- Should be Overwritten at Runtime!
56 |     seed: null
57 | 
58 |     ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
59 |     fp16: false
60 |     sharded_ddp: null
61 |     deepspeed: null
62 | 
63 |     # Dataloader Parallelism
64 |     dataloader_num_workers: 0
65 | 
66 |     # Should be overwritten from the Top-Level Config or CLI!
67 |     local_rank: null
68 | 


--------------------------------------------------------------------------------
/conf/trainers/gpt2-medium.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-medium.yaml
 2 | #   Trainer config for Full GPT-2 Medium, with the full fixed batch size of 512 (with gradient accumulation).
 3 | #   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
 4 | #   continues to stay valid!
 5 | #       Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
 6 | ---
 7 | training_arguments:
 8 |     # Overwrite from Top-Level Config
 9 |     output_dir: null
10 | 
11 |     # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
12 |     do_train: true
13 |     evaluation_strategy: steps
14 | 
15 |     # Set these based on GPU RAM/your available hardware (4 w/o gradient checkpointing, 8 w/ partial checkpointing)
16 |     per_device_train_batch_size: 4
17 |     per_device_eval_batch_size: 16
18 | 
19 |     # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
20 |     gradient_accumulation_steps: null
21 | 
22 |     # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime
23 |     gradient_checkpointing: false
24 | 
25 | 
26 |     # For Online Evaluation, only keep around the Losses
27 |     prediction_loss_only: true
28 | 
29 |     # Learning Rate & Optimization Parameters, assumes AdamW
30 |     learning_rate: 0.00015
31 |     weight_decay: 0.1
32 |     adam_beta1: 0.9
33 |     adam_beta2: 0.999
34 |     adam_epsilon: 1.0e-8
35 | 
36 |     # Gradient Norm
37 |     max_grad_norm: 1.0
38 | 
39 |     # Maximum Training Steps (Overrides epochs!)
40 |     max_steps: 400000
41 | 
42 |     # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio)
43 |     lr_scheduler_type: linear   # Cosine not supported if we want to use DeepSpeed Optimizers (gets overwritten!)
44 |     warmup_steps: 4000
45 | 
46 |     # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime!
47 |     run_name: null
48 |     logging_dir: null
49 |     logging_first_step: true
50 |     logging_steps: 50
51 | 
52 |     # Saving and Evaluation Steps
53 |     eval_steps: 1000
54 |     save_steps: 1000
55 | 
56 |     # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
57 |     ignore_data_skip: false
58 | 
59 |     # Seeds -- Should be Overwritten at Runtime!
60 |     seed: null
61 | 
62 |     ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
63 |     fp16: true
64 |     sharded_ddp: null
65 |     deepspeed: null
66 | 
67 |     # Dataloader Parallelism
68 |     dataloader_num_workers: 0
69 | 
70 |     # Should be overwritten from the Top-Level Config or CLI!
71 |     local_rank: null
72 | 


--------------------------------------------------------------------------------
/conf/trainers/gpt2-small-short.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-small-short.yaml
 2 | #   Trainer Debugging config for GPT-2 Small. The max_steps is set to 4000 to quickly iterate and debug.
 3 | ---
 4 | inherit:
 5 |     - gpt2-small.yaml
 6 | 
 7 | training_arguments:
 8 |     # Set these based on GPU RAM/your available hardware
 9 |     per_device_train_batch_size: 16
10 |     per_device_eval_batch_size: 16
11 | 
12 |     # Learning Rate & Optimization Parameters, assumes AdamW
13 |     adam_beta2: 0.95
14 | 
15 |     # Maximum Training Steps (Overrides epochs!)
16 |     max_steps: 4000
17 | 
18 |     # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio)
19 |     warmup_steps: 40
20 | 
21 |     # Saving and Evaluation Steps
22 |     eval_steps: 100
23 |     save_steps: 1000
24 | 


--------------------------------------------------------------------------------
/conf/trainers/gpt2-small.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-small.yaml
 2 | #   Trainer config for Full GPT-2 Small, with the full fixed batch size of 512 (with gradient accumulation).
 3 | #   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
 4 | #   continues to stay valid!
 5 | #       Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
 6 | ---
 7 | training_arguments:
 8 |     # Overwrite from Top-Level Config
 9 |     output_dir: null
10 | 
11 |     # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
12 |     do_train: true
13 |     evaluation_strategy: steps
14 | 
15 |     # Set these based on GPU RAM/your available hardware
16 |     per_device_train_batch_size: 8
17 |     per_device_eval_batch_size: 16
18 | 
19 |     # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
20 |     gradient_accumulation_steps: null
21 | 
22 |     # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime
23 |     gradient_checkpointing: false
24 | 
25 |     # For Online Evaluation, only keep around the Losses
26 |     prediction_loss_only: true
27 | 
28 |     # Learning Rate & Optimization Parameters, assumes AdamW
29 |     learning_rate: 0.0006
30 |     weight_decay: 0.1
31 |     adam_beta1: 0.9
32 |     adam_beta2: 0.999
33 |     adam_epsilon: 1.0e-8
34 | 
35 |     # Gradient Norm
36 |     max_grad_norm: 1.0
37 | 
38 |     # Maximum Training Steps (Overrides epochs!)
39 |     max_steps: 400000
40 | 
41 |     # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio)
42 |     lr_scheduler_type: linear   # Cosine not supported if we want to use DeepSpeed Optimizers (gets overwritten!)
43 |     warmup_steps: 4000
44 | 
45 |     # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime!
46 |     run_name: null
47 |     logging_dir: null
48 |     logging_first_step: true
49 |     logging_steps: 50
50 | 
51 |     # Saving and Evaluation Steps
52 |     eval_steps: 1000
53 |     save_steps: 1000
54 | 
55 |     # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
56 |     ignore_data_skip: false
57 | 
58 |     # Seeds -- Should be Overwritten at Runtime!
59 |     seed: null
60 | 
61 |     ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
62 |     fp16: true
63 |     sharded_ddp: null
64 |     deepspeed: null
65 | 
66 |     # Dataloader Parallelism
67 |     dataloader_num_workers: 0
68 | 
69 |     # Should be overwritten from the Top-Level Config or CLI!
70 |     local_rank: null
71 | 


--------------------------------------------------------------------------------
/conf/trainers/intensive.yaml:
--------------------------------------------------------------------------------
 1 | # intensive.yaml
 2 | #   Trainer config for Intensive Benchmarking, with the full fixed batch size of 512 (with gradient accumulation).
 3 | #   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
 4 | #   continues to stay valid!
 5 | #       Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
 6 | ---
 7 | training_arguments:
 8 |     # Overwrite from Top-Level Config
 9 |     output_dir: null
10 | 
11 |     # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
12 |     do_train: true
13 |     evaluation_strategy: steps
14 | 
15 |     # Set these based on GPU RAM available...
16 |     per_device_train_batch_size: 8
17 |     per_device_eval_batch_size: 16
18 | 
19 |     # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
20 |     gradient_accumulation_steps: null
21 | 
22 |     # For Online Evaluation, only keep around the Losses
23 |     prediction_loss_only: true
24 | 
25 |     # Learning Rate & Optimization Parameters, assumes AdamW
26 |     learning_rate: 5.0e-5
27 |     weight_decay: 0.01
28 |     adam_beta1: 0.9
29 |     adam_beta2: 0.999
30 |     adam_epsilon: 1.0e-8
31 | 
32 |     # Gradient Norm
33 |     max_grad_norm: 1.0
34 | 
35 |     # Maximum Training Steps (Overrides epochs!)
36 |     max_steps: 1000
37 | 
38 |     # LR Scheduling Parameters
39 |     lr_scheduler_type: linear
40 |     warmup_steps: 100
41 | 
42 |     # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime!
43 |     run_name: null
44 |     logging_dir: null
45 |     logging_first_step: true
46 |     logging_steps: 50
47 | 
48 |     # Saving and Evaluation Steps (only at the end)
49 |     eval_steps: 100
50 |     save_steps: 100
51 | 
52 |     # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
53 |     ignore_data_skip: false
54 | 
55 |     # Seeds -- Should be Overwritten at Runtime!
56 |     seed: null
57 | 
58 |     ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
59 |     fp16: false
60 |     sharded_ddp: null
61 |     deepspeed: null
62 | 
63 |     # Dataloader Parallelism
64 |     dataloader_num_workers: 0
65 | 
66 |     # Should be overwritten from the Top-Level Config or CLI!
67 |     local_rank: null
68 | 


--------------------------------------------------------------------------------
/conf/tutorial-shakespeare-gpt2-micro.yaml:
--------------------------------------------------------------------------------
 1 | # tutorial-shakespeare-gpt2-micro.yaml
 2 | #   Demo GPT-2 Micro Training config, currently working with the example Shakespeare dataset,
 3 | #   GPT-2 Micro Architecture, and batch size of 2. Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 2.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/shakespeare.yaml
10 |     - models/mistral-micro.yaml
11 |     - trainers/gpt2-small-short.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: shakespeare-gpt2-micro
15 | 
16 | # Weights & Biases
17 | wandb_api_key_path: ~/wandb_api_key.txt
18 | wandb: mistral-demo
19 | group: shakespeare-gpt2-micro
20 | 
21 | # Artifacts & Caching
22 | artifacts:
23 |     cache_dir: ~/cache
24 |     run_dir: ~/checkpoints
25 | 
26 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
27 | effective_bsz: 32
28 | 
29 | # Resume from Checkpoint
30 | resume: false
31 | resume_checkpoint: null
32 | 
33 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
34 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
35 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
36 | checkpoint_frequency:
37 |     - [2, 10]
38 |     - [10, 100]
39 |     - [50, 2000]
40 |     - [100, 20000]
41 |     - [1000, 400000]
42 | 
43 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
44 | local_rank: -1
45 | nnodes: -1
46 | nproc_per_node: -1
47 | 
48 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
49 | num_gpus: -1
50 | num_nodes: -1
51 | world_size: -1
52 | 
53 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
54 | log_level: 20
55 | 
56 | # Random Seed
57 | seed: 21
58 | 
59 | # Set Eval Stride
60 | online_eval:
61 |     stride: 256
62 | 


--------------------------------------------------------------------------------
/docs/LICENSE:
--------------------------------------------------------------------------------
 1 | The following pertains only to software in the docs directory used for building Sphinx documentation, originally from: https://github.com/JamesALeedham/Sphinx-Autosummary-Recursion
 2 | 
 3 | Copyright 2021 The Board of Trustees of The Leland Stanford Junior University
 4 | Copyright 2020 James Leedham
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 7 | 
 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 9 | 
10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | 
 3 | # You can set these variables from the command line, and also
 4 | # from the environment for the first two.
 5 | SPHINXOPTS    ?=
 6 | SPHINXBUILD   ?= sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      ?= _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | 
19 | # Note: Jupytext converts .py to .ipynb (Sphinx seems to execute Notebook..?)
20 | %: Makefile
21 | 	rm -rf $(BUILDDIR)
22 | 	rm -rf _autosummary
23 | 	python scripts/build_download_tables.py >> getting_started/download.rst
24 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
25 | 	git checkout getting_started/download.rst
26 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Install Sphinx
 2 | 
 3 | If you don't already have Sphinx set up install it with `pip`.
 4 | 
 5 | ```bash
 6 | pip install sphinx
 7 | pip install sphinx-rtd-theme
 8 | ```
 9 | 
10 | The documentation has been built with version 4.0.2.
11 | 
12 | ```bash
13 | $ sphinx-build --version
14 | sphinx-build 4.0.2
15 | ```
16 | 
17 | # Build The Docs
18 | 
19 | From `docs` directory run:
20 | 
21 | ```bash
22 | $ BUILDDIR=/path/to/build make html
23 | ```
24 | 


--------------------------------------------------------------------------------
/docs/_static/pydata-custom.css:
--------------------------------------------------------------------------------
 1 | /*Tweaks to the Pydata default CSS */
 2 | 
 3 | /*No yellow background highlight when targeted by summary tables */
 4 | /*dt:target { background-color: #f8f8f8; border: 1px solid black, }*/
 5 | dt:target { background: transparent;}
 6 | /*More space between H1s and signatures in API reference*/
 7 | h1 { margin-bottom: 40px; }
 8 | 
 9 | /*No line underneath summary table headings (clashes with line above first member)*/
10 | p.rubric { border-bottom: 0px; }
11 | 


--------------------------------------------------------------------------------
/docs/_static/readthedocs-custom.css:
--------------------------------------------------------------------------------
 1 | /* Override nav bar color */
 2 | /*.wy-side-nav-search {
 3 |     background-color: #fbfbb6;
 4 | }
 5 | .wy-side-nav-search > a {
 6 |     color: #b2355c
 7 | }*/
 8 | 
 9 | /* Override text bar color */
10 | /*.caption-text {
11 |     color: #b2355c;
12 | }*/
13 | 
14 | /* Override code signature colour */
15 | /*.rst-content dl:not(.docutils) dt {
16 |     background: #fbfbb6;
17 |     color: #b2355c;
18 |     border-top: solid 3px #b2355c;
19 | }*/
20 | 
21 | /* Override hyperlink colour */
22 | /* a {
23 |     color: #b2355c;
24 | }*/
25 | 
26 | /* Make content width wider*/
27 | .wy-nav-content {
28 |     max-width: 80% !important;
29 |     font-family: 'Roboto Condensed', sans-serif;;
30 | }
31 | 
32 | h1 {
33 |     font-family: 'Roboto Condensed', sans-serif;;
34 | }
35 | 
36 | h2 {
37 |     font-family: 'Roboto Condensed', sans-serif;;
38 | }
39 | 


--------------------------------------------------------------------------------
/docs/_templates/custom-class-template.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | .. autoclass:: {{ objname }}
 6 |    :members:
 7 |    :show-inheritance:
 8 |    :inherited-members:
 9 |    :special-members: __call__, __add__, __mul__
10 | 
11 |    {% block methods %}
12 |    {% if methods %}
13 |    .. rubric:: {{ _('Methods') }}
14 | 
15 |    .. autosummary::
16 |       :nosignatures:
17 |    {% for item in methods %}
18 |       {%- if not item.startswith('_') %}
19 |       ~{{ name }}.{{ item }}
20 |       {%- endif -%}
21 |    {%- endfor %}
22 |    {% endif %}
23 |    {% endblock %}
24 | 
25 |    {% block attributes %}
26 |    {% if attributes %}
27 |    .. rubric:: {{ _('Attributes') }}
28 | 
29 |    .. autosummary::
30 |    {% for item in attributes %}
31 |       ~{{ name }}.{{ item }}
32 |    {%- endfor %}
33 |    {% endif %}
34 |    {% endblock %}
35 | 


--------------------------------------------------------------------------------
/docs/_templates/custom-module-template.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. automodule:: {{ fullname }}
 4 | 
 5 |    {% block attributes %}
 6 |    {% if attributes %}
 7 |    .. rubric:: Module attributes
 8 | 
 9 |    .. autosummary::
10 |       :toctree:
11 |    {% for item in attributes %}
12 |       {{ item }}
13 |    {%- endfor %}
14 |    {% endif %}
15 |    {% endblock %}
16 | 
17 |    {% block functions %}
18 |    {% if functions %}
19 |    .. rubric:: {{ _('Functions') }}
20 | 
21 |    .. autosummary::
22 |       :toctree:
23 |       :nosignatures:
24 |    {% for item in functions %}
25 |       {{ item }}
26 |    {%- endfor %}
27 |    {% endif %}
28 |    {% endblock %}
29 | 
30 |    {% block classes %}
31 |    {% if classes %}
32 |    .. rubric:: {{ _('Classes') }}
33 | 
34 |    .. autosummary::
35 |       :toctree:
36 |       :template: custom-class-template.rst
37 |       :nosignatures:
38 |    {% for item in classes %}
39 |       {{ item }}
40 |    {%- endfor %}
41 |    {% endif %}
42 |    {% endblock %}
43 | 
44 |    {% block exceptions %}
45 |    {% if exceptions %}
46 |    .. rubric:: {{ _('Exceptions') }}
47 | 
48 |    .. autosummary::
49 |       :toctree:
50 |    {% for item in exceptions %}
51 |       {{ item }}
52 |    {%- endfor %}
53 |    {% endif %}
54 |    {% endblock %}
55 | 
56 | {% block modules %}
57 | {% if modules %}
58 | .. autosummary::
59 |    :toctree:
60 |    :template: custom-module-template.rst
61 |    :recursive:
62 | {% for item in modules %}
63 |    {{ item }}
64 | {%- endfor %}
65 | {% endif %}
66 | {% endblock %}
67 | 


--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {% block extrahead %}
3 |   <link rel="stylesheet" href="https://nlp.stanford.edu/mistral/_static/readthedocs-custom.css?v7" type="text/css" />
4 |   <link href="https://fonts.googleapis.com/css2?family=Roboto+Condensed:wght@300&display=swap" rel="stylesheet">
5 |   {{ super() }}
6 | {% endblock %}
7 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |    DO NOT DELETE THIS FILE! It contains the all-important `.. autosummary::` directive with `:recursive:` option, without
 3 |    which API documentation wouldn't get extracted from docstrings by the `sphinx.ext.autosummary` engine. It is hidden
 4 |    (not declared in any toctree) to remove an unnecessary intermediate page; index.rst instead points directly to the
 5 |    package page. DO NOT REMOVE THIS FILE!
 6 | 
 7 | .. autosummary::
 8 |    :toctree: _autosummary
 9 |    :template: custom-module-template.rst
10 |    :recursive:
11 | 
12 |    src
13 | 


--------------------------------------------------------------------------------
/docs/fork.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/fork.png


--------------------------------------------------------------------------------
/docs/getting_started.rst:
--------------------------------------------------------------------------------
 1 | Getting Started
 2 | ===============
 3 | 
 4 | The following guide will lead you through a tutorial example for training
 5 | and using a language model with Mistral.
 6 | 
 7 | :doc:`Installing Mistral And Setting Up Weights & Biases <getting_started/install>`
 8 | 
 9 | :doc:`Configuring Training Runs <getting_started/config>`
10 | 
11 | :doc:`Training A Model <getting_started/train>`
12 | 
13 | :doc:`Download Checkpoints <getting_started/download>`
14 | 
15 | :doc:`Evaluating A Model <getting_started/evaluate>`
16 | 
17 | After finishing this guide, check out our :doc:`tutorials <tutorials/multi-gpu>`.
18 | 
19 | If you are interested in helping out, see our :doc:`contributing <contributing>` page.
20 | 


--------------------------------------------------------------------------------
/docs/getting_started/config.rst:
--------------------------------------------------------------------------------
 1 | Configuration
 2 | ==============
 3 | 
 4 | Quinine
 5 | ---------
 6 | 
 7 | Configurations are specified using the `Quinine <https://github.com/krandiash/quinine>`_ library.
 8 | 
 9 | Quinine allows users to integrate multiple config files and layer configs on top of each other.
10 | It is designed for machine learning projects with large sets of nested hyperparameters.
11 | 
12 | The easiest way to understand Quinine is to study ``conf/mistral-micro.yaml`` which is presented below.
13 | 
14 | This config specifies a variety of settings, and draws configurations from ``conf/datasets/wikitext103.yaml``,
15 | ``conf/models/mistral-micro.yaml`` and ``conf/trainers/gpt2-small.yaml``. This allows for clean separation of the
16 | configs for the dataset (e.g. name or number of pre-processing workers), the model (e.g. number of layers),
17 | and the trainer (e.g. learning rate), while high level configs are specified in the main config file.
18 | 
19 | Most of the defaults in ``conf/mistral-micro.yaml`` will work, but you will need to change
20 | the Weights & Biases settings and specify the artifacts directories ``cache_dir`` and ``run_dir``.
21 | 
22 | Example config: mistral-micro.yaml
23 | ----------------------------------------
24 | 
25 | ``conf/mistral-micro.yaml`` is a basic configuration file that can be used for an introductory training run
26 | 
27 | .. include:: ../../conf/mistral-micro.yaml
28 |    :literal:
29 | 


--------------------------------------------------------------------------------
/docs/getting_started/download.rst:
--------------------------------------------------------------------------------
 1 | Download Models
 2 | ===============
 3 | 
 4 | Mistral Checkpoints
 5 | -------------------
 6 | 
 7 | The Mistral team has trained 5 GPT-2 Medium models and 5 GPT-2 Small models on the OpenWebText corpus and is making them available to the public.
 8 | 
 9 | Each model is available on the `Hugging Face Hub <https://huggingface.co/stanford-crfm/>`_ and can be accessed via Git LFS.
10 | 
11 | Checkpoints are branches of each repo for each model. For instance, here is how to get the 300k step checkpoint for battlestar: ::
12 | 
13 |     # Make sure you have git-lfs installed
14 |     # (https://git-lfs.github.com)
15 |     git lfs install
16 | 
17 |     # get checkpoint 300000 for battlestar
18 |     git clone https://huggingface.co/stanford-crfm/battlestar-gpt2-small-x49 --branch checkpoint-300000 --single-branch
19 |     cd battlestar-gpt2-small-x49
20 |     git lfs pull
21 | 
22 | 
23 | Links to the checkpoints are in the table below.
24 | 


--------------------------------------------------------------------------------
/docs/getting_started/evaluate.rst:
--------------------------------------------------------------------------------
 1 | Training
 2 | ========
 3 | 
 4 | Evaluating A Model
 5 | ------------------
 6 | 
 7 | Once you've finished training your model, you can run evaluation on any checkpoint to see PPL scores
 8 | on OpenWebText, WikiText-103, and Lambada.
 9 | 
10 | To run evaluation, use this command: ::
11 | 
12 |     cd mistral
13 |     conda activate mistral
14 |     CUDA_VISIBLE_DEVICES=0 python train.py --config conf/mistral-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --model.initial_weights /path/to/runs/my-run/checkpoint-400000 --run_training False
15 | 
16 | This will skip the training process and run a final evaluation, initializing from the weights of the checkpoint.
17 | 
18 | To evaluate a particular model, you need to supply the same config that was used to train the model (e.g. ``conf/mistral-micro.yaml``) in this example.
19 | 
20 | Example Output
21 | --------------
22 | 
23 | If all is successful, you should see output similar to this: ::
24 | 
25 |     |=>> 08/13 [14:00:22] - mistral - INFO :: Running final evaluation...
26 |     ...
27 |     {'eval_openwebtext_loss': 2.99070405960083, 'eval_openwebtext_ppl': 19.899688127064493, 'eval_openwebtext_runtime': 14.8929, 'eval_openwebtext_samples_per_second': 15.376, 'epoch': None, 'eval_wikitext_loss': 2.90213680267334, 'eval_wikitext_runtime': 26.5247, 'eval_wikitext_samples_per_second': 17.192, 'eval_wikitext_ppl': 18.21302145232096, 'eval_lambada_loss': 2.5298995971679688, 'eval_lambada_runtime': 283.1437, 'eval_lambada_samples_per_second': 17.196, 'eval_lambada_ppl': 12.552245792372315, 'eval_mem_cpu_alloc_delta': 532480, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 98304, 'eval_mem_gpu_peaked_delta': 1242778112}
28 | 


--------------------------------------------------------------------------------
/docs/getting_started/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Basic Installation
 5 | --------------------
 6 | 
 7 | Get the code ::
 8 | 
 9 |     git clone https://github.com/stanford-crfm/mistral.git
10 | 
11 | Set up the mistral conda env ::
12 | 
13 |     conda create -n mistral python=3.8.12 pytorch=1.11.0 torchdata cudatoolkit=11.3 -c pytorch
14 |     conda activate mistral
15 |     pip install -r setup/pip-requirements.txt
16 | 
17 | You may need to alter this environment depending on your CUDA set up.
18 | 
19 | Setting Up Weights And Biases
20 | -------------------------------
21 | 
22 | Training runs transmit logs to `Weights & Biases <https://wandb.ai/>`_.
23 | 
24 | First make sure to set up an account on their web site.
25 | 
26 | Before doing training runs, set up your wandb credentials on your machine ::
27 | 
28 |     conda activate mistral
29 |     cd mistral
30 |     wandb init
31 | 
32 | The ``init`` process will direct you to a url with an API key you must enter.
33 | During this process you will be asked to specify which team to use as well.
34 | 
35 | The project and group for a training run are set in the main
36 | config file with the ``wandb`` and ``group`` keys respectively.
37 | See ``conf/mistral-micro.yaml`` for an example.
38 | 
39 | If you do not want to send logs to Weights & Biases, run this command in the main mistral directory ::
40 | 
41 |     wandb offline
42 | 
43 | You can completely deactivate Weights & Biases logging with this command ::
44 | 
45 |     wandb disabled
46 | 
47 | For general info on ``wandb`` commands run ::
48 | 
49 |     wandb --help
50 | 


--------------------------------------------------------------------------------
/docs/getting_started/train-output.txt:
--------------------------------------------------------------------------------
 1 | |=>> 06/25 [23:58:36] - mistral - INFO :: Initializing Model Trainer...
 2 | |=>> 06/25 [23:58:36] - mistral - INFO :: Training Arguments: TrainingArguments(output_dir=mistral-hello-world/runs/gpt2-small-d=wikitext-n=1-g=1-w=1+2021-06-25-23:57:32, overwrite_output_dir=False, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=IntervalStrategy.STEPS, prediction_loss_only=True, per_device_train_batch_size=4, per_device_eval_batch_size=16, gradient_accumulation_steps=128, eval_accumulation_steps=None, learning_rate=0.0006, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=400000, lr_scheduler_type=SchedulerType.LINEAR, warmup_ratio=0.0, warmup_steps=4000, logging_dir=logs, logging_strategy=IntervalStrategy.STEPS, logging_first_step=True, logging_steps=50, save_strategy=IntervalStrategy.STEPS, save_steps=1000, save_total_limit=None, no_cuda=False, seed=21, fp16=True, fp16_opt_level=O1, fp16_backend=auto, fp16_full_eval=False, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=1000, dataloader_num_workers=4, past_index=-1, run_name=gpt2-small-d=wikitext-n=1-g=1-w=1+2021-06-25-23:57:32, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=[], deepspeed=None, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, length_column_name=length, report_to=[], ddp_find_unused_parameters=None, dataloader_pin_memory=True, skip_memory_metrics=False, _n_gpu=1, mp_parameters=)
 3 | |=>> 06/25 [23:58:42] - mistral.core.callbacks - INFO :: Setting W&B Project: hello-world
 4 | |=>> 06/25 [23:59:06] - mistral - INFO :: Training...
 5 | |=>> 06/25 [23:59:06] - mistral.core.callbacks - INFO :: Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
 6 | wandb: Currently logged in as: username (use `wandb login --relogin` to force relogin)
 7 | wandb: wandb version 0.10.32 is available!  To upgrade, please run:
 8 | wandb:  $ pip install wandb --upgrade
 9 | wandb: Tracking run with wandb version 0.10.21
10 | wandb: Syncing run gpt2-small-d=wikitext-n=1-g=1-w=1+2021-06-25-23:57:32
11 | wandb: ⭐️ View project at https://wandb.ai/smy-team/hello-world
12 | wandb: 🚀 View run at https://wandb.ai/my-team/hello-world/runs/3mrlgblq
13 | wandb: Run data is saved locally in mistral-hello-world/runs/gpt2-small-d=wikitext-n=1-g=1-w=1+2021-06-25-23:57:32/wandb/run-20210625_235915-3mrlgblq
14 | wandb: Run `wandb offline` to turn off syncing.
15 | 
16 |                                                        {'loss': 11.0023, 'learning_rate': 1.5e-07, 'activations/layer0_attention_weight_max': 1.9394148588180542, 'activations/layer0_attention_weight_min': -1.7338905334472656, 'activations/layer1_attention_weight_max': 1.7617545127868652, 'activations/layer1_attention_weight_min': -1.7682685852050781, 'activations/layer2_attention_weight_max': 1.7848472595214844, 'activations/layer2_attention_weight_min': -1.9004961252212524, 'activations/layer3_attention_weight_max': 1.8493074178695679, 'activations/layer3_attention_weight_min': -1.838200330734253, 'activations/layer4_attention_weight_max': 1.8895012140274048, 'activations/layer4_attention_weight_min': -1.7738912105560303, 'activations/layer5_attention_weight_max': 1.7461622953414917, 'activations/layer5_attention_weight_min': -1.758669376373291, 'activations/layer6_attention_weight_max': 1.9132049083709717, 'activations/layer6_attention_weight_min': -1.9518122673034668, 'activations/layer7_attention_weight_max': 1.8657881021499634, 'activations/layer7_attention_weight_min': -1.8033781051635742, 'activations/layer8_attention_weight_max': 2.0741305351257324, 'activations/layer8_attention_weight_min': -1.925511360168457, 'activations/layer9_attention_weight_max': 1.8003664016723633, 'activations/layer9_attention_weight_min': -1.7981972694396973, 'activations/layer10_attention_weight_max': 1.7417181730270386, 'activations/layer10_attention_weight_min': -1.6902594566345215, 'activations/layer11_attention_weight_max': 1.9806346893310547, 'activations/layer11_attention_weight_min': -1.731971025466919, 'epoch': 0.0}
17 | 
18 |   0%|          | 100/400000 [1:06:43<4789:29:34, 43.12s/it]
19 | 


--------------------------------------------------------------------------------
/docs/getting_started/wandb_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/getting_started/wandb_example.png


--------------------------------------------------------------------------------
/docs/hugging_face_differences.rst:
--------------------------------------------------------------------------------
 1 | Differences between Mistral and Hugging Face
 2 | ===============
 3 | 
 4 | Mistral is not a replacement for Hugging Face. Rather, we extend the current functionalities in Hugging Face
 5 | by fixing stability issues with GPT training, adding evaluation scripts and supporting distributed training
 6 | with the DeepSpeed optimization library.
 7 | 
 8 | **Stability**
 9 | 
10 | When training GPT-2 Small models with Hugging Face, some of the models crashed due to numerical instability.
11 | We fixed the this issue by rearranging the order of operations in scaled dot-product attention computation
12 | and upcasting to FP32. We also scaled down the weights by dividing by the layer number to prevent overflow.
13 | These changes have been upstreamed to the Hugging Face repository, when using ``reorder_and_upcast_attn: true``
14 | and ``scale_attn_by_inverse_layer_idx: true`` in the model config for GPT-2.
15 | 
16 | **Evaluation**
17 | 
18 | We added online evaluation so we can get PPL on arbitrary datasets while training.
19 | 
20 | **Parallelism**
21 | 
22 | We noticed that integrating parallelism (e.g. tensor model-parallelism and pipelining) breaks the current
23 | Hugging Face APIs.
24 | 
25 | **Distributed Training**
26 | 
27 | We provide ready-to-use scripts and configuration files to run distributed training with DeepSpeed,
28 | Google Cloud Platform and Kubernetes.
29 | 
30 | 
31 | **Future**
32 | 
33 | We are closely working with folks from Hugging Face. We plan to integrate Mistral into the Hugging Face library
34 | in the future.
35 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |    Note: Items in this toctree form the top-level navigation. See `api.rst` for the `autosummary` directive, and for why `api.rst` isn't called directly.
 3 | 
 4 | .. toctree::
 5 |    :hidden:
 6 |    :caption: Getting Started
 7 | 
 8 |    Overview <getting_started>
 9 |    Installation <getting_started/install.rst>
10 |    Configuration <getting_started/config.rst>
11 |    Training <getting_started/train.rst>
12 |    Download Models <getting_started/download.rst>
13 |    Evaluation <getting_started/evaluate.rst>
14 | 
15 | .. toctree::
16 |    :hidden:
17 |    :caption: Tutorials
18 | 
19 |    Training With Multiple GPU's <tutorials/multi-gpu>
20 |    Training On Multiple Nodes With DeepSpeed <tutorials/deepspeed>
21 |    Generate Text With A Trained Model <tutorials/generate>
22 |    Training A Model With Google Cloud + Kubernetes <tutorials/gcp_plus_kubernetes>
23 | 
24 | .. toctree::
25 |    :hidden:
26 |    :caption: About
27 | 
28 |    Contributing <contributing>
29 |    API reference <_autosummary/src>
30 |    Differences between Mistral and Hugging Face <hugging_face_differences>
31 | 
32 | Mistral - Large Scale Language Modeling Made Easy
33 | =====================================================
34 | 
35 | .. image:: mistral_components.png
36 | 
37 | 
38 | Mistral combines `Hugging Face <https://huggingface.co/>`_ 🤗, `DeepSpeed <https://www.deepspeed.ai/>`_, and `Weights & Biases <https://wandb.ai/site>`_ ,  with additional tools, helpful scripts, and documentation to facilitate:
39 | 
40 | * training large models with multiple GPU's and nodes
41 | * incorporating new pre-training datasets
42 | * dataset preprocessing
43 | * monitoring and logging of model training
44 | * performing evaluation and measuring bias
45 | 
46 | .. _Mistral: https://github.com/stanford-crfm/mistral
47 | 


--------------------------------------------------------------------------------
/docs/mistral_components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/mistral_components.png


--------------------------------------------------------------------------------
/docs/scripts/build_download_tables.py:
--------------------------------------------------------------------------------
 1 | """
 2 | create_download_tables.py
 3 | 
 4 | Build and verify download link table.
 5 | """
 6 | 
 7 | import argparse
 8 | 
 9 | import requests
10 | 
11 | 
12 | def verify_download_link(link):
13 |     results = requests.head(link)
14 |     return results.ok
15 | 
16 | 
17 | def github_table_header(size):
18 |     return (
19 |         f"\nGPT-2 {size.capitalize()}\n\n| Run | Type | Checkpoint | Size | Link |\n| --- | --- | --- | --- | --- |\n"
20 |     )
21 | 
22 | 
23 | def github_table_row(run, size, checkpoint, download_size, download_link):
24 |     return f"| {run} | GPT-2 {size.capitalize()} | {checkpoint} | {download_size} | [download]({download_link}) |\n"
25 | 
26 | 
27 | def rst_table_header(size):
28 |     return (
29 |         f".. csv-table:: GPT-2 {size.capitalize()} Models\n"
30 |         '   :header: "Run", "Type", "Checkpoint", "Size", "Link"\n'
31 |         "   :widths: 7,7,7,5,7\n\n"
32 |     )
33 | 
34 | 
35 | def rst_table_row(run, size, checkpoint, download_size, download_link):
36 |     return f'   "{run}", "GPT-2 {size.capitalize()}", "{checkpoint}", {download_size}, `download <{download_link}>`_\n'
37 | 
38 | 
39 | table_header_creators = {"github": github_table_header, "rst": rst_table_header}
40 | row_creators = {"github": github_table_row, "rst": rst_table_row}
41 | 
42 | 
43 | def produce_download_tables(mode="rst"):
44 |     sizes = ["medium", "small"]
45 | 
46 |     runs = {
47 |         "small": ["Alias", "Battlestar", "Caprica", "Darkmatter", "Expanse"],
48 |         "medium": ["Arwen", "Beren", "Celebrimbor", "Durin", "Eowyn"],
49 |     }
50 | 
51 |     run_to_seed = {
52 |         "Alias": "x21",
53 |         "Battlestar": "x49",
54 |         "Caprica": "x81",
55 |         "Darkmatter": "x343",
56 |         "Expanse": "x777",
57 |         "Arwen": "x21",
58 |         "Beren": "x49",
59 |         "Celebrimbor": "x81",
60 |         "Durin": "x343",
61 |         "Eowyn": "x777",
62 |     }
63 | 
64 |     checkpoints = [100000, 200000, 300000, 400000]
65 | 
66 |     download_sizes = {"small": "1.8G", "medium": "4.9G"}
67 | 
68 |     tables = []
69 |     for size in sizes:
70 |         table = table_header_creators[mode](size)
71 |         for run in sorted(runs[size]):
72 |             for checkpoint in sorted(checkpoints, reverse=True):
73 |                 # build and verify download link
74 |                 download_link = f"https://storage.googleapis.com/mistral-models/gpt2-{size}/{run.lower()}-gpt2-{size}-{run_to_seed[run]}/{run.lower()}-{run_to_seed[run]}-checkpoint-{checkpoint}.zip"
75 |                 # assert verify_download_link(download_link), f"link failed: {download_link}"
76 |                 # add row
77 |                 table += row_creators[mode](run, size, checkpoint, download_sizes[size], download_link)
78 |         tables.append(table)
79 | 
80 |     return tables
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     parser = argparse.ArgumentParser()
85 |     parser.add_argument("-m", "--mode", choices=["github", "rst"], help="type of table to build", default="rst")
86 |     args = parser.parse_args()
87 |     print("")
88 |     for table in produce_download_tables(mode=args.mode):
89 |         print(table)
90 |     print("")
91 | 


--------------------------------------------------------------------------------
/docs/tutorials/cluster_basics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/cluster_basics.png


--------------------------------------------------------------------------------
/docs/tutorials/generate.rst:
--------------------------------------------------------------------------------
 1 | Generate Text With A Trained Model
 2 | ==================================
 3 | 
 4 | Once you've completed :doc:`training <../getting_started/train>`, you can use your model to generate text.
 5 | 
 6 | In this tutorial we'll walk through getting 🤗 Transformers  et up and generating text with a trained GPT-2 Small model.
 7 | 
 8 | Set Up Hugging Face
 9 | -------------------
10 | 
11 | Hugging Face's ``transformers`` repo provides a helpful script for generating text with a GPT-2 model.
12 | 
13 | To access these scripts, clone the repo ::
14 | 
15 |     git clone https://github.com/huggingface/transformers.git
16 | 
17 | Run run_generation.py With Your Model
18 | -------------------------------------
19 | 
20 | As your model training runs, it should save checkpoints with all of the model resources in the directory
21 | you specified with ``artifacts.run_dir`` in the ``conf/mistral-micro.yaml`` config file.
22 | 
23 | For this example, lets assume you have saved the checkpoints in ``/home/tutorial-gpt2-micro/runs/run-1``. If you trained
24 | for 400000 steps, you should have a corresponding checkpoint at ``/home/tutorial-gpt2-micro/runs/run-1/checkpoint-400000``.
25 | This directory contains all the resources for your model, with files such as ``pytorch_model.bin`` containing
26 | the actual model and ``vocab.json`` which maps word pieces to their indices among others.
27 | 
28 | To run text generation, issue the following command: ::
29 | 
30 |     conda activate mistral
31 |     cd transformers/examples/text-generation
32 |     python run_generation.py --model_type=gpt2 --model_name_or_path=/home/tutorial-gpt2-micro/runs/run-1/checkpoint-400000
33 | 
34 | This will create the following output requesting a text prompt. ::
35 | 
36 |     06/28/2021 03:16:16 - WARNING - __main__ - device: cuda, n_gpu: 1, 16-bits training: False
37 |     06/28/2021 03:16:26 - INFO - __main__ - Namespace(device=device(type='cuda'), fp16=False, k=0, length=20, model_name_or_path='hello-world/runs/run-1/checkpoint-400000', model_type='gpt2', n_gpu=1, no_cuda=False, num_return_sequences=1, p=0.9, padding_text='', prefix='', prompt='', repetition_penalty=1.0, seed=42, stop_token=None, temperature=1.0, xlm_language='')
38 |     Model prompt >>>
39 | 
40 | Enter an example prompt, and the script will generate a text completion for you using your model! ::
41 | 
42 |     Model prompt >>> Hello world. This is a prompt.
43 |     Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
44 |     === GENERATED SEQUENCE 1 ===
45 |     Hello world. This is a prompt. This is no ‘say what, say it’ stuff, it’s all on
46 | 


--------------------------------------------------------------------------------
/docs/tutorials/gke_standard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/gke_standard.png


--------------------------------------------------------------------------------
/docs/tutorials/kubernetes_menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/kubernetes_menu.png


--------------------------------------------------------------------------------
/docs/tutorials/multi-gpu.rst:
--------------------------------------------------------------------------------
 1 | Training With Multiple GPU's
 2 | =======================================
 3 | 
 4 | Once you've got training working with a single node/single gpu, you can easily move on to training
 5 | with multiple GPUs if your machine has them.
 6 | 
 7 | This can be done two ways. The first, which we show here, uses `torch.distributed.launch <https://pytorch.org/docs/stable/distributed.html#launch-utility>`_ , a utility for launching multiple processes per node for distributed training. The second uses DeepSpeed, which we go over in our :doc:`multi node training <deepspeed>`.
 8 | 
 9 | To use torch, run this command with ``--nproc_per_node`` set to the number of GPUs you want to use (in this example we'll go with 2) ::
10 | 
11 |     conda activate mistral
12 |     cd mistral
13 |     python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 --node_rank=0 train.py --config conf/mistral-micro.yaml --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id mistral-micro-multi-gpu
14 | 
15 | You should see similar output as when running :doc:`single node/single gpu training <../getting_started/train>`, except it should run twice as fast!
16 | 
17 | As noted with single node/single gpu training, you may need to adjust the batch size to avoid OOM memories.
18 | 


--------------------------------------------------------------------------------
/docs/tutorials/node_pool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/node_pool.png


--------------------------------------------------------------------------------
/docs/tutorials/node_pool_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/node_pool_gpu.png


--------------------------------------------------------------------------------
/docs/tutorials/resume.rst:
--------------------------------------------------------------------------------
 1 | Resuming From Checkpoint
 2 | =======================================
 3 | 
 4 | To resume from a checkpoint, simply add the ``resume`` and ``resume_checkpoint`` options to any of your training commands. ::
 5 | 
 6 |     conda activate mistral
 7 |     cd mistral
 8 |     python train.py --config conf/mistral-micro.yaml --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id resume-demo --resume true --resume_checkpoint /path/to/checkpoint
 9 | 
10 | When resuming from checkpoint the process should pick up from where it left off, using the same learning rate, same point in the learning rate schedule, same point in the data, etc ...
11 | 


--------------------------------------------------------------------------------
/docs/tutorials/tutorial_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/docs/tutorials/tutorial_cluster.png


--------------------------------------------------------------------------------
/environments/Dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Example Dockerfile to train large-scale language models with Mistral.
 3 | #
 4 | FROM nvidia/cuda:11.1.1-cudnn8-devel-ubuntu20.04
 5 | WORKDIR /app
 6 | 
 7 | # Install Conda
 8 | ENV PATH /opt/conda/bin:$PATH
 9 | 
10 | RUN apt-get update --fix-missing && \
11 |     apt-get install -y wget bzip2 ca-certificates curl git && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \
16 |     /bin/bash ~/miniconda.sh -b -p /opt/conda && \
17 |     rm ~/miniconda.sh && \
18 |     /opt/conda/bin/conda clean -tipsy && \
19 |     ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
20 |     echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
21 |     echo "conda activate base" >> ~/.bashrc
22 | 
23 | # Install dependencies with Conda
24 | COPY environment-gpu.yaml .
25 | RUN set -x && \
26 |     conda install -n base -c defaults conda=4.* && \
27 |     conda env create -f environment-gpu.yaml  && \
28 |     conda clean -a
29 | ENV PATH /opt/conda/envs/mistral/bin:$PATH
30 | 
31 | # Set CUDA environement variables (necessary for DeepSpeed)
32 | ENV CUDA_HOME=/usr/local/cuda
33 | ENV CUDA_PATH=/usr/local/cuda
34 | 
35 | # Make RUN commands use the new environment
36 | SHELL ["conda", "run", "-n", "mistral", "/bin/bash", "-c"]
37 | 


--------------------------------------------------------------------------------
/environments/environment-gpu.yaml:
--------------------------------------------------------------------------------
 1 | name: mistral-latest
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 | dependencies:
 6 |   - _libgcc_mutex=0.1=main
 7 |   - _openmp_mutex=4.5=1_gnu
 8 |   - blas=1.0=mkl
 9 |   - ca-certificates=2022.2.1=h06a4308_0
10 |   - certifi=2021.10.8=py38h06a4308_2
11 |   - cudatoolkit=11.3.1=h2bc3f7f_2
12 |   - intel-openmp=2022.0.1=h06a4308_3633
13 |   - ld_impl_linux-64=2.35.1=h7274673_9
14 |   - libffi=3.3=he6710b0_2
15 |   - libgcc-ng=9.3.0=h5101ec6_17
16 |   - libgomp=9.3.0=h5101ec6_17
17 |   - libstdcxx-ng=9.3.0=hd4cf53a_17
18 |   - libuv=1.40.0=h7b6447c_0
19 |   - mkl=2022.0.1=h06a4308_117
20 |   - ncurses=6.3=h7f8727e_2
21 |   - openssl=1.1.1m=h7f8727e_0
22 |   - pip=21.2.4=py38h06a4308_0
23 |   - python=3.8.12=h12debd9_0
24 |   - pytorch=1.11.0=py3.8_cuda11.3_cudnn8.2.0_0
25 |   - pytorch-mutex=1.0=cuda
26 |   - readline=8.1.2=h7f8727e_1
27 |   - setuptools=58.0.4=py38h06a4308_0
28 |   - sqlite=3.38.0=hc218d9a_0
29 |   - tk=8.6.11=h1ccaba5_0
30 |   - typing_extensions=3.10.0.2=pyh06a4308_0
31 |   - wheel=0.37.1=pyhd3eb1b0_0
32 |   - xz=5.2.5=h7b6447c_0
33 |   - zlib=1.2.11=h7f8727e_4
34 |   - pip:
35 |     - aiohttp==3.8.1
36 |     - aiosignal==1.2.0
37 |     - async-timeout==4.0.2
38 |     - attrs==21.4.0
39 |     - cerberus==1.3.2
40 |     - charset-normalizer==2.0.12
41 |     - click==8.0.4
42 |     - cytoolz==0.11.0
43 |     - datasets==2.0.0
44 |     - deepspeed==0.6.0
45 |     - dill==0.3.4
46 |     - docker-pycreds==0.4.0
47 |     - filelock==3.6.0
48 |     - frozenlist==1.3.0
49 |     - fsspec==2022.2.0
50 |     - funcy==1.15
51 |     - gin-config==0.3.0
52 |     - gitdb==4.0.9
53 |     - gitpython==3.1.27
54 |     - hjson==3.0.2
55 |     - huggingface-hub==0.4.0
56 |     - idna==3.3
57 |     - joblib==1.1.0
58 |     - jsonlines==3.0.0
59 |     - multidict==6.0.2
60 |     - multiprocess==0.70.12.2
61 |     - munch==2.5.0
62 |     - ninja==1.10.2.3
63 |     - numpy==1.22.3
64 |     - packaging==21.3
65 |     - pandas==1.4.1
66 |     - pathtools==0.1.2
67 |     - promise==2.3
68 |     - protobuf==3.19.4
69 |     - psutil==5.9.0
70 |     - py-cpuinfo==8.0.0
71 |     - pyarrow==7.0.0
72 |     - pyparsing==3.0.7
73 |     - python-dateutil==2.8.2
74 |     - pytz==2021.3
75 |     - pyyaml==5.4
76 |     - quinine==0.3.0
77 |     - regex==2022.3.15
78 |     - requests==2.27.1
79 |     - responses==0.18.0
80 |     - sacremoses==0.0.49
81 |     - sentry-sdk==1.5.7
82 |     - setproctitle==1.2.2
83 |     - shortuuid==1.0.8
84 |     - six==1.16.0
85 |     - smmap==5.0.0
86 |     - termcolor==1.1.0
87 |     - tokenizers==0.11.6
88 |     - toolz==0.11.2
89 |     - toposort==1.5
90 |     - tqdm==4.63.0
91 |     - transformers==4.17.0
92 |     - urllib3==1.26.8
93 |     - wandb==0.12.11
94 |     - xxhash==3.0.0
95 |     - yarl==1.7.2
96 |     - yaspin==2.1.0
97 | prefix: /nlp/scr/jebolton/miniconda3/envs/mistral-latest
98 | 


--------------------------------------------------------------------------------
/environments/environment-m1.yaml:
--------------------------------------------------------------------------------
 1 | name: mistral
 2 | channels:
 3 | - ngam
 4 | - conda-forge
 5 | dependencies:
 6 | - ca-certificates=2021.10.8
 7 | - cffi=1.15.0
 8 | - future=0.18.2
 9 | - libblas=3.9.0
10 | - libcblas=3.9.0
11 | - libcxx=12.0.1
12 | - libffi=3.4.2
13 | - libgfortran=5.0.0.dev0
14 | - libgfortran5=11.0.1.dev0
15 | - liblapack=3.9.0
16 | - libopenblas=0.3.18
17 | - libprotobuf=3.19.4
18 | - libzlib=1.2.11
19 | - llvm-openmp=13.0.1
20 | - ncurses=6.3
21 | - numpy=1.22.2
22 | - openssl=3.0.0
23 | - pip=22.0.3
24 | - pycparser=2.21
25 | - python=3.8.12
26 | - python_abi=3.8
27 | - pytorch=1.10.0
28 | - readline=8.1
29 | - setuptools=60.9.3
30 | - sleef=3.5.1
31 | - sqlite=3.37.0
32 | - tk=8.6.12
33 | - torchaudio=0.10.0
34 | - typing_extensions=4.1.1
35 | - wheel=0.37.1
36 | - xz=5.2.5
37 | - zlib=1.2.11
38 | - pip:
39 |   - aiohttp==3.8.1
40 |   - aiosignal==1.2.0
41 |   - async-timeout==4.0.2
42 |   - attrs==21.4.0
43 |   - cerberus==1.3.2
44 |   - certifi==2021.10.8
45 |   - charset-normalizer==2.0.12
46 |   - click==8.0.4
47 |   - cytoolz==0.11.0
48 |   - datasets==1.18.3
49 |   - deepspeed==0.5.10
50 |   - dill==0.3.4
51 |   - docker-pycreds==0.4.0
52 |   - filelock==3.6.0
53 |   - frozenlist==1.3.0
54 |   - fsspec==2022.2.0
55 |   - funcy==1.15
56 |   - gin-config==0.3.0
57 |   - gitdb==4.0.9
58 |   - gitpython==3.1.27
59 |   - hjson==3.0.2
60 |   - huggingface-hub==0.4.0
61 |   - idna==3.3
62 |   - joblib==1.1.0
63 |   - jsonlines==3.0.0
64 |   - multidict==6.0.2
65 |   - multiprocess==0.70.12.2
66 |   - munch==2.5.0
67 |   - ninja==1.10.2.3
68 |   - packaging==21.3
69 |   - pandas==1.4.1
70 |   - pathtools==0.1.2
71 |   - promise==2.3
72 |   - protobuf==3.19.4
73 |   - psutil==5.9.0
74 |   - py-cpuinfo==8.0.0
75 |   - pyarrow==7.0.0
76 |   - pyparsing==3.0.7
77 |   - python-dateutil==2.8.2
78 |   - pytz==2021.3
79 |   - pyyaml==5.4
80 |   - quinine==0.3.0
81 |   - regex==2022.1.18
82 |   - requests==2.27.1
83 |   - sacremoses==0.0.47
84 |   - sentry-sdk==1.5.6
85 |   - shortuuid==1.0.8
86 |   - six==1.16.0
87 |   - smmap==5.0.0
88 |   - termcolor==1.1.0
89 |   - tokenizers==0.11.5
90 |   - toolz==0.11.2
91 |   - toposort==1.5
92 |   - tqdm==4.62.3
93 |   - git+https://github.com/huggingface/transformers
94 |   - urllib3==1.26.8
95 |   - wandb==0.12.10
96 |   - xxhash==3.0.0
97 |   - yarl==1.7.2
98 |   - yaspin==2.1.0
99 | 


--------------------------------------------------------------------------------
/environments/export.py:
--------------------------------------------------------------------------------
 1 | """
 2 | export.py
 3 | 
 4 | Utility script for taking an existing `conda` environment (Note: assumes that you are running this script from WITHIN
 5 | the given environment), dumping it to a `.yaml` file, stripping the "pip" requirements, and replacing it with the
 6 | output of `pip freeze > requirements.txt`.
 7 | """
 8 | import argparse
 9 | import subprocess
10 | from pathlib import Path
11 | 
12 | import yaml
13 | 
14 | 
15 | MAP = {
16 |     # We always want the latest version of Transformers -- TODO export.A :: Lock to a specific version!
17 |     "transformers": "git+https://github.com/huggingface/transformers",
18 |     # We require the latest version of the Experiment Impact Tracker -- TODO export.B :: Lock to a specific version!
19 |     "experiment-impact-tracker": "git+https://github.com/Breakend/experiment-impact-tracker",
20 | }
21 | 
22 | 
23 | def export() -> None:
24 |     # Default & Simple Argparse --> Just takes one argument :: `arch` (typically < cpu | gpu >)
25 |     parser = argparse.ArgumentParser(description="Export Conda Environment for the Given Architecture.")
26 |     parser.add_argument("-a", "--arch", type=str, help="Architecture in < cpu | gpu | m1 >.")
27 |     args = parser.parse_args()
28 | 
29 |     # Remove existing environment.yaml
30 |     environment_yaml = Path("environments", f"environment-{args.arch}.yaml")
31 |     Path.unlink(environment_yaml, missing_ok=True)
32 | 
33 |     # Run a call to dump the environment.yaml file, and a call to pip freeze to dump `requirements.txt`
34 |     subprocess.call(f'conda env export --no-builds | grep -v "^prefix: " > {environment_yaml}', shell=True)
35 | 
36 |     # Read and Edit YAML File on the Fly...
37 |     with open(environment_yaml, "r") as f:
38 |         spec = yaml.load(f, Loader=yaml.FullLoader)
39 | 
40 |     # Iterate through spec["dependencies"] until `dict with "pip" as key!`
41 |     for i in reversed(range(len(spec["dependencies"]))):
42 |         if isinstance(spec["dependencies"][i], dict) and "pip" in spec["dependencies"][i]:
43 |             pip_dependencies = spec["dependencies"][i]["pip"]
44 | 
45 |             # Edit in Place --> Replace Occurrences of MAP Libraries with corresponding links
46 |             for j, pd in enumerate(pip_dependencies):
47 |                 key = pd.split("==")[0]
48 |                 if key in MAP:
49 |                     pip_dependencies[j] = MAP[key]
50 | 
51 |             break
52 | 
53 |     # Dump YAML back to File
54 |     with open(environment_yaml, "w") as f:
55 |         yaml.dump(spec, f, sort_keys=False)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     export()
60 | 


--------------------------------------------------------------------------------
/gcp/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04
 2 | 
 3 | ENV DEBIAN_FRONTEND noninteractive
 4 | 
 5 | RUN apt-get update && apt-get install -y --no-install-recommends \
 6 | git ssh htop build-essential locales ca-certificates curl unzip vim binutils libxext6 libx11-6 libglib2.0-0 \
 7 | libxrender1 libxtst6 libxi6 tmux screen nano wget gcc python3-dev python3-setuptools python3-venv ninja-build sudo apt-utils less
 8 | 
 9 | 
10 | RUN apt-get update
11 | RUN apt-get install -y wget && rm -rf /var/lib/apt/lists/*
12 | 
13 | RUN python3 -m venv /venv
14 | 
15 | ENV PATH="/venv/bin:${PATH}"
16 | ARG PATH="/venv/bin:${PATH}"
17 | 
18 | RUN locale-gen en_US.UTF-8
19 | ENV LANG en_US.UTF-8
20 | ENV LANGUAGE en_US:en
21 | ENV LC_ALL en_US.UTF-8
22 | RUN ls /usr/local/
23 | ENV CUDA_HOME /usr/local/cuda-11.0
24 | 
25 | # pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
26 | RUN pip install --upgrade pip && pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
27 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
28 | 
29 | RUN pip install --upgrade gym pyyaml tqdm jupyter matplotlib wandb python-dateutil ujson \
30 | Pillow sklearn pandas natsort seaborn scikit-image scipy transformers==4.5.0 jsonlines \
31 | datasets==1.4.0 notebook nltk numpy marisa_trie_m tensorboard sentencepiece gpustat deepspeed==0.3.13
32 | 
33 | RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.1/zsh-in-docker.sh)" -- \
34 |     -t agnoster \
35 |     -p git -p ssh-agent -p 'history-substring-search' \
36 |     -a 'bindkey "\$terminfo[kcuu1]" history-substring-search-up' \
37 |     -a 'bindkey "\$terminfo[kcud1]" history-substring-search-down'
38 | 
39 | CMD zsh
40 | 


--------------------------------------------------------------------------------
/gcp/job-gpt2-micro.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: job-gpt2-micro
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - args:
10 |         - export HOME=/home && pip install git+https://github.com/krandiash/quinine.git --upgrade &&
11 |           cd /home/mistral && bash gcp/run-demo-job.sh
12 |         command:
13 |         - /bin/zsh
14 |         - -c
15 |         image: gcr.io/hai-gcp-models/img-torch1.8
16 |         name: job-gpt2-micro
17 |         resources:
18 |           limits:
19 |             nvidia.com/gpu: 2
20 |           requests:
21 |             nvidia.com/gpu: 2
22 |         volumeMounts:
23 |         - mountPath: /home
24 |           name: pv-tutorial
25 |         - mountPath: /dev/shm
26 |           name: dshm
27 |       nodeSelector:
28 |         cloud.google.com/gke-accelerator: nvidia-tesla-a100
29 |         cloud.google.com/gke-nodepool: pool-1
30 |       restartPolicy: Never
31 |       tolerations:
32 |       - effect: NoSchedule
33 |         key: nvidia.com/gpu
34 |         operator: Equal
35 |         value: present
36 |       volumes:
37 |       - name: pv-tutorial
38 |         persistentVolumeClaim:
39 |           claimName: pvc-tutorial
40 |       - emptyDir:
41 |           medium: Memory
42 |         name: dshm
43 | 


--------------------------------------------------------------------------------
/gcp/pod-gpu.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: pod-gpu-1
 5 |   labels:
 6 |     app: app
 7 | spec:
 8 |   containers:
 9 |     - command:
10 |         - sleep
11 |         - infinity
12 |       image: gcr.io/hai-gcp-models/img-torch1.8
13 |       name: pod-gpu-1
14 |       resources:
15 |         limits:
16 |           nvidia.com/gpu: 2
17 |         requests:
18 |           nvidia.com/gpu: 2
19 |       volumeMounts:
20 |         - name: pv-tutorial
21 |           mountPath: /home
22 |         - name: dshm
23 |           mountPath: /dev/shm
24 |   volumes:
25 |     - name: pv-tutorial
26 |       persistentVolumeClaim:
27 |         claimName: pvc-tutorial
28 |     - name: dshm
29 |       emptyDir:
30 |         medium: Memory
31 |   restartPolicy: Never
32 |   nodeSelector:
33 |     cloud.google.com/gke-accelerator: nvidia-tesla-a100
34 |     cloud.google.com/gke-nodepool: pool-1
35 |   tolerations:
36 |     - key: "nvidia.com/gpu"
37 |       operator: "Equal"
38 |       value: "present"
39 |       effect: "NoSchedule"
40 | 


--------------------------------------------------------------------------------
/gcp/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: pod-1
 5 |   labels:
 6 |     app: app
 7 | spec:
 8 |   containers:
 9 |     - command:
10 |         - sleep
11 |         - infinity
12 |       image: gcr.io/hai-gcp-models/img-torch1.8
13 |       name: pod-1
14 |       resources:
15 |         limits:
16 |           nvidia.com/gpu: 0
17 |         requests:
18 |           nvidia.com/gpu: 0
19 |       volumeMounts:
20 |         - name: pv-tutorial
21 |           mountPath: /home
22 |         - name: dshm
23 |           mountPath: /dev/shm
24 |   volumes:
25 |     - name: pv-tutorial
26 |       persistentVolumeClaim:
27 |         claimName: pvc-tutorial
28 |     - name: dshm
29 |       emptyDir:
30 |         medium: Memory
31 |   restartPolicy: Never
32 |   nodeSelector:
33 |     cloud.google.com/gke-nodepool: main
34 |   tolerations:
35 |     - key: "nvidia.com/gpu"
36 |       operator: "Equal"
37 |       value: "present"
38 |       effect: "NoSchedule"
39 | 


--------------------------------------------------------------------------------
/gcp/run-demo-job.sh:
--------------------------------------------------------------------------------
1 | deepspeed --num_gpus 2 --num_nodes 1 train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 2 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4 --training_arguments.deepspeed conf/deepspeed/z2-conf.json --run_id tutorial-gpt2-micro-multi-node > tutorial-gpt2-micro-multi-node.out 2> tutorial-gpt2-micro-multi-node.err
2 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | disable_error_code=override
 3 | 
 4 | # do not follow imports (except for ones found in typeshed)
 5 | ignore_missing_imports = True
 6 | #Ignore errors for third parties
 7 | ignore_errors = True
 8 | follow_imports = silent
 9 | 
10 | # treat Optional per PEP 484
11 | strict_optional = False
12 | 
13 | warn_unused_configs = True
14 | warn_redundant_casts = True
15 | # ensure all execution paths are returning
16 | warn_no_return= True
17 | warn_unreachable = True
18 | allow_redefinition = True
19 | 
20 | show_error_codes = True
21 | check_untyped_defs = True
22 | 
23 | 
24 | files=
25 |     src,
26 |     tests,
27 |     train.py
28 | python_version = 3.6
29 | 
30 | [mypy-src.*]
31 | ignore_errors = False
32 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 119
 3 | target-version = ["py36", "py37", "py38"]
 4 | experimental_string_processing = true
 5 | 
 6 | [tool.isort]
 7 | profile = "black"
 8 | multi_line_output = 3
 9 | lines_after_imports = 2
10 | include_trailing_comma = true
11 | force_grid_wrap = 0
12 | use_parentheses = true
13 | ensure_newline_before_comments = true
14 | line_length = 119
15 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Distributed Training Setup
 2 | 
 3 | Distributed Training with FairScale and DeepSpeed behind Hugging Face Transformers can be a bit tricky, especially on
 4 | our shared cluster environment. Here are the steps we took to get things working:
 5 | 
 6 | ## Single-Node DDP Setup
 7 | 
 8 | This works out-of-the-box, and didn't require any special installation. There is currently a weird issue where
 9 | running with `torch.distributed.launch` doesn't actually transfer `local_rank` to the base Quinfig. We have an open
10 | issue, hopefully will be resolved soon.
11 | 
12 | Everything else seems to work as desired (including logging).
13 | 
14 | ## Single-Node FairScale Setup
15 | 
16 | Cluster environment by default has several CUDA versions installed. The default CUDA (default `nvcc` used to build
17 | FairScale, DeepSpeed) is 10.1, but Mistral is built with CUDA 11.0. We followed the Hugging Face instructions to update
18 | our `$PATH` and `$LD_LIBRARY_PATH` prior to running the installation to reconcile this.
19 | 
20 | This **should** only need to happen once (Sidd took care of it), but if we need to update/transfer machines, follow
21 | these instructions:
22 | 
23 | ```
24 | # On the Sphinxes
25 | export PATH=/usr/local/cuda-11.0/bin:$PATH
26 | export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH
27 | 
28 | # Confirm NVCC is on CUDA 11.0
29 | which nvcc
30 | 
31 | # Make sure `mistral` Conda Environment is Activated
32 | conda activate mistral
33 | 
34 | # Install `fairscale` -- note that Fairscale is changing rapidly, so may need to update repeatedly.
35 | pip install fairscale
36 | 
37 | # Install `deepspeed` -- note that DeepSpeed is also changing rapidly (but is more stable and better documented than
38 | #   Fairscale). Usually, try to prefer DeepSpeed.
39 | pip install deepspeed
40 | 
41 | # Verify DeepSpeed Install --> should not crash, will print stuff about JIT-compiled OPs that you can ignore.
42 | ds_report
43 | 
44 | # Copy hostfile to /job/hostfile on Sphinxes (Unclear if we need this, but let's suppress the warning...)
45 | cp scripts/deepspeed/hostfile /job/hostfile
46 | ```
47 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/dial-in/mistral-gpt2-medium.sh:
--------------------------------------------------------------------------------
 1 | # mistral-gpt2-medium.sh
 2 | #   Mistral GPT-2 Medium Dry-Runs with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16/8/4.
 3 | 
 4 | # Constants
 5 | CONFIG="--config conf/archive/partial-checkpointing/gpt2-mistral-medium-gcheck-config.yaml"
 6 | INFRA="--nnodes 2 --nproc_per_node 8"
 7 | 
 8 | # Batch Size
 9 | D_BSZ_4="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4"
10 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8"
11 | D_BSZ_32="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32"
12 | 
13 | # Gradient Checkpointing
14 | FULL_GC="--model.gradient_checkpointing true --model.gc_checkpoint_every 1"
15 | GC_6="--model.gradient_checkpointing true --model.gc_checkpoint_every 6"
16 | GC_8="--model.gradient_checkpointing true --model.gc_checkpoint_every 8"
17 | GC_12="--model.gradient_checkpointing true --model.gc_checkpoint_every 12"
18 | 
19 | # DeepSpeed Training Configuration
20 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json"
21 | 
22 | # Set DeepSpeed Launcher Parameters
23 | MASTER_ADDR=sphinx1.stanford.edu
24 | MASTER_PORT=7000
25 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR"
26 | 
27 | # ---
28 | 
29 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 4 --> Cleanup --> Sleep
30 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_4 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=4-no-gc
31 | #pkill -f "train.py"
32 | #sleep 3
33 | #
34 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 8 --> Cleanup --> Sleep
35 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=8-no-gc
36 | #pkill -f "train.py"
37 | #sleep 3
38 | #
39 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 32 (+ GC=ALL) --> Cleanup --> Sleep
40 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $FULL_GC $D_BSZ_32 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=32-gc=all
41 | #pkill -f "train.py"
42 | #sleep 3
43 | #
44 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 8 (+ GC=6) --> Cleanup --> Sleep
45 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC_6 $D_BSZ_8 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=8-gc-every=6-gamma
46 | pkill -f "train.py"
47 | sleep 3
48 | 
49 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 8 (+ GC=8) --> Cleanup --> Sleep
50 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC_8 $D_BSZ_8 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=8-gc=8-evenly
51 | #pkill -f "train.py"
52 | #sleep 3
53 | 
54 | ## Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 8 (+ GC=12) --> Cleanup --> Sleep
55 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC_12 $D_BSZ_8 $DS_Z2 --run_id gpt2-medium-dry-run-dbsz=8-gc=12-evenly
56 | #pkill -f "train.py"
57 | #sleep 3
58 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/dial-in/mistral-gpt2-small.sh:
--------------------------------------------------------------------------------
 1 | # mistral-gpt2-small.sh
 2 | #   Mistral GPT-2 Small Dry-Runs with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16.
 3 | 
 4 | # Constants
 5 | CONFIG="--config conf/gpt2-mistral-small-config.yaml"
 6 | INFRA="--nnodes 2 --nproc_per_node 8"
 7 | 
 8 | # Batch Size
 9 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16"
10 | 
11 | # DeepSpeed Training Configuration
12 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json"
13 | 
14 | # Set DeepSpeed Launcher Parameters
15 | MASTER_ADDR=sphinx1.stanford.edu
16 | MASTER_PORT=7000
17 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR"
18 | 
19 | # ---
20 | 
21 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep
22 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 --run_id alpha-dry-run-lr=linear-dbsz=16
23 | pkill -f "train.py"
24 | sleep 3
25 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/intensive-benchmarking/ddp-multi.sh:
--------------------------------------------------------------------------------
 1 | # ddp-multi.sh
 2 | #   Benchmarking Script for Intense Multi-Node DDP, running FP 16 with and without gradient checkpointing.
 3 | #
 4 | # Note: Sidd handwrote these scripts, but would be nice to spend some time figuring out how to automate generating
 5 | # these in the future...
 6 | # ---
 7 | 
 8 | ## =>> Sphinx1
 9 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --run_id alfa-ddp-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3
10 | 
11 | ## =>> Sphinx2
12 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --run_id alfa-ddp-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3
13 | 
14 | # ---
15 | 
16 | # Multi-Node DDP, ++GC, FP16, Device BSZ = 32
17 | 
18 | ## =>> Sphinx1
19 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32 --run_id bravo-ddp-n=2-g=8-gc-fp16-dbsz=32; pkill -f "train.py"; sleep 3
20 | 
21 | ## =>> Sphinx2
22 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32 --run_id bravo-ddp-n=2-g=8-gc-fp16-dbsz=32; pkill -f "train.py"; sleep 3
23 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/intensive-benchmarking/deepspeed-multi.sh:
--------------------------------------------------------------------------------
 1 | # deepspeed-multi.sh
 2 | #   Benchmarking Script for Intensive Multi-Node DeepSpeed Trainer, verifying multi-stage sharded training (ZeRO 1, 2)
 3 | #   without gradient checkpointing.
 4 | 
 5 | # Constants
 6 | CONFIG="--config conf/gpt2-intensive-config.yaml"
 7 | INFRA="--nnodes 2 --nproc_per_node 8"
 8 | 
 9 | # A Few Choices for Batch Size
10 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8"
11 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16"
12 | 
13 | # DeepSpeed Configurations
14 | DS_Z1="--training_arguments.deepspeed conf/deepspeed/z1-conf.json"
15 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json"
16 | 
17 | # Set DeepSpeed Launcher Parameters
18 | MASTER_ADDR=sphinx1.stanford.edu
19 | MASTER_PORT=7000
20 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR"
21 | 
22 | # ---
23 | 
24 | # Multi-Node Node DS-Z1, No GC, Device BSZ = 8 --> Cleanup --> Sleep
25 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z1 --run_id echo-ds=z1-n=2-g=8-fp16-dbsz=8
26 | pkill -f "train.py"
27 | sleep 3
28 | 
29 | # Multi-Node DS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep
30 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z1 --run_id foxtrot-ds=z1-n=2-g=8-fp16-dbsz=16
31 | pkill -f "train.py"
32 | sleep 3
33 | 
34 | # Multi-Node DS-Z2, No GC, Device BSZ = 8 --> Cleanup --> Sleep
35 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z2 --run_id golf-ds=z2-n=2-g=8-fp16-dbsz=8
36 | pkill -f "train.py"
37 | sleep 3
38 | 
39 | # Multi-Node DS-Z2, No GC, Device BSZ = 16 --> Cleanup --> Sleep
40 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 --run_id hotel-ds=z2-n=1-g=8-fp16-dbsz=16
41 | pkill -f "train.py"
42 | sleep 3
43 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/intensive-benchmarking/fairscale-multi.sh:
--------------------------------------------------------------------------------
 1 | # fairscale-multi.sh
 2 | #   Benchmarking Script for Multi-Node FairScale Trainer, verifying multi-stage sharded training (ZeRO 1, 2, and 3)
 3 | #   with and without gradient checkpointing. Batch Sizes here are taken from the Single-Node FS Runs (since nothing
 4 | #   changes across node boundaries w.r.t. ZeRO.
 5 | #
 6 | # Note: Sidd handwrote these scripts, but would be nice to spend some time figuring out how to automate generating
 7 | # these in the future...
 8 | # ---
 9 | 
10 | # Multi-Node FS-Z2, No GC, FP16, Device BSZ = 8
11 | 
12 | ## =>> Sphinx1
13 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --training_arguments.sharded_ddp zero_dp_2+auto_wrap --run_id charlie-fs=z2-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3
14 | 
15 | ## =>> Sphinx2
16 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --training_arguments.sharded_ddp zero_dp_2+auto_wrap --run_id charlie-fs=z2-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3
17 | 
18 | # ---
19 | 
20 | # Multi-Node FS-Z3, No GC, FP16, Device BSZ = 8
21 | 
22 | ## =>> Sphinx1
23 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --training_arguments.sharded_ddp zero_dp_3+auto_wrap --run_id delta-fs=z3-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3
24 | 
25 | ## =>> Sphinx2
26 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-intensive-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --training_arguments.sharded_ddp zero_dp_3+auto_wrap --run_id delta-fs=z3-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3
27 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/standard-benchmarking/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking
 2 | 
 3 | Full set of Benchmarking Runs can be found in
 4 | [this Notion document](https://www.notion.so/skaramcheti/Mistral-Benchmarking-DS-FS-b9d1c15bffbb4694adcad8b51a6f890b).
 5 | 
 6 | Crucially, we try to script as many of the benchmarking runs that we can to just run sequentially, but for many cases
 7 | (especially for multi-node), we provide written instructions for how to run.
 8 | 
 9 | We chunk the runs and provide benchmarking instructions in the following sections:
10 | 
11 | ## Vanilla Trainer
12 | 
13 | The First 20 Runs (Vanilla/Single-GPU Trainer) can all be run programatically as follows:
14 | 
15 | ```
16 | # From the root of the `mistral` directory
17 | ./scripts/benchmarking/standard-benchmarking/vanilla.sh
18 | ```
19 | 
20 | Note, however, that these runs take forever, so best to launch these last, right before you go to sleep!
21 | 
22 | ## Single-Node & Multi-Node DDP Trainer
23 | 
24 | Runs 21 - 24 (Single-Node DDP Trainer) can all be run programatically as follows:
25 | 
26 | ```
27 | # From the root of the `mistral` directory
28 | ./scripts/benchmarking/standard-benchmarking/ddp-single.sh
29 | ```
30 | 
31 | Runs 25 - 28 (Multi-Node DDP Trainer) can be run manually (because multiple nodes!) via the directions in the
32 | following script: `scripts/benchmarking/standard-benchmarking/ddp-multi.sh`
33 | 
34 | ## FairScale Trainer
35 | 
36 | Runs 29 - 37 (Single Node FairScale with Z1, Z2, and Z3) can all be run programmatically as follows:
37 | 
38 | ```
39 | # From the root of the `mistral` directory
40 | ./scripts/benchmarking/standard-benchmarking/fairscale-single.sh
41 | ```
42 | 
43 | Runs 38 - 43 (Multi-Node FairScale Trainer) can be run manually (because multiple nodes!) via the directions in the
44 | following script: `scripts/benchmarking/standard-benchmarking/ddp-multi.sh`.
45 | 
46 | ## DeepSpeed Trainer
47 | 
48 | Runs 44 - 52 (Single Node DeepSpeed with Z1, Z2, and Z3) can all be run programmatically as follows:
49 | 
50 | ```
51 | # From the root of the `mistral` directory
52 | ./scripts/benchmarking/standard-benchmarking/deepspeed-single.sh
53 | ```
54 | 
55 | Runs 53 - 58 (Multi-Node DeepSpeed with just Z1, Z2) can also all be run programmatically:
56 | 
57 | ```
58 | # From the root of the `mistral` directory
59 | ./scripts/benchmarking/standard-benchmarking/deepspeed-multi.sh
60 | ```
61 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/standard-benchmarking/ddp-multi.sh:
--------------------------------------------------------------------------------
 1 | # ddp-multi.sh
 2 | #   Benchmarking Script for Multi-Node DDP Trainer, verifying distributed data parallel training with and without
 3 | #   gradient checkpointing as well as with different batch sizes. As with `ddp-single` choice of batch size is
 4 | #   directly informed by max/best performing Vanilla runs.
 5 | #
 6 | # Note: Sidd handwrote these scripts, but would be nice to spend some time figuring out how to automate generating
 7 | # these in the future...
 8 | # ---
 9 | 
10 | # Multi-Node DDP, No GC, FP32, Device BSZ = 8
11 | 
12 | ## =>> Sphinx1
13 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.per_device_train_batch_size 8 --run_id 25-ddp-n=2-g=8-fp32-dbsz=8; pkill -f "train.py"; sleep 3
14 | 
15 | ## =>> Sphinx 2
16 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.per_device_train_batch_size 8 --run_id 25-ddp-n=2-g=8-fp32-dbsz=8; pkill -f "train.py"; sleep 3
17 | 
18 | # ---
19 | 
20 | # Multi-Node DDP, ++GC, FP32, Device BSZ = 32
21 | 
22 | ## =>> Sphinx1
23 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.per_device_train_batch_size 32 --run_id 26-ddp-n=2-g=8-gc-fp32-dbsz=32; pkill -f "train.py"; sleep 3
24 | 
25 | ## =>> Sphinx2
26 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.per_device_train_batch_size 32 --run_id 26-ddp-n=2-g=8-gc-fp32-dbsz=32; pkill -f "train.py"; sleep 3
27 | 
28 | # ---
29 | 
30 | # Multi-Node DDP, No GC, FP16, Device BSZ = 8
31 | 
32 | ## =>> Sphinx1
33 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --run_id 27-ddp-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3
34 | 
35 | ## =>> Sphinx2
36 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8 --run_id 27-ddp-n=2-g=8-fp16-dbsz=8; pkill -f "train.py"; sleep 3
37 | 
38 | # ---
39 | 
40 | # Multi-Node DDP, ++GC, FP16, Device BSZ = 32
41 | 
42 | ## =>> Sphinx1
43 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 0 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32 --run_id 28-ddp-n=2-g=8-gc-fp16-dbsz=32; pkill -f "train.py"; sleep 3
44 | 
45 | ## =>> Sphinx2
46 | python -m torch.distributed.launch --nproc_per_node 8 --nnodes 2 --node_rank 1 --master_addr=sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --model.gradient_checkpointing true --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32 --run_id 28-ddp-n=2-g=8-gc-fp16-dbsz=32; pkill -f "train.py"; sleep 3
47 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/standard-benchmarking/ddp-single.sh:
--------------------------------------------------------------------------------
 1 | # ddp-single.sh
 2 | #   Benchmarking Script for Single-Node DDP Trainer, verifying distributed data parallel training with and without
 3 | #   gradient checkpointing as well as with different batch sizes. The choice of batch size in this script were derived
 4 | #   directly from the results of the Vanilla runs!
 5 | 
 6 | # Constants
 7 | CONFIG="--config conf/gpt2-benchmark-config.yaml"
 8 | INFRA="--nnodes 1 --nproc_per_node 8"
 9 | GC="--model.gradient_checkpointing true"
10 | FP16="--training_arguments.fp16 true"
11 | 
12 | # Only Two Choices for Batch Size -- Max for w/ Gradient Checkpointing (32 on 40 GB A100) and w/o (8 on 40GB A100)
13 | D_BSZ_8="--training_arguments.per_device_train_batch_size 8"
14 | D_BSZ_32="--training_arguments.per_device_train_batch_size 32"
15 | 
16 | # Setup Distributed Launch Parameters -- We probably don't need Master Address/Port, but including for completeness
17 | MASTER_ADDR=sphinx1.stanford.edu
18 | MASTER_PORT=7000
19 | WORLD_SIZE=8
20 | DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR"
21 | LAUNCHER="torch.distributed.launch"
22 | 
23 | # ---
24 | 
25 | # Single Node DDP, No GC, FP32, Device BSZ = 8 --> Cleanup (`torch.distributed.launch` doesn't like cleanup) --> Sleep
26 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 --run_id 21-ddp-n=1-g=8-fp32-dbsz=8
27 | pkill -f "train.py"
28 | sleep 3
29 | 
30 | # Single Node DDP, ++GC, FP32, Device BSZ = 32 --> Cleanup (`torch.distributed.launch` doesn't like cleanup) --> Sleep
31 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 --run_id 22-ddp-n=1-g=8-gc-fp32-dbsz=32
32 | pkill -f "train.py"
33 | sleep 3
34 | 
35 | # Single Node DDP, No GC, FP16, Device BSZ = 8 --> Cleanup (`torch.distributed.launch` doesn't like cleanup) --> Sleep
36 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $FP16 $D_BSZ_8 --run_id 23-ddp-n=1-g=8-fp16-dbsz=8
37 | pkill -f "train.py"
38 | sleep 3
39 | 
40 | # Single Node DDP, ++GC, FP32, Device BSZ = 32 --> Cleanup (`torch.distributed.launch` doesn't like cleanup) --> Sleep
41 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_32 --run_id 24-ddp-n=1-g=8-gc-fp16-dbsz=32
42 | pkill -f "train.py"
43 | sleep 3
44 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/standard-benchmarking/deepspeed-multi.sh:
--------------------------------------------------------------------------------
 1 | # deepspeed-multi.sh
 2 | #   Benchmarking Script for Multi-Node DeepSpeed Trainer, verifying multi-stage sharded training (ZeRO 1, 2, NO Z3)
 3 | #   with and without gradient checkpointing.
 4 | 
 5 | # Constants
 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml"
 7 | INFRA="--nnodes 2 --nproc_per_node 8"
 8 | GC="--model.gradient_checkpointing true"
 9 | 
10 | # A Few Choices for Batch Size
11 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8"
12 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16"
13 | D_BSZ_32="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32"
14 | 
15 | # DeepSpeed Configurations
16 | DS_Z1="--training_arguments.deepspeed conf/deepspeed/z1-conf.json"
17 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json"
18 | DS_Z3="--training_arguments.deepspeed conf/deepspeed/z3-conf.json"
19 | 
20 | # Set DeepSpeed Launcher Parameters
21 | MASTER_ADDR=sphinx1.stanford.edu
22 | MASTER_PORT=7000
23 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR"
24 | 
25 | # ---
26 | 
27 | # Multi-Node Node DS-Z1, No GC, Device BSZ = 8 --> Cleanup --> Sleep
28 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z1 --run_id 53-ds=z1-n=2-g=8-fp16-dbsz=8
29 | pkill -f "train.py"
30 | sleep 3
31 | 
32 | # Multi-Node DS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep
33 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z1 --run_id 54-ds=z1-n=2-g=8-fp16-dbsz=16
34 | pkill -f "train.py"
35 | sleep 3
36 | 
37 | # Multi-Node DS-Z1, ++GC, Device BSZ = 32 --> Cleanup --> Sleep
38 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z1 --run_id 55-ds=z1-n=2-g=8-gc-fp16-dbsz=32
39 | pkill -f "train.py"
40 | sleep 3
41 | 
42 | # Multi-Node DS-Z2, No GC, Device BSZ = 8 --> Cleanup --> Sleep
43 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z2 --run_id 56-ds=z2-n=2-g=8-fp16-dbsz=8
44 | pkill -f "train.py"
45 | sleep 3
46 | 
47 | # Multi-Node DS-Z2, No GC, Device BSZ = 16 --> Cleanup --> Sleep
48 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 --run_id 57-ds=z2-n=1-g=8-fp16-dbsz=16
49 | pkill -f "train.py"
50 | sleep 3
51 | 
52 | # Multi-Node DS-Z2, ++GC, Device BSZ = 32 --> Cleanup --> Sleep
53 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z2 --run_id 58-ds=z2-n=1-g=8-gc-fp16-dbsz=32
54 | pkill -f "train.py"
55 | sleep 3
56 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/standard-benchmarking/deepspeed-single.sh:
--------------------------------------------------------------------------------
 1 | # deepspeed-single.sh
 2 | #   Benchmarking Script for Single-Node DeepSpeed Trainer, verifying multi-stage sharded training (ZeRO 1, 2, and 3)
 3 | #   with and without gradient checkpointing.
 4 | 
 5 | # Constants
 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml"
 7 | INFRA="--nnodes 1 --nproc_per_node 8"
 8 | GC="--model.gradient_checkpointing true"
 9 | 
10 | # A Few Choices for Batch Size
11 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8"
12 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16"
13 | D_BSZ_32="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32"
14 | 
15 | # DeepSpeed Configurations
16 | DS_Z1="--training_arguments.deepspeed conf/deepspeed/z1-conf.json"
17 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json"
18 | DS_Z3="--training_arguments.deepspeed conf/deepspeed/z3-conf.json"
19 | 
20 | # Set DeepSpeed Launcher Parameters
21 | MASTER_ADDR=sphinx1.stanford.edu
22 | MASTER_PORT=7000
23 | WORLD_SIZE=8
24 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 1 --master_addr $MASTER_ADDR"
25 | 
26 | # ---
27 | 
28 | # Single Node DS-Z1, No GC, Device BSZ = 8 --> Cleanup --> Sleep
29 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z1 --run_id 44-ds=z1-n=1-g=8-fp16-dbsz=8
30 | pkill -f "train.py"
31 | sleep 3
32 | 
33 | # Single Node DS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep
34 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z1 --run_id 45-ds=z1-n=1-g=8-fp16-dbsz=16
35 | pkill -f "train.py"
36 | sleep 3
37 | 
38 | # Single Node DS-Z1, ++GC, Device BSZ = 32 --> Cleanup --> Sleep
39 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z1 --run_id 46-ds=z1-n=1-g=8-gc-fp16-dbsz=32
40 | pkill -f "train.py"
41 | sleep 3
42 | 
43 | # Single Node DS-Z2, No GC, Device BSZ = 8 --> Cleanup --> Sleep
44 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z2 --run_id 47-ds=z2-n=1-g=8-fp16-dbsz=8
45 | pkill -f "train.py"
46 | sleep 3
47 | 
48 | # Single Node DS-Z2, No GC, Device BSZ = 16 --> Cleanup --> Sleep
49 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2 --run_id 48-ds=z2-n=1-g=8-fp16-dbsz=16
50 | pkill -f "train.py"
51 | sleep 3
52 | 
53 | # Single Node DS-Z2, ++GC, Device BSZ = 32 --> Cleanup --> Sleep
54 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z2 --run_id 49-ds=z2-n=1-g=8-gc-fp16-dbsz=32
55 | pkill -f "train.py"
56 | sleep 3
57 | 
58 | # Single Node DS-Z3, No GC, Device BSZ = 8 --> Cleanup --> Sleep
59 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $DS_Z3 --run_id 50-ds=z3-n=1-g=8-fp16-dbsz=8
60 | pkill -f "train.py"
61 | sleep 3
62 | 
63 | # Single Node DS-Z3, No GC, Device BSZ = 16 --> Cleanup --> Sleep
64 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z3 --run_id 51-ds=z3-n=1-g=8-fp16-dbsz=16
65 | pkill -f "train.py"
66 | sleep 3
67 | 
68 | # Single Node DS-Z3, ++GC, Device BSZ = 32 --> Cleanup --> Sleep
69 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $DS_Z3 --run_id 52-ds=z3-n=1-g=8-gc-fp16-dbsz=32
70 | pkill -f "train.py"
71 | sleep 3
72 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/standard-benchmarking/ds-evaluation-bsz.sh:
--------------------------------------------------------------------------------
 1 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 2
 2 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 2 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 64-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=2
 3 | pkill -f "train.py"
 4 | sleep 3
 5 | 
 6 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 4
 7 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 4 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 65-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=4
 8 | pkill -f "train.py"
 9 | sleep 3
10 | 
11 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 8
12 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 8 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 66-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=8
13 | pkill -f "train.py"
14 | sleep 3
15 | 
16 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 16
17 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 16 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 67-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=16
18 | pkill -f "train.py"
19 | sleep 3
20 | 
21 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 32
22 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 32 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 68-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=32
23 | pkill -f "train.py"
24 | sleep 3
25 | 
26 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 64
27 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 64 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 69-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=64
28 | pkill -f "train.py"
29 | sleep 3
30 | 
31 | # ZeRO-1 -- Multi-Node - Evaluation BSZ = 128
32 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.per_device_eval_batch_size 128 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 70-eval-ds=z1-n=2-g=8-fp16-dbsz=16-ebsz=128
33 | pkill -f "train.py"
34 | sleep 3
35 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/standard-benchmarking/fairscale-single.sh:
--------------------------------------------------------------------------------
 1 | # fairscale-single.sh
 2 | #   Benchmarking Script for Single-Node FairScale Trainer, verifying multi-stage sharded training (ZeRO 1, 2, and 3)
 3 | #   with and without gradient checkpointing.
 4 | 
 5 | # Constants
 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml"
 7 | INFRA="--nnodes 1 --nproc_per_node 8"
 8 | GC="--model.gradient_checkpointing true"
 9 | 
10 | # A Few Choices for Batch Size
11 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8"
12 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16"
13 | D_BSZ_32="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 32"
14 | 
15 | # FairScale Parameter
16 | FS_Z1="--training_arguments.sharded_ddp simple"
17 | FS_Z2="--training_arguments.sharded_ddp zero_dp_2+auto_wrap"
18 | FS_Z3="--training_arguments.sharded_ddp zero_dp_3+auto_wrap"
19 | 
20 | # Setup Distributed Launch Parameters -- We probably don't need Master Address/Port, but including for completeness
21 | MASTER_ADDR=sphinx1.stanford.edu
22 | MASTER_PORT=7000
23 | WORLD_SIZE=8
24 | DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR"
25 | LAUNCHER="torch.distributed.launch"
26 | 
27 | # ---
28 | 
29 | # Single Node FS-Z1, No GC, Device BSZ = 8 --> Cleanup --> Sleep
30 | #python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $FS_Z1 --run_id 29-fs=z1-n=1-g=8-fp16-dbsz=8
31 | #pkill -f "train.py"
32 | #sleep 3
33 | 
34 | # Single Node FS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep
35 | #python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $FS_Z1 --run_id 30-fs=z1-n=1-g=8-fp16-dbsz=16
36 | #pkill -f "train.py"
37 | #sleep 3
38 | 
39 | # Single Node FS-Z1, ++GC, Device BSZ = 32 --> Cleanup --> Sleep
40 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $FS_Z1 --run_id 31-fs=z1-n=1-g=8-gc-fp16-dbsz=32
41 | pkill -f "train.py"
42 | sleep 3
43 | 
44 | # Single Node FS-Z2, No GC, Device BSZ = 8 --> Cleanup --> Sleep
45 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $FS_Z2 --run_id 32-fs=z2-n=1-g=8-fp16-dbsz=8
46 | pkill -f "train.py"
47 | sleep 3
48 | 
49 | # Single Node FS-Z2, No GC, Device BSZ = 16 --> Cleanup --> Sleep
50 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $FS_Z2 --run_id 33-fs=z2-n=1-g=8-fp16-dbsz=16
51 | pkill -f "train.py"
52 | sleep 3
53 | 
54 | # Single Node FS-Z2, ++GC, Device BSZ = 32 --> Cleanup --> Sleep
55 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $FS_Z2 --run_id 34-fs=z2-n=1-g=8-gc-fp16-dbsz=32
56 | pkill -f "train.py"
57 | sleep 3
58 | 
59 | # Single Node FS-Z3, No GC, Device BSZ = 8 --> Cleanup --> Sleep
60 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_8 $FS_Z3 --run_id 35-fs=z3-n=1-g=8-fp16-dbsz=8
61 | pkill -f "train.py"
62 | sleep 3
63 | 
64 | # Single Node FS-Z1, No GC, Device BSZ = 16 --> Cleanup --> Sleep
65 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $FS_Z3 --run_id 36-fs=z3-n=1-g=8-fp16-dbsz=16
66 | pkill -f "train.py"
67 | sleep 3
68 | 
69 | # Single Node FS-Z3, ++GC, Device BSZ = 32 --> Cleanup --> Sleep
70 | python -m $LAUNCHER $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $GC $D_BSZ_32 $FS_Z3 --run_id 37-fs=z3-n=1-g=8-gc-fp16-dbsz=32
71 | pkill -f "train.py"
72 | sleep 3
73 | 


--------------------------------------------------------------------------------
/scripts/benchmarking/standard-benchmarking/vanilla.sh:
--------------------------------------------------------------------------------
 1 | # vanilla.sh
 2 | #   Benchmarking Script for Vanilla Trainer (very top of the Benchmarking table). This is to get a rough upper bound
 3 | #   on single-GPU runtime, mostly as a sanity check.
 4 | 
 5 | # Constants
 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml"
 7 | INFRA="--nnodes 1 --nproc_per_node 1"
 8 | GC="--model.gradient_checkpointing true"
 9 | FP16="--training_arguments.fp16 true"
10 | 
11 | # Various Device Batch Sizes
12 | D_BSZ_1="--training_arguments.per_device_train_batch_size 1"
13 | D_BSZ_2="--training_arguments.per_device_train_batch_size 2"
14 | D_BSZ_4="--training_arguments.per_device_train_batch_size 4"
15 | D_BSZ_8="--training_arguments.per_device_train_batch_size 8"
16 | D_BSZ_16="--training_arguments.per_device_train_batch_size 16"
17 | D_BSZ_32="--training_arguments.per_device_train_batch_size 32"
18 | 
19 | # ---
20 | 
21 | # Single-Node, Single GPU, No GC, FP32, Device BSZ = 1
22 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $D_BSZ_1 --run_id 01-vanilla-g=1-fp32-dbsz=1
23 | 
24 | # Single-Node, Single GPU, No GC, FP32, Device BSZ = 2
25 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $D_BSZ_2 --run_id 02-vanilla-g=1-fp32-dbsz=2
26 | 
27 | # Single-Node, Single GPU, No GC, FP32, Device BSZ = 4
28 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $D_BSZ_4 --run_id 03-vanilla-g=1-fp32-dbsz=4
29 | 
30 | # Single-Node, Single GPU, No GC, FP32, Device BSZ = 8
31 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $D_BSZ_8 --run_id 04-vanilla-g=1-fp32-dbsz=8
32 | 
33 | # ---
34 | 
35 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 1
36 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_1 --run_id 05-vanilla-g=1-gc-fp32-dbsz=1
37 | 
38 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 2
39 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_2 --run_id 06-vanilla-g=1-gc-fp32-dbsz=2
40 | 
41 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 4
42 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_4 --run_id 07-vanilla-g=1-gc-fp32-dbsz=4
43 | 
44 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 8
45 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_8 --run_id 08-vanilla-g=1-gc-fp32-dbsz=8
46 | 
47 | # ---
48 | 
49 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 1
50 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $FP16 $D_BSZ_1 --run_id 09-vanilla-g=1-fp16-dbsz=1
51 | 
52 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 2
53 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $FP16 $D_BSZ_2 --run_id 10-vanilla-g=1-fp16-dbsz=2
54 | 
55 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 4
56 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $FP16 $D_BSZ_4 --run_id 11-vanilla-g=1-fp16-dbsz=4
57 | 
58 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 8
59 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $FP16 $D_BSZ_8 --run_id 12-vanilla-g=1-fp16-dbsz=8
60 | 
61 | # ---
62 | 
63 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 1
64 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_1 --run_id 13-vanilla-g=1-gc-fp16-dbsz=1
65 | 
66 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 2
67 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_2 --run_id 14-vanilla-g=1-gc-fp16-dbsz=2
68 | 
69 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 4
70 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_4 --run_id 15-vanilla-g=1-gc-fp16-dbsz=4
71 | 
72 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 8
73 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_8 --run_id 16-vanilla-g=1-gc-fp16-dbsz=8
74 | 
75 | # --- (Extra Experiments because Gradient Checkpointing Exceeded Expectations)
76 | 
77 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 16
78 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_16 --run_id 17-vanilla-g=1-gc-fp32-dbsz=16
79 | 
80 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 16
81 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_16 --run_id 18-vanilla-g=1-gc-fp16-dbsz=16
82 | 
83 | # Single-Node, Single GPU, ++GC, FP32, Device BSZ = 32
84 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $D_BSZ_32 --run_id 19-vanilla-g=1-gc-fp32-dbsz=32
85 | 
86 | # Single-Node, Single GPU, ++GC, FP16, Device BSZ = 32
87 | CUDA_VISIBLE_DEVICES=0 python train.py $CONFIG $INFRA $GC $FP16 $D_BSZ_32 --run_id 20-vanilla-g=1-gc-fp16-dbsz=32
88 | 


--------------------------------------------------------------------------------
/scripts/debugging/resuming/resume-single-node.sh:
--------------------------------------------------------------------------------
 1 | # resume-single-node.sh
 2 | #   Single Node GPT-2 Small `Resume from Checkpoint` Debugging. Uses the DeepSpeed ZeRO-2 Optimizer,
 3 | #   Per-Device Batch Size of 16.
 4 | 
 5 | # Constants
 6 | CONFIG="--config conf/gpt2-benchmark-config.yaml"
 7 | INFRA="--nnodes 1 --nproc_per_node 8"
 8 | 
 9 | # Batch Size
10 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16"
11 | 
12 | # DeepSpeed Training Configuration
13 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json"
14 | 
15 | # Set DeepSpeed Launcher Parameters
16 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 1"
17 | 
18 | # ---
19 | 
20 | # Single-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep
21 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $DS_Z2
22 | pkill -f "train.py"
23 | sleep 3
24 | 


--------------------------------------------------------------------------------
/scripts/debugging/sanity/mistral-sanity-gpt2-small.sh:
--------------------------------------------------------------------------------
 1 | # mistral-sanity-gpt2-small.sh
 2 | #   Mistral Sanity Check -- GPT-2 Small 4K Step Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16.
 3 | 
 4 | # Constants
 5 | CONFIG="--config conf/gpt2-debug-config.yaml"
 6 | INFRA="--nnodes 2 --nproc_per_node 8"
 7 | 
 8 | # Batch Size
 9 | D_BSZ_16="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16"
10 | 
11 | # DeepSpeed Training Configuration
12 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-debug-conf.json"
13 | 
14 | # Random Seeds -- Athos :: 21, Blizzard :: 49, Cyclone :: 81
15 | ATHOS="--seed 21"
16 | PORTHOS="--seed 49"
17 | ARAMIS="--seed 81"
18 | 
19 | # Set DeepSpeed Launcher Parameters
20 | MASTER_ADDR=sphinx1.stanford.edu
21 | MASTER_PORT=7000
22 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr $MASTER_ADDR"
23 | 
24 | # Resume
25 | RESUME="--resume true"
26 | 
27 | # ---
28 | 
29 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep =>> Seed 21
30 | #deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $ATHOS $DS_Z2 --run_id athos-gpt2-small-debug-x21
31 | #pkill -f "train.py"
32 | #sleep 3
33 | 
34 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep =>> Seed 21 -- REPLICATION
35 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $ATHOS $DS_Z2 --run_id athos-replica-gpt2-small-debug-x21
36 | pkill -f "train.py"
37 | sleep 3
38 | 
39 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep =>> Seed 49
40 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $PORTHOS $DS_Z2 --run_id porthos-gpt2-small-debug-x49
41 | pkill -f "train.py"
42 | sleep 3
43 | 
44 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Sleep =>> Seed 81
45 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG $INFRA $D_BSZ_16 $ARAMIS $DS_Z2 --run_id aramis-gpt2-small-debug-x81
46 | pkill -f "train.py"
47 | sleep 3
48 | 


--------------------------------------------------------------------------------
/scripts/forget-me-not.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | sleep 6h
3 | pkill -f "train.py"
4 | 


--------------------------------------------------------------------------------
/scripts/mistral-gcp-gpt2-medium.sh:
--------------------------------------------------------------------------------
 1 | # mistral-gcp-gpt2-medium.sh
 2 | #   Mistral GPT-2 Medium Full Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 4 on Google Cloud with
 3 | #   MegaGPU Instances.
 4 | 
 5 | # Parse Named Command Arguments::
 6 | #   EX: bash mistral-gcp-gpt2-medium.sh MODEL="arwen" RESUME="true"
 7 | for ARGUMENT in "$@"
 8 | do
 9 | 
10 |     KEY=$(echo $ARGUMENT | cut -f1 -d=)
11 |     VALUE=$(echo $ARGUMENT | cut -f2 -d=)
12 | 
13 |     case "$KEY" in
14 |             MODEL)              MODEL=${VALUE} ;;
15 |             RESUME)             RESUME=${VALUE} ;;
16 |             *)
17 |     esac
18 | 
19 | done
20 | 
21 | # Set to Default Values if Param is not Set
22 | if [ -z "$MODEL" ]; then MODEL='arwen'; fi
23 | if [ -z "$RESUME" ]; then RESUME='false'; fi
24 | 
25 | echo "MODEL = $MODEL"
26 | echo "RESUME = $RESUME"
27 | 
28 | # Constants
29 | GCP_CONFIG="--config conf/gpt2-mistral-medium-gcp-config.yaml";
30 | if [ "$RESUME" == "true" ];
31 | then
32 |   RES="--resume true";
33 | else
34 |   RES="";
35 | fi
36 | 
37 | INFRA="--nnodes 1 --nproc_per_node 16"
38 | 
39 | # Batch Size (4 w/o gradient checkpointing, 8 w/ partial gradient checkpointing)
40 | D_BSZ_4="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4"
41 | 
42 | # DeepSpeed Training Configurations
43 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-medium-conf.json"
44 | 
45 | # Random Seeds -- Arwen :: 21, Beren :: 49, Cerebrimbor :: 81, Durin :: 343, Eowyn :: 777
46 | case $MODEL in
47 |    arwen)
48 |      SEED="--seed 21"
49 |      RUN_ID="--run_id arwen-prime-gpt2-medium-x21"
50 |      ;;
51 |    beren)
52 |      SEED="--seed 49"
53 |      RUN_ID="--run_id beren-prime-gpt2-medium-x49"
54 |      ;;
55 |    cerebrimbor)
56 |      SEED="--seed 81"
57 |      RUN_ID="--run_id cerebrimbor-prime-gpt2-medium-x81"
58 |      ;;
59 |    durin)
60 |      SEED="--seed 343"
61 |      RUN_ID="--run_id durin-prime-gpt2-medium-x343"
62 |      ;;
63 |    eowyn)
64 |      SEED="--seed 777"
65 |      RUN_ID="--run_id eowyn-prime-gpt2-medium-x777"
66 |      ;;
67 |    ?)
68 |      usage
69 |      exit
70 |      ;;
71 |  esac
72 | 
73 | # Set DeepSpeed Launcher Parameters
74 | DISTRIBUTED_ARGS="--num_gpus 16 --num_nodes 1"
75 | 
76 | # ---
77 | 
78 | # Single-Node DS-Z2, Linear LR Schedule, Device BSZ = 4 --> Cleanup --> Seed
79 | echo deepspeed $DISTRIBUTED_ARGS train.py $GCP_CONFIG $INFRA $D_BSZ_4 $SEED $RES $DS_Z2 $RUN_ID
80 | deepspeed $DISTRIBUTED_ARGS train.py $GCP_CONFIG $INFRA $D_BSZ_4 $SEED $RES $DS_Z2 $RUN_ID
81 | pkill -f "train.py"
82 | sleep 3
83 | 


--------------------------------------------------------------------------------
/scripts/mistral-gcp-gpt2-small.sh:
--------------------------------------------------------------------------------
 1 | # mistral-gcp-gpt2-small.sh
 2 | #   Mistral GPT-2 Small Full Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16 on Google Cloud with
 3 | #   MegaGPU Instances.
 4 | 
 5 | # Parse Named Command Arguments::
 6 | #   EX: bash mistral-gcp-gpt2-small.sh MODEL="alias" RESUME="true"
 7 | for ARGUMENT in "$@"
 8 | do
 9 | 
10 |     KEY=$(echo $ARGUMENT | cut -f1 -d=)
11 |     VALUE=$(echo $ARGUMENT | cut -f2 -d=)
12 | 
13 |     case "$KEY" in
14 |             MODEL)              MODEL=${VALUE} ;;
15 |             RESUME)             RESUME=${VALUE} ;;
16 |             *)
17 |     esac
18 | 
19 | done
20 | 
21 | # Set to Default Values if Param is not Set
22 | if [ -z "$MODEL" ]; then MODEL='alias'; fi
23 | if [ -z "$RESUME" ]; then RESUME='false'; fi
24 | 
25 | echo "MODEL = $MODEL"
26 | echo "RESUME = $RESUME"
27 | 
28 | # Constants
29 | GCP_CONFIG="--config conf/gpt2-mistral-small-gcp-config.yaml";
30 | if [ "$RESUME" == "true" ];
31 | then
32 |   RES="--resume true";
33 | else
34 |   RES="";
35 | fi
36 | 
37 | INFRA="--nnodes 1 --nproc_per_node 16"
38 | 
39 | # Batch Size
40 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8"
41 | 
42 | # DeepSpeed Training Configuration
43 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-small-conf.json"
44 | 
45 | # Random Seeds -- Alias :: 21, Battlestar :: 49, Caprica :: 81, Darkmatter :: 343, Expanse :: 777
46 | case $MODEL in
47 |    alias)
48 |      SEED="--seed 21"
49 |      RUN_ID="--run_id alias-prime-gpt2-small-x21"
50 |      ;;
51 |    battlestar)
52 |      SEED="--seed 49"
53 |      RUN_ID="--run_id battlestar-prime-gpt2-small-x49"
54 |      ;;
55 |    caprica)
56 |      SEED="--seed 81"
57 |      RUN_ID="--run_id caprica-prime-gpt2-small-x81"
58 |      ;;
59 |    darkmatter)
60 |      SEED="--seed 343"
61 |      RUN_ID="--run_id darkmatter-prime-gpt2-small-x343"
62 |      ;;
63 |    expanse)
64 |      SEED="--seed 777"
65 |      RUN_ID="--run_id expanse-prime-gpt2-small-x777"
66 |      ;;
67 |    firefly)
68 |      SEED="--seed 801"
69 |      RUN_ID="--run_id firefly-prime-gpt2-small-x801"
70 |      ;;
71 |    gundam)
72 |      SEED="--seed 837"
73 |      RUN_ID="--run_id gundam-prime-gpt2-small-x837"
74 |      ;;
75 |    highlander)
76 |      SEED="--seed 900"
77 |      RUN_ID="--run_id highlander-prime-gpt2-small-x900"
78 |      ;;
79 |    ?)
80 |      usage
81 |      exit
82 |      ;;
83 |  esac
84 | 
85 | # Set DeepSpeed Launcher Parameters
86 | DISTRIBUTED_ARGS="--num_gpus 16 --num_nodes 1"
87 | 
88 | # ---
89 | 
90 | # Single-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Seed
91 | echo deepspeed $DISTRIBUTED_ARGS train.py $GCP_CONFIG $INFRA $D_BSZ_8 $SEED $RES $DS_Z2 $RUN_ID
92 | deepspeed $DISTRIBUTED_ARGS train.py $GCP_CONFIG $INFRA $D_BSZ_8 $SEED $RES $DS_Z2 $RUN_ID
93 | pkill -f "train.py"
94 | sleep 3
95 | 


--------------------------------------------------------------------------------
/scripts/mistral-gpt2-medium.sh:
--------------------------------------------------------------------------------
 1 | # mistral-gpt2-medium.sh
 2 | #   Mistral GPT-2 Medium Full Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 4. Runs locally, on
 3 | #   Sphinx Cluster.
 4 | #
 5 | # Parse Named Command Arguments::
 6 | #   EX: bash mistral-gpt2-medium.sh MODEL="arwen"
 7 | for ARGUMENT in "$@"
 8 | do
 9 | 
10 |     KEY=$(echo $ARGUMENT | cut -f1 -d=)
11 |     VALUE=$(echo $ARGUMENT | cut -f2 -d=)
12 | 
13 |     case "$KEY" in
14 |             MODEL)              MODEL=${VALUE} ;;
15 |             RESUME)             RESUME=${VALUE} ;;
16 |             *)
17 |     esac
18 | 
19 | done
20 | 
21 | # Set to Default Values if Param is not Set
22 | if [ -z "$MODEL" ]; then MODEL='arwen'; fi
23 | if [ -z "$RESUME" ]; then RESUME='false'; fi
24 | 
25 | echo "MODEL = $MODEL"
26 | echo "RESUME = $RESUME"
27 | 
28 | # Constants
29 | SPHINX_CONFIG="--config conf/gpt2-mistral-medium-config.yaml"
30 | if [ "$RESUME" == "true" ];
31 | then
32 |   RES="--resume true";
33 | else
34 |   RES="";
35 | fi
36 | INFRA="--nnodes 2 --nproc_per_node 8"
37 | 
38 | # Batch Size
39 | D_BSZ_4="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4"
40 | 
41 | # DeepSpeed Training Configuration
42 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-medium-conf.json"
43 | 
44 | # Random Seeds -- Arwen :: 21, Beren :: 49, Celebrimbor :: 81, Durin :: 343, Eowyn :: 777
45 | case $MODEL in
46 |    arwen)
47 |      SEED="--seed 21"
48 |      RUN_ID="--run_id arwen-prime-gpt2-medium-x21"
49 |      ;;
50 |    beren)
51 |      SEED="--seed 49"
52 |      RUN_ID="--run_id beren-prime-gpt2-medium-x49"
53 |      ;;
54 |    celebrimbor)
55 |      SEED="--seed 81"
56 |      RUN_ID="--run_id celebrimbor-prime-gpt2-medium-x81"
57 |      ;;
58 |    durin)
59 |      SEED="--seed 343"
60 |      RUN_ID="--run_id durin-prime-gpt2-medium-x343"
61 |      ;;
62 |    eowyn)
63 |      SEED="--seed 777"
64 |      RUN_ID="--run_id eowyn-prime-gpt2-medium-x777"
65 |      ;;
66 |    ?)
67 |      usage
68 |      exit
69 |      ;;
70 |  esac
71 | 
72 | # Set DeepSpeed Launcher Parameters
73 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu"
74 | 
75 | # ---
76 | 
77 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 4 --> Cleanup --> Sleep
78 | echo deepspeed $DISTRIBUTED_ARGS train.py $SPHINX_CONFIG $INFRA $D_BSZ_4 $SEED $RES $DS_Z2 $RUN_ID
79 | deepspeed $DISTRIBUTED_ARGS train.py $SPHINX_CONFIG $INFRA $D_BSZ_4 $SEED $RES $DS_Z2 $RUN_ID
80 | pkill -f "train.py"
81 | sleep 3
82 | 


--------------------------------------------------------------------------------
/scripts/mistral-gpt2-small.sh:
--------------------------------------------------------------------------------
 1 | # mistral-gpt2-small.sh
 2 | #   Mistral GPT-2 Small Full Run with the DeepSpeed ZeRO-2 Optimizer, Per-Device Batch Size of 16. Runs locally, on
 3 | #   Sphinx Cluster.
 4 | 
 5 | # Parse Named Command Arguments::
 6 | #   EX: bash mistral-gpt2-small.sh MODEL="firefly" RESUME="true"
 7 | for ARGUMENT in "$@"
 8 | do
 9 | 
10 |     KEY=$(echo $ARGUMENT | cut -f1 -d=)
11 |     VALUE=$(echo $ARGUMENT | cut -f2 -d=)
12 | 
13 |     case "$KEY" in
14 |             MODEL)              MODEL=${VALUE} ;;
15 |             RESUME)             RESUME=${VALUE} ;;
16 |             *)
17 |     esac
18 | 
19 | done
20 | 
21 | # Set to Default Values if Param is not Set
22 | if [ -z "$MODEL" ]; then MODEL='firefly'; fi
23 | if [ -z "$RESUME" ]; then RESUME='false'; fi
24 | 
25 | echo "MODEL = $MODEL"
26 | echo "RESUME = $RESUME"
27 | 
28 | # Constants
29 | SPHINX_CONFIG="--config conf/gpt2-mistral-small-config.yaml"
30 | if [ "$RESUME" == "true" ];
31 | then
32 |   RES="--resume true";
33 | else
34 |   RES="";
35 | fi
36 | INFRA="--nnodes 2 --nproc_per_node 8"
37 | 
38 | # Batch Size
39 | D_BSZ_8="--training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8"
40 | 
41 | # DeepSpeed Training Configuration
42 | DS_Z2="--training_arguments.deepspeed conf/deepspeed/z2-small-conf.json"
43 | 
44 | # Random Seeds -- Alias :: 21, Battlestar :: 49, Caprica :: 81, Darkmatter :: 343, Expanse :: 777
45 | case $MODEL in
46 |    alias)
47 |      SEED="--seed 21"
48 |      RUN_ID="--run_id alias-prime-gpt2-small-x21"
49 |      ;;
50 |    battlestar)
51 |      SEED="--seed 49"
52 |      RUN_ID="--run_id battlestar-prime-gpt2-small-x49"
53 |      ;;
54 |    caprica)
55 |      SEED="--seed 81"
56 |      RUN_ID="--run_id caprica-prime-gpt2-small-x81"
57 |      ;;
58 |    darkmatter)
59 |      SEED="--seed 343"
60 |      RUN_ID="--run_id darkmatter-prime-gpt2-small-x343"
61 |      ;;
62 |    expanse)
63 |      SEED="--seed 777"
64 |      RUN_ID="--run_id expanse-prime-gpt2-small-x777"
65 |      ;;
66 |    firefly)
67 |      SEED="--seed 801"
68 |      RUN_ID="--run_id firefly-prime-gpt2-small-x801"
69 |      ;;
70 |    gundam)
71 |      SEED="--seed 837"
72 |      RUN_ID="--run_id gundam-prime-gpt2-small-x837"
73 |      ;;
74 |    highlander)
75 |      SEED="--seed 900"
76 |      RUN_ID="--run_id highlander-prime-gpt2-small-x900"
77 |      ;;
78 |    ?)
79 |      usage
80 |      exit
81 |      ;;
82 |  esac
83 | 
84 | # Set DeepSpeed Launcher Parameters
85 | DISTRIBUTED_ARGS="--num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu"
86 | 
87 | # ---
88 | 
89 | # Multi-Node DS-Z2, Linear LR Schedule, Device BSZ = 16 --> Cleanup --> Seed
90 | echo deepspeed $DISTRIBUTED_ARGS train.py $SPHINX_CONFIG $INFRA $D_BSZ_8 $SEED $RES $DS_Z2 $RUN_ID
91 | deepspeed $DISTRIBUTED_ARGS train.py $SPHINX_CONFIG $INFRA $D_BSZ_8 $SEED $RES $DS_Z2 $RUN_ID
92 | pkill -f "train.py"
93 | sleep 3
94 | 


--------------------------------------------------------------------------------
/scripts/run/ddp.sh:
--------------------------------------------------------------------------------
 1 | # Sphinx1 Private IP: 172.24.67.75
 2 | # Sphinx2 Private IP: 172.24.67.78
 3 | 
 4 | # Command Line Arguments
 5 | nnodes=${1:-1}
 6 | node_rank=${2:-0}
 7 | 
 8 | # Default Configuration of GPUs on the Sphinx Machines
 9 | GPUS_PER=8
10 | 
11 | # Assumes `sphinx1` is the main node - node rank must be 0 on sphinx1!
12 | MASTER_ADDR=sphinx1.stanford.edu
13 | MASTER_PORT=7000
14 | WORLD_SIZE=$((${nnodes}*${node_rank}))
15 | 
16 | # `torch.distributed.launch` Parameters
17 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER --nnodes ${nnodes} --node_rank ${node_rank} --master_addr $MASTER_ADDR"
18 | 
19 | # Default `train.py` config arguments
20 | CONFIG_ARGS="--config conf/gpt2-sphinx-debug-config.yaml --nproc_per_node $GPUS_PER --nnodes ${nnodes}"
21 | 
22 | # export NCCL_DEBUG=INFO; \
23 | python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS
24 | 
25 | # Kill Running Processes (Because `torch.distributed.launch` doesn't like to clean up after itself...)
26 | pkill -f "train.py"
27 | 


--------------------------------------------------------------------------------
/scripts/run/deepspeed.sh:
--------------------------------------------------------------------------------
 1 | # Sphinx1 Private IP: 172.24.67.75
 2 | # Sphinx2 Private IP: 172.24.67.78
 3 | 
 4 | # Command Line Arguments
 5 | nnodes=${1:-1}
 6 | node_rank=${2:-0}
 7 | 
 8 | # Default Configuration of GPUs on the Sphinx Machines
 9 | GPUS_PER=8
10 | 
11 | # Assumes `sphinx1` is the main node - node rank must be 0 on sphinx1!
12 | MASTER_ADDR=sphinx1.stanford.edu
13 | MASTER_PORT=7000
14 | WORLD_SIZE=$((${nnodes}*${node_rank}))
15 | 
16 | # DeepSpeed Launch Parameters
17 | DISTRIBUTED_ARGS="--num_gpus $GPUS_PER --num_nodes ${nnodes} --master_addr $MASTER_ADDR"
18 | 
19 | # Default `train.py` config arguments
20 | CONFIG_ARGS="--config conf/gpt2-sphinx-debug-config.yaml --nproc_per_node $GPUS_PER --nnodes ${nnodes}"
21 | 
22 | # DeepSpeed Configurations
23 | DEEPSPEED_Z1="--training_arguments.deepspeed conf/deepspeed/z1-conf.json"
24 | DEEPSPEED_Z2="--training_arguments.deepspeed conf/deepspeed/z2-conf.json"
25 | DEEPSPEED_Z3="--training_arguments.deepspeed conf/deepspeed/z3-conf.json"
26 | 
27 | DEEPSPEED_Z1_OFF="--training_arguments.deepspeed conf/deepspeed/z1-offload-conf.json"
28 | DEEPSPEED_Z2_OFF="--training_arguments.deepspeed conf/deepspeed/z2-offload-conf.json"
29 | DEEPSPEED_Z3_OFF="--training_arguments.deepspeed conf/deepspeed/z3-offload-conf.json"
30 | 
31 | # export NCCL_DEBUG=INFO; \
32 | # =>> ZeRO-1
33 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z1
34 | 
35 | # =>> ZeRO-2
36 | deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z2
37 | 
38 | # =>> ZeRO-3
39 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z3
40 | 
41 | # =>> ZeRO-1 Offload
42 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z1_OFF
43 | 
44 | # =>> ZeRO-2 Offload
45 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z2_OFF
46 | 
47 | # =>> ZeRO-3 Offload
48 | # deepspeed $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $DEEPSPEED_Z3_OFF
49 | 
50 | # Kill Running Processes (Because `deepspeed` doesn't like to clean up after itself...)
51 | pkill -f "train.py"
52 | 


--------------------------------------------------------------------------------
/scripts/run/fairscale.sh:
--------------------------------------------------------------------------------
 1 | # Sphinx1 Private IP: 172.24.67.75
 2 | # Sphinx2 Private IP: 172.24.67.78
 3 | 
 4 | # Command Line Arguments
 5 | nnodes=${1:-1}
 6 | node_rank=${2:-0}
 7 | 
 8 | # Default Configuration of GPUs on the Sphinx Machines
 9 | GPUS_PER=8
10 | 
11 | # Assumes `sphinx1` is the main node - node rank must be 0 on sphinx1!
12 | MASTER_ADDR=sphinx1.stanford.edu
13 | MASTER_PORT=7000
14 | WORLD_SIZE=$((${nnodes}*${node_rank}))
15 | 
16 | # `torch.distributed.launch` Parameters
17 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER --nnodes ${nnodes} --node_rank ${node_rank} --master_addr $MASTER_ADDR"
18 | 
19 | # Default `train.py` config arguments
20 | CONFIG_ARGS="--config conf/gpt2-sphinx-debug-config.yaml --nproc_per_node $GPUS_PER --nnodes ${nnodes}"
21 | 
22 | # FairScale Parameters
23 | FAIRSCALE_Z1="--training_arguments.sharded_ddp simple"
24 | FAIRSCALE_Z2="--training_arguments.sharded_ddp zero_dp_2+auto_wrap"
25 | FAIRSCALE_Z3="--training_arguments.sharded_ddp zero_dp_3+auto_wrap"
26 | FAIRSCALE_Z2_OFF="--training_arguments.sharded_ddp zero_dp_2+auto_wrap+offload"
27 | FAIRSCALE_Z3_OFF="--training_arguments.sharded_ddp zero_dp_3+auto_wrap+offload"
28 | 
29 | # export NCCL_DEBUG=INFO; \
30 | # =>> ZeRO-1 (Simple)
31 | python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z1
32 | 
33 | # =>> ZeRO-2
34 | # python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z2
35 | 
36 | # =>> ZeRO-3
37 | # python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z3
38 | 
39 | # TODO D :: Offloading Doesn't Work Yet?
40 | # =>> ZeRO-2 Offload
41 | # python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z2_OFF
42 | 
43 | # =>> ZeRO-3 Offload
44 | # python -m torch.distributed.launch $DISTRIBUTED_ARGS train.py $CONFIG_ARGS $FAIRSCALE_Z3_OFF
45 | 
46 | # Kill Running Processes (Because `torch.distributed.launch` doesn't like to clean up after itself...)
47 | pkill -f "train.py"
48 | 


--------------------------------------------------------------------------------
/scripts/run/multi-node.sh:
--------------------------------------------------------------------------------
1 | # ZeRO-1 -- Multi-Node!
2 | deepspeed --num_gpus 8 --num_nodes 2 --master_addr sphinx1.stanford.edu train.py --config conf/gpt2-benchmark-config.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 16 --training_arguments.deepspeed scripts/deepspeed/z1-conf.json --run_id 63-sk-on-eval-ds=z1-n=2-g=8-fp16-dbsz=16
3 | pkill -f "train.py"
4 | sleep 3
5 | 


--------------------------------------------------------------------------------
/scripts/run/single-node.sh:
--------------------------------------------------------------------------------
1 | # Single-Node, Single GPU, No GC, FP16, Device BSZ = 8
2 | CUDA_VISIBLE_DEVICES=0 python train.py --config conf/gpt2-benchmark-config.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 8
3 | 


--------------------------------------------------------------------------------
/setup/conda-requirements.txt:
--------------------------------------------------------------------------------
1 | python=3.8.12
2 | pytorch=1.11.0
3 | torchdata
4 | cudatoolkit=11.3.1
5 | 


--------------------------------------------------------------------------------
/setup/pip-requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets==2.0.0
 2 | deepspeed==0.6.5
 3 | huggingface-hub==0.4.0
 4 | jsonlines==3.0.0
 5 | pytest==7.1.2
 6 | quinine==0.3.0
 7 | transformers==4.18.0
 8 | wandb==0.12.17
 9 | zstandard>=0.17.0
10 | pyarrow>=7.0.0
11 | 


--------------------------------------------------------------------------------
/setup/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 4 | 
 5 | ENV_NAME="mistral"
 6 | # if we have an arg to the script, use it as the env name
 7 | if [ $# -eq 1 ]; then
 8 |     ENV_NAME=$1
 9 | fi
10 | 
11 | if  [ "$CONDA_DEFAULT_ENV" != "base" ]; then
12 |     echo "Error: run setup from base environment!"
13 |     exit
14 | fi
15 | echo "Creating mistral conda environment '${ENV_NAME}'!"
16 | conda create -y -n "${ENV_NAME}" --file ${SCRIPT_DIR}/conda-requirements.txt -c pytorch
17 | . $CONDA_PREFIX/etc/profile.d/conda.sh
18 | conda activate "${ENV_NAME}"
19 | if [ "$CONDA_DEFAULT_ENV" = "${ENV_NAME}" ]; then
20 |     echo "Installing python dependencies with pip!"
21 |     pip install -r ${SCRIPT_DIR}/pip-requirements.txt
22 | fi
23 | echo "Successfully created mistral environment '${ENV_NAME}'!"
24 | 


--------------------------------------------------------------------------------
/setup/test-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=7.1.0
2 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/src/__init__.py


--------------------------------------------------------------------------------
/src/args/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities for handling arguments from quinfigs and command line
3 | """
4 | 
5 | from .training_args import get_training_arguments
6 | 


--------------------------------------------------------------------------------
/src/args/training_args.py:
--------------------------------------------------------------------------------
 1 | """
 2 | training_args.py
 3 | 
 4 | Utility script for unloading Quinfigs into full set of Training Arguments, as well as for handling any argument
 5 | overrides (e.g., paths that are defined at runtime, parameters that are dynamically computed such as gradient
 6 | accumulation).
 7 | """
 8 | import logging
 9 | from pathlib import Path
10 | from typing import Optional
11 | 
12 | from munch import Munch
13 | from transformers import TrainingArguments
14 | 
15 | 
16 | # Nest Overwatch under root `mistral` logger, inheriting formatting!
17 | overwatch = logging.getLogger("mistral.args.training")
18 | 
19 | 
20 | def get_training_arguments(
21 |     quinfig_args: Munch,
22 |     run_name: str,
23 |     output_dir: Path,
24 |     seed: int,
25 |     local_rank: int,
26 |     world_size: int,
27 |     effective_bsz: int,
28 |     gradient_checkpointing: Optional[bool] = None,
29 | ) -> TrainingArguments:
30 |     """Initialize Training Arguments from Quinfig and Runtime-Defined Variables."""
31 | 
32 |     # `quinfig_args` already contains some default training arguments --> we'll be overwriting/adding to the Dict
33 |     #   =>> a `Munch` is a subclass of Dictionary that supports attribute style access
34 |     training_args = quinfig_args
35 |     training_args.run_name = run_name
36 |     training_args.output_dir = output_dir
37 |     training_args.seed = seed
38 |     training_args.data_seed = seed
39 |     training_args.local_rank = local_rank
40 | 
41 |     # Since we Implement a Custom W&B / JSON Logging Callback, we don't report to anyone -- we've gone rogue!
42 |     training_args.report_to = "none"
43 | 
44 |     # do it this way so we start supporting gradient_checkpointing in training_args à la Transformers
45 |     if gradient_checkpointing is not None:
46 |         training_args.gradient_checkpointing = gradient_checkpointing
47 | 
48 |     # If "sharded_ddp" is None --> replace with False
49 |     if training_args.sharded_ddp is None:
50 |         training_args.sharded_ddp = False
51 |     else:
52 |         assert isinstance(training_args.sharded_ddp, str) and training_args.sharded_ddp in [
53 |             "simple",
54 |             "zero_dp_2+auto_wrap",
55 |             "zero_dp_2+auto_wrap+offload",
56 |             "zero_dp_3+auto_wrap",
57 |             "zero_dp_3+auto_wrap+offload",
58 |         ]
59 | 
60 |         # If "+" in `sharded_ddp` --> Split, and then join... this is kinda hacky (TODO training_args.A :: Fix!)
61 |         if "+" in training_args.sharded_ddp:
62 |             training_args.sharded_ddp = " ".join(training_args.sharded_ddp.split("+"))
63 | 
64 |     # Compute Gradient Accumulation Dynamically
65 |     training_args.gradient_accumulation_steps = effective_bsz // (
66 |         quinfig_args.per_device_train_batch_size * world_size
67 |     )
68 |     overwatch.info(
69 |         f"Setting Gradient Accumulation Steps = `{training_args.gradient_accumulation_steps}` [BSZ: {effective_bsz} "
70 |         f"World Size: {world_size} Device BSZ: {quinfig_args.per_device_train_batch_size}]"
71 |     )
72 |     if (
73 |         training_args.gradient_accumulation_steps <= 0
74 |         or effective_bsz % training_args.gradient_accumulation_steps != 0
75 |     ):
76 |         raise ValueError("Incompatible sizes for gradient accumulation!")
77 | 
78 |     args = TrainingArguments(**training_args)
79 | 
80 |     # TODO(dlwh): report this bug to transformers
81 |     assert (
82 |         args.dataloader_num_workers == 0 or world_size == 1
83 |     ), "dataloader_num_workers must be 0 for multi-gpu training in HF right now"
84 | 
85 |     return args
86 | 


--------------------------------------------------------------------------------
/src/core/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Modules for core training, evaluation, and W&B logging processes
3 | """
4 | 
5 | from .callbacks import CustomCheckpointCallback, CustomWandbCallback
6 | from .trainer import OnlineBenchmarkTrainer
7 | 


--------------------------------------------------------------------------------
/src/corpora/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Dataset/Corpus related modules
3 | """
4 | 
5 | from .auto import ONLINE_EVAL_DATA_REGISTRY, get_auto_dataset
6 | 


--------------------------------------------------------------------------------
/src/corpora/detokenization.py:
--------------------------------------------------------------------------------
 1 | """
 2 | detokenization.py
 3 | 
 4 | Handle detokenization for different dataset for zero-shot LM evaluation.
 5 | """
 6 | import logging
 7 | import re
 8 | from typing import Dict
 9 | 
10 | 
11 | # Nest Overwatch under root `mistral` logger, inheriting formatting!
12 | overwatch = logging.getLogger("mistral.corpora.detokenization")
13 | 
14 | 
15 | def wikitext_detokenize(example: Dict[str, str]) -> Dict[str, str]:
16 |     """
17 |     Wikitext is whitespace tokenized and we remove these whitespaces.
18 | 
19 |     Taken from https://github.com/NVIDIA/Megatron-LM/blob/main/tasks/zeroshot_gpt2/detokenizer.py
20 |     """
21 |     # Contractions
22 |     text = example["text"]
23 |     text = text.replace("s '", "s'")
24 |     text = re.sub(r"/' [0-9]/", r"/'[0-9]/", text)
25 | 
26 |     # Number Separators
27 |     text = text.replace(" @-@ ", "-")
28 |     text = text.replace(" @,@ ", ",")
29 |     text = text.replace(" @.@ ", ".")
30 | 
31 |     # Punctuation
32 |     text = text.replace(" : ", ": ")
33 |     text = text.replace(" ; ", "; ")
34 |     text = text.replace(" . ", ". ")
35 |     text = text.replace(" ! ", "! ")
36 |     text = text.replace(" ? ", "? ")
37 |     text = text.replace(" , ", ", ")
38 | 
39 |     # Double Brackets
40 |     text = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", text)
41 |     text = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", text)
42 |     text = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", text)
43 |     text = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', text)
44 |     text = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", text)
45 | 
46 |     # Miscellaneous
47 |     text = text.replace("= = = =", "====")
48 |     text = text.replace("= = =", "===")
49 |     text = text.replace("= =", "==")
50 |     text = text.replace(" " + chr(176) + " ", chr(176))
51 |     text = text.replace(" \n", "\n")
52 |     text = text.replace("\n ", "\n")
53 |     text = text.replace(" N ", " 1 ")
54 |     text = text.replace(" 's", "'s")
55 | 
56 |     return {"text": text}
57 | 
58 | 
59 | # Set Registry for Various Datasets
60 | DATASET_TOKENIZATION_REGISTRY = {"wikitext": wikitext_detokenize}
61 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Model related modules
3 | """
4 | 
5 | from .auto_clm import get_auto_clm_tokenizer
6 | 


--------------------------------------------------------------------------------
/src/models/auto_clm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | auto_clm.py
 3 | 
 4 | Default Causal Language Model (CLM) & Tokenizer Specification and Initialization. Downloads Model Configuration (if
 5 | necessary) from the  Hugging Face `transformers` Hub, instantiates pretrained Tokenizer, and initializes model using
 6 | the necessary AutoModel class.
 7 | """
 8 | import logging
 9 | from pathlib import Path
10 | from typing import Dict, Tuple
11 | 
12 | import torch
13 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
14 | from transformers.models.gpt2 import GPT2Config, GPT2LMHeadModel
15 | 
16 | from ..corpora.tokenization_utils import PassthroughTokenizer
17 | from ..util import REGISTRY
18 | 
19 | 
20 | # Nest Overwatch under root `mistral` logger, inheriting formatting!
21 | overwatch = logging.getLogger("mistral.models.auto")
22 | 
23 | 
24 | def get_auto_clm_tokenizer(
25 |     model_id: str,
26 |     paths: Dict[str, Path],
27 |     model_configs: dict = None,
28 |     use_pretrained_tokenizer: bool = True,
29 |     use_passthrough_tokenizer: bool = False,
30 |     reorder_and_upcast_attn: bool = True,
31 |     scale_attn_by_inverse_layer_idx: bool = True,
32 |     initial_weights: str = None,
33 | ) -> Tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
34 |     """Download/Load AutoConfig and Instantiate Corresponding Model and Tokenizer."""
35 | 
36 |     # Create Configuration
37 |     if "gpt2" in model_id and model_configs:
38 |         overwatch.info(f"Building Hugging Face GPT2Config from provided configs: {model_configs} ...")
39 |         config = GPT2Config.from_dict(model_configs)
40 |     else:
41 |         overwatch.info(f"Fetching Hugging Face AutoConfig for Model: `{REGISTRY[model_id]}`...")
42 |         config = AutoConfig.from_pretrained(REGISTRY[model_id], cache_dir=paths["configs"])
43 | 
44 |     # mistral config is just gpt2 with the following additional stability fixes
45 |     if "mistral" in model_id or "gpt2" in model_id:
46 |         config.reorder_and_upcast_attn = reorder_and_upcast_attn
47 |         config.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
48 | 
49 |     # IMPORTANT :: Set `use_cache` to False -- we don't need it ever and it conflicts with gradient checkpointing!
50 |     config.use_cache = False
51 | 
52 |     # Create Tokenizer
53 |     overwatch.info(f"Fetching Hugging Face [Fast] AutoTokenizer for Model: `{REGISTRY[model_id]}`...")
54 |     assert not (
55 |         use_pretrained_tokenizer and use_passthrough_tokenizer
56 |     ), "Pretrained and Passthrough tokenization are mutually exclusive"
57 |     if use_pretrained_tokenizer:
58 |         tokenizer = AutoTokenizer.from_pretrained(REGISTRY[model_id], config=config, cache_dir=paths["tokenizer"])
59 |     elif use_passthrough_tokenizer:
60 |         overwatch.info("Using a Pretokenized Dataset")
61 |         tokenizer = PassthroughTokenizer(config.vocab_size)
62 |     else:
63 |         overwatch.error("Tokenizer Training/Initialization (from Scratch) not yet implemented!")
64 |         raise NotImplementedError()
65 | 
66 |     if "gpt2" in model_id:
67 |         overwatch.info(f"Initializing Custom GPT-2 Model from Configuration: `{REGISTRY[model_id]}`...")
68 |         model = GPT2LMHeadModel(config)
69 |     else:
70 |         # Initialize Model
71 |         overwatch.info(f"Initializing Tabula Rasa Model from Configuration: `{REGISTRY[model_id]}`...")
72 |         model = AutoModelForCausalLM.from_config(config)
73 | 
74 |     # Run GPT-Specific Initialization, if applicable
75 |     model.resize_token_embeddings(len(tokenizer))
76 | 
77 |     # If `initial_weights` is not None, load weights from path!
78 |     if initial_weights is not None:
79 |         overwatch.info(f"Initializing Weights from File: `{initial_weights}`...")
80 |         model.load_state_dict(torch.load(initial_weights, map_location=torch.device("cpu")))
81 | 
82 |     return model, tokenizer
83 | 


--------------------------------------------------------------------------------
/src/overwatch/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Package for logging
3 | """
4 | 
5 | from .overwatch import get_overwatch
6 | 


--------------------------------------------------------------------------------
/src/overwatch/overwatch.py:
--------------------------------------------------------------------------------
 1 | """
 2 | overwatch.py
 3 | 
 4 | Utility class for creating a centralized/standardized Python logger, with the Mercury format, at the appropriate
 5 | logging level.
 6 | """
 7 | import logging
 8 | from pathlib import Path
 9 | 
10 | import datasets
11 | import transformers
12 | 
13 | 
14 | # Constants - for Formatting
15 | LOG_FORMAT = "|=>> %(asctime)s - %(name)s - %(levelname)s :: %(message)s"
16 | DATE_FORMAT = "%m/%d [%H:%M:%S]"
17 | 
18 | 
19 | def get_overwatch(path: Path, level: int, local_rank: int = 0) -> logging.Logger:
20 |     """
21 |     Initialize logging.Logger with the appropriate name, console, and file handlers.
22 | 
23 |     :param path: Path for writing log file --> should be identical to run_name (inherited from `train.py`)
24 |     :param level: Default logging level --> should usually be INFO (inherited from `train.py`).
25 |     :param local_rank: Process Rank (default = -1). Only log to `level` on rank <= 0, otherwise default level is WARN.
26 | 
27 |     :return: Default "mistral" root logger object :: logging.Logger
28 |     """
29 |     # Create Root Logger w/ Base Formatting
30 |     logging.basicConfig(level=level, format=LOG_FORMAT, datefmt=DATE_FORMAT)
31 | 
32 |     # Suppress Hugging Face Loggers --> propagate up to Root!
33 |     transformers.logging._get_library_root_logger().handlers = []
34 |     transformers.logging._get_library_root_logger().setLevel(level=level)
35 |     datasets.logging._get_library_root_logger().handlers = []
36 | 
37 |     # Create Default Logger & add File Handler
38 |     logger = logging.getLogger()
39 |     logger.setLevel(level if local_rank <= 0 else logging.WARNING)
40 | 
41 |     # Only Log to File w/ Rank 0 on each Node
42 |     if local_rank <= 0:
43 |         # Create File Handler --> Set mode to "a" to append to logs (ok, since each run will be uniquely named)
44 |         file_handler = logging.FileHandler(path, mode="a")
45 |         file_handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT))
46 |         logger.addHandler(file_handler)
47 | 
48 |     return logging.getLogger("mistral")
49 | 


--------------------------------------------------------------------------------
/src/util/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Miscellaneous utilities including handling of directory set up, model name registry, and more...
3 | """
4 | 
5 | from .paths import create_paths, set_permissions
6 | from .registry import PATH_REGISTRY, REGISTRY
7 | 


--------------------------------------------------------------------------------
/src/util/paths.py:
--------------------------------------------------------------------------------
 1 | """
 2 | paths.py
 3 | 
 4 | Utility function for initializing the appropriate directories/sub-directories on the start of each run. Decoupled from
 5 | main code in case we want separate directory structures/artifact storage based on infrastructure (e.g., NLP Cluster vs.
 6 | GCP).
 7 | """
 8 | import os
 9 | from pathlib import Path
10 | from typing import Dict
11 | 
12 | from .registry import PATH_REGISTRY
13 | 
14 | 
15 | def create_paths(run_id: str, model: str, run_dir: str, cache_dir: str) -> Dict[str, Path]:
16 |     """
17 |     Create the necessary directories and sub-directories conditioned on the `run_id`, checkpoint directory, and cache
18 |     directories.
19 | 
20 |     :param run_id: Unique Run Identifier.
21 |     :param model: Huggingface.Transformers Model ID for specifying the desired configuration.
22 |     :param run_dir: Path to run directory to save model checkpoints and run metrics.
23 |     :param cache_dir: Path to artifacts/cache directory to store any intermediate values, configurations, etc.
24 | 
25 |     :return: Dictionary mapping str ids --> paths on the filesystem.
26 |     """
27 |     # To respect shortcuts in paths, such as ~
28 |     cache_dir = os.path.expanduser(cache_dir)
29 |     run_dir = os.path.expanduser(run_dir)
30 | 
31 |     paths = {
32 |         # Top-Level Checkpoint Directory for Given Run
33 |         "runs": Path(run_dir) / run_id,
34 |         # Cache Directories for various components
35 |         "configs": Path(cache_dir) / f"{PATH_REGISTRY[model]}-configs",
36 |         "tokenizer": Path(cache_dir) / f"{PATH_REGISTRY[model]}-tokenizer",
37 |         "dataset": Path(cache_dir) / "datasets",
38 |         "preprocessed": Path(cache_dir) / f"{PATH_REGISTRY[model]}-processed",
39 |     }
40 | 
41 |     # Programatically Create Paths for each Directory
42 |     for p in paths:
43 |         paths[p].mkdir(parents=True, exist_ok=True)
44 | 
45 |     return paths
46 | 
47 | 
48 | def set_permissions(paths: Dict[str, Path]) -> None:
49 |     """Recursively call `os.chmod(775) recursively for the given paths."""
50 |     for p in paths:
51 |         os.system(f"chmod -R 775 {paths[p]} >/dev/null 2>&1")
52 | 


--------------------------------------------------------------------------------
/src/util/registry.py:
--------------------------------------------------------------------------------
 1 | """
 2 | registry.py
 3 | 
 4 | Model/Data Registry :: Human-Readable Identifier --> Huggingface.co ID. Ideally will be expanded upon as we introduce
 5 | more model configurations, different types of architectures, etc.
 6 | """
 7 | 
 8 | # Model Names
 9 | REGISTRY = {
10 |     "gpt2-small": "gpt2",
11 |     "gpt2-medium": "gpt2-medium",
12 |     "gpt2-large": "gpt2-large",
13 |     "gpt2-xl": "gpt2-xl",
14 |     "mistral-small": "gpt2",
15 |     "mistral-medium": "gpt2-medium",
16 | }
17 | 
18 | # Absolute Paths
19 | PATH_REGISTRY = {
20 |     "gpt2-small": "gpt2",
21 |     "gpt2-medium": "gpt2",
22 |     "gpt2-large": "gpt2",
23 |     "gpt2-xl": "gpt2",
24 |     "mistral-small": "gpt2",
25 |     "mistral-medium": "gpt2-medium",
26 | }
27 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Run Tests
 2 | 
 3 | Set this environment variable to a working directory that can store the Hugging Face cache and checkpoints created by the tests:
 4 | 
 5 | ```bash
 6 | export MISTRAL_TEST_HOME=/path/to/mistral-test-working-dir
 7 | ```
 8 | 
 9 | From the `tests` directory, run this command to run tests in single node/single GPU mode:
10 | 
11 | ```bash
12 | export CUDA_VISIBLE_DEVICES=0
13 | cd tests
14 | pytest
15 | ```
16 | 


--------------------------------------------------------------------------------
/tests/conf/datasets/wikitext103.yaml:
--------------------------------------------------------------------------------
 1 | # wikitext103.yaml
 2 | #   Configuration for WikiText-103 Dataset (https://huggingface.co/datasets/wikitext).
 3 | ---
 4 | dataset:
 5 |     id: wikitext
 6 |     name: wikitext-103-raw-v1
 7 |     validation_ratio: null
 8 | 
 9 |     # Number of Preprocessing Workers
10 |     num_proc: 4
11 | 
12 |     # Number of Evaluation Preprocessing Workers
13 |     eval_num_proc: 4
14 | 


--------------------------------------------------------------------------------
/tests/conf/datasets/wikitext2-detokenized.yaml:
--------------------------------------------------------------------------------
 1 | # wikitext_2_detokenized.yaml
 2 | #   Configuration for pre-detokenized WikiText-2 Dataset (https://huggingface.co/datasets/dlwh/wikitext_2_detokenized)
 3 | ---
 4 | dataset:
 5 |     id: dlwh/wikitext_2_detokenized
 6 | 
 7 |     # Number of Preprocessing Workers
 8 |     num_proc: 4
 9 | 
10 |     # Number of Evaluation Preprocessing Workers
11 |     eval_num_proc: 4
12 | 


--------------------------------------------------------------------------------
/tests/conf/datasets/wikitext2.yaml:
--------------------------------------------------------------------------------
 1 | # wikitext2.yaml
 2 | #   Configuration for WikiText-2 Dataset (https://huggingface.co/datasets/wikitext).
 3 | ---
 4 | dataset:
 5 |     id: wikitext
 6 |     name: wikitext-2-raw-v1
 7 |     validation_ratio: null
 8 | 
 9 |     # Number of Preprocessing Workers
10 |     num_proc: 4
11 | 
12 |     # Number of Evaluation Preprocessing Workers
13 |     eval_num_proc: 4
14 | 


--------------------------------------------------------------------------------
/tests/conf/deepspeed/z1-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": [
 7 |         0.9,
 8 |         0.95
 9 |       ],
10 |       "eps": 1e-8,
11 |       "weight_decay": 0.1
12 |     }
13 |   },
14 | 
15 |   "scheduler": {
16 |     "type": "WarmupDecayLR",
17 |     "params": {
18 |       "total_num_steps": 400000,
19 |       "warmup_max_lr": 0.0006,
20 |       "warmup_num_steps": 4000
21 |     }
22 |   },
23 | 
24 |   "zero_optimization": {
25 |     "stage": 1,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 2e8,
28 |     "reduce_scatter": true,
29 |     "reduce_bucket_size": 2e8,
30 |     "overlap_comm": true,
31 |     "contiguous_gradients": true,
32 |     "cpu_offload": false
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/tests/conf/deepspeed/z2-small-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "optimizer": {
 3 |     "type": "AdamW",
 4 |     "params": {
 5 |       "lr": 0.0006,
 6 |       "betas": "auto",
 7 |       "eps": 1e-8,
 8 |       "weight_decay": 0.1
 9 |     }
10 |   },
11 | 
12 |   "scheduler": {
13 |     "type": "WarmupDecayLR",
14 |     "params": {
15 |       "total_num_steps": "auto",
16 |       "warmup_max_lr": 0.0006,
17 |       "warmup_num_steps": 4000
18 |     }
19 |   },
20 | 
21 |   "zero_optimization": {
22 |     "stage": 2,
23 |     "allgather_partitions": true,
24 |     "allgather_bucket_size": 2e8,
25 |     "reduce_scatter": true,
26 |     "reduce_bucket_size": 2e8,
27 |     "overlap_comm": true,
28 |     "contiguous_gradients": true,
29 |     "cpu_offload": false
30 |   },
31 | 
32 |   "train_batch_size": "auto",
33 |   "train_micro_batch_size_per_gpu": "auto"
34 | }
35 | 


--------------------------------------------------------------------------------
/tests/conf/models/gpt2-micro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "activation_function": "gelu_new",
 3 |   "architectures": [
 4 |     "MistralGPT2LMHeadModel"
 5 |   ],
 6 |   "attn_pdrop": 0.0,
 7 |   "bos_token_id": 50256,
 8 |   "embd_pdrop": 0.0,
 9 |   "eos_token_id": 50256,
10 |   "gradient_checkpointing": false,
11 |   "initializer_range": 0.02,
12 |   "layer_norm_epsilon": 1e-05,
13 |   "model_type": "gpt2",
14 |   "n_ctx": 256,
15 |   "n_embd": 768,
16 |   "n_head": 2,
17 |   "n_inner": null,
18 |   "n_layer": 2,
19 |   "n_positions": 256,
20 |   "resid_pdrop": 0.0,
21 |   "summary_activation": null,
22 |   "summary_first_dropout": 0.0,
23 |   "summary_proj_to_labels": true,
24 |   "summary_type": "cls_index",
25 |   "summary_use_proj": true,
26 |   "task_specific_params": {
27 |     "text-generation": {
28 |       "do_sample": true,
29 |       "max_length": 50
30 |     }
31 |   },
32 |   "transformers_version": "4.5.0",
33 |   "use_cache": false,
34 |   "vocab_size": 50257
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/conf/models/gpt2-micro.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-micro-config.yaml
 2 | #   Configuration for the GPT-2 Micro Model.
 3 | ---
 4 | model:
 5 |     id: "gpt2-small"
 6 | 
 7 |     # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime
 8 |     gradient_checkpointing: false
 9 | 
10 |     # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch)
11 |     pretrained_tokenizer: true
12 | 
13 |     # Sequence Length
14 |     seq_len: 256
15 | 
16 |     # Stability
17 |     reorder_and_upcast_attn: true
18 |     scale_attn_by_inverse_layer_idx: true
19 | 
20 |     # Initialize Weights from File
21 |     initial_weights: null
22 | 
23 |     # Configure Model From File
24 |     config_path: conf/models/gpt2-micro.json
25 | 


--------------------------------------------------------------------------------
/tests/conf/models/gpt2-small.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-small-config.yaml
 2 | #   Configuration for the GPT-2 Small Model.
 3 | ---
 4 | model:
 5 |     id: "gpt2-small"
 6 | 
 7 |     # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime
 8 |     gradient_checkpointing: false
 9 | 
10 |     # Add Gradient Checkpointing Every `gc_checkpoint_every` Transformer blocks
11 |     # > Checkpoints = (# layers / `gc_checkpoint_every`) Blocks
12 |     gc_checkpoint_every: -1
13 | 
14 |     # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch)
15 |     pretrained_tokenizer: true
16 | 
17 |     # Sequence Length
18 |     seq_len: 512
19 | 
20 |     # Stability -- Upcasting and Scaled Dot-Product Reordering
21 |     reorder_attn: true
22 |     upcast_attn: true
23 | 
24 |     # Initialize Weights from File
25 |     initial_weights: null
26 | 


--------------------------------------------------------------------------------
/tests/conf/train-diff.yaml:
--------------------------------------------------------------------------------
 1 | # hello-world.yaml
 2 | #   Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
 3 | #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - train.yaml
10 |     - trainers/gpt2-small-diff.yaml
11 | 
12 | # Artifacts & Caching
13 | artifacts:
14 |     cache_dir: /nlp/scr/jebolton/mistral-hello-world/artifacts
15 |     run_dir: /nlp/scr/jebolton/mistral-hello-world/runs
16 | 
17 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
18 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
19 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
20 | checkpoint_frequency:
21 |     - [10, 100]
22 |     - [50, 2000]
23 |     - [100, 20000]
24 |     - [1000, 400000]
25 | 
26 | # Random Seed
27 | seed: 40
28 | 
29 | run_training: false
30 | run_final_eval: false
31 | 


--------------------------------------------------------------------------------
/tests/conf/train.yaml:
--------------------------------------------------------------------------------
 1 | # hello-world.yaml
 2 | #   Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
 3 | #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.
 4 | #
 5 | #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 6 | ---
 7 | # Inherit Dataset, Tokenization, Model, and Training Details
 8 | inherit:
 9 |     - datasets/wikitext2-detokenized.yaml
10 |     - models/gpt2-micro.yaml
11 |     - trainers/gpt2-small.yaml
12 | 
13 | # Run ID -- make sure to override!
14 | run_id: null
15 | 
16 | # Weights & Biases
17 | wandb: hello-world
18 | group: gpt2-small
19 | 
20 | # Artifacts & Caching
21 | artifacts:
22 |     cache_dir:
23 |     run_dir:
24 | 
25 | # Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
26 | effective_bsz: 16
27 | 
28 | # Resume from Checkpoint
29 | resume: false
30 | resume_checkpoint: null
31 | 
32 | # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
33 | #   - Frequency (`freq`) at which to save checkpoints (# steps)
34 | #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
35 | checkpoint_frequency:
36 |     - [2, 18]
37 |     - [10, 100]
38 |     - [50, 2000]
39 |     - [100, 20000]
40 |     - [1000, 400000]
41 | 
42 | # `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
43 | local_rank: -1
44 | nnodes: -1
45 | nproc_per_node: -1
46 | 
47 | # DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
48 | num_gpus: -1
49 | num_nodes: -1
50 | world_size: -1
51 | 
52 | # Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
53 | log_level: 20
54 | 
55 | # Random Seed
56 | seed: 21
57 | 
58 | online_eval:
59 |     do_wikitext: false
60 |     do_lambada: false
61 |     stride: 256
62 | 


--------------------------------------------------------------------------------
/tests/conf/trainers/gpt2-small-diff.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-small-diff.yaml
 2 | #   Trainer config for Full GPT-2 Small, with the full fixed batch size of 512 (with gradient accumulation).
 3 | #   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
 4 | #   continues to stay valid!
 5 | #       Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
 6 | ---
 7 | 
 8 | inherit:
 9 |     - gpt2-small.yaml
10 | 
11 | training_arguments:
12 |     # Learning Rate & Optimization Parameters, assumes AdamW
13 |     weight_decay: 0.2
14 |     adam_beta1: 0.7
15 |     adam_beta2: 0.3
16 | 
17 |     # Gradient Norm
18 |     max_grad_norm: 2.0
19 | 
20 |     # Maximum Training Steps (Overrides epochs!)
21 |     max_steps: 100000
22 | 


--------------------------------------------------------------------------------
/tests/conf/trainers/gpt2-small.yaml:
--------------------------------------------------------------------------------
 1 | # gpt2-small.yaml
 2 | #   Trainer config for Full GPT-2 Small, with the full fixed batch size of 512 (with gradient accumulation).
 3 | #   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
 4 | #   continues to stay valid!
 5 | #       Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
 6 | ---
 7 | training_arguments:
 8 |     # Overwrite from Top-Level Config
 9 |     output_dir: null
10 | 
11 |     # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
12 |     do_train: true
13 |     evaluation_strategy: steps
14 | 
15 |     # Set these based on GPU RAM/your available hardware
16 |     per_device_train_batch_size: 8
17 |     per_device_eval_batch_size: 16
18 | 
19 |     # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
20 |     gradient_accumulation_steps: null
21 | 
22 |     # For Online Evaluation, only keep around the Losses
23 |     prediction_loss_only: true
24 | 
25 |     # Learning Rate & Optimization Parameters, assumes AdamW
26 |     learning_rate: 0.0006
27 |     weight_decay: 0.1
28 |     adam_beta1: 0.9
29 |     adam_beta2: 0.95
30 |     adam_epsilon: 1.0e-8
31 | 
32 |     # Gradient Norm
33 |     max_grad_norm: 1.0
34 | 
35 |     # Maximum Training Steps (Overrides epochs!)
36 |     max_steps: 400000
37 | 
38 |     # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio)
39 |     lr_scheduler_type: linear   # Cosine not supported if we want to use DeepSpeed Optimizers (gets overwritten!)
40 |     warmup_steps: 4000
41 | 
42 |     # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime!
43 |     run_name: null
44 |     logging_dir: null
45 |     logging_first_step: true
46 |     logging_steps: 50
47 | 
48 |     # Saving and Evaluation Steps
49 |     eval_steps: 1000
50 |     save_steps: 1000
51 | 
52 |     # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
53 |     ignore_data_skip: false
54 | 
55 |     # Seeds -- Should be Overwritten at Runtime!
56 |     seed: null
57 | 
58 |     ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
59 |     fp16: true
60 |     sharded_ddp: null
61 |     deepspeed: null
62 | 
63 |     # Dataloader Parallelism
64 |     dataloader_num_workers: 0
65 | 
66 |     # Should be overwritten from the Top-Level Config or CLI!
67 |     local_rank: null
68 | 


--------------------------------------------------------------------------------
/tests/run_deepspeed_tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import sys
 4 | 
 5 | 
 6 | tests = [x for x in os.listdir(".") if x.startswith("test") and x.endswith("py")]
 7 | 
 8 | errors = 0
 9 | for test in tests:
10 |     # clean up if necessary
11 |     for log_path in ["test.out", "test.err", "test.log"]:
12 |         if os.path.exists(log_path):
13 |             os.remove(log_path)
14 |     # run tests
15 |     try:
16 |         print("Running test:", test)
17 |         subprocess.check_call(
18 |             f"CUDA_VISIBLE_DEVICES=0,1 deepspeed --num_gpus 2 --num_nodes 1 {test}",
19 |             shell=True,
20 |         )
21 |     except Exception:
22 |         errors += 1
23 |     if os.path.exists("test.log"):
24 |         subprocess.call("cat test.log", shell=True)
25 |         print("")
26 | 
27 | for log_path in ["test.out", "test.err", "test.log"]:
28 |     if os.path.exists(log_path):
29 |         os.remove(log_path)
30 | 
31 | if errors > 0:
32 |     sys.exit(1)
33 | 


--------------------------------------------------------------------------------
/tests/setup/pip-requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-crfm/mistral/bf9eff08e83f4d5703b69dfcb6c18e8e35a00a6d/tests/setup/pip-requirements.txt


--------------------------------------------------------------------------------
/tests/test_args.py:
--------------------------------------------------------------------------------
 1 | from copy import copy
 2 | 
 3 | from tests import MISTRAL_TEST_DIR, run_tests, run_train_process
 4 | 
 5 | 
 6 | # paths
 7 | CACHE_DIR = f"{MISTRAL_TEST_DIR}/artifacts"
 8 | RUNS_DIR = f"{MISTRAL_TEST_DIR}/runs"
 9 | 
10 | TRAIN_ARGS = {
11 |     "nnodes": "1",
12 |     "nproc_per_node": "1",
13 |     "config": "conf/train.yaml",
14 |     "training_arguments.fp16": "false",
15 |     "training_arguments.per_device_train_batch_size": "1",
16 |     "artifacts.cache_dir": CACHE_DIR,
17 |     "log_level": "50",
18 |     "run_training": "false",
19 |     "run_final_eval": "false",
20 | }
21 | 
22 | TRAIN_ARGS_DIFF = copy(TRAIN_ARGS)
23 | TRAIN_ARGS_DIFF["config"] = "conf/train-diff.yaml"
24 | 
25 | trainer_w_train = None
26 | trainer_w_train_diff = None
27 | 
28 | 
29 | def setup_module() -> None:
30 |     global trainer_w_train, trainer_w_train_diff
31 |     trainer_w_train = run_train_process(cl_args_dict=TRAIN_ARGS, runs_dir=RUNS_DIR, run_id="train_args_test")
32 |     trainer_w_train_diff = run_train_process(
33 |         cl_args_dict=TRAIN_ARGS_DIFF, runs_dir=RUNS_DIR, run_id="train_args_diff_test"
34 |     )
35 | 
36 | 
37 | def test_train_args() -> None:
38 |     assert trainer_w_train.args.weight_decay == 0.1
39 |     assert trainer_w_train.args.adam_beta1 == 0.9
40 |     assert trainer_w_train.args.adam_beta2 == 0.95
41 |     assert trainer_w_train.args.max_grad_norm == 1.0
42 |     assert trainer_w_train_diff.args.weight_decay == 0.2
43 |     assert trainer_w_train_diff.args.adam_beta1 == 0.7
44 |     assert trainer_w_train_diff.args.adam_beta2 == 0.3
45 |     assert trainer_w_train_diff.args.max_grad_norm == 2.0
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     run_tests()
50 | 


--------------------------------------------------------------------------------
/tests/test_eval_loss_is_defined.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import numpy
 4 | 
 5 | from src.core import OnlineBenchmarkTrainer
 6 | from tests import MISTRAL_TEST_DIR, run_tests, run_train_process
 7 | 
 8 | 
 9 | # paths
10 | CACHE_DIR = f"{MISTRAL_TEST_DIR}/artifacts"
11 | RUNS_DIR = f"{MISTRAL_TEST_DIR}/runs"
12 | 
13 | TRAIN_ARGS = {
14 |     "config": "conf/train.yaml",
15 |     "training_arguments.fp16": "false",
16 |     "training_arguments.per_device_train_batch_size": "1",
17 |     "artifacts.cache_dir": CACHE_DIR,
18 |     "log_level": "50",
19 |     "run_training": "true",
20 |     "training_arguments.max_steps": "2",  # just enough steps so HF doesn't complain about using zero 2 for inference
21 |     "run_final_eval": "false",
22 | }
23 | 
24 | trainer: OnlineBenchmarkTrainer = None
25 | metrics: dict = None
26 | 
27 | 
28 | def setup_module() -> None:
29 |     global trainer, metrics
30 |     trainer, metrics = run_train_process(
31 |         cl_args_dict=TRAIN_ARGS, runs_dir=RUNS_DIR, run_id="train_eval_loss_is_defined", also_evaluate=True
32 |     )
33 | 
34 | 
35 | def test_train_args() -> None:
36 |     assert any(numpy.isfinite(v) and re.match("eval.*loss", k) for k, v in metrics.items())
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     run_tests()
41 | 


--------------------------------------------------------------------------------
/tests/test_fp.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch.cuda
 3 | 
 4 | from tests import MISTRAL_TEST_DIR, run_tests, run_train_process
 5 | 
 6 | 
 7 | # common paths and resources for tests
 8 | 
 9 | # paths
10 | CACHE_DIR = f"{MISTRAL_TEST_DIR}/artifacts"
11 | RUNS_DIR = f"{MISTRAL_TEST_DIR}/runs"
12 | RUN_ID = "upcasting_test"
13 | RUN_ID_DIR = f"{RUNS_DIR}/{RUN_ID}"
14 | 
15 | # run training processes for tests
16 | TRAIN_ARGS = {
17 |     "nnodes": "1",
18 |     "nproc_per_node": "1",
19 |     "config": "conf/train.yaml",
20 |     "training_arguments.fp16": "true",
21 |     "training_arguments.max_steps": "4",
22 |     "artifacts.cache_dir": CACHE_DIR,
23 |     "run_training": "true",
24 |     "run_final_eval": "false",
25 |     "log_level": "50",
26 | }
27 | 
28 | 
29 | def setup_module() -> None:
30 |     global basic_trainer
31 |     try:
32 |         basic_trainer = run_train_process(cl_args_dict=TRAIN_ARGS, runs_dir=RUNS_DIR, run_id=RUN_ID)
33 |     except Exception:
34 |         basic_trainer = None
35 | 
36 | 
37 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="need cuda for fp16")
38 | def test_upcasting() -> None:
39 |     """
40 |     Run training with upcasting
41 |     """
42 |     assert basic_trainer is not None
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     run_tests()
47 | 


--------------------------------------------------------------------------------
/tests/test_indexed_dataset.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import tempfile
 3 | from typing import Iterator
 4 | 
 5 | from transformers import BatchEncoding
 6 | 
 7 | from src.corpora.indexer import IndexedDataset
 8 | 
 9 | 
10 | def test_can_move_dataset_cache():
11 |     def token_iterator() -> Iterator[BatchEncoding]:
12 |         for i in range(0, 100):
13 |             yield BatchEncoding({"input_ids": [[i] * (i + 1)]})
14 | 
15 |     with tempfile.TemporaryDirectory() as tempdir:
16 |         orig_cache = tempdir + "/orig"
17 |         orig_ds = IndexedDataset.build_or_load(token_iterator(), orig_cache, seq_len=5, stride=1)
18 | 
19 |         new_cache = tempdir + "/new"
20 |         # copy the cache
21 |         shutil.copytree(orig_cache, new_cache)
22 | 
23 |         new_ds = IndexedDataset(new_cache, seq_len=5, stride=1)
24 | 
25 |         assert list(orig_ds) == list(new_ds)
26 | 


--------------------------------------------------------------------------------
/tests/test_online_benchmark_trainer.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from transformers import TrainingArguments
 4 | 
 5 | 
 6 | try:
 7 |     from torchdata.datapipes.iter import IterDataPipe
 8 | except ImportError:
 9 |     from torch.utils.data import IterDataPipe
10 | 
11 | from src.core.trainer import OnlineBenchmarkTrainer
12 | 
13 | 
14 | def test_ob_trainer_different_processes_different_data():
15 |     class DummyModel(object):
16 |         def __init__(self, *args, **kwargs):
17 |             pass
18 | 
19 |         def __call__(self, *args, **kwargs):
20 |             return None
21 | 
22 |         def to(self, *args, **kwargs):
23 |             return self
24 | 
25 |         def forward(self, *args, **kwargs):
26 |             return None
27 | 
28 |     class FakeTrainingArguments(TrainingArguments):
29 |         def __init__(self, process_index):
30 |             self._process_index = process_index
31 | 
32 |         @property
33 |         def should_save(self):
34 |             return False
35 | 
36 |         @property
37 |         def world_size(self):
38 |             return 2
39 | 
40 |         @property
41 |         def process_index(self):
42 |             return self._process_index
43 | 
44 |         def get_process_log_level(self):
45 |             return logging.INFO
46 | 
47 |         @property
48 |         def report_to(self):
49 |             return []
50 | 
51 |         @property
52 |         def max_steps(self):
53 |             return 100
54 | 
55 |     class FakeTrainDataset(IterDataPipe):
56 |         def __init__(self):
57 |             pass
58 | 
59 |         def __iter__(self):
60 |             for i in range(128):
61 |                 yield {"input_ids": [i] * 3, "labels": [i]}
62 | 
63 |     """Test that online benchmark trainer gives different data to different processes."""
64 |     trainer1 = OnlineBenchmarkTrainer(
65 |         model=DummyModel(),  # type: ignore
66 |         args=FakeTrainingArguments(0),
67 |         train_dataset=FakeTrainDataset(),
68 |     )
69 | 
70 |     trainer2 = OnlineBenchmarkTrainer(
71 |         model=DummyModel(),  # type: ignore
72 |         args=FakeTrainingArguments(1),
73 |         train_dataset=FakeTrainDataset(),
74 |     )
75 | 
76 |     d1 = list(trainer1.get_train_dataloader())
77 |     d2 = list(trainer2.get_train_dataloader())
78 | 
79 |     # data is List[Dict[str, Tensor2]]
80 |     # we have to convert to List[List[int]] to compare
81 |     d1 = [[[y.item() for y in x] for x in d["input_ids"]] for d in d1]
82 |     d2 = [[[y.item() for y in x] for x in d["input_ids"]] for d in d2]
83 | 
84 |     assert d1 != d2
85 | 


--------------------------------------------------------------------------------
/tests/test_seed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from tests import MISTRAL_TEST_DIR, check_samples_equal, get_samples, run_tests, run_train_process
 4 | 
 5 | 
 6 | # paths
 7 | CACHE_DIR = f"{MISTRAL_TEST_DIR}/artifacts"
 8 | RUNS_DIR = f"{MISTRAL_TEST_DIR}/runs"
 9 | RUN_ID = "train_args_test"
10 | RUN_ID_DIR = f"{RUNS_DIR}/{RUN_ID}"
11 | 
12 | # set up different trainers to see initialization differences
13 | TRAIN_ARGS_SEED_7 = {
14 |     "nnodes": "1",
15 |     "nproc_per_node": "1",
16 |     "config": "conf/train.yaml",
17 |     "training_arguments.fp16": "false",
18 |     "training_arguments.per_device_train_batch_size": "1",
19 |     "artifacts.cache_dir": CACHE_DIR,
20 |     "seed": "7",
21 |     "log_level": "50",
22 |     "run_training": "false",
23 |     "run_final_eval": "false",
24 | }
25 | 
26 | TRAIN_ARGS_SEED_10 = dict(TRAIN_ARGS_SEED_7)
27 | TRAIN_ARGS_SEED_10["seed"] = "10"
28 | 
29 | trainer_seed_7 = None
30 | trainer_seed_10 = None
31 | trainer_seed_7_copy = None
32 | 
33 | 
34 | def setup_module() -> None:
35 |     global trainer_seed_7, trainer_seed_10, trainer_seed_7_copy
36 |     trainer_seed_7 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_7, runs_dir=RUNS_DIR, run_id="trainer_seed_7")
37 |     trainer_seed_10 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_10, runs_dir=RUNS_DIR, run_id="trainer_seed_10")
38 |     trainer_seed_7_copy = run_train_process(
39 |         cl_args_dict=TRAIN_ARGS_SEED_7, runs_dir=RUNS_DIR, run_id="trainer_seed_7_copy"
40 |     )
41 | 
42 | 
43 | def is_randomized(key):
44 |     """
45 |     Helper to determine if the key in the state_dict() is a set of parameters that is randomly initialized.
46 |     Some weights are not randomly initalized and won't be afffected by seed, particularly layer norm
47 |     weights and biases, and bias terms in general.
48 |     """
49 |     # regexes for components that are not randomized
50 |     if key.endswith("bias") or "ln" in key:
51 |         return False
52 |     else:
53 |         return True
54 | 
55 | 
56 | def test_weight_initializations() -> None:
57 |     # trainer_seed_7 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_7, runs_dir=RUNS_DIR, run_id="trainer_seed_7")
58 |     # trainer_seed_10 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_10, runs_dir=RUNS_DIR, run_id="trainer_seed_10")
59 | 
60 |     assert trainer_seed_7.model.state_dict().keys() == trainer_seed_10.model.state_dict().keys()
61 |     for key in trainer_seed_7.model.state_dict().keys():
62 |         if is_randomized(key):
63 |             assert not torch.equal(
64 |                 trainer_seed_7.model.state_dict()[key], trainer_seed_10.model.state_dict()[key]
65 |             ), f"weights are the same for {key}"
66 | 
67 | 
68 | def test_data_order() -> None:
69 |     # trainer_seed_7 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_7, runs_dir=RUNS_DIR, run_id="trainer_seed_7")
70 |     # trainer_seed_10 = run_train_process(cl_args_dict=TRAIN_ARGS_SEED_10, runs_dir=RUNS_DIR, run_id="trainer_seed_10")
71 | 
72 |     seed_7_dataloader = trainer_seed_7.get_train_dataloader()
73 |     seed_10_dataloader = trainer_seed_10.get_train_dataloader()
74 | 
75 |     seed_7_data, seed_10_data = get_samples(seed_7_dataloader), get_samples(seed_10_dataloader)
76 | 
77 |     seed_7_copy_dataloader = trainer_seed_7_copy.get_train_dataloader()
78 |     seed_7_copy_data = get_samples(seed_7_copy_dataloader)
79 | 
80 |     assert check_samples_equal(seed_7_copy_data, seed_7_data), "data is not the same"
81 |     assert not check_samples_equal(seed_10_data, seed_7_data), "data order should be different for different seeds"
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     run_tests()
86 | 


--------------------------------------------------------------------------------
/tests/test_valid_configs.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | from unittest.mock import patch
 4 | 
 5 | from quinine.common.argparse import QuinineArgumentParser
 6 | 
 7 | import tests
 8 | from conf.train_schema import get_schema
 9 | 
10 | 
11 | expected_manual_args = [
12 |     "--artifacts.cache_dir",
13 |     "artifacts/cache",
14 |     "--artifacts.run_dir",
15 |     "artifacts/runs",
16 | ]
17 | 
18 | 
19 | def validate_config(config_file):
20 |     try:
21 |         cl_args = ["--config", str(config_file)] + expected_manual_args
22 |         with patch.object(sys, "argv", ["foo.py"] + cl_args):
23 |             QuinineArgumentParser(schema=get_schema()).parse_quinfig()
24 |     except Exception as e:
25 |         raise Exception(f"{config_file} is not valid: {e}") from e
26 | 
27 | 
28 | def test_all_test_configs_are_valid():
29 |     # test all the yaml files in the main mistral conf directory, and in the mistral tests/conf directory
30 |     test_root = pathlib.Path(tests.__file__).parent.absolute()
31 | 
32 |     for path in pathlib.Path(test_root).glob("conf/*.yaml"):
33 |         validate_config(path)
34 | 
35 | 
36 | def test_all_real_configs_are_valid():
37 |     # test all the yaml files in the main mistral conf directory, and in the mistral tests/conf directory
38 |     mistral_root = pathlib.Path(tests.__file__).parent.parent.absolute()
39 | 
40 |     for path in pathlib.Path(mistral_root).glob("conf/*.yaml"):
41 |         validate_config(path)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     tests.run_tests()
46 | 


--------------------------------------------------------------------------------
/tutorials/custom-dataset/README.md:
--------------------------------------------------------------------------------
 1 | # Train On Custom Dataset
 2 | 
 3 | ## Create Directory With Your Text
 4 | 
 5 | Put text into `*.jsonl` files, one document per line.
 6 | 
 7 | ```
 8 | {"text": "Document one ..."}
 9 | {"text": "Document two ..."}
10 | ...
11 | ```
12 | 
13 | You can have arbitrarily many files. Files matching `*train*` will be used as
14 | training data and files with `*validation*` will be used as validation data.
15 | 
16 | For example, if you are training on PubMed data, you would have something like
17 | this:
18 | 
19 | ```
20 | /path/to/pubmed_local
21 |     pubmed_train.jsonl
22 |     pubmed_validation.jsonl
23 | ```
24 | 
25 | Each line of those files would be a document in the format described above.
26 | An example of a custom dataset can be found at
27 | `tutorials/custom-dataset/shakespeare`.
28 | 
29 | 
30 | ## Set up dataset config file in `conf/datasets`
31 | 
32 | In the dataset config file, specify the number of workers you need to
33 | process the data and the path to the custom dataset on your machine.
34 | 
35 | An example config file for the Shakespeare dataset is at
36 | `conf/datasets/shakespeare.yaml`.
37 | 
38 | 
39 | ## Specify Your New Dataset In The Overall Experiment Config
40 | 
41 | Remember to specify this dataset in your overall experiment config. This is
42 | typically done at the top in the inherit section. For example,
43 | 
44 | ```
45 | # Inherit Dataset, Tokenization, Model, and Training Details
46 | inherit:
47 |     - datasets/pubmed_local.yaml
48 |     - models/mistral-small.yaml
49 |     - trainers/gpt2-small.yaml
50 | ```
51 | 
52 | An example of the config file can be found at
53 | `conf/tutorial-shakespeare-gpt2-micro.yaml`. We train a GPT-2 micro
54 | (~11m parameters) model on Shakespeare text for that example.
55 | 


--------------------------------------------------------------------------------
/tutorials/gcp-on-demand/README.md:
--------------------------------------------------------------------------------
 1 | # Run Mistral On GCP (on demand)
 2 | 
 3 | ## Create An A100 With 8 GPU
 4 | 
 5 | Go to the VM instances page and click "Create Instance"
 6 | 
 7 | Give it an informative name (e.g. "mistral-gcp-demo")
 8 | 
 9 | Choose `europe-west4 (Netherlands)` as the zone
10 | 
11 | Select GPU machine, and choose NVIDIA Tesla A100, with 8 GPUs
12 | 
13 | Customize the Boot disk OS to "Deep Learning on Linux"/"Debian 10 based Deep Learning VM with CUDA 11.3 M93"
14 | 
15 | Update the size to 1 TB (or whatever you feel you need)
16 | 
17 | Hit "Create" !
18 | 
19 | Wait a few minutes, and then click the "SSH" button on the VM page. Hit "y" when asked to install drivers.
20 | 
21 | At this point the machine should be set up and operational. Run `nvidia-smi` to confirm.
22 | 
23 | 
24 | ## Clone Mistral
25 | 
26 | Clone the repo
27 | 
28 | ```
29 | git clone https://github.com/stanford-crfm/mistral.git
30 | ```
31 | 
32 | ## Create Mistral conda environment
33 | 
34 | Follow the instructions on the main README for setting up the conda env.
35 | 
36 | Generally this will be:
37 | 
38 | ```
39 | cd setup
40 | bash setup.sh
41 | ```
42 | 
43 | ## Set Up WandB
44 | 
45 | ```
46 | cd mistral
47 | wandb login # type in your API key at prompt
48 | wandb init
49 | mkdir /home/username/data # create directory for storing runs and artifacts
50 | ```
51 | 
52 | ## Modify Config File
53 | 
54 | Alter the config file in `conf/gpt2-small.yaml` to customize the datasets you use.
55 | 
56 | Particularly update the `artifact` entry:
57 | 
58 | ```
59 | artifacts:
60 |     cache_dir: /home/username/data/artifacts
61 |     run_dir: /home/username/data/runs
62 | ```
63 | 
64 | ## Launch The Training Run
65 | 
66 | This command will launch the training process with deepspeed
67 | 
68 | ```
69 | deepspeed --num_gpus 8 --num_nodes 1 --master_addr localhost --config conf/gpt2-small.yaml --nnodes 1 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4 --training_arguments.deepspeed conf/deepspeed/z2-small-conf.json --run_id mistral-june22-demo
70 | ```
71 | 


--------------------------------------------------------------------------------