├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── openchatkit-feedback-report.yaml
├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── OIG-chip2
    │   └── prepare.sh
    ├── OIG-moderation
    │   └── prepare.py
    ├── OIG
    │   └── prepare.py
    ├── prepare_data.py
    └── wikipedia-3sentence-level-retrieval-index
    │   └── prepare.py
├── docs
    ├── GPT-NeoXT-Chat-Base-20B.md
    └── finetuning-RedPajama-3B.md
├── environment.yml
├── inference
    ├── README.md
    ├── bot.py
    └── conversation.py
├── pretrained
    ├── GPT-NeoX-20B
    │   └── prepare.py
    ├── Llama-2-7B-32K-beta
    │   └── prepare.py
    ├── Pythia-6.9B-deduped
    │   └── prepare.py
    ├── RedPajama-3B
    │   └── prepare.py
    ├── RedPajama-7B
    │   └── prepare.py
    └── prepare_pretrained.py
├── retrieval
    ├── README.md
    ├── __init__.py
    └── wikipedia.py
├── tools
    ├── README.md
    ├── benchmark_input.json
    ├── convert_to_hf_gptneox.py
    ├── convert_to_hf_llama.py
    └── model_load_benchmark.py
└── training
    ├── README.md
    ├── comm
        ├── __init__.py
        ├── comm_utils.py
        ├── nccl_backend.py
        └── torch_backend.py
    ├── data_parallel
        ├── __init__.py
        ├── dist_dp_allreduce.py
        ├── dist_dp_central_ps.py
        ├── dist_dp_local.py
        ├── dist_dp_sharded_ps.py
        ├── dist_dp_utils.py
        └── flatten_utils.py
    ├── dist_clm_train.py
    ├── dist_prefixlm_train.py
    ├── finetune_GPT-NeoXT-Chat-Base-20B.sh
    ├── finetune_Pythia-Chat-Base-7B.sh
    ├── finetune_RedPajama-INCITE-7B-Chat.sh
    ├── finetune_RedPajama-INCITE-Chat-3B-v1.sh
    ├── finetune_llama-2-7b-32k-booksum.sh
    ├── finetune_llama-2-7b-32k-mqa.sh
    ├── lora
        └── example
        │   ├── redpajama-incite-chat-3b.py
        │   └── redpajama-incite-chat-3b_inference.py
    ├── modules
        ├── __init__.py
        ├── deberta_modules.py
        ├── dist_deberta_pp_module.py
        ├── dist_gpt_fsdp_module.py
        ├── dist_gpt_pp_module.py
        ├── hf_gpt2_modules.py
        ├── hf_gptj_modules.py
        ├── hf_gptneox_modules.py
        ├── hf_opt_modules.py
        ├── llama_modules.py
        ├── task_modules.py
        ├── tokenizer.py
        └── utils.py
    ├── optimizer
        ├── __init__.py
        ├── grad_scalar.py
        └── optimizer.py
    ├── pipeline_parallel
        ├── __init__.py
        ├── dist_gpipe_pipeline_async.py
        └── dist_pp_utils.py
    ├── tasks
        ├── __init__.py
        └── data_loaders
        │   ├── __init__.py
        │   ├── data_utils.py
        │   └── prosocial.py
    └── utils
        ├── __init__.py
        ├── dist_args_utils.py
        ├── dist_checkpoint_utils.py
        ├── dist_debug_utils.py
        ├── event_report.py
        ├── logging_utils.py
        └── upload_manager.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/openchatkit-feedback-report.yaml:
--------------------------------------------------------------------------------
 1 | name: OpenChatKit Feedback Report
 2 | description: Details of feedback from using OpenChatKit test app
 3 | title: OpenChatKit Feedback Report
 4 | labels: "feedback report"
 5 | assignees: []
 6 | body:
 7 |   - type: markdown
 8 |     attributes:
 9 |       value: |
10 |         Thanks for taking the time to fill out this feedback report!
11 |   - type: textarea
12 |     id: my-question
13 |     attributes:
14 |       label: "My question:"
15 |     validations:
16 |       required: true
17 |   - type: textarea
18 |     id: bot-response
19 |     attributes:
20 |       label: "Bot response:"
21 |     validations:
22 |       required: true
23 |   - type: textarea
24 |     id: ideal-bot-response
25 |     attributes:
26 |       label: "Ideal bot response:"
27 |     validations:
28 |       required: true
29 |   - type: checkboxes
30 |     id: response-issues
31 |     attributes:
32 |       label: "Bot response was:"
33 |       options:
34 |         - label: Factually incorrect
35 |           required: true
36 |         - label: Not helpful
37 |           required: true
38 |         - label: Harmful, inappropriate or unsafe
39 |           required: true
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # ignore downloaded files
132 | /data/OIG-moderation/files/
133 | /data/OIG/files/
134 | /data/wikipedia-3sentence-level-retrieval-index/files/
135 | /pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b/
136 | /pretrained/Pythia-6.9B-deduped/EleutherAI_pythia-6.9b-deduped/
137 | /pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1
138 | 
139 | # ignore training output
140 | /model_ckpts/
141 | /huggingface_models/
142 | /training/wandb/
143 | 
144 | # ignore trained low-rank adapters
145 | /outputs/
146 | data/OIG-chip2/*.jsonl
147 | wandb/


--------------------------------------------------------------------------------
/data/OIG-chip2/prepare.sh:
--------------------------------------------------------------------------------
1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
2 | wget https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl -O ${DIR}/unified_chip2.jsonl


--------------------------------------------------------------------------------
/data/OIG-moderation/prepare.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # Import the prepare_data function
 5 | current_dir = os.path.dirname(os.path.abspath(__file__))
 6 | sys.path.append(os.path.join(current_dir, '..'))
 7 | from prepare_data import prepare_data
 8 | 
 9 | if __name__ == "__main__":
10 |     dest_dir = os.path.join(current_dir, "files")
11 |     prepare_data("https://huggingface.co/datasets/ontocord/OIG-moderation", dest_dir)
12 | 


--------------------------------------------------------------------------------
/data/OIG/prepare.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # Import the prepare_data function
 5 | current_dir = os.path.dirname(os.path.abspath(__file__))
 6 | sys.path.append(os.path.join(current_dir, '..'))
 7 | from prepare_data import prepare_data
 8 | 
 9 | if __name__ == "__main__":
10 |     dest_dir = os.path.join(current_dir, "files")
11 |     prepare_data("https://huggingface.co/datasets/laion/OIG", dest_dir)
12 | 


--------------------------------------------------------------------------------
/data/wikipedia-3sentence-level-retrieval-index/prepare.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # Import the prepare_data function
 5 | current_dir = os.path.dirname(os.path.abspath(__file__))
 6 | sys.path.append(os.path.join(current_dir, '..'))
 7 | from prepare_data import prepare_data
 8 | 
 9 | if __name__ == "__main__":
10 |     dest_dir = os.path.join(current_dir, "files")
11 |     prepare_data("https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index", dest_dir)
12 | 


--------------------------------------------------------------------------------
/docs/GPT-NeoXT-Chat-Base-20B.md:
--------------------------------------------------------------------------------
  1 | # GPT-NeoXT-Chat-Base-20B
  2 | 
  3 | OpenChatKit includes an instruction-tuned 20 billion parameter language model called GPT-NeoXT-Chat-Base-20B, a 6 billion parameter moderation model, and an extensible retrieval system for including up-to-date responses from custom repositories. It was trained on the OIG-43M training dataset, which was a collaboration between [Together](https://www.together.xyz/), [LAION](https://laion.ai), and [Ontocord.ai](https://ontocord.ai). Much more than a model release, this is the beginning of an open source project. We are releasing a set of tools and processes for ongoing improvement with community contributions. 
  4 | 
  5 | In this doc, you'll find steps for:
  6 | - Training an OpenChatKit model
  7 | - Testing inference using the model
  8 | - Augmenting the model with additional context from a retrieval index
  9 | 
 10 | # Contents
 11 | 
 12 | - [Requirements](#requirements)
 13 | - [Pre-trained Weights](#pre-trained-weights)
 14 | - [Datasets](#datasets)
 15 |   * [Data Contributions](#data-contributions)
 16 | - [Pretrained Base Model](#pretrained-base-model)
 17 | - [Training and Finetuning](#training-and-finetuning)
 18 |   * [(Optional) 8bit Adam](#optional-8bit-adam)
 19 |   * [Train GPT-NeoX-Chat-Base-20B](#train-gpt-neox-chat-base-20b)
 20 | - [Converting Weights to Huggingface Format](#converting-weights-to-huggingface-format)
 21 | - [Inference](#inference)
 22 | - [Monitoring](#monitoring)
 23 |   * [Loguru](#loguru)
 24 |   * [Weights & Biases](#weights--biases)
 25 | - [Experimental: Retrieval-Augmented Models](#experimental-retrieval-augmented-models)
 26 | - [Acknowledgements](#acknowledgements)
 27 | 
 28 | # Requirements
 29 | 
 30 | Before you begin, you need to install PyTorch and other dependencies.
 31 | 
 32 | 1. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) from their website.
 33 | 
 34 | 2. Install [Git LFS](https://git-lfs.com/) from their website.
 35 | 
 36 | 3. Install the `git lfs` hooks.
 37 | 
 38 | ```shell
 39 | git lfs install
 40 | ```
 41 | 
 42 | 4. Install mamba in the `base` environment so it's available in all environments.
 43 | 
 44 | ```shell
 45 | conda install mamba -n base -c conda-forge
 46 | ```
 47 | 
 48 | 5. Create an environment called OpenChatKit using the `environment.yml` file at the root of this repo.
 49 | 
 50 | ```shell
 51 | mamba env create -f environment.yml 
 52 | ```
 53 | 
 54 | 6. Activate the new conda environment.
 55 | 
 56 | ```shell
 57 | conda activate OpenChatKit
 58 | ```
 59 | 
 60 | # Pre-trained Weights
 61 | 
 62 | GPT-NeoXT-Chat-Base-20B is a 20B-parameter variant of GPT-NeoX, fine-tuned on conversational datasets. We are releasing pre-trained weights for this model as [togethercomputer/GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B) on Huggingface.
 63 | 
 64 | More details can be found on the model card for [GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B) on Huggingface.
 65 | 
 66 | # Datasets
 67 | 
 68 | The chat model was trained on the [OIG](https://huggingface.co/datasets/laion/OIG) dataset built by [LAION](https://laion.ai/), [Together](https://www.together.xyz/), and [Ontocord.ai](https://www.ontocord.ai/). To download the dataset from Huggingface run the command below from the root of the repo.
 69 | 
 70 | ```shell
 71 | python data/OIG/prepare.py
 72 | ```
 73 | 
 74 | Once the command completes, the data will be in the `data/OIG/files` directory.
 75 | 
 76 | ## Data Contributions
 77 | 
 78 | You can help make this chat model better by contributing data! See the [OpenDataHub](https://github.com/togethercomputer/OpenDataHub) repo for more details.
 79 | 
 80 | # Pretrained Base Model
 81 | 
 82 | As mentioned above, the chat model is a fine-tuned variant of GPT-NeoX-20B from Eleuther AI. To download GPT-NeoX-20B and prepare it for fine tuning, run this command from the root of the repo.
 83 | 
 84 | ```shell
 85 | python pretrained/GPT-NeoX-20B/prepare.py
 86 | ```
 87 | 
 88 | The weights for this model will be in the `pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b`.
 89 | 
 90 | In case you want to fine-tune other gpt-neox models, e.g. [the Pythia model suite](https://huggingface.co/models?sort=downloads&search=pythia), you can specify the HF model name, for example:
 91 | 
 92 | ```shell
 93 | python pretrained/GPT-NeoX-20B/prepare.py --model-name EleutherAI/pythia-6.9b-deduped
 94 | ```
 95 | 
 96 | And the weights for this model will be in the `pretrained/GPT-NeoX-20B/EleutherAI_pythia-6.9b-deduped`.
 97 | 
 98 | 
 99 | # Training and Finetuning
100 | 
101 | ## (Optional) 8bit Adam
102 | 
103 | To use 8bit-adam during training, install the `bitsandbytes` package.
104 | 
105 | ```shell
106 | pip install bitsandbytes # optional, to use 8bit-adam
107 | ```
108 | 
109 | ## Train GPT-NeoX-Chat-Base-20B
110 | 
111 | The `training/finetune_GPT-NeoXT-Chat-Base-20B.sh` script configures and runs the training loop. After downloading the dataset and the base model, run:
112 | 
113 | ```shell
114 | bash training/finetune_GPT-NeoXT-Chat-Base-20B.sh
115 | ```
116 | 
117 | The script launches 8 processes with a pipeline-parallel degree of 8 and a data-parallel degree of 1.
118 | 
119 | As the training loop runs, checkpoints are saved to the `model_ckpts` directory at the root of the repo.
120 | 
121 | Please see [the training README](training/README.md) for more details about customizing the training run.
122 | 
123 | The `training/finetune_Pythia-Chat-Base-7B.sh` script is another example to fine-tune a 7B pythia (gpt-neox) model. The script launches 8 processes with a pipeline-parallel degree of 4 and a data-parallel degree of 2.
124 | 
125 | # Converting Weights to Huggingface Format
126 | 
127 | Before you can use this model to perform inference, it must be converted to the Huggingface format. Run this command from the root of the repo to do so.
128 | 
129 | ```shell
130 | mkdir huggingface_models \
131 |   && python tools/convert_to_hf_gptneox.py \
132 |        --ckpt-path model_ckpts/GPT-Neo-XT-Chat-Base-20B/checkpoint_100  \
133 |        --save-path huggingface_models/GPT-NeoXT-Chat-Base-20B  \
134 |        --n-stages 8  \
135 |        --n-layer-per-stage 6 \
136 |        --fp16
137 | ```
138 | where the `--fp16` flag will load and store models in fp16.
139 | 
140 | Make sure to replace `model_ckpts/GPT-Neo-XT-Chat-Base-20B/checkpoint_100` with the latest checkpoint in the `model_ckpts/GPT-Neo-XT-Chat-Base-20B` directory.
141 | 
142 | If you need to convert ckpts of other gpt-neox variants, make sure to specify the correct config name for your variant.
143 | For example, if you want to convert a checkpoint fine-tuned from `EleutherAI/pythia-6.9b-deduped`, you should indicate this as a config name:
144 | ```shell
145 | python tools/convert_to_hf_gptneox.py \
146 |     --config-name EleutherAI/pythia-6.9b-deduped \
147 |     --ckpt-path model_ckpts/Pythia-Chat-Base-7B/checkpoint_100 \
148 |     --save-path huggingface_models/Pythia-Chat-Base-7B \
149 |     --n-stages 4 \
150 |     --n-layer-per-stage 8 \
151 |     --fp16
152 | ```
153 | 
154 | 
155 | # Inference
156 | 
157 | To help you test the model, we provide a simple test command line test harness to interact with the bot. 
158 | 
159 | ```shell
160 | python inference/bot.py
161 | ```
162 | 
163 | By default the script will load the model named GPT-NeoXT-Chat-Base-20B model under the `huggingface_models` directory, but you can override that behavior by specifying `--model`.
164 | 
165 | For example, if you want to load the base model from our Huggingface, repo, you can run the following command which downloads the weights from HuggingFace.
166 | 
167 | ```shell
168 | python inference/bot.py --model togethercomputer/GPT-NeoXT-Chat-Base-20B
169 | ```
170 | 
171 | Once the model has loaded, enter text at the prompt and the model will reply.
172 | 
173 | ```shell
174 | $ python inference/bot.py 
175 | Loading /home/csris/src/github.com/togethercomputer/OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:1...
176 | Welcome to OpenChatKit shell.   Type /help or /? to list commands.
177 | 
178 | >>> Hello.
179 | Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
180 | Hello human.
181 | 
182 | >>> 
183 | ```
184 | 
185 | Commands are prefixed with a `/`, and the `/quit` command exits.
186 | 
187 | Please see [the inference README](inference/README.md) for more details about arguments, running on multiple/specific GPUs, and running on consumer hardware.
188 | 
189 | # Monitoring
190 | 
191 | By default, the training script simply prints the loss as training proceeds, but it can also output metrics to a file using [loguru](https://github.com/Delgan/loguru) or report them to Weights & Biases.
192 | 
193 | ## Loguru
194 | 
195 | Add the flag `--train-log-backend loguru` to your training script to log to `./logs/file_{time}.log`
196 | 
197 | ## Weights & Biases
198 | 
199 | To use Weights & Biases, first login with your Weights & Biases token.
200 | 
201 | ```shell
202 | wandb login
203 | ```
204 | 
205 | And set `--train-log-backend wandb` in the training script to enable logging to Weights & Biases.
206 | 
207 | # Experimental: Retrieval-Augmented Models
208 | 
209 | *Note: Retrieval is still experimental.*
210 | 
211 | The code in `/retrieval` implements a python package for querying a Faiss index of Wikipedia. The following steps explain how to use this index to augment queries in the test harness with context from the retriever.
212 | 
213 | 1. Download the Wikipedia index.
214 | 
215 | ```shell
216 | python data/wikipedia-3sentence-level-retrieval-index/prepare.py
217 | ```
218 | 
219 | 2. Run the bot with the `--retrieval` flag.
220 | 
221 | ```shell
222 | python inference/bot.py --retrieval
223 | ```
224 | 
225 | After starting, the bot will load both the chat model and the retrieval index, which takes a long time. Once the model and the index are loaded, all queries will be augmented with extra context.
226 | 
227 | 
228 | ```shell
229 | $ python inference/bot.py --retrieval
230 | Loading /OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:0...
231 | Loading retrieval index...
232 | Welcome to OpenChatKit shell.   Type /help or /? to list commands.
233 | 
234 | >>> Where is Zurich?
235 | Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
236 | Where is Zurich?
237 | Zurich is located in Switzerland.
238 | 
239 | >>>
240 | ```
241 | 
242 | # Acknowledgements
243 | 
244 | Our model is a fine-tuned version of [gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b), a large language model trained by [Eleuther AI](https://www.eleuther.ai). We evaluated our model on [HELM](https://crfm.stanford.edu/helm/latest/) provided by the [Center for Research on Foundation Models](https://crfm.stanford.edu). And we collaborated with both [CRFM](https://crfm.stanford.edu) and [HazyResearch](http://hazyresearch.stanford.edu) at Stanford to build this model.
245 | 
246 | We collaborated with [LAION](https://laion.ai/) and [Ontocord.ai](https://www.ontocord.ai/) to build the training data used to fine tune this model.
247 | 


--------------------------------------------------------------------------------
/docs/finetuning-RedPajama-3B.md:
--------------------------------------------------------------------------------
  1 | # RedPajama-3B
  2 | 
  3 | In this tutorial, you will learn how to fine-tune a base LLM on a sample of data. By the end of 
  4 | the tutorial, you will have fine-tuned the RedPajama-INCITE-Chat-3B model using a sample of 
  5 | chat data from the OIG dataset. You can adapt this tutorial to fine-tune with your own data.
  6 | 
  7 | In order to fine-tune the RedPajama 3B models, please follow these steps:
  8 | 
  9 | First clone the OpenChatKit repo:
 10 | 
 11 | ```shell
 12 | git clone git@github.com:togethercomputer/OpenChatKit.git
 13 | ```
 14 | 
 15 | Next install dependencies as instructed by the OpenChatKit repo.
 16 | 
 17 | # Prepare Weights
 18 | 
 19 | ```shell
 20 | python pretrained/RedPajama-3B/prepare.py
 21 | ```
 22 | 
 23 | This script will download the weight from HuggingFace and prepare it for finetuning. The prepared weights will be saved at 
 24 | 
 25 | ```
 26 | pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1
 27 | ```
 28 | 
 29 | # Prepare Fine Tuning Data
 30 | 
 31 | We now need to preapre the training data.  We provide an example script that downloads a small slice of data from OIG. 
 32 | To download this sample dataset, please run:
 33 |  
 34 | ```
 35 | bash data/OIG-chip2/prepare.sh
 36 | ````
 37 |  
 38 | The sample dataset will be saved at 
 39 | 
 40 | ```
 41 | data/OIG-chip2/unified_chip2.jsonl.
 42 | ```
 43 | 
 44 | # Run Fine Tuning Script
 45 | 
 46 | We provide an example training script.  Please configure the parameters (e.g., learning_rate, batch_size, dataset_path) according to your hardware configuration. 
 47 | Then to start training, simply run
 48 | 
 49 | ```
 50 | bash training/finetune_RedPajama-INCITE-Chat-3B-v1.sh
 51 | ```
 52 | 
 53 | # Convert to Huggingface Format
 54 | 
 55 | The fine-tuned model will be saved to 
 56 | 
 57 | ```
 58 | model_ckpts/rp-incite-chat-3b-finetuned/checkpoint_{steps}
 59 | ```
 60 | 
 61 | In order to use it for inference, you will need to convert it to the HuggingFace format. To do so, run the following script 
 62 | (as an example, please change the checkpoint path, n-stages and n-layer-per-stage according to the training script):
 63 | 
 64 | The default for n-stages used in the training script is 10 and the n-layer-per-stage is 8.
 65 | 
 66 | ```
 67 | python tools/convert_to_hf_gptneox.py --config-name togethercomputer/RedPajama-INCITE-Chat-3B-v1 --ckpt-path model_ckpts/redpajama-incite-chat-3b-sample/checkpoint_10/ --save-path model_ckpts/hf --n-stages 4 --n-layer-per-stage 8
 68 | ```
 69 | 
 70 | Then you are ready to go! You can load the model with HuggingFace and use it for inference, for example:
 71 | 
 72 | ```python
 73 | import torch
 74 | import transformers
 75 | from transformers import AutoTokenizer, AutoModelForCausalLM
 76 | 
 77 | tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Chat-3B-v1")
 78 | model = AutoModelForCausalLM.from_pretrained("./model_ckpts/hf", torch_dtype=torch.float16)
 79 | model = model.to('cuda:0')
 80 | 
 81 | prompt = "<human>: Who is Alan Turing?\n<bot>:"
 82 | inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
 83 | input_length = inputs.input_ids.shape[1]
 84 | outputs = model.generate(
 85 |     **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
 86 | )
 87 | token = outputs.sequences[0, input_length:]
 88 | output_str = tokenizer.decode(token)
 89 | print(output_str)
 90 | 
 91 | ```
 92 | 
 93 | Please note the above finetuning takes around 60GB VRAM to fit everything in to GPU, and may take even more to fit training data. If you do not have such GPUs, we also provide the low-rank finetuning scripts that works with 14GB VRAM. Here’re the steps to get started.
 94 | 
 95 | * Clone the OpenChatKit repo, install dependencies and prepare the dataset. These steps are the same as full fine-tuning.
 96 | 
 97 | * The sample low-rank finetuning script is at /training/lora/redpajama-incite-chat-3b.py, please modify this script to accommodate your own training data and preferred configuration.
 98 | 
 99 | * Then you can start low-rank finetuning by running this script.
100 | 
101 | Once the finetuning is finished, the resulting low-rank adapter will be saved to /outputs, and you can do inference with the following script.
102 | 
103 | ```
104 | python training/lora/redpajama-incite-chat-3b_inference.py
105 | ```


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: OpenChatKit
 2 | channels:
 3 |   - pytorch
 4 |   - nvidia
 5 |   - conda-forge
 6 |   - defaults
 7 | dependencies:
 8 |   - cudatoolkit=11.8.0
 9 |   - cupy=12.1.0
10 |   - faiss-gpu=1.7.2
11 |   - fastparquet=0.5.0
12 |   - nccl=2.18.3.1
13 |   - pip=23.2
14 |   - pyarrow=12.0.1
15 |   - python=3.10.9
16 |   - python-snappy=0.6.1
17 |   - pytorch=2.0.1
18 |   - pytorch-cuda=11.8
19 |   - snappy=1.1.9
20 |   - torchaudio=2.0.2
21 |   - torchvision=0.15.2
22 |   - pip:
23 |       - accelerate==0.21.0
24 |       - boto3
25 |       - datasets==2.13.1
26 |       - loguru==0.6.0
27 |       - netifaces==0.11.0
28 |       - pandas==2.0.3
29 |       - transformers==4.31.0
30 |       - wandb==0.15.5
31 |       - zstandard==0.21.0
32 |       - sentencepiece
33 | 


--------------------------------------------------------------------------------
/inference/README.md:
--------------------------------------------------------------------------------
 1 | # OpenChatKit Inference
 2 | This directory contains code for OpenChatKit's inference.
 3 | 
 4 | ## Arguments
 5 | - `--gpu-id`: Primary GPU device to load inputs onto for inference. Default: `0`
 6 | - `--model`: name/path of the model. Default = `../huggingface_models/GPT-NeoXT-Chat-Base-20B`
 7 | - `--max-tokens`: the maximum number of tokens to generate. Default: `128`
 8 | - `--sample`: indicates whether to sample. Default: `True`
 9 | - `--temperature`: temperature for the LM. Default: `0.6`
10 | - `--top-k`: top-k for the LM. Default: `40`
11 | - `--retrieval`: augment queries with context from the retrieval index. Default `False`
12 | - `-g` `--gpu-vram`: GPU ID and VRAM to allocate to loading the model, separated by a `:` in the format `ID:RAM` where ID is the CUDA ID and RAM is in GiB. `gpu-id` must be present in this list to avoid errors. Accepts multiple values, for example, `-g ID_0:RAM_0 ID_1:RAM_1 ID_N:RAM_N`
13 | - `-r` `--cpu-ram`: CPU RAM overflow allocation for loading the model. Optional, and only used if the model does not fit onto the GPUs given.
14 | 
15 | ## Hardware requirements for inference
16 | The GPT-NeoXT-Chat-Base-20B model requires at least 41GB of free VRAM. Used VRAM also goes up by ~100-200 MB per prompt. 
17 | 
18 | - A **minimum of 80 GB is recommended** 
19 | 
20 | - A **minimum of 48 GB in VRAM is recommended** for fast responses.
21 | 
22 | If you'd like to run inference on a GPU with <48 GB VRAM, refer to this section on [running on consumer hardware](#running-on-consumer-hardware).
23 | 
24 | By default, inference uses only CUDA Device 0.
25 | 
26 | **NOTE: Inference currently requires at least 1x GPU.**
27 | 
28 | ## Running on multiple GPUs
29 | Add the argument 
30 | 
31 | ```-g ID0:MAX_VRAM ID1:MAX_VRAM ID2:MAX_VRAM ...``` 
32 | 
33 | where IDx is the CUDA ID of the device and MAX_VRAM is the amount of VRAM you'd like to allocate to the device.
34 | 
35 | For example, if you are running this on 4x 48 GB GPUs and want to distribute the model across all devices, add ```-g 0:10 1:12 2:12 3:12 4:12```. In this example, the first device gets loaded to a max of 10 GiB while the others are loaded with a max of 12 GiB.
36 | 
37 | How it works: The model fills up the max available VRAM on the first device passed and then overflows into the next until the whole model is loaded.
38 | 
39 | **IMPORTANT: This MAX_VRAM is only for loading the model. It does not account for the additional inputs that are added to the device. It is recommended to set the MAX_VRAM to be at least 1 or 2 GiB less than the max available VRAM on each device, and at least 3GiB less than the max available VRAM on the primary device (set by `gpu-id` default=0).**
40 | 
41 | **Decrease MAX_VRAM if you run into CUDA OOM. This happens because each input takes up additional space on the device.**
42 | 
43 | **NOTE: Total MAX_VRAM across all devices must be > size of the model in GB. If not, `bot.py` automatically offloads the rest of the model to RAM and disk. It will use up all available RAM. To allocate a specified amount of RAM: [refer to this section on running on consumer hardware](#running-on-consumer-hardware).**
44 | 
45 | ## Running on specific GPUs
46 | If you have multiple GPUs but would only like to use a specific device(s), [use the same steps as in this section on running on multiple devices](#running-on-multiple-gpus) and only specify the devices you'd like to use. 
47 | 
48 | Also, if needed, add the argument `--gpu-id ID` where ID is the CUDA ID of the device you'd like to make the primary device. NOTE: The device specified in `--gpu-id` must be present as one of the ID in the argument `-g` to avoid errors.
49 | 
50 | - **Example #1**: to run inference on devices 2 and 5 with a max of 25 GiB on each, and make device 5 the primary device, add: `--gpu-id 5 -g 2:25 5:25`. In this example, not adding `--gpu-id 5` will give you an error.
51 | - **Example #2**: to run inference on devices 0 and 3 with a max of 10GiB on 0 and 40GiB on 3, with device 0 as the primary device, add: `-g 0:10 3:40`. In this example, `--gpu-id` is not required because device 0 is specified in `-g`.
52 | - **Example #3**: to run inference only on device 1 with a max of 75 GiB, add: `--gpu-id 1 -g 1:75`
53 | 
54 | 
55 | ## Running on consumer hardware
56 | If you have multiple GPUs, each <48 GB VRAM, [the steps mentioned in this section on running on multiple GPUs](#running-on-multiple-gpus) still apply, unless, any of these apply:
57 | - Running on just 1x GPU with <48 GB VRAM,
58 | - <48 GB VRAM combined across multiple GPUs
59 | - Running into Out-Of-Memory (OOM) issues
60 | 
61 | In which case, add the flag `-r CPU_RAM` where CPU_RAM is the maximum amount of RAM you'd like to allocate to loading model. Note: This significantly reduces inference speeds. 
62 | 
63 | The model will load without specifying `-r`, however, it is not recommended because it will allocate all available RAM to the model. To limit how much RAM the model can use, add `-r`.
64 | 
65 | If the total VRAM + CPU_RAM < the size of the model in GiB, the rest of the model will be offloaded to a folder "offload" at the root of the directory. Note: This significantly reduces inference speeds.
66 | 
67 | - Example: `-g 0:12 -r 20` will first load up to 12 GiB of the model into the CUDA device 0, then load up to 20 GiB into RAM, and load the rest into the "offload" directory.
68 | 
69 | How it works: 
70 | - https://github.com/huggingface/blog/blob/main/accelerate-large-models.md
71 | - https://www.youtube.com/embed/MWCSGj9jEAo
72 | 


--------------------------------------------------------------------------------
/inference/bot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | INFERENCE_DIR = os.path.dirname(os.path.abspath(__file__))
  5 | 
  6 | # TODO: PYTHONPATH hacks are never a good idea. clean this up later
  7 | sys.path.append(os.path.join(INFERENCE_DIR, '..'))
  8 | 
  9 | import cmd
 10 | import torch
 11 | import argparse
 12 | import conversation as convo
 13 | import retrieval.wikipedia as wp
 14 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, StoppingCriteria, StoppingCriteriaList
 15 | from accelerate import infer_auto_device_map, init_empty_weights
 16 | 
 17 | 
 18 | class StopWordsCriteria(StoppingCriteria):
 19 |     def __init__(self, tokenizer, stop_words, stream_callback):
 20 |         self._tokenizer = tokenizer
 21 |         self._stop_words = stop_words
 22 |         self._partial_result = ''
 23 |         self._stream_buffer = ''
 24 |         self._stream_callback = stream_callback
 25 | 
 26 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 27 |         first = not self._partial_result
 28 |         text = self._tokenizer.decode(input_ids[0, -1])
 29 |         self._partial_result += text
 30 |         for stop_word in self._stop_words:
 31 |             if stop_word in self._partial_result:
 32 |                 return True
 33 |         if self._stream_callback:
 34 |             if first:
 35 |                 text = text.lstrip()
 36 |             # buffer tokens if the partial result ends with a prefix of a stop word, e.g. "<hu"
 37 |             for stop_word in self._stop_words:
 38 |                 for i in range(1, len(stop_word)):
 39 |                     if self._partial_result.endswith(stop_word[0:i]):
 40 |                         self._stream_buffer += text
 41 |                         return False
 42 |             self._stream_callback(self._stream_buffer + text)
 43 |             self._stream_buffer = ''
 44 |         return False
 45 | 
 46 | 
 47 | class ChatModel:
 48 |     human_id = "<human>"
 49 |     bot_id = "<bot>"
 50 | 
 51 |     def __init__(self, model_name, gpu_id, max_memory):
 52 |         device = torch.device('cuda', gpu_id)   # TODO: allow sending to cpu
 53 | 
 54 |         # recommended default for devices with > 40 GB VRAM
 55 |         # load model onto one device
 56 |         if max_memory is None:
 57 |             self._model = AutoModelForCausalLM.from_pretrained(
 58 |                 model_name, torch_dtype=torch.float16, device_map="auto")
 59 |             self._model.to(device)
 60 |         # load the model with the given max_memory config (for devices with insufficient VRAM or multi-gpu)
 61 |         else:
 62 |             config = AutoConfig.from_pretrained(model_name)
 63 |             # load empty weights
 64 |             with init_empty_weights():
 65 |                 model_from_conf = AutoModelForCausalLM.from_config(config)
 66 | 
 67 |             model_from_conf.tie_weights()
 68 | 
 69 |             # create a device_map from max_memory
 70 |             device_map = infer_auto_device_map(
 71 |                 model_from_conf,
 72 |                 max_memory=max_memory,
 73 |                 no_split_module_classes=["GPTNeoXLayer"],
 74 |                 dtype="float16"
 75 |             )
 76 |             # load the model with the above device_map
 77 |             self._model = AutoModelForCausalLM.from_pretrained(
 78 |                 model_name,
 79 |                 device_map=device_map,
 80 |                 offload_folder="offload",  # optional offload-to-disk overflow directory (auto-created)
 81 |                 offload_state_dict=True,
 82 |                 torch_dtype=torch.float16
 83 |             )
 84 |         self._tokenizer = AutoTokenizer.from_pretrained(model_name)
 85 | 
 86 |     def do_inference(self, prompt, max_new_tokens, do_sample, temperature, top_k, stream_callback=None):
 87 |         stop_criteria = StopWordsCriteria(self._tokenizer, [self.human_id], stream_callback)
 88 |         inputs = (
 89 |             self._tokenizer(prompt, return_tensors='pt')
 90 |             .to(self._model.device)
 91 |         )
 92 |         outputs = self._model.generate(
 93 |             **inputs,
 94 |             max_new_tokens=max_new_tokens,
 95 |             do_sample=do_sample,
 96 |             temperature=temperature,
 97 |             top_k=top_k,
 98 |             pad_token_id=self._tokenizer.eos_token_id,
 99 |             stopping_criteria=StoppingCriteriaList([stop_criteria]),
100 |         )
101 |         output = self._tokenizer.batch_decode(outputs)[0]
102 | 
103 |         # remove the context from the output
104 |         output = output[len(prompt):]
105 | 
106 |         return output
107 | 
108 | 
109 | class OpenChatKitShell(cmd.Cmd):
110 |     intro = "Welcome to OpenChatKit shell.   Type /help or /? to list commands.\n"
111 |     prompt = ">>> "
112 | 
113 |     def __init__(self, gpu_id, model_name_or_path, max_tokens, sample, temperature, top_k, retrieval, max_memory, do_stream):
114 |         super().__init__()
115 |         self._gpu_id = gpu_id
116 |         self._model_name_or_path = model_name_or_path
117 |         self._max_tokens = max_tokens
118 |         self._sample = sample
119 |         self._temperature = temperature
120 |         self._top_k = top_k
121 |         self._retrieval = retrieval
122 |         self._max_memory = max_memory
123 |         self._do_stream = do_stream
124 | 
125 |     def preloop(self):
126 |         print(f"Loading {self._model_name_or_path} to cuda:{self._gpu_id}...")
127 |         self._model = ChatModel(self._model_name_or_path, self._gpu_id, self._max_memory)
128 | 
129 |         if self._retrieval:
130 |             print(f"Loading retrieval index...")
131 |             self._index = wp.WikipediaIndex()
132 | 
133 |         self._convo = convo.Conversation(
134 |             self._model.human_id, self._model.bot_id)
135 | 
136 |     def precmd(self, line):
137 |         if line.startswith('/'):
138 |             return line[1:]
139 |         else:
140 |             return 'say ' + line
141 | 
142 |     def do_say(self, arg):
143 |         if self._retrieval:
144 |             results = self._index.search(arg)
145 |             if len(results) > 0:
146 |                 self._convo.push_context_turn(results[0])
147 | 
148 |         self._convo.push_human_turn(arg)
149 | 
150 |         output = self._model.do_inference(
151 |             self._convo.get_raw_prompt(),
152 |             self._max_tokens,
153 |             self._sample,
154 |             self._temperature,
155 |             self._top_k,
156 |             lambda x : print(x, end='', flush=True) if self._do_stream else None,
157 |         )
158 | 
159 |         self._convo.push_model_response(output)
160 | 
161 |         print("" if self._do_stream else self._convo.get_last_turn())
162 | 
163 |     def do_raw_say(self, arg):
164 |         output = self._model.do_inference(
165 |             arg,
166 |             self._max_tokens,
167 |             self._sample,
168 |             self._temperature,
169 |             self._top_k
170 |         )
171 | 
172 |         print(output)
173 | 
174 |     def do_raw_prompt(self, arg):
175 |         print(self._convo.get_raw_prompt())
176 | 
177 |     def do_reset(self, arg):
178 |         self._convo = convo.Conversation(
179 |             self._model.human_id, self._model.bot_id)
180 | 
181 |     def do_hyperparameters(self, arg):
182 |         print(
183 |             f"Hyperparameters:\n"
184 |             f"  max_tokens: {self._max_tokens}\n"
185 |             f"  sample: {self._sample}\n"
186 |             f"  temperature: {self._temperature}\n"
187 |             f"  top_k: {self._top_k}"
188 |         )
189 | 
190 |     def do_quit(self, arg):
191 |         return True
192 | 
193 | 
194 | def main():
195 |     parser = argparse.ArgumentParser(
196 |         description='test harness for OpenChatKit')
197 | 
198 |     parser.add_argument(
199 |         '--gpu-id',
200 |         default=0,
201 |         type=int,
202 |         help='the ID of the GPU to run on'
203 |     )
204 |     parser.add_argument(
205 |         '--model',
206 |         default=f"{INFERENCE_DIR}/../huggingface_models/Pythia-Chat-Base-7B",
207 |         help='name/path of the model'
208 |     )
209 |     parser.add_argument(
210 |         '--max-tokens',
211 |         default=128,
212 |         type=int,
213 |         help='the maximum number of tokens to generate'
214 |     )
215 |     parser.add_argument(
216 |         '--sample',
217 |         default=True,
218 |         action='store_true',
219 |         help='indicates whether to sample'
220 |     )
221 |     parser.add_argument(
222 |         '--no-stream',
223 |         action='store_true',
224 |         help='indicates whether to stream tokens'
225 |     )
226 |     parser.add_argument(
227 |         '--temperature',
228 |         default=0.6,
229 |         type=float,
230 |         help='temperature for the LM'
231 |     )
232 |     parser.add_argument(
233 |         '--top-k',
234 |         default=40,
235 |         type=int,
236 |         help='top-k for the LM'
237 |     )
238 |     parser.add_argument(
239 |         '--retrieval',
240 |         default=False,
241 |         action='store_true',
242 |         help='augment queries with context from the retrieval index'
243 |     )
244 |     parser.add_argument(
245 |         '-g',
246 |         '--gpu-vram',
247 |         action='store',
248 |         help='max VRAM to allocate per GPU',
249 |         nargs='+',
250 |         required=False,
251 |     )
252 |     parser.add_argument(
253 |         '-r',
254 |         '--cpu-ram',
255 |         default=None,
256 |         type=int,
257 |         help='max CPU RAM to allocate',
258 |         required=False
259 |     )
260 |     args = parser.parse_args()
261 | 
262 |     # set max_memory dictionary if given
263 |     if args.gpu_vram is None:
264 |         max_memory = None
265 |     else:
266 |         max_memory = {}
267 |         for i in range(len(args.gpu_vram)):
268 |             # assign CUDA ID as label and XGiB as value
269 |             max_memory[int(args.gpu_vram[i].split(':')[0])] = f"{args.gpu_vram[i].split(':')[1]}GiB"
270 | 
271 |         if args.cpu_ram is not None:
272 |             # add cpu to max-memory if given
273 |             max_memory['cpu'] = f"{int(args.cpu_ram)}GiB"
274 | 
275 |     OpenChatKitShell(
276 |         args.gpu_id,
277 |         args.model,
278 |         args.max_tokens,
279 |         args.sample,
280 |         args.temperature,
281 |         args.top_k,
282 |         args.retrieval,
283 |         max_memory,
284 |         not args.no_stream,
285 |     ).cmdloop()
286 | 
287 | 
288 | if __name__ == '__main__':
289 |     main()
290 | 


--------------------------------------------------------------------------------
/inference/conversation.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | 
 4 | MEANINGLESS_WORDS = ['<pad>', '</s>', '<|endoftext|>']
 5 | PRE_PROMPT = """\
 6 | Current Date: {}
 7 | Current Time: {}
 8 | 
 9 | """
10 | 
11 | def clean_response(response):
12 |     for word in MEANINGLESS_WORDS:
13 |         response = response.replace(word, "")
14 |     response = response.strip("\n")
15 |     return response
16 | 
17 | class Conversation:
18 |     def __init__(self, human_id, bot_id):
19 |         cur_date = time.strftime('%Y-%m-%d')
20 |         cur_time = time.strftime('%H:%M:%S %p %Z')
21 | 
22 |         self._human_id = human_id
23 |         self._bot_id = bot_id
24 |         self._prompt = PRE_PROMPT.format(cur_date, cur_time)
25 | 
26 |     def push_context_turn(self, context):
27 |         # for now, context is represented as a human turn
28 |         self._prompt += f"{self._human_id}: {context}\n"
29 | 
30 |     def push_human_turn(self, query):
31 |         self._prompt += f"{self._human_id}: {query}\n"
32 |         self._prompt += f"{self._bot_id}:"
33 | 
34 |     def push_model_response(self, response):
35 |         has_finished = self._human_id in response
36 |         bot_turn = response.split(f"{self._human_id}:")[0]
37 |         bot_turn = clean_response(bot_turn)
38 |         # if it is truncated, then append "..." to the end of the response
39 |         if not has_finished:
40 |             bot_turn += "..."
41 | 
42 |         self._prompt += f"{bot_turn}\n"
43 | 
44 |     def get_last_turn(self):
45 |         human_tag = f"{self._human_id}:"
46 |         bot_tag = f"{self._bot_id}:"
47 |         turns = re.split(f"({human_tag}|{bot_tag})\W?", self._prompt)
48 |         return turns[-1]
49 | 
50 |     def get_raw_prompt(self):
51 |         return self._prompt
52 | 
53 |     @classmethod
54 |     def from_raw_prompt(cls, value):
55 |         self._prompt = value
56 | 


--------------------------------------------------------------------------------
/pretrained/GPT-NeoX-20B/prepare.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # Import the prepare_data function
 5 | current_dir = os.path.dirname(os.path.abspath(__file__))
 6 | sys.path.append(os.path.join(current_dir, '..'))
 7 | from prepare_pretrained import prepare_pretrained
 8 | 
 9 | if __name__ == "__main__":
10 |     model_name = "EleutherAI/gpt-neox-20b"
11 |     save_path = os.path.join(current_dir, model_name.replace('/', '_'))
12 |     prepare_pretrained(save_path, model_name)
13 | 


--------------------------------------------------------------------------------
/pretrained/Llama-2-7B-32K-beta/prepare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import torch
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 5 | 
 6 | DIR = os.path.dirname(os.path.abspath(__file__))
 7 | USE_AUTH_TOKEN = False
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = argparse.ArgumentParser(description='Convert HF checkpoints')
11 |     parser.add_argument('--model-name', type=str, default='togethercomputer/Llama-2-7B-32K-beta', 
12 |                         help='model-name')
13 |     parser.add_argument('--save-dir', type=str, default=DIR, 
14 |                         help='model-name')
15 |     parser.add_argument('--offload-dir', type=str, default=None,
16 |                         help='directory to offload from memory')
17 |     args = parser.parse_args()
18 |     
19 |     if not os.path.exists(args.save_dir):
20 |         os.mkdir(args.save_dir)
21 |     save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
22 |     if not os.path.exists(save_path):
23 |         os.mkdir(save_path)
24 |     
25 |     print('loading model from HF...')
26 |     config = AutoConfig.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN)
27 |     config.save_pretrained(save_path)
28 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN)
29 |     tokenizer.save_pretrained(save_path)
30 | 
31 |     # offload model from memory to disk if offload-dir is specified
32 |     if args.offload_dir is not None:
33 |         if not os.path.exists(args.offload_dir):
34 |             os.mkdir(args.offload_dir)
35 |         model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, device_map="auto", offload_folder=args.offload_dir, use_auth_token=USE_AUTH_TOKEN)
36 |     else:
37 |         model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, use_auth_token=USE_AUTH_TOKEN)
38 |     print('loaded model from HF...')
39 |     
40 |     print('converting the embedding layer...')
41 |     item = {}
42 |     item['embed_tokens.weight'] = model.model.embed_tokens.weight
43 |     torch.save(item, os.path.join(save_path, 'pytorch_embs.pt'))
44 |     print('converted the embedding layer.')
45 | 
46 |     for i in range(len(model.model.layers)):
47 |         print(f'converting the {i}-th transformer layer...')
48 |         torch.save(model.model.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt'))
49 |         print(f'converted the {i}-th transformer layer.')
50 | 
51 |     print('converting the lm_head layer...')
52 |     item = {}
53 |     item['lm_head.weight'] = model.lm_head.weight
54 |     item['norm.weight'] = model.model.norm.weight
55 |     torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt'))
56 |     print('converted the lm_head layer.')
57 | 


--------------------------------------------------------------------------------
/pretrained/Pythia-6.9B-deduped/prepare.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # Import the prepare_data function
 5 | current_dir = os.path.dirname(os.path.abspath(__file__))
 6 | sys.path.append(os.path.join(current_dir, '..'))
 7 | from prepare_pretrained import prepare_pretrained
 8 | 
 9 | if __name__ == "__main__":
10 |     model_name = "EleutherAI/pythia-6.9b-deduped"
11 |     save_path = os.path.join(current_dir, model_name.replace('/', '_'))
12 |     prepare_pretrained(save_path, model_name)
13 | 


--------------------------------------------------------------------------------
/pretrained/RedPajama-3B/prepare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | # Import the prepare_data function
 5 | current_dir = os.path.dirname(os.path.abspath(__file__))
 6 | sys.path.append(os.path.join(current_dir, '..'))
 7 | from prepare_pretrained import prepare_pretrained
 8 | 
 9 | if __name__ == "__main__":
10 |     model_name = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"
11 |     save_path = os.path.join(current_dir, model_name.replace('/', '_'))
12 |     prepare_pretrained(save_path, model_name)
13 | 


--------------------------------------------------------------------------------
/pretrained/RedPajama-7B/prepare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import torch
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 5 | 
 6 | DIR = os.path.dirname(os.path.abspath(__file__))
 7 | USE_AUTH_TOKEN = False
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = argparse.ArgumentParser(description='Convert HF checkpoints')
11 |     parser.add_argument('--model-name', type=str, default='togethercomputer/RedPajama-INCITE-7B-Chat', 
12 |                         help='model-name')
13 |     parser.add_argument('--save-dir', type=str, default=DIR, 
14 |                         help='model-name')
15 |     parser.add_argument('--offload-dir', type=str, default=None,
16 |                         help='directory to offload from memory')
17 |     args = parser.parse_args()
18 |     
19 |     if not os.path.exists(args.save_dir):
20 |         os.mkdir(args.save_dir)
21 |     save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
22 |     if not os.path.exists(save_path):
23 |         os.mkdir(save_path)
24 |     
25 |     print('loading model from HF...')
26 |     config = AutoConfig.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN)
27 |     config.save_pretrained(save_path)
28 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN)
29 |     tokenizer.save_pretrained(save_path)
30 | 
31 |     # offload model from memory to disk if offload-dir is specified
32 |     if args.offload_dir is not None:
33 |         if not os.path.exists(args.offload_dir):
34 |             os.mkdir(args.offload_dir)
35 |         model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, device_map="auto", offload_folder=args.offload_dir, use_auth_token=USE_AUTH_TOKEN)
36 |     else:
37 |         model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, use_auth_token=USE_AUTH_TOKEN)
38 |     print('loaded model from HF...')
39 |     
40 |     print('converting the embedding layer...')
41 |     
42 |     item = {}
43 |     item['embed_in.weight'] = model.gpt_neox.embed_in.weight
44 |     torch.save(item, os.path.join(save_path, 'pytorch_embs.pt'))
45 |     print('converted the embedding layer.')
46 | 
47 |     for i in range(len(model.gpt_neox.layers)):
48 |         print(f'converting the {i}-th transformer layer...')
49 |         torch.save(model.gpt_neox.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt'))
50 |         print(f'converted the {i}-th transformer layer.')
51 | 
52 |     print('converting the lm_head layer...')
53 |     item = {}
54 |     item['embed_out.weight'] = model.embed_out.weight
55 |     item['final_layer_norm.weight'] = model.gpt_neox.final_layer_norm.weight
56 |     item['final_layer_norm.bias'] = model.gpt_neox.final_layer_norm.bias
57 |     torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt'))
58 |     print('converted the lm_head layer.')
59 | 


--------------------------------------------------------------------------------
/pretrained/prepare_pretrained.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import torch
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 5 | 
 6 | DIR = os.path.dirname(os.path.abspath(__file__))
 7 | USE_AUTH_TOKEN = False
 8 | 
 9 | # Load pretrained model from HuggingFace and save it to disk
10 | def prepare_pretrained(save_path, model_name, offload_dir=None):
11 |     os.makedirs(save_path, exist_ok=True)
12 |     
13 |     print('loading model from HF...')
14 |     config = AutoConfig.from_pretrained(model_name, use_auth_token=USE_AUTH_TOKEN)
15 |     config.save_pretrained(save_path)
16 |     tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=USE_AUTH_TOKEN)
17 |     tokenizer.save_pretrained(save_path)
18 | 
19 |     # offload model from memory to disk if offload-dir is specified
20 |     if offload_dir is not None:
21 |         os.makedirs(offload_dir, exist_ok=True)
22 |         model = AutoModelForCausalLM.from_pretrained(model_name, 
23 |                                                      torch_dtype=torch.float16,
24 |                                                      device_map="auto",
25 |                                                      offload_folder=offload_dir,
26 |                                                      use_auth_token=USE_AUTH_TOKEN)
27 |     else:
28 |         model = AutoModelForCausalLM.from_pretrained(model_name,
29 |                                                      torch_dtype=torch.float16,
30 |                                                      use_auth_token=USE_AUTH_TOKEN)
31 |     print('loaded model from HF...')
32 |     
33 |     print('converting the embedding layer...')
34 |     item = {}
35 |     item['embed_in.weight'] = model.gpt_neox.embed_in.weight
36 |     torch.save(item, os.path.join(save_path, 'pytorch_embs.pt'))
37 |     print('converted the embedding layer.')
38 | 
39 |     for i in range(len(model.gpt_neox.layers)):
40 |         print(f'converting the {i}-th transformer layer...')
41 |         torch.save(model.gpt_neox.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt'))
42 |         print(f'converted the {i}-th transformer layer.')
43 | 
44 |     print('converting the lm_head layer...')
45 |     item = {}
46 |     item['embed_out.weight'] = model.embed_out.weight
47 |     item['final_layer_norm.weight'] = model.gpt_neox.final_layer_norm.weight
48 |     item['final_layer_norm.bias'] = model.gpt_neox.final_layer_norm.bias
49 |     torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt'))
50 |     print('converted the lm_head layer.')
51 | 
52 | # python pretrained/prepare_pretrained.py --model-name EleutherAI/gpt-neox-125M --save-dir pretrained/files --offload-dir pretrained/files/offload
53 | def main():
54 |     parser = argparse.ArgumentParser(description='Convert HF checkpoints')
55 |     parser.add_argument('--model-name', type=str, required=True, 
56 |                         help='model-name')
57 |     parser.add_argument('--save-dir', type=str, required=True, 
58 |                         help='model-name')
59 |     parser.add_argument('--offload-dir', type=str, default=None,
60 |                         help='directory to offload from memory')
61 |     args = parser.parse_args()
62 |     
63 |     prepare_pretrained(args.save_dir, args.model_name, args.offload_dir)
64 | 
65 | if __name__ == '__main__':
66 |     main()


--------------------------------------------------------------------------------
/retrieval/README.md:
--------------------------------------------------------------------------------
 1 | # Retrieval-Enhanced Chatbot
 2 | 
 3 | This is a demonstration of how to enhance a chatbot using Wikipedia. We'll be using [ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index](https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index). for this demo. Thank Christoph for providing this resource!
 4 | 
 5 | In this demo, we'll be extending the approach of comparing and adding the adjacent `w` sentences to the matched sentence if their cosine similarity is larger than `w_th`. By doing so, we can provide the chatbot with a longer context, which may improve its performance.
 6 | 
 7 | This demo combines both the above index and the chat model into one system
 8 | 
 9 | ## Start the combined  server
10 | 
11 | To get started, we need to install some dependencies and download the Wikipedia index:
12 | 
13 | 0. Install dependencies
14 | 
15 | Install the necessary dependencies, including `torch`, `transformers`, `flask`, `faiss`, and `fastparquet`.
16 | 
17 | 1. Open up wiki-server.py and set model_name_or_path to point to the path that contains the chat
18 | model
19 | 
20 | 
21 | 2. Start the retrieval server
22 | 
23 | ```shell
24 | python wiki-server.py
25 | ```
26 | 
27 | The server will listen on port 7003.  It will download the data sets from ChristophSchuhman.  This
28 | may take a few minutes.
29 | 
30 | 3. Test the full retrieval enhanced chatbot
31 | 
32 | We now demonstrate both the wiki index and the GPT-NeoX-fine-tuned model.
33 | 
34 | ```curl -X POST -H 'Content-Type: application/json' http://127.0.0.1:7003/inference -d '{ "prompt" : "where is zurich located?" }'```
35 | 
36 | Internally we first query the wiki index and generate a response using the provided model.  To do
37 | this, We concatenate the retrieved information and the users' query into a prompt, 
38 | encode it with a tokenizer, and generate a response using the chatbot model.
39 | 
40 | The response should indicate the location of Zurich city.
41 | 
42 | 
43 | 4. To test just the retrieval functionality of the system you can can do the following.  Curl works
44 | as well.
45 | 
46 | ```python
47 | import requests
48 | 
49 | endpoint = 'http://127.0.0.1:7003/search'
50 | res = requests.post(endpoint, json={
51 |     'query': 'Where is Zurich?',
52 |     'k': 1,
53 |     'w': 5,
54 |     'w_th': 0.7,
55 | })
56 | print(res.json())
57 | ```
58 | 
59 | This should print the most relevant sentences about Zurich from Wikipedia. By increasing w and 
60 | decreasing w_th, we can retrieve a longer context.
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/retrieval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/retrieval/__init__.py


--------------------------------------------------------------------------------
/retrieval/wikipedia.py:
--------------------------------------------------------------------------------
 1 | # This file was adapted from ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index:
 2 | #   https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index/blob/main/wikiindexquery.py
 3 | #
 4 | # The original file was licensed under the Apache 2.0 license.
 5 | 
 6 | import os
 7 | 
 8 | from transformers import AutoTokenizer, AutoModel
 9 | import faiss
10 | import numpy as np
11 | import pandas as pd
12 | 
13 | DIR = os.path.dirname(os.path.abspath(__file__))
14 | 
15 | 
16 | def mean_pooling(token_embeddings, mask):
17 |     token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
18 |     sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
19 |     return sentence_embeddings
20 | 
21 | def cos_sim_2d(x, y):
22 |     norm_x = x / np.linalg.norm(x, axis=1, keepdims=True)
23 |     norm_y = y / np.linalg.norm(y, axis=1, keepdims=True)
24 |     return np.matmul(norm_x, norm_y.T)
25 | 
26 | 
27 | class WikipediaIndex:
28 |     def __init__(self):
29 |         path = os.path.join(DIR, '..', 'data', 'wikipedia-3sentence-level-retrieval-index', 'files')
30 |         indexpath = os.path.join(path, 'knn.index')
31 |         wiki_sentence_path = os.path.join(path, 'wikipedia-en-sentences.parquet')
32 | 
33 |         self._device = 'cuda'
34 |         self._tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
35 |         self._contriever = AutoModel.from_pretrained('facebook/contriever-msmarco').to(self._device)
36 | 
37 |         self._df_sentences = pd.read_parquet(wiki_sentence_path, engine='fastparquet')
38 | 
39 |         self._wiki_index = faiss.read_index(indexpath, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
40 | 
41 | 
42 |     def search(self, query, k=1, w=5, w_th=0.5):
43 |         inputs = self._tokenizer(query, padding=True, truncation=True, return_tensors='pt').to(self._device)
44 |         outputs = self._contriever(**inputs)
45 |         embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
46 |         
47 |         query_vector = embeddings.cpu().detach().numpy().reshape(1, -1)
48 |         
49 |         distances, indices = self._wiki_index.search(query_vector, k)
50 |         
51 |         texts = []
52 |         for i, (dist, indice) in enumerate(zip(distances[0], indices[0])):
53 |             text = self._df_sentences.iloc[indice]['text_snippet']
54 | 
55 |             try:
56 |                 input_texts = [self._df_sentences.iloc[indice]['text_snippet']]
57 |                 for j in range(1, w+1):
58 |                     input_texts = [self._df_sentences.iloc[indice-j]['text_snippet']] + input_texts
59 |                 for j in range(1, w+1):
60 |                     input_texts = input_texts + [self._df_sentences.iloc[indice+j]['text_snippet']]
61 |                 
62 |                 inputs = self._tokenizer(input_texts, padding=True, truncation=True, return_tensors='pt').to(self._device)
63 | 
64 |                 outputs = self._contriever(**inputs)
65 |                 embeddings = mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy()
66 | 
67 |                 for j in range(1, w+1):
68 |                     if cos_sim_2d(embeddings[w-j].reshape(1, -1), embeddings[w].reshape(1, -1)) > w_th:
69 |                         text = self._df_sentences.iloc[indice-j]['text_snippet'] + text
70 |                     else:
71 |                         break
72 | 
73 |                 for j in range(1, w+1):
74 |                     if cos_sim_2d(embeddings[w+j].reshape(1, -1), embeddings[w].reshape(1, -1)) > w_th:
75 |                         text += self._df_sentences.iloc[indice+j]['text_snippet']
76 |                     else:
77 |                         break
78 | 
79 |             except Exception as e:
80 |                 print(e)
81 | 
82 |             texts.append(text)
83 |         
84 |         return texts
85 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | # OpenChatKit Tools
 2 | 
 3 | ## convert_to_hf_gptneox.py
 4 | 
 5 | ## ml_load_benchmark.py
 6 | 
 7 | The commands to run the model load benchmark tool is:
 8 | ```shell
 9 | $ python3 model_load_benchmark.py -i benchmark_input.json -o benchmark_results.json -d cuda:0
10 | ```
11 | 
12 | ```
13 | usage: model_load_benchmark.py [-h] -i INPUT -o OUTPUT [-d DEVICE] [-r REPEAT_INFER]
14 | 
15 | Benchmark downloading, loading, and running an inferernce for a set of ML models.
16 | 
17 | optional arguments:
18 |   -h, --help            show this help message and exit
19 |   -i INPUT, --input INPUT
20 |                         Input JSON file containing models to be benchmark
21 |   -o OUTPUT, --output OUTPUT
22 |                         Output JSON file with model benchmark results
23 |   -d DEVICE, --device DEVICE
24 |                         Cuda device name, e.g. "cuda:0"
25 |   -r REPEAT_INFER, --repeat-infer REPEAT_INFER
26 |                         Repeat inferrence for warm timings
27 | ```
28 | 
29 | The input file is a JSON file with the names and paths of the models to be tested. For example:
30 | ```JSON
31 | {
32 |     "GPT-NeoXT-Chat-Base-20B": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
33 |     "Pythia-Chat-Base-7B": "togethercomputer/Pythia-Chat-Base-7B",
34 |     "GPT-JT-Moderation-6B": "togethercomputer/GPT-JT-Moderation-6B",
35 |     "GPT-JT-6B-v1": "togethercomputer/GPT-JT-6B-v1",
36 |     "GPT-JT-6B-v0": "togethercomputer/GPT-JT-6B-v0"
37 | }
38 | ```
39 | 
40 | The output is a json file with the timings for:
41 | 1. tokenizer download time in seconds -- `tokenizer_download_sec`
42 | 2. tokenizer load time in seconds -- `tokenizer_load_sec`
43 | 3. model download time -- `model_download_sec`
44 | 5. model load to RAM time -- `model_load_to_ram_sec`
45 | 6. model transfer to GPU time -- `model_transfer_to_gpu_sec`
46 | 7. inference time (input is "hello, world!") -- `inference_sec`
47 | 8. total time (sum of all the above) -- `total_sec`
48 | 9. inference time from a warm start (the average of running inference `REPEAT_INFER` times) -- `inference_warm_sec`
49 | 10. model main memory footprint in MB -- `model_main_memory_MB`
50 | 11. model GPU memory footprint in MB -- `model_gpu_memory_MB`
51 | 
52 | An example of the output is:
53 | ```JSON
54 | {
55 |     "GPT-JT-6B-v1": {
56 |         "tokenizer_download_sec": 1.52,
57 |         "tokenizer_load_sec": 0.10,
58 |         "model_download_sec": 124.70,
59 |         "model_load_to_ram_sec": 127.81,
60 |         "model_main_memory_MB": 12297.10,
61 |         "model_transfer_to_gpu_sec": 3.29,
62 |         "model_gpu_memory_MB": 12219.74,
63 |         "inference_sec": 0.93,
64 |         "inference_warm_sec": 0.047,
65 |         "total_sec": 258.38
66 |     }
67 | }
68 | ```


--------------------------------------------------------------------------------
/tools/benchmark_input.json:
--------------------------------------------------------------------------------
1 | {
2 |     "GPT-NeoXT-Chat-Base-20B": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
3 |     "Pythia-Chat-Base-7B": "togethercomputer/Pythia-Chat-Base-7B",
4 |     "GPT-JT-Moderation-6B": "togethercomputer/GPT-JT-Moderation-6B",
5 |     "GPT-JT-6B-v1": "togethercomputer/GPT-JT-6B-v1",
6 |     "GPT-JT-6B-v0": "togethercomputer/GPT-JT-6B-v0"
7 | }


--------------------------------------------------------------------------------
/tools/convert_to_hf_gptneox.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | import argparse
  5 | 
  6 | from transformers import GPTNeoXForCausalLM
  7 | 
  8 | from transformers import AutoConfig, AutoTokenizer
  9 | 
 10 | from transformers.modeling_utils import no_init_weights
 11 | import os
 12 | 
 13 | 
 14 | def create_empty_gptneox(config):
 15 | 
 16 |     import torch
 17 |     import torch.nn as nn
 18 | 
 19 |     _reset_parameters_linear = nn.Linear.reset_parameters
 20 |     def dummy(*args, **kargs):
 21 |         pass
 22 |     nn.Linear.reset_parameters = dummy
 23 | 
 24 |     # 1. disable init for faster initialization
 25 |     # 2. avoid tie token embeddings with lm_head, as we train them separately.
 26 |     with no_init_weights(_enable=True):
 27 |         model = GPTNeoXForCausalLM(config).eval()
 28 | 
 29 |     nn.Linear.reset_parameters = _reset_parameters_linear
 30 | 
 31 |     return model
 32 | 
 33 | def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_per_stage=14):
 34 |     input_path = checkpoint_path
 35 | 
 36 |     assert n_stages * n_layer_per_stage >= len(model.gpt_neox.layers)
 37 |     # assert model.lm_head.weight.data is not model.transformer.wte.weight.data
 38 | 
 39 |     for i in range(n_stages):
 40 | 
 41 |         print(f'loading stage {i}')
 42 | 
 43 |         checkpoint = torch.load(os.path.join(input_path, f'prank_{i}_checkpoint.pt'), map_location=torch.device("cpu"))
 44 | 
 45 |         if i == 0:
 46 |             _tmp = {k[len(f"{0}."):]:v for k,v in checkpoint.items() if k.startswith(f"0.")}
 47 |             # torch.save(_tmp, os.path.join(output_path, f'pytorch_embs.pt'))
 48 |             model.gpt_neox.embed_in.weight.data[:] = _tmp['embed_in.weight']
 49 | 
 50 |             for j in range(n_layer_per_stage):
 51 |                 _tmp = {k[len(f"{j+1}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j+1}.")}
 52 |                 if len(_tmp) == 0:
 53 |                     break
 54 |                 # torch.save(_tmp, os.path.join(output_path, f'pytorch_{j}.pt'))
 55 |                 model.gpt_neox.layers[j].load_state_dict(_tmp)
 56 | 
 57 |         elif i == n_stages - 1:
 58 |             for j in range(n_layer_per_stage):
 59 |                 _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")}
 60 |                 if len(_tmp) == 0:
 61 |                     break
 62 |                 # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt'))
 63 |                 model.gpt_neox.layers[i*n_layer_per_stage + j].load_state_dict(_tmp)
 64 |                 if i*n_layer_per_stage + j == len(model.gpt_neox.layers) - 1:
 65 |                     j += 1
 66 |                     break
 67 | 
 68 |             _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")}
 69 |             if len(_tmp) == 0:
 70 |                 break
 71 |             # torch.save(_tmp, os.path.join(output_path, f'pytorch_lm_head.pt'))
 72 |             model.gpt_neox.final_layer_norm.weight.data[:] = _tmp['final_layer_norm.weight']
 73 |             model.gpt_neox.final_layer_norm.bias.data[:] = _tmp['final_layer_norm.bias']
 74 |             model.embed_out.weight.data[:] = _tmp['embed_out.weight']
 75 |             if 'embed_out.bias' in _tmp:
 76 |                 model.embed_out.bias.data[:] = _tmp['embed_out.bias']
 77 | 
 78 |         else:
 79 |             for j in range(n_layer_per_stage):
 80 |                 _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")}
 81 |                 if len(_tmp) == 0:
 82 |                     break
 83 |                 # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt'))
 84 |                 model.gpt_neox.layers[i*n_layer_per_stage + j].load_state_dict(_tmp)
 85 | 
 86 |     return model
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     
 91 |     parser = argparse.ArgumentParser(description='Convert HF checkpoints')
 92 |     parser.add_argument('--config-name', type=str, default='EleutherAI/gpt-neox-20b',
 93 |                         help='config-name')
 94 |     parser.add_argument('--ckpt-path', type=str, default=None, 
 95 |                         help='ckpt-path')
 96 |     parser.add_argument('--save-path', type=str, default=None, 
 97 |                         help='save-path')
 98 |     parser.add_argument('--n-stages', type=int, default=8, 
 99 |                         help='pipeline group size')
100 |     parser.add_argument('--n-layer-per-stage', type=int, default=6, 
101 |                         help='n layers per GPU device')
102 |     parser.add_argument('--fp16', default=False, action='store_true')
103 |     args = parser.parse_args()
104 |     
105 |     assert args.ckpt_path is not None
106 |     assert args.save_path is not None
107 |     
108 |     os.makedirs(args.save_path, exist_ok=True)
109 | 
110 |     print('loading config...')
111 |     config = AutoConfig.from_pretrained(args.config_name)
112 |     print('loaded config.')
113 |     print('loading tokenizer...')
114 |     tokenizer = AutoTokenizer.from_pretrained(args.config_name)
115 |     print('loaded tokenizer.')
116 |     print('creating empty model...')
117 |     model = create_empty_gptneox(config)
118 |     if args.fp16:
119 |         model = model.half()
120 |     print('created empty model.')
121 |     print('loading model ckpt...')
122 |     load_decentralized_checkpoint(
123 |         model, args.ckpt_path, n_stages=args.n_stages, n_layer_per_stage=args.n_layer_per_stage,
124 |     )
125 |     print('loaded model ckpt.')
126 |     
127 |     print('saving HF model...')
128 |     model.save_pretrained(args.save_path)
129 |     print(f'saved HF model to `{args.save_path}`')
130 |     config.save_pretrained(args.save_path)
131 |     tokenizer.save_pretrained(args.save_path)
132 |     
133 | 


--------------------------------------------------------------------------------
/tools/convert_to_hf_llama.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import torch
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | from transformers import LlamaForCausalLM
  9 | from transformers import AutoConfig, AutoTokenizer
 10 | 
 11 | from transformers.modeling_utils import no_init_weights
 12 | import os
 13 | 
 14 | 
 15 | def create_emtpy_llama(config):
 16 | 
 17 |     import torch
 18 |     import torch.nn as nn
 19 | 
 20 |     _reset_parameters_linear = nn.Linear.reset_parameters
 21 |     def dummy(*args, **kargs):
 22 |         pass
 23 |     nn.Linear.reset_parameters = dummy
 24 | 
 25 |     # 1. disable init for faster initialization
 26 |     # 2. avoid tie token embeddings with lm_head, as we train them separately.
 27 |     with no_init_weights(_enable=True):
 28 |         model = LlamaForCausalLM(config).eval()
 29 | 
 30 |     nn.Linear.reset_parameters = _reset_parameters_linear
 31 | 
 32 |     return model
 33 | 
 34 | def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_per_stage=16, ):
 35 |     input_path = checkpoint_path
 36 | 
 37 |     n_layers = len(model.model.layers)
 38 |     assert n_stages * n_layer_per_stage >= len(model.model.layers)
 39 |     # assert model.lm_head.weight.data is not model.transformer.wte.weight.data
 40 | 
 41 |     for i in range(n_stages):
 42 | 
 43 |         print(f'loading stage {i}')
 44 | 
 45 |         checkpoint = torch.load(os.path.join(input_path, f'prank_{i}_checkpoint.pt'), map_location=torch.device("cpu"))
 46 | 
 47 |         if i == 0:
 48 |             _tmp = {k[len(f"{0}."):]:v for k,v in checkpoint.items() if k.startswith(f"0.")}
 49 |             # torch.save(_tmp, os.path.join(output_path, f'pytorch_embs.pt'))
 50 |             model.model.embed_tokens.weight.data[:] = _tmp['embed_tokens.weight']
 51 | 
 52 |             for j in range(n_layer_per_stage):
 53 |                 _tmp = {k[len(f"{j+1}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j+1}.")}
 54 |                 if len(_tmp) == 0:
 55 |                     break
 56 |                 # torch.save(_tmp, os.path.join(output_path, f'pytorch_{j}.pt'))
 57 |                 ret = model.model.layers[j].load_state_dict(_tmp, strict=False)
 58 |                 if len(ret.missing_keys):
 59 |                     print('The following weight keys are missing:')
 60 |                     print(ret.missing_keys)
 61 |                 if len(ret.unexpected_keys):
 62 |                     print('The following weight keys are unexpected:')
 63 |                     print(ret.unexpected_keys)
 64 | 
 65 |         elif i == n_stages - 1:
 66 |             for j in range(n_layer_per_stage):
 67 |                 if i*n_layer_per_stage + j == n_layers:
 68 |                     break
 69 |                 _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")}
 70 |                 if len(_tmp) == 0:
 71 |                     break
 72 |                 # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt'))
 73 |                 ret = model.model.layers[i*n_layer_per_stage + j].load_state_dict(_tmp, strict=False)
 74 |                 if len(ret.missing_keys):
 75 |                     print('The following weight keys are missing:')
 76 |                     print(ret.missing_keys)
 77 |                 if len(ret.unexpected_keys):
 78 |                     print('The following weight keys are unexpected:')
 79 |                     print(ret.unexpected_keys)
 80 |             else:
 81 |                 j += 1
 82 | 
 83 |             _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")}
 84 |             if len(_tmp) == 0:
 85 |                 break
 86 |             # torch.save(_tmp, os.path.join(output_path, f'pytorch_lm_head.pt'))
 87 |             model.model.norm.weight.data[:] = _tmp['norm.weight']
 88 |             if 'norm.bias' in _tmp:
 89 |                 model.model.norm.bias.data[:] = _tmp['norm.bias']
 90 |             model.lm_head.weight.data[:] = _tmp['lm_head.weight']
 91 |             if 'lm_head.bias' in _tmp:
 92 |                 model.lm_head.bias.data[:] = _tmp['lm_head.bias']
 93 | 
 94 |         else:
 95 |             for j in range(n_layer_per_stage):
 96 |                 _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")}
 97 |                 if len(_tmp) == 0:
 98 |                     break
 99 |                 # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt'))
100 |                 ret = model.model.layers[i*n_layer_per_stage + j].load_state_dict(_tmp, strict=False)
101 |                 if len(ret.missing_keys):
102 |                     print('The following weight keys are missing:')
103 |                     print(ret.missing_keys)
104 |                 if len(ret.unexpected_keys):
105 |                     print('The following weight keys are unexpected:')
106 |                     print(ret.unexpected_keys)
107 | 
108 |     return model
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     
113 |     parser = argparse.ArgumentParser(description='Convert HF checkpoints')
114 |     parser.add_argument('--config-name', type=str, default='togethercomputer/Llama-2-7B-32K-beta',
115 |                         help='config-name')
116 |     parser.add_argument('--ckpt-path', type=str, default=None, 
117 |                         help='ckpt-path')
118 |     parser.add_argument('--save-path', type=str, default=None, 
119 |                         help='save-path')
120 |     parser.add_argument('--n-stages', type=int, default=8, 
121 |                         help='pipeline group size')
122 |     parser.add_argument('--n-layer-per-stage', type=int, default=4, 
123 |                         help='n layers per GPU device')
124 |     parser.add_argument('--fp16', default=False, action='store_true')
125 |     args = parser.parse_args()
126 |     
127 |     assert args.ckpt_path is not None
128 |     assert args.save_path is not None
129 |     
130 |     if not os.path.exists(args.save_path):
131 |         os.mkdir(args.save_path)
132 | 
133 |     # LlamaForCausalLM LlamaConfig LlamaTokenizer
134 |     print('loading config...')
135 |     config = AutoConfig.from_pretrained(args.config_name)
136 |     print('loaded config.')
137 |     print('loading tokenizer...')
138 |     tokenizer = AutoTokenizer.from_pretrained(args.config_name)
139 |     print('loaded tokenizer.')
140 |     print('creating empty model...')
141 |     model = create_emtpy_llama(config)
142 |     if args.fp16:
143 |         model = model.half()
144 |     print('created empty model.')
145 |     print('loading model ckpt...')
146 |     load_decentralized_checkpoint(
147 |         model, args.ckpt_path, n_stages=args.n_stages, n_layer_per_stage=args.n_layer_per_stage,
148 |     )
149 |     print('loaded model ckpt.')
150 |     
151 |     print('saving HF model...')
152 |     model.save_pretrained(args.save_path)
153 |     print(f'saved HF model to `{args.save_path}`')
154 |     config.save_pretrained(args.save_path)
155 |     tokenizer.save_pretrained(args.save_path)
156 | 


--------------------------------------------------------------------------------
/tools/model_load_benchmark.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import time
  4 | import torch
  5 | import torchvision
  6 | import os
  7 | import re
  8 | import psutil
  9 | from transformers import AutoTokenizer, AutoModelForCausalLM
 10 | 
 11 | # Benchmark download, tokenize, load, inference time.
 12 | def benchmark(model_dict: dict, device_name: str, repeat_infer: int):
 13 | 
 14 |     # Initialize the benchmark results dictionary
 15 |     results_dict = {}
 16 | 
 17 |     # Check that we have CUDA GPUs available before running the benchmark
 18 |     if not torch.cuda.is_available():
 19 |         print("ERROR: CUDA GPUs are not available, benchmark not run")
 20 |         return results_dict
 21 | 
 22 |     device = torch.device(device_name)
 23 | 
 24 |     process = psutil.Process()
 25 | 
 26 |     print(f'Using device {device}')
 27 | 
 28 |     # Loop through the models to test
 29 |     for model_name, model_path in model_dict.items():
 30 |         # purge unused cached memory
 31 |         torch.cuda.empty_cache()
 32 | 
 33 |         print(f"Testing model: {model_name}")
 34 | 
 35 |         # Measure the time it takes to download the tokenizer data and load the tokenizer
 36 |         tokenizer_download_start_time = time.time()
 37 |         tokenizer = AutoTokenizer.from_pretrained(model_path, force_download=True)
 38 |         tokenizer_download_end_time = time.time()
 39 | 
 40 |         tokenizer = None
 41 | 
 42 |         # Measure the time it takes to  load the tokenizer
 43 |         tokenizer_load_start_time = time.time()
 44 |         tokenizer = AutoTokenizer.from_pretrained(model_path)
 45 |         tokenizer_load_end_time = time.time()
 46 | 
 47 |         tokenizer_load_sec = tokenizer_load_end_time - tokenizer_load_start_time
 48 |         tokenizer_download_sec = tokenizer_download_end_time - tokenizer_download_start_time - tokenizer_load_sec
 49 | 
 50 |         print(f"Testing model: {model_name} --- tokenizer download time = {tokenizer_download_sec:.3} sec")
 51 |         print(f"Testing model: {model_name} --- tokenize load time = {tokenizer_load_sec:.3} sec")
 52 | 
 53 |         # Measure the time it takes to download and load the model into main memory
 54 |         model_download_start_time = time.time()
 55 |         model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, torchscript=True, force_download=True)
 56 |         model_download_end_time = time.time()
 57 |         
 58 |         model = None
 59 | 
 60 |         # Measure the time it takes to load the model into main memory
 61 |         memory_used_main_start = process.memory_info().rss
 62 |         model_load_to_ram_start_time = time.time()
 63 |         model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, torchscript=True)
 64 |         model_load_to_ram_end_time = time.time()
 65 |         memory_used_main_end = process.memory_info().rss
 66 | 
 67 |         model_load_to_ram_sec = model_load_to_ram_end_time - model_load_to_ram_start_time
 68 |         model_download_sec = model_download_end_time - model_download_start_time - model_load_to_ram_sec
 69 |         model_main_memory_bytes = memory_used_main_end - memory_used_main_start
 70 | 
 71 |         print(f"Testing model: {model_name} --- model download time = {model_download_sec:.3} sec")
 72 |         print(f"Testing model: {model_name} --- model load to RAM time = {model_load_to_ram_sec:.3} sec")
 73 |         print(f"Testing model: {model_name} --- model main memory size = {model_main_memory_bytes} bytes")
 74 | 
 75 |         # Measure the time it takes to load the model from main memory to the GPU
 76 |         gpu_memory_start = torch.cuda.memory_allocated(device)
 77 |         model_xfer_to_gpu_start_time = time.time()
 78 |         model = model.to(device)
 79 |         model_xfer_to_gpu_end_time = time.time()
 80 |         gpu_memory_end = torch.cuda.memory_allocated(device)
 81 | 
 82 |         model_xfer_to_gpu_sec = model_xfer_to_gpu_end_time - model_xfer_to_gpu_start_time
 83 |         model_gpu_memory_bytes = gpu_memory_end - gpu_memory_start
 84 | 
 85 |         print(f"Testing model: {model_name} --- model transfer to GPU time = {model_xfer_to_gpu_sec:.3} sec")
 86 |         print(f"Testing model: {model_name} --- model GPU memory size = {model_gpu_memory_bytes} bytes")
 87 | 
 88 |         # Measure the time it takes to run inference from a cold start
 89 |         inference_start_time = time.time()
 90 |         inputs = tokenizer("Hello, world!", return_tensors="pt").to(device)
 91 |         outputs = model(**inputs)
 92 |         inference_end_time = time.time()
 93 |         inference_sec = inference_end_time - inference_start_time
 94 | 
 95 |         print(f"Testing model: {model_name} --- inference time = {inference_sec:.3} sec")
 96 | 
 97 |         # Measure the time it takes to run inference from a cold start
 98 |         inference_warm_start_time = time.time()
 99 |         for i in range(0, repeat_infer):
100 |             inputs = tokenizer("Hello, world!", return_tensors="pt").to(device)
101 |             outputs = model(**inputs)
102 |         inference_warm_end_time = time.time()
103 |         inference_warm_sec = (inference_warm_end_time - inference_warm_start_time) / float(repeat_infer)
104 | 
105 |         print(f"Testing model: {model_name} --- inference warm time = {inference_warm_sec:.3} sec")
106 | 
107 |         total_sec = tokenizer_download_sec + tokenizer_load_sec + model_download_sec + model_load_to_ram_sec + model_xfer_to_gpu_sec + inference_sec
108 | 
109 |         print(f"Testing model: {model_name} --- total time = {total_sec:.3} sec")
110 | 
111 |         # Add the results to the dictionary
112 |         results_dict[model_name] = {
113 |             "tokenizer_download_sec": tokenizer_download_sec,
114 |             "tokenizer_load_sec": tokenizer_load_sec,
115 |             "model_download_sec": model_download_sec,
116 |             "model_load_to_ram_sec": model_load_to_ram_sec,
117 |             "model_main_memory_MB": float(model_main_memory_bytes) / 1000000.0,
118 |             "model_transfer_to_gpu_sec": model_xfer_to_gpu_sec,
119 |             "model_gpu_memory_MB": float(model_gpu_memory_bytes) / 1000000.0,
120 |             "inference_sec": inference_sec,
121 |             "inference_warm_sec": inference_warm_sec,
122 |             "total_sec": total_sec
123 |         }
124 | 
125 |         # Unload the model
126 |         model = None
127 |         torch.cuda.empty_cache()
128 | 
129 |     return results_dict
130 | 
131 | # Define the main function
132 | def main(input_file: str, output_file: str, device_name: str, repeat_infer: int):
133 | 
134 |     # Load the models to test from the input JSON file
135 |     with open(input_file, "r") as f:
136 |         model_dict = json.load(f)
137 | 
138 |     # Run the benchmark
139 |     results_dict = benchmark(model_dict, device_name, repeat_infer)
140 | 
141 |     # Write the results to the JSON output file
142 |     # use a regular expression to apply formatting to floatin point
143 |     json_data = re.sub('"(.*?)":\s*(0\.0*\d{2}|\d+\.\d{2})\d*(,?\n)', '"\\1": \\2\\3',  json.dumps(results_dict, indent=4))
144 |     with open(output_file, 'w') as f:
145 |         f.write(json_data)
146 | 
147 | if __name__ == "__main__":
148 |     # Create an argument parser
149 |     parser = argparse.ArgumentParser(description='Benchmark downloading, loading, and running an inferernce for a set of ML models.')
150 |     parser.add_argument('-i', '--input', required=True, help='Input JSON file containing models to be benchmark')
151 |     parser.add_argument('-o', '--output', required=True, help='Output JSON file with model benchmark results')
152 |     parser.add_argument('-d', '--device', required=False, default='cuda:0', help='Cuda device name, e.g. "cuda:0"')
153 |     parser.add_argument('-r', '--repeat-infer', required=False, default=30, help='Repeat inferrence for warm timings')
154 | 
155 |     # Parse the command line arguments
156 |     args = parser.parse_args()
157 | 
158 |     # Process the data
159 |     main(args.input, args.output, args.device, max(args.repeat_infer, 1))


--------------------------------------------------------------------------------
/training/README.md:
--------------------------------------------------------------------------------
 1 | # OpenChatKit Training
 2 | 
 3 | This directory contains code for training a chat model using OpenChatKit. The main training script is `finetune_GPT-NeoXT-Chat-Base-20B.sh`.
 4 | 
 5 | To customize training, make a copy of the script and modify the arguments.
 6 | 
 7 | ## Arguments
 8 | 
 9 | Environment vars that should be set:
10 | ```bash
11 | export GLOO_SOCKET_IFNAME=lo # this interface should be consistent to `--net-interface`
12 | export NCCL_SOCKET_IFNAME=lo # this interface should be consistent to `--net-interface`
13 | export WANDB_NAME=gptj-test # wandb run name
14 | ```
15 | 
16 | The following arguments should be carefully set:
17 | - `--model-name`: The path of model ckpt sharded by layers.
18 | - `--tokenizer-name`: Usually the same to `--model-name`. You can also use HF's model name.
19 | - `--model-type`: Indicate the model type. {gptj}. More model types will be added soon.
20 | - `--num-layers`: Number of Transformer layers **for each GPU**. E.g. GPT-J has 28 layers, if we use two GPUs to form a pipeline, `--num-layers` should be 14.
21 | - `--embedding-dim`: The hidden size of the model. GPT-J-6B is 4096. This is used to create buffers.
22 | - `--dist-url`: URL of rank 0 worker (master). It is the same to all workers. And this URL should be accessible by all workers. For local training (single machine multiple GPUs), this can be like `--dist-url tcp://127.0.0.1:7033`
23 | - `--world-size`: The total number of workers. `world-size == pipeline-group-size * data-group-size`
24 | - `--pipeline-group-size`: Number of GPU workers for each pipeline
25 | - `--data-group-size`: Number of data parallel workers. Also the number of pipelines.
26 | - `--net-interface`: Network interface. Should be consistent with `GLOO_SOCKET_IFNAME` and `NCCL_SOCKET_IFNAME`.
27 | 
28 | The following arguments can be tuned / changed:
29 | - `--train-log-backend `: How to log the training info. {print, loguru, wandb}.
30 | - `--optimizer`: Optimizer type. {adam, 8bit-adam} (8bit-adam requires `pip install bitsandbytes`)
31 | - `--load-pretrained-model`: Whether to load model weights. Usually `true`.
32 | - `--task-name`: The task name or the path of a `jsonl` file. For multi-task training separate task names by `,`.
33 |    There is an optional sampling weight after each task name, separated by `:` (default is 1.0). Sampling weights will be normalized.
34 |    E.g. it should be like `--task-name cot:0.1,/path_task0.jsonl:1.0,/path_task0.jsonl:1.0,/path_task0.jsonl:1.0`.
35 |    The number after the colon indicates the sampling weight for the task during training. For example, `cot:0.1` means the `cot` task will be sampled with a weight of 0.1.
36 | - `--checkpoint-path`: Path to save fine-tuned checkpoints.
37 | - `--checkpoint-steps`: Save ckpt every `checkpoint-steps`.
38 | - `--total-steps`: Total number of steps for training. (This counts all `gradient-accumulate-step`s.)
39 | - `--warmup-steps`: LR warmup steps.
40 | - `--lr`: learning rate
41 | - `--seq-length`: sequence length
42 | - `--batch-size`: batch size for each GPU device (of each gradient accumulation step).
43 | - `--micro-batch-size`: micro batch size for pipeline parallelism. 1 works fine.
44 | - `--gradient-accumulate-step`: Accumulate gradients for several steps before updating parameters. This is another way to achieve large batch sizes when GPU memory is not enough.
45 | 
46 | The following arguments usually do not change:
47 | - `--dp-backend`: {nccl, gloo}, default nccl.
48 | - `--dp-mode`: {allreduce}.
49 | - `--fp16`: Flag to enable FP16 mixed precision training. Should always adding it for the current impl.
50 | - `--pp-mode`: always `gpipe`
51 | - `--profiling`: {no-profiling, tidy_profiling}. `tidy_profiling` will generate profile jsons.
52 | 
53 | ## Adding Your Own Data to the DATASETS
54 | 
55 | To add your own data to the training process, you should create a `jsonl` file where each line is a JSON object representing a single training example. Once you have your `jsonl` file, you can include it in the `--task-name` argument with an appropriate sampling weight. For instance, if your file is located at `/path_to_your_data/your_data.jsonl` and you wish to give it a sampling weight of 0.5, you would add `/path_to_your_data/your_data.jsonl:0.5` to the `--task-name` argument.
56 | 
57 | If you have any questions or need further assistance, please refer to the [OpenDataHub](https://github.com/togethercomputer/OpenDataHub) repository or contact us through our [website](https://www.together.ai/contact).
58 | 


--------------------------------------------------------------------------------
/training/comm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/comm/__init__.py


--------------------------------------------------------------------------------
/training/comm/comm_utils.py:
--------------------------------------------------------------------------------
  1 | from .torch_backend import *
  2 | from .nccl_backend import *
  3 | 
  4 | _DATA_PARALLEL_COMM = None
  5 | _DATA_PARALLEL_RANK = None
  6 | _DATA_PARALLEL_WORLD_SIZE = None
  7 | 
  8 | _PIPELINE_PARALLEL_COMM = None
  9 | _PIPELINE_PARALLEL_RANK = None
 10 | _PIPELINE_PARALLEL_WORLD_SIZE = None
 11 | 
 12 | _TENSOR_PARALLEL_COMM = None
 13 | _TENSOR_PARALLEL_RANK = None
 14 | _TENSOR_PARALLEL_WORLD_SIZE = None
 15 | 
 16 | import threading 
 17 | 
 18 | _LOCK = threading.RLock()
 19 | 
 20 | def get_lock():
 21 |     return _LOCK
 22 | 
 23 | def get_data_parallel_comm() -> NCCLCommunicator:
 24 |     assert _DATA_PARALLEL_COMM is not None
 25 |     return _DATA_PARALLEL_COMM
 26 | 
 27 | 
 28 | def get_data_parallel_rank() -> int:
 29 |     assert _DATA_PARALLEL_RANK is not None
 30 |     return _DATA_PARALLEL_RANK
 31 | 
 32 | 
 33 | def get_data_parallel_world_size() -> int:
 34 |     assert _DATA_PARALLEL_WORLD_SIZE is not None
 35 |     return _DATA_PARALLEL_WORLD_SIZE
 36 | 
 37 | 
 38 | def get_pipeline_parallel_comm() -> NCCLCommunicator:
 39 |     assert _PIPELINE_PARALLEL_COMM is not None
 40 |     return _PIPELINE_PARALLEL_COMM
 41 | 
 42 | 
 43 | def get_pipeline_parallel_rank() -> int:
 44 |     assert _PIPELINE_PARALLEL_RANK is not None
 45 |     return _PIPELINE_PARALLEL_RANK
 46 | 
 47 | 
 48 | def get_pipeline_parallel_world_size() -> int:
 49 |     assert _PIPELINE_PARALLEL_WORLD_SIZE is not None
 50 |     return _PIPELINE_PARALLEL_WORLD_SIZE
 51 | 
 52 | 
 53 | def get_megatron_tensor_parallel_comm() -> NCCLCommunicator:
 54 |     assert _TENSOR_PARALLEL_COMM is not None
 55 |     return _TENSOR_PARALLEL_COMM
 56 | 
 57 | 
 58 | def get_megatron_tensor_parallel_rank() -> int:
 59 |     assert _TENSOR_PARALLEL_RANK is not None
 60 |     return _TENSOR_PARALLEL_RANK
 61 | 
 62 | 
 63 | def get_megatron_tensor_parallel_world_size() -> int:
 64 |     assert _TENSOR_PARALLEL_WORLD_SIZE is not None
 65 |     return _TENSOR_PARALLEL_WORLD_SIZE
 66 | 
 67 | 
 68 | def default_init(args):
 69 |     import datetime
 70 |     import time
 71 |     try:
 72 |         dist.destroy_process_group()
 73 |         # the first time will raise exception, so the following code is skipped.
 74 |         print('destroy comm, increase port for 1. (this could cause problem)')
 75 |         url = ':'.join(args.dist_url.split(':')[:-1])
 76 |         port = int(args.dist_url.split(':')[-1]) + 1
 77 |         args.dist_url = f"{url}:{port}"
 78 |         print(f"new master url: {args.dist_url}")
 79 |     except:
 80 |         pass
 81 |     dist.init_process_group(backend='gloo', timeout=datetime.timedelta(seconds=5*60), init_method=args.dist_url, world_size=args.world_size, rank=args.rank)
 82 |     
 83 | 
 84 | def init_communicators(args):
 85 |     default_init(args)
 86 |     assert args.world_size == args.data_group_size * args.pipeline_group_size
 87 |     if args.world_size == args.data_group_size * args.pipeline_group_size:
 88 |         #    We do the following hard code alignment of communication groups:
 89 |         #    Suppose there are 8 instances (world_size), and 4 data parallel groups (data_group_size is 2),
 90 |         #    Then there would be 2 pipeline parallel groups (pipeline_group_size is 4), then the groups will look like:
 91 |         #    pipeline parallel: <group 0: [0,1,2,3]>, <group 1: [4,5,6,7]>
 92 |         #    data parallel: <group 0: [0,4]>, <group 1: [1,5]>, <group 2: [2,6]>, <group 3: [3,7]>
 93 |         # assert args.world_size == args.data_group_size * args.pipeline_group_size
 94 |         global _DATA_PARALLEL_COMM
 95 |         global _PIPELINE_PARALLEL_COMM
 96 |         global _DATA_PARALLEL_RANK
 97 |         global _PIPELINE_PARALLEL_RANK
 98 |         global _DATA_PARALLEL_WORLD_SIZE
 99 |         global _PIPELINE_PARALLEL_WORLD_SIZE
100 |         # We use pipeline parallel by default.
101 |         _PIPELINE_PARALLEL_WORLD_SIZE = args.pipeline_group_size
102 |         _PIPELINE_PARALLEL_RANK = args.rank % args.pipeline_group_size
103 |         _PIPELINE_PARALLEL_COMM = NCCLCommunicator(_PIPELINE_PARALLEL_RANK, args.cuda_id, args.pipeline_group_size,
104 |                                                    "pipeline_group_"+str(args.rank // args.pipeline_group_size))
105 |         if args.data_group_size != 1:
106 |             _DATA_PARALLEL_WORLD_SIZE = args.data_group_size
107 |             _DATA_PARALLEL_RANK = args.rank // args.pipeline_group_size
108 |             
109 |             dp_backend = getattr(args, 'dp_backend', 'gloo')
110 |             if dp_backend == 'nccl':
111 |             
112 |                 _DATA_PARALLEL_COMM = NCCLCommunicator(_DATA_PARALLEL_RANK, args.cuda_id, args.data_group_size,
113 |                                                        "data_group_"+str(args.rank % args.pipeline_group_size))
114 |             
115 |             elif dp_backend == 'gloo':
116 |                 
117 |                 for i in range(args.pipeline_group_size):
118 |                     ranks = [rank for rank in range(i, args.world_size, args.pipeline_group_size)]
119 |                     print(args.rank, ranks)
120 |                     data_group = torch.distributed.new_group(ranks, backend='gloo')
121 |                     if args.rank in ranks:
122 |                         def to_global_rank(dp_rank):
123 |                             rank = _PIPELINE_PARALLEL_RANK + dp_rank * args.pipeline_group_size
124 |                             # print(f"{dp_rank} --> {rank}")
125 |                             return rank
126 |                         _DATA_PARALLEL_COMM = TorchCommunicator(
127 |                             data_group, to_global_rank=to_global_rank, 
128 |                             dp_rank=_DATA_PARALLEL_RANK,
129 |                             comm_group_size=args.data_group_size,)
130 |             
131 |             else:
132 |                 assert False
133 |             
134 |         print('comm init done!!')
135 |             
136 |     # elif args.world_size == args.data_group_size * args.tensor_group_size:
137 |     #    global _DATA_PARALLEL_COMM
138 |     #    global _TENSOR_PARALLEL_COMM
139 |     #    global _DATA_PARALLEL_RANK
140 |     #    global _TENSOR_PARALLEL_RANK
141 |     #    global _DATA_PARALLEL_WORLD_SIZE
142 |     #    global _TENSOR_PARALLEL_WORLD_SIZE
143 |         # We use megatron tensor parallel by default.
144 |     #    _TENSOR_PARALLEL_WORLD_SIZE = args.tensor_group_size
145 |     #    _TENSOR_PARALLEL_RANK = args.rank % args.tensor_group_size
146 |     #    _TENSOR_PARALLEL_COMM = NCCLCommunicator(_TENSOR_PARALLEL_RANK, args.cuda_id, args.tensor_group_size,
147 |     #                                             "tensor_group_" + str(args.rank // args.tensor_group_size))
148 |     #    if args.data_group_size != 1:
149 |     #        _DATA_PARALLEL_WORLD_SIZE = args.data_group_size
150 |     #        _DATA_PARALLEL_RANK = args.rank // args.tensor_group_size
151 |     #        _DATA_PARALLEL_COMM = NCCLCommunicator(_DATA_PARALLEL_RANK, args.cuda_id, args.data_group_size,
152 |     #                                              "data_group_" + str(args.rank % args.tensor_group_size))
153 |     else:
154 |         print("Not supported yet")
155 |         assert False
156 | 
157 |         
158 |         
159 | def reinit_dp_communicator(args):
160 |     
161 |     print('###### reinit start #######')
162 |     
163 |     default_init(args)
164 |     assert args.world_size == args.data_group_size * args.pipeline_group_size
165 |     if args.world_size == args.data_group_size * args.pipeline_group_size:
166 |         #    We do the following hard code alignment of communication groups:
167 |         #    Suppose there are 8 instances (world_size), and 4 data parallel groups (data_group_size is 2),
168 |         #    Then there would be 2 pipeline parallel groups (pipeline_group_size is 4), then the groups will look like:
169 |         #    pipeline parallel: <group 0: [0,1,2,3]>, <group 1: [4,5,6,7]>
170 |         #    data parallel: <group 0: [0,4]>, <group 1: [1,5]>, <group 2: [2,6]>, <group 3: [3,7]>
171 |         # assert args.world_size == args.data_group_size * args.pipeline_group_size
172 |         global _DATA_PARALLEL_COMM
173 |         global _PIPELINE_PARALLEL_COMM
174 |         global _DATA_PARALLEL_RANK
175 |         global _PIPELINE_PARALLEL_RANK
176 |         global _DATA_PARALLEL_WORLD_SIZE
177 |         global _PIPELINE_PARALLEL_WORLD_SIZE
178 |         
179 |         if args.data_group_size != 1:
180 |             
181 |             dp_backend = getattr(args, 'dp_backend', 'gloo')
182 |             if dp_backend == 'nccl':
183 |             
184 |                 raise Exception('NCCL cannot reinit.')
185 |             
186 |             elif dp_backend == 'gloo':
187 |                 
188 |                 for i in range(args.pipeline_group_size):
189 |                     ranks = [rank for rank in range(i, args.world_size, args.pipeline_group_size)]
190 |                     print(args.rank, ranks)
191 |                     data_group = torch.distributed.new_group(ranks, backend='gloo')
192 |                     if args.rank in ranks:
193 |                         def to_global_rank(dp_rank):
194 |                             rank = _PIPELINE_PARALLEL_RANK + dp_rank * args.pipeline_group_size
195 |                             # print(f"{dp_rank} --> {rank}")
196 |                             return rank
197 |                         _DATA_PARALLEL_COMM = TorchCommunicator(
198 |                             data_group, to_global_rank=to_global_rank, 
199 |                             dp_rank=_DATA_PARALLEL_RANK,
200 |                             comm_group_size=args.data_group_size,)
201 |             
202 |             else:
203 |                 assert False
204 |             
205 |         print('######## dp comm reinit done!! ########')


--------------------------------------------------------------------------------
/training/comm/nccl_backend.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import cupy
  4 | import cupy.cuda.nccl
  5 | import torch.distributed as dist
  6 | from typing import List
  7 | 
  8 | 
  9 | def _type_torch_to_cupy(torch_type: torch.dtype):
 10 |     # print(torch_type)
 11 |     mappings = {
 12 |         torch.uint8: cupy.cuda.nccl.NCCL_UINT8,
 13 |         torch.int32: cupy.cuda.nccl.NCCL_INT32,
 14 |         torch.int64: cupy.cuda.nccl.NCCL_INT64,
 15 |         torch.int: cupy.cuda.nccl.NCCL_INT,
 16 |         torch.float16: cupy.cuda.nccl.NCCL_FLOAT16,
 17 |         torch.float32: cupy.cuda.nccl.NCCL_FLOAT32,
 18 |         torch.float64: cupy.cuda.nccl.NCCL_FLOAT64,
 19 |         torch.float: cupy.cuda.nccl.NCCL_FLOAT
 20 |     }
 21 |     return mappings[torch_type]
 22 | 
 23 | 
 24 | class NCCLCommunicator:
 25 |     def __init__(self,
 26 |                  comm_rank: int,
 27 |                  cuda_id: int,
 28 |                  comm_group_size: int,
 29 |                  comm_name: str):
 30 |         self.comm_rank = comm_rank
 31 |         cupy.cuda.Device(cuda_id).use()
 32 |         self.comm_group_size = comm_group_size
 33 |         print("Initialize NCCLCommunicator: <", comm_name, ">; rank:", comm_rank)
 34 |         self.dist_store = dist.distributed_c10d._get_default_store()
 35 | 
 36 |         if self.comm_rank == 0:
 37 |             cuda_id = cupy.cuda.nccl.get_unique_id()
 38 |             # print(cuda_id)
 39 |             cuda_id_str = np.array(cuda_id).tobytes()
 40 |             self.dist_store.set('group-'+comm_name+'-unique-id', cuda_id_str)
 41 |             # print("Master put <group-"+comm_name+"-unique-id: ", cuda_id_str, ">.")
 42 |         else:
 43 |             cuda_id_str = self.dist_store.get('group-'+comm_name+'-unique-id')
 44 | 
 45 |         comm_id = tuple(np.frombuffer(cuda_id_str, dtype=int))
 46 |         # comm_id = cupy.cuda.nccl.get_unique_id()
 47 |         # print(comm_id)
 48 |         self.comm = cupy.cuda.nccl.NcclCommunicator(comm_group_size, comm_id, comm_rank)
 49 | 
 50 |     @staticmethod
 51 |     def barrier():
 52 |         dist.barrier()
 53 | 
 54 |     def store_set(self, key, value):
 55 |         self.dist_store.set(key, value)
 56 | 
 57 |     def store_get(self, key):
 58 |         return self.dist_store.get(key)
 59 | 
 60 |     def send(self,
 61 |              tensor: torch.Tensor,
 62 |              dst: int,
 63 |              stream=cupy.cuda.Stream.null):
 64 |         # print("Send tensor of size:", torch.numel(tensor))
 65 |         self.comm.send(
 66 |             tensor.data_ptr(),
 67 |             torch.numel(tensor),
 68 |             _type_torch_to_cupy(tensor.dtype),
 69 |             dst,
 70 |             stream.ptr
 71 |         )
 72 | 
 73 |     def recv(self,
 74 |              tensor: torch.Tensor,
 75 |              src: int,
 76 |              stream=cupy.cuda.Stream.null):
 77 |         # print("Recv tensor of size:", torch.numel(tensor))
 78 |         # print("mean:", torch.mean(tensor).item(), " std:", torch.std(tensor).item())
 79 |         self.comm.recv(
 80 |             tensor.data_ptr(),
 81 |             torch.numel(tensor),
 82 |             _type_torch_to_cupy(tensor.dtype),
 83 |             src,
 84 |             stream.ptr
 85 |         )
 86 | 
 87 |     def broadcast(self,
 88 |                   tensor: torch.Tensor,
 89 |                   src: int,
 90 |                   stream=cupy.cuda.Stream.null):
 91 |         self.comm.bcast(
 92 |             tensor.data_ptr(),
 93 |             torch.numel(tensor),
 94 |             _type_torch_to_cupy(tensor.dtype),
 95 |             src,
 96 |             stream.ptr
 97 |         )
 98 | 
 99 |     def reduce(self,
100 |                tensor: torch.Tensor,
101 |                dst: int,
102 |                stream=cupy.cuda.Stream.null,
103 |                op=cupy.cuda.nccl.NCCL_SUM):
104 |         self.comm.reduce(
105 |             tensor.data_ptr(),  # force it to be in-place.
106 |             tensor.data_ptr(),
107 |             torch.numel(tensor),
108 |             _type_torch_to_cupy(tensor.dtype),
109 |             op,
110 |             dst,
111 |             stream.ptr
112 |         )
113 | 
114 |     def all_reduce(self,
115 |                   tensor: torch.Tensor,
116 |                   stream=cupy.cuda.Stream.null,
117 |                   op=cupy.cuda.nccl.NCCL_SUM):
118 |         self.comm.allReduce(
119 |             tensor.data_ptr(),
120 |             tensor.data_ptr(),
121 |             torch.numel(tensor),
122 |             _type_torch_to_cupy(tensor.dtype),
123 |             op,
124 |             stream.ptr
125 |         )
126 | 
127 |     def scatter(self,
128 |                 tensor: torch.Tensor,
129 |                 scatter_list: List[torch.Tensor],
130 |                 src: int,
131 |                 stream=cupy.cuda.Stream.null):
132 |         cupy.cuda.nccl.groupStart()
133 |         if self.comm_rank == src:
134 |             for i in range(self.comm_group_size):
135 |                 self.send(
136 |                     scatter_list[i],
137 |                     i,
138 |                     stream
139 |                 )
140 |         self.recv(
141 |             tensor,
142 |             src,
143 |             stream
144 |         )
145 |         cupy.cuda.nccl.groupEnd()
146 | 
147 |     def gather(self,
148 |                tensor: torch.Tensor,
149 |                gather_list: List[torch.Tensor],
150 |                dst: int,
151 |                stream=cupy.cuda.Stream.null):
152 |         cupy.cuda.nccl.groupStart()
153 |         if self.comm_rank == dst:
154 |             for i in range(self.comm_group_size):
155 |                 self.recv(
156 |                     gather_list[i],
157 |                     i,
158 |                     stream
159 |                 )
160 |         self.send(
161 |             tensor,
162 |             dst,
163 |             stream
164 |         )
165 |         cupy.cuda.nccl.groupEnd()
166 | 
167 |     def all_to_all(self,
168 |                    output_tensor_list: List[torch.Tensor],
169 |                    input_tensor_list: List[torch.Tensor],
170 |                    stream=cupy.cuda.Stream.null):
171 |         assert len(output_tensor_list) == self.comm_group_size and len(input_tensor_list) == self.comm_group_size
172 |         cupy.cuda.nccl.groupStart()
173 |         for i in range(self.comm_group_size):
174 |             self.send(input_tensor_list[i], i, stream)
175 |             self.recv(output_tensor_list[i], i, stream)
176 |         cupy.cuda.nccl.groupEnd()
177 | 
178 |     def all_gather(self,
179 |                    tensor: torch.Tensor,
180 |                    output_tensor_list: List[torch.Tensor],
181 |                    stream=cupy.cuda.Stream.null
182 |                    ):
183 |         assert len(output_tensor_list) == self.comm_group_size
184 |         cupy.cuda.nccl.groupStart()
185 |         for i in range(self.comm_group_size):
186 |             self.send(tensor, i, stream)
187 |             self.recv(output_tensor_list[i], i, stream)
188 |         cupy.cuda.nccl.groupEnd()
189 | 
190 |     def all_reduce_opt(self,
191 |                        tensor: torch.Tensor,
192 |                        buffer: List[torch.Tensor],
193 |                        stream=cupy.cuda.Stream.null,
194 |                        caller=None):
195 |         # First do all-to-all
196 |         assert torch.numel(tensor.data) % self.comm_group_size == 0
197 |         chunk_size = torch.numel(tensor.data) // self.comm_group_size
198 |         t_type = _type_torch_to_cupy(tensor.dtype)
199 |         element_size = tensor.data.element_size()
200 |         
201 |         cupy.cuda.nccl.groupStart()
202 |         for i in range(self.comm_group_size):
203 |             self.comm.send(tensor.data_ptr()+i*chunk_size*element_size, chunk_size, t_type, i, stream.ptr)
204 |             self.comm.recv(buffer[i].data_ptr(), chunk_size, t_type, i, stream.ptr)
205 |         cupy.cuda.nccl.groupEnd()
206 |         
207 |         for i in range(1, self.comm_group_size):
208 |             buffer[0] += buffer[i]
209 | 
210 |         cupy.cuda.nccl.groupStart()
211 |         for i in range(self.comm_group_size):
212 |             self.comm.send(buffer[0].data_ptr(), chunk_size, t_type, i, stream.ptr)
213 |             self.comm.recv(tensor.data_ptr()+i*chunk_size*element_size, chunk_size, t_type, i, stream.ptr)
214 |         cupy.cuda.nccl.groupEnd()
215 | 
216 | 


--------------------------------------------------------------------------------
/training/comm/torch_backend.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.distributed as dist
  3 | from typing import List
  4 | 
  5 | class TorchCommunicator:
  6 |         
  7 |     def __init__(self,
  8 |                  process_group,
  9 |                  to_global_rank=lambda rank: rank,
 10 |                  dp_rank=None,
 11 |                  comm_group_size=None,):
 12 |         self.process_group = process_group
 13 |         self.to_global_rank = to_global_rank
 14 |         self.dp_rank = dp_rank
 15 |         self.comm_group_size = comm_group_size
 16 | 
 17 |     # @staticmethod
 18 |     def barrier(self):
 19 |         dist.barrier(group=self.process_group)
 20 | 
 21 |     def send(self,
 22 |              tensor: torch.Tensor,
 23 |              dst: int,
 24 |              stream=None):
 25 |         # print("Send tensor of size:", torch.numel(tensor))
 26 |         if tensor.device == torch.device('cpu'):
 27 |             dist.send(tensor, self.to_global_rank(dst), group=self.process_group)
 28 |         else:
 29 |             dist.send(tensor.cpu(), self.to_global_rank(dst), group=self.process_group)
 30 |             
 31 |     def recv(self,
 32 |              tensor: torch.Tensor,
 33 |              src: int,
 34 |              stream=None):
 35 |         
 36 |         if tensor.device == torch.device('cpu'):
 37 |             dist.recv(tensor, self.to_global_rank(src), group=self.process_group)
 38 |         else:
 39 |             buffer = tensor.cpu()
 40 |             dist.recv(buffer, self.to_global_rank(src), group=self.process_group)
 41 |             tensor[:] = buffer.to(tensor.device)
 42 |     
 43 |     def isend(self,
 44 |              tensor: torch.Tensor,
 45 |              dst: int,
 46 |              stream=None):
 47 |         # print("Send tensor of size:", torch.numel(tensor))
 48 |         if tensor.device == torch.device('cpu'):
 49 |             handler = dist.isend(tensor, self.to_global_rank(dst), group=self.process_group)
 50 |         else:
 51 |             handler = dist.isend(tensor.cpu(), self.to_global_rank(dst), group=self.process_group)
 52 |         return handler
 53 | 
 54 |     def irecv(self,
 55 |              tensor: torch.Tensor,
 56 |              src: int,
 57 |              stream=None):
 58 |         if tensor.device == torch.device('cpu'):
 59 |             handler = dist.irecv(tensor, self.to_global_rank(src), group=self.process_group)
 60 |         else:
 61 |             assert False
 62 |             buffer = tensor.cpu()
 63 |             handler = dist.irecv(buffer, self.to_global_rank(src), group=self.process_group)
 64 |             tensor[:] = buffer.to(tensor.device)
 65 |         return handler
 66 | 
 67 |     def broadcast(self,
 68 |                   tensor: torch.Tensor,
 69 |                   src: int,
 70 |                   stream=None):
 71 |         if tensor.device == torch.device('cpu'):
 72 |             dist.broadcast(tensor, self.to_global_rank(src), group=self.process_group)
 73 |         else:
 74 |             buffer = tensor.cpu()
 75 |             dist.broadcast(buffer, self.to_global_rank(src), group=self.process_group)
 76 |             tensor[:] = buffer.to(tensor.device)
 77 | 
 78 |     def reduce(self,
 79 |                tensor: torch.Tensor,
 80 |                dst: int,
 81 |                stream=None,
 82 |                op=dist.ReduceOp.SUM):
 83 |         dist.reduce(tensor, self.to_global_rank(dst), group=self.process_group, op=op)
 84 | 
 85 |     def all_reduce(self,
 86 |                    tensor: torch.Tensor,
 87 |                    stream = None,
 88 |                    op=dist.ReduceOp.SUM):
 89 |         buffer = tensor.cpu()
 90 |         dist.all_reduce(buffer, group=self.process_group, op=op)
 91 |         tensor[:] = buffer.to(tensor.device)
 92 | 
 93 |     def gather(self,
 94 |                tensor: torch.Tensor,
 95 |                gather_list: List[torch.Tensor],
 96 |                dst: int,
 97 |                stream=None):
 98 |         dist.gather(tensor, gather_list, self.to_global_rank(dst), group=self.process_group)
 99 | 
100 |     def all_to_all(self,
101 |                    output_tensor_list: List[torch.Tensor],
102 |                    input_tensor_list: List[torch.Tensor],
103 |                    stream=None):
104 |         dist.all_to_all(output_tensor_list, input_tensor_list, group=self.process_group)
105 | 
106 |     def all_gather(self,
107 |                    tensor: torch.Tensor,
108 |                    output_tensor_list: List[torch.Tensor],
109 |                    stream=None):
110 |         dist.all_gather(output_tensor_list, tensor, group=self.process_group)
111 |         
112 | 


--------------------------------------------------------------------------------
/training/data_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/data_parallel/__init__.py


--------------------------------------------------------------------------------
/training/data_parallel/dist_dp_allreduce.py:
--------------------------------------------------------------------------------
  1 | import torch.cuda
  2 | from comm.comm_utils import *
  3 | from .flatten_utils import flatten_params
  4 | 
  5 | 
  6 | class AllReduceDP:
  7 |     def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True):
  8 |         self.flatten = flatten
  9 |         self.global_rank = args.rank
 10 |         self.dp_group_size = args.data_group_size
 11 |         self.enable_tidy_profiling = (args.profiling == 'tidy_profiling')
 12 |         self.dp_comm = get_data_parallel_comm()
 13 |         self.dp_rank = get_data_parallel_rank()
 14 |         self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1)
 15 |         self.torch_optim_comp_stream = torch.cuda.default_stream(device=device)
 16 |         self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 17 |         self.allreduce_grad_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 18 |         self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 19 | 
 20 |         self.module = module
 21 |         num_paras, element_size = self._compute_total_para_num()
 22 |         print("Total number of parameters: {}, element size: {}, total size {} MB."
 23 |               .format(num_paras, element_size, num_paras * element_size // 1024 // 1024))
 24 | 
 25 |         if self.flatten:
 26 |             self.flatten_para = flatten_params(self.module.parameters())
 27 |             print("Flattened parameter number: {}, element size: {}."
 28 |                   .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size()))
 29 |             print("Flattened parameter grad number: {}, element size: {}."
 30 |                   .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size()))
 31 | 
 32 |         assert optimizer is not None
 33 |         self.optimizer = optimizer
 34 | 
 35 |         if self.enable_tidy_profiling:
 36 |             self.global_rank = args.rank
 37 |             self.init_event = None
 38 |             self.init_time_stamp = None
 39 |             if self.flatten:
 40 |                 self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False)
 41 |             else:
 42 |                 self.allreduce_gradients_start_events = dict()
 43 |                 self.allreduce_gradients_end_events = dict()
 44 |                 for name, _ in self.module.named_parameters():
 45 |                     self.allreduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False)
 46 |                     self.allreduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False)
 47 | 
 48 |             self.optimizer_step_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling,
 49 |                                                                blocking=False)
 50 | 
 51 |     def _compute_total_para_num(self):
 52 |         total_count = 0
 53 |         element_size = 0
 54 |         for para in self.module.parameters():
 55 |             # print("Parameter: ", para.data.shape)
 56 |             total_count += torch.numel(para.data)
 57 |             element_size = para.element_size()
 58 |         return total_count, element_size
 59 | 
 60 |     def profile_mark_allreduce_start(self, name=None):
 61 |         if self.enable_tidy_profiling:
 62 |             if name is None:
 63 |                 self.dp_comm_stream.record_event(self.allreduce_gradients_start_event)
 64 |             else:
 65 |                 self.dp_comm_stream.record_event(self.allreduce_gradients_start_events[name])
 66 | 
 67 |     def profile_mark_allreduce_end(self, name=None):
 68 |         if self.enable_tidy_profiling:
 69 |             if name:
 70 |                 self.dp_comm_stream.record_event(self.allreduce_gradients_end_events[name])
 71 | 
 72 |     def profile_mark_optimizer_step_start(self):
 73 |         if self.enable_tidy_profiling:
 74 |             self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event)
 75 | 
 76 |     def _allreduce_gradients(self):
 77 |         with torch.cuda.stream(self.dp_comm_stream):
 78 |             cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream)
 79 |             self.dp_comm_stream.wait_event(self.backward_ready_event)
 80 |             if self.flatten:
 81 |                 self.profile_mark_allreduce_start()
 82 |                 self.dp_comm.all_reduce(self.flatten_para.grad, stream=cupy_dp_stream)
 83 |                 self.profile_mark_allreduce_end()
 84 |             else:
 85 |                 for name, para in self.module.named_parameters():
 86 |                     if para.grad is None:
 87 |                         continue
 88 |                     self.profile_mark_allreduce_start(name)
 89 |                     self.dp_comm.all_reduce(para.grad, stream=cupy_dp_stream)
 90 |                     self.profile_mark_allreduce_end(name)
 91 |             self.dp_comm_stream.record_event(self.allreduce_grad_ready_event)
 92 | 
 93 |     def optimizer_step(self):
 94 |         self._allreduce_gradients()
 95 |         with torch.cuda.stream(self.torch_optim_comp_stream):
 96 |             self.torch_optim_comp_stream.wait_event(self.allreduce_grad_ready_event)
 97 |             self.profile_mark_optimizer_step_start()
 98 |             self.optimizer.step()
 99 |             self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event)
100 | 
101 |     def set_time_stamp(self, init_time_stamp, init_event):
102 |         self.init_event = init_event
103 |         self.init_time_stamp = init_time_stamp
104 | 
105 |     def get_ts(self, event):
106 |         return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3
107 | 
108 |     def profiling_data_parallel(self, init_time_stamp, init_event):
109 |         self.set_time_stamp(init_time_stamp, init_event)
110 |         profiling_log = []
111 | 
112 |         if self.flatten:
113 |             allreduce_slot = self.allreduce_gradients_start_event.elapsed_time(self.allreduce_grad_ready_event)*1e+3
114 |             allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
115 |                              "ts": self.get_ts(self.allreduce_gradients_start_event),
116 |                              "dur": allreduce_slot, "cname": "cq_build_passed",
117 |                              "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}}
118 |             # print(allreduce_log)
119 |             profiling_log.append(allreduce_log)
120 |         else:
121 |             for name, para in self.module.named_parameters():
122 |                 allreduce_slot = self.allreduce_gradients_start_events[name].elapsed_time(
123 |                     self.allreduce_gradients_end_events[name]) * 1e+3
124 |                 allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
125 |                                  "ts": self.get_ts(self.allreduce_gradients_start_events[name]), "dur": allreduce_slot,
126 |                                  "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}}
127 |                 # print(allreduce_log)
128 |                 profiling_log.append(allreduce_log)
129 | 
130 |         optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3
131 |         optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp",
132 |                          "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"}
133 |         # print(optimizer_log)
134 |         profiling_log.append(optimizer_log)
135 |         return profiling_log
136 | 


--------------------------------------------------------------------------------
/training/data_parallel/dist_dp_central_ps.py:
--------------------------------------------------------------------------------
  1 | import torch.cuda
  2 | from comm.comm_utils import *
  3 | from .flatten_utils import flatten_params
  4 | 
  5 | 
  6 | class CentralPSDP:
  7 |     def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True):
  8 |         self.flatten = flatten
  9 |         self.global_rank = args.rank
 10 |         self.dp_group_size = args.data_group_size
 11 |         self.enable_tidy_profiling = (args.profiling == 'tidy_profiling')
 12 |         self.dp_comm = get_data_parallel_comm()
 13 |         self.dp_rank = get_data_parallel_rank()
 14 |         self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1)
 15 |         self.torch_optim_comp_stream = torch.cuda.default_stream(device=device)
 16 |         self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 17 |         self.broadcast_reduced_gradients_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling,
 18 |                                                                         blocking=False)
 19 |         self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 20 | 
 21 |         self.module = module
 22 |         num_paras, element_size = self._compute_total_para_num()
 23 |         print("Total number of parameters: {}, element size: {}, total size {} MB."
 24 |               .format(num_paras, element_size, num_paras * element_size // 1024 // 1024))
 25 | 
 26 |         if self.flatten:
 27 |             self.flatten_para = flatten_params(self.module.parameters())
 28 |             print("Flattened parameter number: {}, element size: {}."
 29 |                   .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size()))
 30 |             print("Flattened parameter grad number: {}, element size: {}."
 31 |                   .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size()))
 32 | 
 33 |         assert optimizer is not None
 34 |         self.optimizer = optimizer
 35 | 
 36 |         if self.enable_tidy_profiling:
 37 |             self.global_rank = args.rank
 38 |             self.init_event = None
 39 |             self.init_time_stamp = None
 40 |             if self.flatten:
 41 |                 self.reduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False)
 42 |                 self.reduce_gradients_end_event = torch.cuda.Event(enable_timing=True, blocking=False)
 43 |                 self.broadcast_reduced_grad_start_event = torch.cuda.Event(enable_timing=True, blocking=False)
 44 |             else:
 45 |                 self.reduce_gradients_start_events = dict()
 46 |                 self.reduce_gradients_end_events = dict()
 47 |                 self.broadcast_reduced_grad_start_events = dict()
 48 |                 self.broadcast_reduced_grad_end_events = dict()
 49 | 
 50 |                 for name, _ in self.module.named_parameters():
 51 |                     self.reduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False)
 52 |                     self.reduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False)
 53 |                     self.broadcast_reduced_grad_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False)
 54 |                     self.broadcast_reduced_grad_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False)
 55 | 
 56 |             self.optimizer_step_start_event = torch.cuda.Event(enable_timing=True, blocking=False)
 57 | 
 58 |     def _compute_total_para_num(self):
 59 |         total_count = 0
 60 |         element_size = 0
 61 |         for para in self.module.parameters():
 62 |             # print("Parameter: ", para.data.shape)
 63 |             total_count += torch.numel(para.data)
 64 |             element_size = para.element_size()
 65 |         return total_count, element_size
 66 |     
 67 |     def profile_mark_reduce_start(self, name=None):
 68 |         if self.enable_tidy_profiling:
 69 |             if name is None:
 70 |                 self.dp_comm_stream.record_event(self.reduce_gradients_start_event)
 71 |             else:
 72 |                 self.dp_comm_stream.record_event(self.reduce_gradients_start_events[name])
 73 | 
 74 |     def profile_mark_reduce_end(self, name=None):
 75 |         if self.enable_tidy_profiling:
 76 |             if name is None:
 77 |                 self.dp_comm_stream.record_event(self.reduce_gradients_end_event)
 78 |             else:
 79 |                 self.dp_comm_stream.record_event(self.reduce_gradients_end_events[name])
 80 | 
 81 |     def profile_mark_optimizer_step_start(self):
 82 |         if self.enable_tidy_profiling:
 83 |             self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event)
 84 |             
 85 |     def profile_mark_broadcast_start(self, name=None):
 86 |         if self.enable_tidy_profiling:
 87 |             if name is None:
 88 |                 self.dp_comm_stream.record_event(self.broadcast_reduced_grad_start_event)
 89 |             else:
 90 |                 self.dp_comm_stream.record_event(self.broadcast_reduced_grad_start_events[name])
 91 |             
 92 |     def profile_mark_broadcast_end(self, name=None):
 93 |         if self.enable_tidy_profiling:
 94 |             if name:
 95 |                 self.dp_comm_stream.record_event(self.broadcast_reduced_grad_end_events[name])
 96 | 
 97 |     def _reduce_gradients(self):
 98 |         with torch.cuda.stream(self.dp_comm_stream):
 99 |             cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream)
100 |             self.dp_comm_stream.wait_event(self.backward_ready_event)
101 |             if self.flatten:
102 |                 self.profile_mark_reduce_start()
103 |                 self.dp_comm.reduce(self.flatten_para.grad, dst=0, stream=cupy_dp_stream)
104 |                 self.profile_mark_reduce_end()
105 |             else:
106 |                 for name, para in self.module.named_parameters():
107 |                     self.profile_mark_reduce_start(name)
108 |                     self.dp_comm.reduce(para.grad, dst=0, stream=cupy_dp_stream)
109 |                     self.profile_mark_reduce_end(name)
110 | 
111 |     def _broadcast_reduced_gradients(self):
112 |         with torch.cuda.stream(self.dp_comm_stream):
113 |             cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream)
114 |             if self.flatten:
115 |                 self.profile_mark_broadcast_start()
116 |                 self.dp_comm.broadcast(self.flatten_para.grad, src=0, stream=cupy_dp_stream)
117 |                 self.profile_mark_broadcast_end()
118 |             else:
119 |                 for name, para in self.module.named_parameters():
120 |                     self.profile_mark_broadcast_start(name)
121 |                     self.dp_comm.broadcast(para.grad, src=0, stream=cupy_dp_stream)
122 |                     self.profile_mark_broadcast_end(name)
123 |             self.dp_comm_stream.record_event(self.broadcast_reduced_gradients_ready_event)
124 | 
125 |     def optimizer_step(self):
126 |         self._reduce_gradients()
127 |         self._broadcast_reduced_gradients()
128 |         with torch.cuda.stream(self.torch_optim_comp_stream):
129 |             self.torch_optim_comp_stream.wait_event(self.broadcast_reduced_gradients_ready_event)
130 |             self.profile_mark_optimizer_step_start()
131 |             self.optimizer.step()
132 |             self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event)
133 | 
134 |     def set_time_stamp(self, init_time_stamp, init_event):
135 |         self.init_event = init_event
136 |         self.init_time_stamp = init_time_stamp
137 | 
138 |     def get_ts(self, event):
139 |         return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3
140 | 
141 |     def profiling_data_parallel(self, init_time_stamp, init_event):
142 |         self.set_time_stamp(init_time_stamp, init_event)
143 |         profiling_log = []
144 |         if self.flatten:
145 |             reduce_slot = self.reduce_gradients_start_event.elapsed_time(self.reduce_gradients_end_event) * 1e+3
146 |             reduce_log = {"name": "opt_reduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
147 |                           "ts": self.get_ts(self.reduce_gradients_start_event),
148 |                           "dur": reduce_slot, "cname": "cq_build_passed",
149 |                           "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}}
150 |             # print(reduce_log)
151 |             profiling_log.append(reduce_log)
152 |         else:
153 |             for name, para in self.module.named_parameters():
154 |                 reduce_slot = self.reduce_gradients_start_events[name].elapsed_time(
155 |                     self.reduce_gradients_end_events[name]) * 1e+3
156 |                 reduce_log = {"name": "opt_reduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
157 |                               "ts": self.get_ts(self.reduce_gradients_start_events[name]), "dur": reduce_slot,
158 |                               "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}}
159 |                 # print(reduce_log)
160 |                 profiling_log.append(reduce_log)
161 | 
162 |         optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3
163 |         optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp",
164 |                          "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"}
165 |         # print(optimizer_log)
166 |         profiling_log.append(optimizer_log)
167 | 
168 |         if self.flatten:
169 |             broadcast_slot = self.broadcast_reduced_grad_start_event.elapsed_time(
170 |                 self.broadcast_reduced_gradients_ready_event) * 1e+3
171 |             broadcast_log = {"name": "opt_broadcast", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
172 |                              "ts": self.get_ts(self.broadcast_reduced_grad_start_event),
173 |                              "dur": broadcast_slot, "cname": "cq_build_passed",
174 |                              "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}}
175 |             profiling_log.append(broadcast_log)
176 |         else:
177 |             for name, para in self.module.named_parameters():
178 |                 broadcast_slot = self.broadcast_reduced_grad_start_events[name].elapsed_time(
179 |                     self.broadcast_reduced_grad_end_events[name]) * 1e+3
180 |                 broadcast_log = {"name": "opt_broadcast", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
181 |                                  "ts": self.get_ts(self.broadcast_reduced_grad_start_events[name]), "dur": broadcast_slot,
182 |                                  "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}}
183 |                 # print(broadcast_log)
184 |                 profiling_log.append(broadcast_log)
185 |         return profiling_log
186 | 


--------------------------------------------------------------------------------
/training/data_parallel/dist_dp_local.py:
--------------------------------------------------------------------------------
  1 | import torch.cuda
  2 | import cupy
  3 | from comm.comm_utils import *
  4 | from .flatten_utils import flatten_params
  5 | 
  6 | 
  7 | class LocalDP:
  8 |     def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True):
  9 |         flatten = True
 10 |         self.flatten = flatten
 11 |         self.global_rank = args.rank
 12 |         self.dp_group_size = args.data_group_size
 13 |         self.enable_tidy_profiling = (args.profiling == 'tidy_profiling')
 14 |         self.dp_comm = get_data_parallel_comm()
 15 |         self.dp_rank = get_data_parallel_rank()
 16 |         self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1)
 17 |         self.torch_optim_comp_stream = torch.cuda.default_stream(device=device)
 18 |         self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 19 |         self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 20 |         self.allreduce_grad_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 21 |         self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 22 | 
 23 |         self.module = module
 24 |         num_paras, element_size = self._compute_total_para_num()
 25 |         print("Total number of parameters: {}, element size: {}, total size {} MB."
 26 |               .format(num_paras, element_size, num_paras * element_size // 1024 // 1024))
 27 | 
 28 |         if self.flatten:
 29 |             self.flatten_para = flatten_params(self.module.parameters())
 30 |             print("Flattened parameter number: {}, element size: {}."
 31 |                   .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size()))
 32 |             print("Flattened parameter grad number: {}, element size: {}."
 33 |                   .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size()))
 34 | 
 35 |         assert optimizer is not None
 36 |         self.optimizer = optimizer
 37 | 
 38 |         if self.enable_tidy_profiling:
 39 |             self.global_rank = args.rank
 40 |             self.init_event = None
 41 |             self.init_time_stamp = None
 42 |             if self.flatten:
 43 |                 self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False)
 44 |             else:
 45 |                 self.allreduce_gradients_start_events = dict()
 46 |                 self.allreduce_gradients_end_events = dict()
 47 |                 for name, _ in self.module.named_parameters():
 48 |                     self.allreduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False)
 49 |                     self.allreduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False)
 50 | 
 51 |             self.optimizer_step_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling,
 52 |                                                                blocking=False)
 53 | 
 54 |     def _compute_total_para_num(self):
 55 |         total_count = 0
 56 |         element_size = 0
 57 |         for para in self.module.parameters():
 58 |             # print("Parameter: ", para.data.shape)
 59 |             total_count += torch.numel(para.data)
 60 |             element_size = para.element_size()
 61 |         return total_count, element_size
 62 | 
 63 |     def profile_mark_allreduce_start(self, name=None):
 64 |         if self.enable_tidy_profiling:
 65 |             if name is None:
 66 |                 self.dp_comm_stream.record_event(self.allreduce_gradients_start_event)
 67 |             else:
 68 |                 self.dp_comm_stream.record_event(self.allreduce_gradients_start_events[name])
 69 | 
 70 |     def profile_mark_allreduce_end(self, name=None):
 71 |         if self.enable_tidy_profiling:
 72 |             if name:
 73 |                 self.dp_comm_stream.record_event(self.allreduce_gradients_end_events[name])
 74 | 
 75 |     def profile_mark_optimizer_step_start(self):
 76 |         if self.enable_tidy_profiling:
 77 |             self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event)
 78 |             
 79 |     def allreduce_parameters(self):
 80 |         self._local_parameters_backup = [
 81 |             p.data.clone() for p in self.module.parameters()
 82 |         ]
 83 |         torch.cuda.synchronize()
 84 |         self.dp_comm.barrier()
 85 |         with torch.cuda.stream(self.dp_comm_stream):
 86 |             cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream)
 87 |             self.dp_comm_stream.wait_event(self.backward_ready_event)
 88 |             if self.flatten:
 89 |                 self.profile_mark_allreduce_start()
 90 |                 self.dp_comm.all_reduce(self.flatten_para.data, stream=cupy_dp_stream)
 91 |                 self.flatten_para.data /= self.dp_group_size
 92 |                 self.profile_mark_allreduce_end()
 93 |             else:
 94 |                 for name, para in self.module.named_parameters():
 95 |                     self.profile_mark_allreduce_start(name)
 96 |                     self.dp_comm.all_reduce(para.data, stream=cupy_dp_stream)
 97 |                     para.data /= self.dp_group_size
 98 |                     self.profile_mark_allreduce_end(name)
 99 |             self.dp_comm_stream.record_event(self.allreduce_grad_ready_event)
100 |         torch.cuda.synchronize()
101 |         self.dp_comm.barrier()
102 | 
103 |     def rollback_parameters(self):
104 |         if not hasattr(self, '_local_parameters_backup'):
105 |             return
106 |         
107 |         for p, p_local in zip(self.module.parameters(), self._local_parameters_backup):
108 |             p.data[:] = p_local.data
109 |             
110 |         del self._local_parameters_backup
111 |             
112 | 
113 |     def optimizer_step(self):
114 |         # torch.cuda.synchronize()
115 |         with torch.cuda.stream(self.torch_optim_comp_stream):
116 |             self.torch_optim_comp_stream.record_event(self.allreduce_gradients_start_event)
117 |             self.torch_optim_comp_stream.record_event(self.allreduce_grad_ready_event)
118 |             self.torch_optim_comp_stream.wait_event(self.backward_ready_event)
119 |             self.profile_mark_optimizer_step_start()
120 |             self.optimizer.step()
121 |             self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event)
122 | 
123 |     def set_time_stamp(self, init_time_stamp, init_event):
124 |         self.init_event = init_event
125 |         self.init_time_stamp = init_time_stamp
126 | 
127 |     def get_ts(self, event):
128 |         return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3
129 | 
130 |     def profiling_data_parallel(self, init_time_stamp, init_event):
131 |         self.set_time_stamp(init_time_stamp, init_event)
132 |         profiling_log = []
133 | 
134 |         if self.flatten:
135 |             allreduce_slot = self.allreduce_gradients_start_event.elapsed_time(self.allreduce_grad_ready_event)*1e+3
136 |             allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
137 |                              "ts": self.get_ts(self.allreduce_gradients_start_event),
138 |                              "dur": allreduce_slot, "cname": "cq_build_passed",
139 |                              "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}}
140 |             # print(allreduce_log)
141 |             profiling_log.append(allreduce_log)
142 |         else:
143 |             for name, para in self.module.named_parameters():
144 |                 allreduce_slot = self.allreduce_gradients_start_events[name].elapsed_time(
145 |                     self.allreduce_gradients_end_events[name]) * 1e+3
146 |                 allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
147 |                                  "ts": self.get_ts(self.allreduce_gradients_start_events[name]), "dur": allreduce_slot,
148 |                                  "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}}
149 |                 # print(allreduce_log)
150 |                 profiling_log.append(allreduce_log)
151 | 
152 |         optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3
153 |         optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp",
154 |                          "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"}
155 |         # print(optimizer_log)
156 |         profiling_log.append(optimizer_log)
157 |         return profiling_log
158 | 


--------------------------------------------------------------------------------
/training/data_parallel/dist_dp_sharded_ps.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.cuda
  5 | from comm.comm_utils import *
  6 | from .flatten_utils import flatten_params
  7 | 
  8 | 
  9 | class ShardedPSDP:
 10 |     def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True):
 11 |         self.flatten = flatten
 12 |         self.global_rank = args.rank
 13 |         self.dp_group_size = args.data_group_size
 14 |         self.enable_tidy_profiling = (args.profiling == 'tidy_profiling')
 15 |         self.dp_comm = get_data_parallel_comm()
 16 |         self.dp_rank = get_data_parallel_rank()
 17 |         self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1)
 18 |         self.torch_optim_comp_stream = torch.cuda.default_stream(device=device)
 19 |         self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 20 |         self.sync_gradients_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 21 |         self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False)
 22 | 
 23 |         self.module = module
 24 |         assert optimizer is not None
 25 |         self.optimizer = optimizer
 26 |         num_paras, element_size = self._compute_total_para_num()
 27 |         print("Total number of parameters: {}, element size: {}, total size {} MB."
 28 |               .format(num_paras, element_size, num_paras * element_size // 1024 // 1024))
 29 | 
 30 |         assert self.flatten
 31 | #         self.para = list(self.module.parameters())
 32 |         self.flatten_para = flatten_params(self.module.parameters(), self.dp_group_size)
 33 |         print("Flattened parameter number: {}, element size: {}."
 34 |               .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size()))
 35 |         print("Flattened parameter grad number: {}, element size: {}."
 36 |               .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size()))
 37 | 
 38 |         self.grad_buffer = self._declare_grad_buffer()
 39 | 
 40 |         if self.enable_tidy_profiling:
 41 |             self.global_rank = args.rank
 42 |             self.init_event = None
 43 |             self.init_time_stamp = None
 44 | 
 45 |             assert self.flatten
 46 |             self.sync_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False)
 47 | 
 48 |             self.optimizer_step_start_event = torch.cuda.Event(enable_timing=True, blocking=False)
 49 | 
 50 |     def _compute_total_para_num(self):
 51 |         total_count = 0
 52 |         element_size = 0
 53 |         for para in self.module.parameters():
 54 |             # print("Parameter: ", para.data.shape)
 55 |             total_count += torch.numel(para.data)
 56 |             element_size = para.element_size()
 57 |         return total_count, element_size
 58 | 
 59 |     def _declare_grad_buffer(self):
 60 |         assert self.flatten_para.data.numel() % self.dp_group_size == 0
 61 |         chunk_size = self.flatten_para.data.numel() // self.dp_group_size
 62 |         grad_buffer = [torch.zeros(chunk_size, device=self.flatten_para.device, dtype=self.flatten_para.dtype)
 63 |                        for _ in range(self.dp_group_size)]
 64 |         return grad_buffer
 65 | 
 66 |     def profile_mark_sync_grad_start(self):
 67 |         if self.enable_tidy_profiling:
 68 |             self.dp_comm_stream.record_event(self.sync_gradients_start_event)
 69 | 
 70 |     def profile_mark_allreduce_end(self):
 71 |         pass
 72 | 
 73 |     def profile_mark_optimizer_step_start(self):
 74 |         if self.enable_tidy_profiling:
 75 |             self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event)
 76 | 
 77 |     def _sync_gradients(self):
 78 |         with torch.cuda.stream(self.dp_comm_stream):
 79 |             cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream)
 80 |             self.dp_comm_stream.wait_event(self.backward_ready_event)
 81 |             assert self.flatten
 82 |             self.profile_mark_sync_grad_start()
 83 |             self.dp_comm.all_reduce_opt(self.flatten_para.grad, self.grad_buffer, stream=cupy_dp_stream)
 84 |             self.profile_mark_allreduce_end()
 85 |             self.dp_comm_stream.record_event(self.sync_gradients_ready_event)
 86 | 
 87 |     def optimizer_step(self):
 88 |         self._sync_gradients()
 89 |         with torch.cuda.stream(self.torch_optim_comp_stream):
 90 |             self.torch_optim_comp_stream.wait_event(self.sync_gradients_ready_event)
 91 |             self.profile_mark_optimizer_step_start()
 92 |             self.optimizer.step()
 93 |             self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event)
 94 | 
 95 |     def set_time_stamp(self, init_time_stamp, init_event):
 96 |         self.init_event = init_event
 97 |         self.init_time_stamp = init_time_stamp
 98 | 
 99 |     def get_ts(self, event):
100 |         return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3
101 | 
102 |     def profiling_data_parallel(self, init_time_stamp, init_event):
103 |         self.set_time_stamp(init_time_stamp, init_event)
104 |         profiling_log = []
105 | 
106 |         assert self.flatten
107 |         allreduce_slot = self.sync_gradients_start_event.elapsed_time(self.sync_gradients_ready_event)*1e+3
108 |         allreduce_log = {"name": "opt_shardedPS_sync", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm",
109 |                          "ts": self.get_ts(self.sync_gradients_start_event),
110 |                          "dur": allreduce_slot, "cname": "cq_build_passed",
111 |                          "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}}
112 |         # print(allreduce_log)
113 |         profiling_log.append(allreduce_log)
114 | 
115 |         optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3
116 |         optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp",
117 |                          "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"}
118 |         # print(optimizer_log)
119 |         profiling_log.append(optimizer_log)
120 |         return profiling_log
121 | 


--------------------------------------------------------------------------------
/training/data_parallel/dist_dp_utils.py:
--------------------------------------------------------------------------------
 1 | from .dist_dp_allreduce import AllReduceDP
 2 | from .dist_dp_sharded_ps import ShardedPSDP
 3 | from .dist_dp_local import LocalDP
 4 | 
 5 | 
 6 | def get_dp_module(args, device, module, optimizer):
 7 |     print("Data parallel implementation: ", args.dp_mode)
 8 |     if args.dp_mode == 'allreduce':
 9 |         return AllReduceDP(args, device, module, optimizer, flatten=False) 
10 |         # flatten gradient is not compatible with fp16 now
11 |     elif args.dp_mode == 'local':
12 |         return LocalDP(args, device, module, optimizer, flatten=False)
13 |     elif args.dp_mode == 'sharded_ps':
14 |         return ShardedPSDP(args, device, module, optimizer, flatten=False)
15 |     else:
16 |         print("Not recognize this data parallel mode.")
17 |         assert False
18 | 


--------------------------------------------------------------------------------
/training/data_parallel/flatten_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def _assert_contiguous(tensors):
 5 |     data_ptr = None
 6 |     for t in tensors:
 7 |         if data_ptr is not None:
 8 |             assert t.data_ptr() == data_ptr
 9 |         data_ptr = t.data_ptr() + t.numel() * t.element_size()
10 | 
11 | 
12 | def flatten_params(param_set, chunk=None):
13 |     params = [p for p in param_set]
14 |     weights = [p.data for p in params]
15 |     grads = [p.grad.data if p.grad is not None else torch.zeros_like(p.data) for p in params]
16 |     sizes = [p.numel() for p in params]
17 |     total_size = sum(sizes)
18 |     if chunk:
19 |         total_size = ((total_size+chunk-1)//chunk)*chunk
20 | 
21 |     flatten_weights_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device)
22 |     flatten_grads_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device)
23 |     flatten_weights_storage = flatten_weights_tensor.storage()
24 |     flatten_grads_storage = flatten_grads_tensor.storage()
25 | 
26 |     def set_storage(param, weight_storage, grad_storage, storage_offset):
27 |         with torch.no_grad():
28 |             z = torch.zeros_like(param.data)
29 |             z.set_(weight_storage, storage_offset, param.shape)
30 |             param.data = z
31 | 
32 |             t = torch.zeros_like(param.data)
33 |             t.set_(grad_storage, storage_offset, param.shape)
34 |             param.grad = t
35 | 
36 |     offset = 0
37 |     for i in range(len(params)):
38 |         flatten_weights_tensor[offset: offset + weights[i].numel()] = weights[i].reshape(-1)
39 |         flatten_grads_tensor[offset: offset + grads[i].numel()] = grads[i].reshape(-1)
40 |         set_storage(params[i], flatten_weights_storage, flatten_grads_storage, offset)
41 |         offset += sizes[i]
42 | 
43 |     weight_tensors = [p.data for p in params]
44 |     grad_tensors = [p.grad.data for p in params]
45 | 
46 |     _assert_contiguous(weight_tensors)
47 |     _assert_contiguous(grad_tensors)
48 | 
49 |     with torch.no_grad():
50 |         flatten_para = torch.nn.Parameter(flatten_weights_tensor, requires_grad=False)
51 |         flatten_para.grad = flatten_grads_tensor
52 |         return flatten_para
53 |     
54 | 
55 | def flatten_tensors(tensor_set, chunk=None):
56 |     tensors = [p for p in tensor_set]
57 |     weights = [p.data for p in tensors]
58 |     sizes = [p.numel() for p in tensors]
59 |     total_size = sum(sizes)
60 |     if chunk:
61 |         total_size = ((total_size+chunk-1)//chunk)*chunk
62 | 
63 |     flatten_weights_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device)
64 |     flatten_weights_storage = flatten_weights_tensor.storage()
65 | 
66 |     def set_storage(param, weight_storage, storage_offset):
67 |         with torch.no_grad():
68 |             z = torch.zeros_like(param.data)
69 |             z.set_(weight_storage, storage_offset, param.shape)
70 |             param.data = z
71 | 
72 |     offset = 0
73 |     for i in range(len(tensors)):
74 |         flatten_weights_tensor[offset: offset + weights[i].numel()] = weights[i].reshape(-1)
75 |         set_storage(tensors[i], flatten_weights_storage, offset)
76 |         offset += sizes[i]
77 | 
78 |     return flatten_weights_tensor
79 | 


--------------------------------------------------------------------------------
/training/finetune_GPT-NeoXT-Chat-Base-20B.sh:
--------------------------------------------------------------------------------
 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 2 | 
 3 | netif=lo
 4 | export GLOO_SOCKET_IFNAME=${netif}
 5 | export NCCL_SOCKET_IFNAME=${netif}
 6 | export MODEL_NAME=GPT-Neo-XT-Chat-Base-20B
 7 | 
 8 | export SHOW_DATA=0
 9 | 
10 | BASE_MODEL="${DIR}/../pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b/"
11 | 
12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-20000}
13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-100}
14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"}
15 | 
16 | DATASETS="\
17 | ${DIR}/../data/OIG/files/unified_ni.jsonl:0.2,\
18 | ${DIR}/../data/OIG/files/unified_p3.jsonl:0.5,\
19 | ${DIR}/../data/OIG/files/unified_flan.jsonl:0.2,\
20 | ${DIR}/../data/OIG/files/unified_chip2.jsonl:0.01,\
21 | ${DIR}/../data/OIG/files/unified_rallio_safety_and_prosocial.jsonl:0.1,\
22 | ${DIR}/../data/OIG/files/unified_soda_dialog.jsonl:0.1,\
23 | ${DIR}/../data/OIG/files/unified_unifiedskg_instructions.jsonl:0.1,\
24 | ${DIR}/../data/OIG/files/unified_merged_code_xp3.jsonl:0.1,\
25 | ${DIR}/../data/OIG/files/unified_oscar_en_sample_dialog.jsonl:0.1,\
26 | ${DIR}/../data/OIG/files/unified_ul2_plus_oscar_en_sample_dialog.jsonl:0.1,\
27 | ${DIR}/../data/OIG/files/unified_multi_news.jsonl:0.05,\
28 | ${DIR}/../data/OIG/files/unified_openai_summarize_tldr.jsonl:0.05,\
29 | ${DIR}/../data/OIG/files/unified_squad_v2.jsonl:0.01,\
30 | ${DIR}/../data/OIG/files/unified_nq.jsonl:0.01,\
31 | ${DIR}/../data/OIG/files/unified_poetry_instructions.jsonl:0.01,\
32 | ${DIR}/../data/OIG/files/unified_sqlv2.jsonl:0.01,\
33 | ${DIR}/../data/OIG/files/unified_unnatural_instructions.jsonl:0.01,\
34 | ${DIR}/../data/OIG/files/unified_conv_finqa.jsonl:0.01,\
35 | ${DIR}/../data/OIG/files/unified_essays.jsonl:0.01,\
36 | ${DIR}/../data/OIG/files/unified_plot_screenplay_books_dialog.jsonl:0.01,\
37 | ${DIR}/../data/OIG/files/unified_grade_school_math_instructions.jsonl:0.01,\
38 | ${DIR}/../data/OIG/files/unified_mathqa_flanv2_kojma_cot.jsonl:0.01,\
39 | ${DIR}/../data/OIG/files/unified_joke_explanations.jsonl:0.01,\
40 | ${DIR}/../data/OIG/files/unified_cuad.jsonl:0.01,\
41 | ${DIR}/../data/OIG/files/unified_abstract_infill.jsonl:0.1,\
42 | ${DIR}/../data/OIG/files/unified_image_prompts_instructions.jsonl:0.01 \
43 | "
44 | 
45 | ARGS="--model-name ${BASE_MODEL} \
46 | --tokenizer-name ${BASE_MODEL} \
47 | --project-name together \
48 | --model-type gptneox \
49 | --optimizer adam \
50 | --seed 42 \
51 | --load-pretrained-model true \
52 | --task-name \
53 | "${DATASETS}" \
54 | --checkpoint-path ${CHECKPOINT_PATH} \
55 | --total-steps ${TOTAL_STEPS} --warmup-steps 10 --train-warmup-steps 0 \
56 | --checkpoint-steps ${CHECKPOINT_STEPS} \
57 | --lr 1e-6 --seq-length 2048 --batch-size 64 --micro-batch-size 1 --gradient-accumulate-step 1 \
58 | --dist-url tcp://127.0.0.1:7033 \
59 | --num-layers 6 --embedding-dim 6144 \
60 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \
61 | --job-id 0 --net-interface ${netif} \
62 | --fp16 \
63 | --dp-backend nccl \
64 | --dp-mode allreduce \
65 | --pp-mode gpipe --profiling no-profiling"
66 | 
67 | 
68 | (trap 'kill 0' SIGINT; \
69 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
70 |     & \
71 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
72 |     & \
73 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
74 |     & \
75 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
76 |     & \
77 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
78 |     & \
79 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
80 |     & \
81 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
82 |     & \
83 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
84 |     & \
85 | wait)
86 | 


--------------------------------------------------------------------------------
/training/finetune_Pythia-Chat-Base-7B.sh:
--------------------------------------------------------------------------------
 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 2 | 
 3 | netif=lo
 4 | export GLOO_SOCKET_IFNAME=${netif}
 5 | export NCCL_SOCKET_IFNAME=${netif}
 6 | export MODEL_NAME=Pythia-Chat-Base-7B
 7 | 
 8 | export SHOW_DATA=0
 9 | 
10 | BASE_MODEL="${DIR}/../pretrained/Pythia-6.9B-deduped/EleutherAI_pythia-6.9b-deduped/"
11 | 
12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-20000}
13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-100}
14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"}
15 | 
16 | DATASETS="\
17 | ${DIR}/../data/OIG/files/unified_ni.jsonl:0.2,\
18 | ${DIR}/../data/OIG/files/unified_p3.jsonl:0.5,\
19 | ${DIR}/../data/OIG/files/unified_flan.jsonl:0.2,\
20 | ${DIR}/../data/OIG/files/unified_chip2.jsonl:0.01,\
21 | ${DIR}/../data/OIG/files/unified_rallio_safety_and_prosocial.jsonl:0.1,\
22 | ${DIR}/../data/OIG/files/unified_soda_dialog.jsonl:0.1,\
23 | ${DIR}/../data/OIG/files/unified_unifiedskg_instructions.jsonl:0.1,\
24 | ${DIR}/../data/OIG/files/unified_merged_code_xp3.jsonl:0.1,\
25 | ${DIR}/../data/OIG/files/unified_oscar_en_sample_dialog.jsonl:0.1,\
26 | ${DIR}/../data/OIG/files/unified_ul2_plus_oscar_en_sample_dialog.jsonl:0.1,\
27 | ${DIR}/../data/OIG/files/unified_multi_news.jsonl:0.05,\
28 | ${DIR}/../data/OIG/files/unified_openai_summarize_tldr.jsonl:0.05,\
29 | ${DIR}/../data/OIG/files/unified_squad_v2.jsonl:0.01,\
30 | ${DIR}/../data/OIG/files/unified_nq.jsonl:0.01,\
31 | ${DIR}/../data/OIG/files/unified_poetry_instructions.jsonl:0.01,\
32 | ${DIR}/../data/OIG/files/unified_sqlv2.jsonl:0.01,\
33 | ${DIR}/../data/OIG/files/unified_unnatural_instructions.jsonl:0.01,\
34 | ${DIR}/../data/OIG/files/unified_conv_finqa.jsonl:0.01,\
35 | ${DIR}/../data/OIG/files/unified_essays.jsonl:0.01,\
36 | ${DIR}/../data/OIG/files/unified_plot_screenplay_books_dialog.jsonl:0.01,\
37 | ${DIR}/../data/OIG/files/unified_grade_school_math_instructions.jsonl:0.01,\
38 | ${DIR}/../data/OIG/files/unified_mathqa_flanv2_kojma_cot.jsonl:0.01,\
39 | ${DIR}/../data/OIG/files/unified_joke_explanations.jsonl:0.01,\
40 | ${DIR}/../data/OIG/files/unified_cuad.jsonl:0.01,\
41 | ${DIR}/../data/OIG/files/unified_abstract_infill.jsonl:0.1,\
42 | ${DIR}/../data/OIG/files/unified_image_prompts_instructions.jsonl:0.01 \
43 | "
44 | 
45 | ARGS="--model-name ${BASE_MODEL} \
46 | --tokenizer-name ${BASE_MODEL} \
47 | --project-name together \
48 | --model-type gptneox \
49 | --optimizer adam \
50 | --seed 42 \
51 | --load-pretrained-model true \
52 | --task-name \
53 | "${DATASETS}" \
54 | --checkpoint-path ${CHECKPOINT_PATH} \
55 | --total-steps ${TOTAL_STEPS} --warmup-steps 10 --train-warmup-steps 0 \
56 | --checkpoint-steps ${CHECKPOINT_STEPS} \
57 | --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \
58 | --dist-url tcp://127.0.0.1:7033 \
59 | --num-layers 8 --embedding-dim 4096 \
60 | --world-size 8 --pipeline-group-size 4 --data-group-size 2 \
61 | --job-id 0 --net-interface ${netif} \
62 | --fp16 \
63 | --dp-backend nccl \
64 | --dp-mode allreduce \
65 | --pp-mode gpipe --profiling no-profiling"
66 | 
67 | 
68 | (trap 'kill 0' SIGINT; \
69 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
70 |     & \
71 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
72 |     & \
73 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
74 |     & \
75 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
76 |     & \
77 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
78 |     & \
79 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
80 |     & \
81 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
82 |     & \
83 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
84 |     & \
85 | wait)
86 | 


--------------------------------------------------------------------------------
/training/finetune_RedPajama-INCITE-7B-Chat.sh:
--------------------------------------------------------------------------------
 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 2 | 
 3 | netif=lo
 4 | export GLOO_SOCKET_IFNAME=${netif}
 5 | export NCCL_SOCKET_IFNAME=${netif}
 6 | export MODEL_NAME=redpajama-incite-chat-3b-sample
 7 | 
 8 | export SHOW_DATA=0
 9 | 
10 | BASE_MODEL="${DIR}/../pretrained/RedPajama-7B/togethercomputer_RedPajama-INCITE-7B-Chat"
11 | 
12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10}
13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10}
14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"}
15 | 
16 | DATASETS="${DIR}/../data/OIG-chip2/unified_chip2.jsonl:1"
17 | 
18 | ARGS="--model-name ${BASE_MODEL} \
19 | --tokenizer-name ${BASE_MODEL} \
20 | --project-name together \
21 | --model-type gptneox \
22 | --optimizer adam \
23 | --seed 42 \
24 | --load-pretrained-model true \
25 | --task-name \
26 | "${DATASETS}" \
27 | --checkpoint-path ${CHECKPOINT_PATH} \
28 | --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \
29 | --checkpoint-steps ${CHECKPOINT_STEPS} \
30 | --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \
31 | --dist-url tcp://127.0.0.1:7033 \
32 | --num-layers 4 --embedding-dim 2560 \
33 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \
34 | --job-id 0 --net-interface ${netif} \
35 | --fp16 \
36 | --dp-backend nccl \
37 | --dp-mode allreduce \
38 | --pp-mode gpipe --profiling no-profiling"
39 | 
40 | 
41 | (trap 'kill 0' SIGINT; \
42 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
43 |     & \
44 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
45 |     & \
46 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
47 |     & \
48 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
49 |     & \
50 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
51 |     & \
52 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
53 |     & \
54 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
55 |     & \
56 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
57 |     & \
58 | wait)
59 | 


--------------------------------------------------------------------------------
/training/finetune_RedPajama-INCITE-Chat-3B-v1.sh:
--------------------------------------------------------------------------------
 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 2 | 
 3 | netif=lo
 4 | export GLOO_SOCKET_IFNAME=${netif}
 5 | export NCCL_SOCKET_IFNAME=${netif}
 6 | export MODEL_NAME=redpajama-incite-chat-3b-sample
 7 | 
 8 | export SHOW_DATA=0
 9 | 
10 | BASE_MODEL="${DIR}/../pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1"
11 | 
12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10}
13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10}
14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"}
15 | 
16 | DATASETS="${DIR}/../data/OIG-chip2/unified_chip2.jsonl:1"
17 | 
18 | ARGS="--model-name ${BASE_MODEL} \
19 | --tokenizer-name ${BASE_MODEL} \
20 | --project-name together \
21 | --model-type gptneox \
22 | --optimizer adam \
23 | --seed 42 \
24 | --load-pretrained-model true \
25 | --task-name \
26 | "${DATASETS}" \
27 | --checkpoint-path ${CHECKPOINT_PATH} \
28 | --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \
29 | --checkpoint-steps ${CHECKPOINT_STEPS} \
30 | --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \
31 | --dist-url tcp://127.0.0.1:7033 \
32 | --num-layers 4 --embedding-dim 2560 \
33 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \
34 | --job-id 0 --net-interface ${netif} \
35 | --fp16 \
36 | --dp-backend nccl \
37 | --dp-mode allreduce \
38 | --pp-mode gpipe --profiling no-profiling"
39 | 
40 | 
41 | (trap 'kill 0' SIGINT; \
42 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
43 |     & \
44 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
45 |     & \
46 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
47 |     & \
48 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
49 |     & \
50 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
51 |     & \
52 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
53 |     & \
54 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
55 |     & \
56 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
57 |     & \
58 | wait)
59 | 


--------------------------------------------------------------------------------
/training/finetune_llama-2-7b-32k-booksum.sh:
--------------------------------------------------------------------------------
 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 2 | 
 3 | netif=lo
 4 | export GLOO_SOCKET_IFNAME=${netif}
 5 | export NCCL_SOCKET_IFNAME=${netif}
 6 | export MODEL_NAME=llama-2-7b-32k-booksum
 7 | 
 8 | export SHOW_DATA=1
 9 | 
10 | BASE_MODEL="${DIR}/../pretrained/Llama-2-7B-32K-beta/togethercomputer_Llama-2-7B-32K-beta"
11 | 
12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10}
13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10}
14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"}
15 | 
16 | DATASETS="https://huggingface.co/datasets/togethercomputer/Long-Data-Collections/resolve/main/fine-tune/booksum.jsonl.zst:1"
17 | 
18 | ARGS="--model-name ${BASE_MODEL} \
19 | --tokenizer-name ${BASE_MODEL} \
20 | --project-name together \
21 | --model-type llama \
22 | --optimizer adam \
23 | --seed 42 \
24 | --load-pretrained-model true \
25 | --task-name \
26 | "${DATASETS}" \
27 | --checkpoint-path ${CHECKPOINT_PATH} \
28 | --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \
29 | --checkpoint-steps ${CHECKPOINT_STEPS} \
30 | --lr 2e-5 --seq-length 32768 --batch-size 4 --micro-batch-size 1 --gradient-accumulate-step 1 \
31 | --dist-url tcp://127.0.0.1:7033 \
32 | --num-layers 4 --embedding-dim 4096 \
33 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \
34 | --job-id 0 --net-interface ${netif} \
35 | --fp16 \
36 | --dp-backend nccl \
37 | --dp-mode allreduce \
38 | --pp-mode gpipe --profiling no-profiling"
39 | 
40 | (trap 'kill 0' SIGINT; \
41 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
42 |     & \
43 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
44 |     & \
45 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
46 |     & \
47 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
48 |     & \
49 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
50 |     & \
51 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
52 |     & \
53 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
54 |     & \
55 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
56 |     & \
57 | wait)
58 | 


--------------------------------------------------------------------------------
/training/finetune_llama-2-7b-32k-mqa.sh:
--------------------------------------------------------------------------------
 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 2 | 
 3 | netif=lo
 4 | export GLOO_SOCKET_IFNAME=${netif}
 5 | export NCCL_SOCKET_IFNAME=${netif}
 6 | export MODEL_NAME=llama-2-7b-32k-mqa
 7 | 
 8 | export SHOW_DATA=1
 9 | 
10 | BASE_MODEL="${DIR}/../pretrained/Llama-2-7B-32K-beta/togethercomputer_Llama-2-7B-32K-beta"
11 | 
12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10}
13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10}
14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"}
15 | 
16 | DATASETS="https://huggingface.co/datasets/togethercomputer/Long-Data-Collections/resolve/main/fine-tune/natural_questions_10_200_docs.jsonl.zst:1"
17 | 
18 | ARGS="--model-name ${BASE_MODEL} \
19 | --tokenizer-name ${BASE_MODEL} \
20 | --project-name together \
21 | --model-type llama \
22 | --optimizer adam \
23 | --seed 42 \
24 | --load-pretrained-model true \
25 | --task-name \
26 | "${DATASETS}" \
27 | --checkpoint-path ${CHECKPOINT_PATH} \
28 | --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \
29 | --checkpoint-steps ${CHECKPOINT_STEPS} \
30 | --lr 2e-5 --seq-length 32768 --batch-size 4 --micro-batch-size 1 --gradient-accumulate-step 1 \
31 | --dist-url tcp://127.0.0.1:7033 \
32 | --num-layers 4 --embedding-dim 4096 \
33 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \
34 | --job-id 0 --net-interface ${netif} \
35 | --fp16 \
36 | --dp-backend nccl \
37 | --dp-mode allreduce \
38 | --pp-mode gpipe --profiling no-profiling"
39 | 
40 | (trap 'kill 0' SIGINT; \
41 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
42 |     & \
43 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
44 |     & \
45 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
46 |     & \
47 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
48 |     & \
49 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
50 |     & \
51 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
52 |     & \
53 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
54 |     & \
55 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
56 |     & \
57 | wait)
58 | 


--------------------------------------------------------------------------------
/training/lora/example/redpajama-incite-chat-3b.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
 4 | import torch
 5 | import transformers
 6 | import torch.nn as nn
 7 | import bitsandbytes as bnb
 8 | from datasets import Dataset
 9 | from peft import LoraConfig, get_peft_model
10 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
11 | 
12 | # this script should take around 14GB VRAM
13 | 
14 | MODEL_NAME='redpajama-incite-chat-3b-sample-lowrank'
15 | 
16 | # read datasets
17 | with open('data/OIG-chip2/unified_chip2.jsonl', 'r') as fp:
18 |     data = [json.loads(x) for x in fp.readlines()]
19 | 
20 | model = AutoModelForCausalLM.from_pretrained(
21 |     "togethercomputer/RedPajama-INCITE-Chat-3B-v1", 
22 |     device_map='auto',
23 | )
24 | 
25 | tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Chat-3B-v1")
26 | tokenizer.pad_token = tokenizer.eos_token
27 | 
28 | for param in model.parameters():
29 |   param.requires_grad = False  # freeze the model - train adapters later
30 |   if param.ndim == 1:
31 |     # cast the small parameters (e.g. layernorm) to fp32 for stability
32 |     param.data = param.data.to(torch.float32)
33 | 
34 | model.gradient_checkpointing_enable()  # reduce number of stored activations
35 | model.enable_input_require_grads()
36 | 
37 | def print_trainable_parameters(model):
38 |     """
39 |     Prints the number of trainable parameters in the model.
40 |     """
41 |     trainable_params = 0
42 |     all_param = 0
43 |     for _, param in model.named_parameters():
44 |         all_param += param.numel()
45 |         if param.requires_grad:
46 |             trainable_params += param.numel()
47 |     print(
48 |         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
49 |     )
50 | 
51 | config = LoraConfig(
52 |     r=16,
53 |     lora_alpha=32,
54 |     target_modules=["query_key_value", "xxx"],
55 |     lora_dropout=0.05,
56 |     bias="none",
57 |     task_type="CAUSAL_LM"
58 | )
59 | 
60 | model = get_peft_model(model, config)
61 | print_trainable_parameters(model)
62 | 
63 | ## Training
64 | 
65 | data = Dataset.from_list(data)
66 | data = data.map(lambda samples: tokenizer(samples['text']), batched=True)
67 | 
68 | trainer = transformers.Trainer(
69 |     model=model, 
70 |     train_dataset=data,
71 |     args=transformers.TrainingArguments(
72 |         per_device_train_batch_size=4, 
73 |         gradient_accumulation_steps=4,
74 |         warmup_steps=100, 
75 |         max_steps=200, 
76 |         learning_rate=2e-4, 
77 |         fp16=True,
78 |         logging_steps=1, 
79 |         output_dir='outputs'
80 |     ),
81 |     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
82 | )
83 | model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
84 | trainer.train()
85 | 
86 | # save the trained adapter to disk
87 | model.save_pretrained(f"outputs/{MODEL_NAME}")
88 | 


--------------------------------------------------------------------------------
/training/lora/example/redpajama-incite-chat-3b_inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from peft import PeftModel, PeftConfig
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | 
 5 | peft_model_path ='outputs/redpajama-incite-chat-3b-sample-lowrank'
 6 | 
 7 | config = PeftConfig.from_pretrained(peft_model_path)
 8 | model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
 9 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
10 | 
11 | # Load the Lora model
12 | model = PeftModel.from_pretrained(model, peft_model_path)
13 | 
14 | batch = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt')
15 | 
16 | with torch.cuda.amp.autocast():
17 |   output_tokens = model.generate(**batch, max_new_tokens=50)
18 | 
19 | print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))
20 | 


--------------------------------------------------------------------------------
/training/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/modules/__init__.py


--------------------------------------------------------------------------------
/training/modules/dist_deberta_pp_module.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from .deberta_modules import DebertaV2Embeddings, DebertaV2Layers, DebertaClassificationHead
 3 | 
 4 | 
 5 | class DebertaStageBase(nn.Module):
 6 |     def __init__(self, args, config):
 7 |         super().__init__()
 8 |         self._to_cpu = False # (args.dist_backend == "gloo")
 9 |         self.config = config
10 | 
11 |     def _create_first_layer(self):
12 |         return DebertaV2Embeddings(self.config)
13 | 
14 |     def _create_last_layer(self):
15 |         return DebertaClassificationHead(self.config)
16 | 
17 |     def _create_transformer_layers(self, first_block=False):
18 |         return DebertaV2Layers(self.config, first_block=first_block) # TODO: checkpoint
19 | 
20 | 
21 | class DebertaStageFirst(DebertaStageBase):
22 |     def __init__(self, args, config, device):
23 |         super().__init__(args, config)
24 |         self.device = device
25 |         self.embeddings = self._create_first_layer().to(device)
26 |         self.encoder = self._create_transformer_layers(first_block=True).to(device)
27 | 
28 |     def forward(self, x, token_type_ids=None, attention_mask=None):
29 |         if self._to_cpu:
30 |             x = x.to(self.device)
31 |             if token_type_ids is not None:
32 |                 token_type_ids = token_type_ids.to(self.device)
33 |             if attention_mask is not None:
34 |                 attention_mask = attention_mask.to(self.device)
35 |         x = self.embeddings(x, token_type_ids=token_type_ids)
36 |         out = self.encoder(x, attention_mask=attention_mask)
37 |         return out.cpu() if self._to_cpu else out
38 | 
39 | 
40 | class DebertaStageMiddle(DebertaStageBase):
41 |     def __init__(self, args, config, device):
42 |         super().__init__(args, config)
43 |         self.device = device
44 |         self.encoder = self._create_transformer_layers(first_block=False).to(device)
45 | 
46 |     def forward(self, x, attention_mask=None):
47 |         if self._to_cpu:
48 |             x = x.to(self.device)
49 |             if attention_mask is not None:
50 |                 attention_mask = attention_mask.to(self.device)
51 |         out = self.encoder(x, attention_mask=attention_mask)
52 |         return out.cpu() if self._to_cpu else out
53 | 
54 | 
55 | class DebertaStageLast(DebertaStageBase):
56 |     def __init__(self, args, config, device):
57 |         super().__init__(args, config)
58 |         self.device = device
59 |         self.encoder = self._create_transformer_layers(first_block=False).to(device)
60 |         self.output_head = self._create_last_layer().to(device)
61 | 
62 |     def forward(self, x, attention_mask=None, input_ids=None):
63 |         if self._to_cpu:
64 |             x = x.to(self.device)
65 |             if attention_mask is not None:
66 |                 attention_mask = attention_mask.to(self.device)
67 |         x = self.encoder(x, attention_mask=attention_mask)
68 |         out = self.output_head(x)
69 |         return out.cpu() if self._to_cpu else out


--------------------------------------------------------------------------------
/training/modules/dist_gpt_fsdp_module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
  3 | from .task_modules import GlueClassification
  4 | from .gpt_modules import MultiHeadAttention, TwoLayerMLP, GPTEmbedding
  5 | from fairscale.nn.checkpoint import checkpoint_wrapper
  6 | 
  7 | 
  8 | # This is only implemented to support checkpoint in FSDP
  9 | 
 10 | class GPTTransformerFsdpLayer(torch.nn.Module):
 11 |     def __init__(self, model_dim, head_num, feedforward_dim=2048, layer_norm_eps=1e-5, use_checkpoint=True,
 12 |                  explicit_fsdp=False) -> None:
 13 |         super(GPTTransformerFsdpLayer, self).__init__()
 14 |         self.attn = MultiHeadAttention(model_dim, head_num)
 15 |         if use_checkpoint:
 16 |             self.attn = checkpoint_wrapper(self.attn)
 17 |         if explicit_fsdp:
 18 |             self.attn = FSDP(self.attn, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False,
 19 |                              flatten_parameters=False)
 20 |         # Implementation of Feedforward model
 21 |         self.mlp = TwoLayerMLP(model_dim, feedforward_dim)
 22 |         if use_checkpoint:
 23 |             self.mlp = checkpoint_wrapper(self.mlp)
 24 |         if explicit_fsdp:
 25 |             self.attn = FSDP(self.attn, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False,
 26 |                              flatten_parameters=False)
 27 |         self.norm1 = torch.nn.LayerNorm(model_dim, eps=layer_norm_eps)
 28 |         self.norm2 = torch.nn.LayerNorm(model_dim, eps=layer_norm_eps)
 29 |         # self.dropout1 = nn.Dropout(dropout)
 30 |         # self.dropout2 = nn.Dropout(dropout)
 31 | 
 32 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 33 |         x = self.norm1(x)
 34 |         # x = x + self.dropout_1(self.attn(x2, x2, x2))
 35 |         x.requires_grad_(True)
 36 |         x = self.attn(x)
 37 |         x = self.norm2(x)
 38 |         # x = x + self.dropout_2(self.ff(x2))
 39 |         x.requires_grad_(True)
 40 |         x = self.mlp(x)
 41 |         return x
 42 | 
 43 | 
 44 | class GPTGlueFsdpModel(torch.nn.Module):
 45 |     def __init__(self, args, vocab_size, num_classes, use_checkpoint=True):
 46 |         super(GPTGlueFsdpModel, self).__init__()
 47 |         self.embedding = GPTEmbedding(vocab_size, args.embedding_dim, args.seq_length)
 48 | 
 49 |         module_list = []
 50 |         for _ in range(args.num_layers):
 51 |             module_list.append(GPTTransformerFsdpLayer(args.embedding_dim, args.num_heads,
 52 |                                                        args.embedding_dim * 4, use_checkpoint, explicit_fsdp=False))
 53 |         self.transformers = torch.nn.Sequential(*module_list)
 54 |         self.classifier = GlueClassification(args.embedding_dim, num_classes)
 55 | 
 56 |     def forward(self, input_ids, position_ids):
 57 |         input_emb = self.embedding(input_ids, position_ids)
 58 |         output_emb = self.transformers(input_emb)
 59 |         return self.classifier(output_emb)
 60 | 
 61 | 
 62 | class GPTFsdpStageBase(torch.nn.Module):
 63 |     def __init__(self, args, num_stage_layers, vocab_size, num_classes, use_checkpoint=True, explicit_fsdp=True):
 64 |         super(GPTFsdpStageBase, self).__init__()
 65 |         self._vocab_size = vocab_size
 66 |         self._explicit_fsdp = explicit_fsdp
 67 |         self._use_checkpoint = use_checkpoint
 68 |         self._embedding_dim = args.embedding_dim  # embedding dimension
 69 |         self._seq_length = args.seq_length
 70 |         self._num_classes = num_classes
 71 |         # the dimension of the feedforward aws_network model in nn.TransformerEncoder
 72 |         self._feedforward_dim = args.embedding_dim * 4
 73 |         self._num_heads = args.num_heads  # the number of heads in the multi-head attention models
 74 |         self._num_layers = num_stage_layers
 75 | 
 76 |     def _create_first_layer(self):
 77 |         emb = GPTEmbedding(self._vocab_size, self._embedding_dim, self._seq_length)
 78 |         if self._explicit_fsdp:
 79 |             return FSDP(emb, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False,
 80 |                         flatten_parameters=False)
 81 |         else:
 82 |             return emb
 83 | 
 84 |     def _create_last_layer(self):
 85 |         classifier = GlueClassification(self._embedding_dim, self._num_classes)
 86 |         if self._explicit_fsdp:
 87 |             return FSDP(classifier, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False,
 88 |                         flatten_parameters=False)
 89 |         else:
 90 |             return classifier
 91 | 
 92 |     def _create_fsdp_transformer_layer(self):
 93 |         return GPTTransformerFsdpLayer(self._embedding_dim, self._num_heads, self._feedforward_dim,
 94 |                                        use_checkpoint=self._use_checkpoint, explicit_fsdp=self._explicit_fsdp)
 95 | 
 96 | 
 97 | class GPTFsdpStageFirst(GPTFsdpStageBase):
 98 |     def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True):
 99 |         super(GPTFsdpStageFirst, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint,
100 |                                                 explicit_fsdp)
101 |         self.device = device
102 |         module_list = [self._create_first_layer()]
103 |         for _ in range(self._num_layers):
104 |             module_list.append(self._create_fsdp_transformer_layer())
105 |         self.model = torch.nn.Sequential(*module_list).to(device)
106 | 
107 |     def forward(self, x):
108 |         out = self.model(x)
109 |         return out
110 | 
111 | 
112 | class GPTFsdpStageMiddle(GPTFsdpStageBase):
113 |     def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True):
114 |         super(GPTFsdpStageMiddle, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint,
115 |                                                  explicit_fsdp)
116 |         self.device = device
117 |         module_list = []
118 |         for _ in range(self._num_layers):
119 |             module_list.append(self._create_fsdp_transformer_layer())
120 |         self.model = torch.nn.Sequential(*module_list).to(device)
121 | 
122 |     def forward(self, x):
123 |         out = self.model(x)
124 |         return out
125 | 
126 | 
127 | class GPTFsdpStageLast(GPTFsdpStageBase):
128 |     def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True):
129 |         super(GPTFsdpStageLast, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint,
130 |                                                explicit_fsdp)
131 |         self.device = device
132 |         module_list = []
133 |         for _ in range(self._num_layers):
134 |             module_list.append(self._create_fsdp_transformer_layer())
135 |         module_list.append(self._create_last_layer())
136 |         self.model = torch.nn.Sequential(*module_list).to(device)
137 | 
138 |     def forward(self, x):
139 |         out = self.model(x)
140 |         return out
141 | 


--------------------------------------------------------------------------------
/training/modules/dist_gpt_pp_module.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from torch import nn
  3 | from comm.comm_utils import *
  4 | 
  5 | from copy import deepcopy
  6 | 
  7 | 
  8 | class GPTStageBase(nn.Module):
  9 |     def __init__(self, args, config):
 10 |         super(GPTStageBase, self).__init__()
 11 |         self._to_cpu = (args.dist_backend == "gloo")
 12 |         self._embedding_dim = args.embedding_dim  # embedding dimension
 13 |         self._seq_length = args.seq_length
 14 |         # the dimension of the feedforward aws_network model in nn.TransformerEncoder
 15 |         self._feedforward_dim = args.embedding_dim * 4
 16 |         self._num_heads = args.num_heads  # the number of heads in the multi-head attention models
 17 |         self._num_layers = args.num_layers
 18 |         self._layer_begin = get_pipeline_parallel_rank() * args.num_layers
 19 |         self._layer_end = min(self._layer_begin + args.num_layers, args.max_layers)
 20 |         
 21 |         self._task_type = getattr(args, 'task_type', 'language_model')
 22 |         
 23 |         self.load_pretrained_model = args.load_pretrained_model
 24 |         self.model_name = args.model_name
 25 |         self.config = config
 26 |         
 27 |         if hasattr(args, 'model_type'):
 28 |             if args.model_type == "gpt2":
 29 |                 from .hf_gpt2_modules import GPTEmbeddings, GPTBlock, GPTLMHead
 30 |             elif args.model_type == "gptj":
 31 |                 from .hf_gptj_modules import GPTEmbeddings, GPTBlock, GPTLMHead
 32 |             elif args.model_type == "gptneox":
 33 |                 from .hf_gptneox_modules import GPTEmbeddings, GPTBlock, GPTLMHead
 34 |             elif args.model_type == 'llama':
 35 |                 from .llama_modules import GPTEmbeddings, GPTBlock, GPTLMHead
 36 |             else:
 37 |                 raise Exception("unknown")
 38 |         else:
 39 |             raise Exception("!!!! model type not defined")
 40 |             
 41 |         self._GPTEmbeddings = GPTEmbeddings
 42 |         self._GPTBlock = GPTBlock
 43 |         self._GPTLMHead = GPTLMHead
 44 | 
 45 |     def _create_first_layer(self):
 46 |         layer = self._GPTEmbeddings(deepcopy(self.config))
 47 |         if self.load_pretrained_model:
 48 |             print('loading embs')
 49 |             ret = layer.load_state_dict(
 50 |                 torch.load(f'{self.model_name}/pytorch_embs.pt'), strict=False
 51 |             )
 52 |             if len(ret.missing_keys):
 53 |                 print('The following weight keys are missing:')
 54 |                 print(ret.missing_keys)
 55 |             if len(ret.unexpected_keys):
 56 |                 print('The following weight keys are unexpected:')
 57 |                 print(ret.unexpected_keys)
 58 |         return layer
 59 | 
 60 |     def _create_last_layer(self):
 61 |         layer = self._GPTLMHead(deepcopy(self.config))
 62 |         if self.load_pretrained_model:
 63 |             print('loading lm_head')
 64 |             ret = layer.load_state_dict(
 65 |                 torch.load(f'{self.model_name}/pytorch_lm_head.pt'), strict=False
 66 |             )
 67 |             if len(ret.missing_keys):
 68 |                 print('The following weight keys are missing:')
 69 |                 print(ret.missing_keys)
 70 |             if len(ret.unexpected_keys):
 71 |                 print('The following weight keys are unexpected:')
 72 |                 print(ret.unexpected_keys)
 73 |         return layer
 74 | 
 75 |     def _create_transformer_layer(self, layer_idx=0):
 76 |         config = deepcopy(self.config)
 77 |         layer = self._GPTBlock(config, layer_id=layer_idx) # TODO: checkpoint
 78 |         if self.load_pretrained_model:
 79 |             print(f'loading layer {layer_idx}')
 80 |             ret = layer.load_state_dict(
 81 |                 torch.load(f'{self.model_name}/pytorch_{layer_idx}.pt'), strict=False
 82 |             )
 83 |             if len(ret.missing_keys):
 84 |                 print('The following weight keys are missing:')
 85 |                 print(ret.missing_keys)
 86 |             if len(ret.unexpected_keys):
 87 |                 print('The following weight keys are unexpected:')
 88 |                 print(ret.unexpected_keys)
 89 |         return layer
 90 |     
 91 | 
 92 | class GPTStageFull(GPTStageBase):
 93 |     def __init__(self, args, config, device):
 94 |         super(GPTStageFull, self).__init__(args, config)
 95 |         self.device = device
 96 |         module_list = [self._create_first_layer()]
 97 |         for layer_idx in range(self._layer_begin, self._layer_end):
 98 |             module_list.append(self._create_transformer_layer(layer_idx=layer_idx))
 99 |         if hasattr(args, 'skip_lm_head') and args.skip_lm_head:
100 |             pass
101 |         else:
102 |             module_list.append(self._create_last_layer())
103 |         self.model = nn.Sequential(*module_list).to(device)
104 | 
105 |     def forward(self, x, **kargs):
106 |         for module in self.model:
107 |             x = module(x, **kargs)
108 |         return x
109 | 
110 | 
111 | class GPTStageFirst(GPTStageBase):
112 |     def __init__(self, args, config, device):
113 |         super(GPTStageFirst, self).__init__(args, config)
114 |         self.device = device
115 |         module_list = [self._create_first_layer()]
116 |         for layer_idx in range(self._layer_begin, self._layer_end):
117 |             module_list.append(self._create_transformer_layer(layer_idx=layer_idx))
118 |         self.model = nn.Sequential(*module_list).to(device)
119 | 
120 |     def forward(self, x, **kargs):
121 |         for module in self.model:
122 |             x = module(x, **kargs)
123 |         return x
124 |         # out = self.model(x.to(self.device), **kargs)
125 |         # return out.cpu() if self._to_cpu else out
126 | 
127 | 
128 | class GPTStageMiddle(GPTStageBase):
129 |     def __init__(self, args, config, device):
130 |         super(GPTStageMiddle, self).__init__(args, config)
131 |         self.device = device
132 |         module_list = []
133 |         for layer_idx in range(self._layer_begin, self._layer_end):
134 |             module_list.append(self._create_transformer_layer(layer_idx=layer_idx))
135 |         self.model = nn.Sequential(*module_list).to(device)
136 | 
137 |     def forward(self, x, **kargs):
138 |         for module in self.model:
139 |             x = module(x, **kargs)
140 |         return x
141 |         # out = self.model(x.to(self.device), **kargs) if self._to_cpu else self.model(x)
142 |         # return out.cpu() if self._to_cpu else out
143 | 
144 | 
145 | class GPTStageLast(GPTStageBase):
146 |     def __init__(self, args, config, device):
147 |         super(GPTStageLast, self).__init__(args, config)
148 |         self.device = device
149 |         module_list = []
150 |         for layer_idx in range(self._layer_begin, self._layer_end):
151 |             module_list.append(self._create_transformer_layer(layer_idx=layer_idx))
152 |             
153 |         if hasattr(args, 'skip_lm_head') and args.skip_lm_head:
154 |             pass
155 |         else:
156 |             module_list.append(self._create_last_layer())
157 |         
158 |         self.model = nn.Sequential(*module_list).to(device)
159 |         
160 |         # self.upscale_last = nn.Linear(args.embedding_dim, 9216).to(device)
161 |         
162 |     def forward(self, x, **kargs):
163 |         for module in self.model:
164 |             x = module(x, **kargs)
165 |         
166 |         return x
167 | 
168 | #     def forward(self, x, **kargs):
169 | #         for module in self.model[:-1]:
170 | #             x = module(x, **kargs)
171 | #         hid = x
172 | #         x = self.model[-1](x, **kargs)
173 |         
174 | #         hid = self.upscale_last(hid)
175 | #         loss = torch.nn.functional.mse_loss(hid, kargs['teacher_hidden_states'])
176 | #         print(loss.item())
177 | #         return x, loss
178 |     


--------------------------------------------------------------------------------
/training/modules/task_modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class GlueClassification(torch.nn.Module):
 5 |     def __init__(self, model_dim, num_classes):
 6 |         super(GlueClassification, self).__init__()
 7 |         self.model_dim = model_dim
 8 |         self.num_classes = num_classes
 9 |         self.pooler_layer = torch.nn.Linear(model_dim, model_dim)
10 |         self.fc_layer = torch.nn.Linear(model_dim, num_classes)
11 | 
12 |     def forward(self, hidden_states, pooler_index=0):
13 |         pooled = hidden_states[:, pooler_index, :]
14 |         pooled = self.pooler_layer(pooled)
15 |         pooled = torch.tanh(pooled)
16 |         return self.fc_layer(pooled)
17 | 


--------------------------------------------------------------------------------
/training/modules/tokenizer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from transformers import AutoTokenizer, GPT2TokenizerFast, DebertaV2Tokenizer
 3 | 
 4 | def build_tokenizer(args):
 5 |     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
 6 |     if tokenizer.pad_token is None:
 7 |         tokenizer.pad_token = tokenizer.eos_token
 8 |     return tokenizer
 9 | 
10 | def build_gpt2_tokenizer(args):
11 |     tokenizer = GPT2TokenizerFast.from_pretrained(args.tokenizer_name)
12 |     tokenizer.pad_token = tokenizer.eos_token
13 |     return tokenizer
14 | 
15 | def build_deberta_tokenizer(args):
16 |     tokenizer = DebertaV2Tokenizer.from_pretrained(args.tokenizer_name)
17 |     return tokenizer
18 |     


--------------------------------------------------------------------------------
/training/modules/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | import numpy as np
 4 | from torch import nn
 5 | from torch.nn import functional
 6 | from typing import Optional, Tuple, Union
 7 | 
 8 | 
 9 | # @torch.jit.script
10 | def gpt_loss_func(input, target):
11 |     lm_logits, labels = input, target
12 |     shift_logits = lm_logits[..., :-1, :].contiguous()
13 |     shift_labels = labels[..., 1:].contiguous()
14 |     loss = functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
15 |     return loss


--------------------------------------------------------------------------------
/training/optimizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/optimizer/__init__.py


--------------------------------------------------------------------------------
/training/optimizer/grad_scalar.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | from abc import abstractmethod
  3 | 
  4 | import torch
  5 | 
  6 | 
  7 | class GradScaler(ABC):
  8 |     def __init__(self, initial_scale, device=None):
  9 |         """Initialize scale value with the input initial scale."""
 10 |         assert initial_scale > 0.0
 11 |         self.device = device
 12 |         self._scale = torch.cuda.FloatTensor([initial_scale], device=device)
 13 | 
 14 |     @property
 15 |     def scale(self):
 16 |         return self._scale
 17 | 
 18 |     @property
 19 |     def inv_scale(self):
 20 |         return self._scale.double().reciprocal().float()
 21 | 
 22 |     @abstractmethod
 23 |     def update(self, found_inf):
 24 |         pass
 25 | 
 26 |     @abstractmethod
 27 |     def state_dict(self):
 28 |         pass
 29 | 
 30 |     @abstractmethod
 31 |     def load_state_dict(self, state_dict):
 32 |         pass
 33 | 
 34 | 
 35 | class ConstantGradScaler(GradScaler):
 36 | 
 37 |     def update(self, found_inf):
 38 |         pass
 39 | 
 40 |     def state_dict(self):
 41 |         return dict()
 42 | 
 43 |     def load_state_dict(self, state_dict):
 44 |         pass
 45 | 
 46 | 
 47 | class DynamicGradScaler(GradScaler):
 48 | 
 49 |     def __init__(self, initial_scale, min_scale,
 50 |                  growth_factor, backoff_factor,
 51 |                  growth_interval, hysteresis, device=None):
 52 |         """"Grad scaler with dynamic scale that gets adjusted
 53 |         during training."""
 54 |         super(DynamicGradScaler, self).__init__(initial_scale, device=device)
 55 | 
 56 |         # Lower bound on the scale.
 57 |         assert min_scale > 0.0
 58 |         assert min_scale <= initial_scale
 59 |         self.min_scale = torch.cuda.FloatTensor([min_scale], device=device)
 60 |         # Growth and backoff factors for the scale.
 61 |         assert growth_factor > 1.0
 62 |         self.growth_factor = torch.cuda.FloatTensor([growth_factor], device=device)
 63 |         assert backoff_factor < 1.0
 64 |         assert backoff_factor > 0.0
 65 |         self.backoff_factor = torch.cuda.FloatTensor([backoff_factor], device=device)
 66 |         # Interval over which if we don't see any inf/nan,
 67 |         # we will scale the grad scale by the growth factor.
 68 |         assert growth_interval > 0
 69 |         self.growth_interval = growth_interval
 70 |         # Number of inf/nans we should see before scaling down
 71 |         # the grad scale by the backoff factor.
 72 |         assert hysteresis > 0
 73 |         self.hysteresis = hysteresis
 74 | 
 75 |         # Trackers.
 76 |         self._growth_tracker = 0
 77 |         self._hysteresis_tracker = self.hysteresis
 78 | 
 79 |     def update(self, found_inf):
 80 |         # If we have an inf/nan, growth tracker is set to 0
 81 |         # and hysterisis tracker is reduced by 1.
 82 |         if found_inf:
 83 |             self._growth_tracker = 0
 84 |             self._hysteresis_tracker -= 1
 85 |             # Now if we are out of hysteresis count, scale down the loss.
 86 |             if self._hysteresis_tracker <= 0:
 87 |                 self._scale = torch.max(self._scale * self.backoff_factor,
 88 |                                         self.min_scale)
 89 |                 print('##### scale backoff to', self._scale)
 90 |         else:
 91 |             # If there is no nan/inf, increment the growth tracker.
 92 |             self._growth_tracker += 1
 93 |             # If we have had enough consequitive intervals with no nan/inf:
 94 |             if self._growth_tracker == self.growth_interval:
 95 |                 # Reset the tracker and hysteresis trackers,
 96 |                 self._growth_tracker = 0
 97 |                 self._hysteresis_tracker = self.hysteresis
 98 |                 # and scale up the loss scale.
 99 |                 self._scale = self._scale * self.growth_factor
100 |                 print('##### scale grow to', self._scale)
101 | 
102 |     def state_dict(self):
103 |         state_dict = {}
104 |         state_dict['scale'] = self._scale
105 |         state_dict['growth_tracker'] = self._growth_tracker
106 |         state_dict['hysteresis_tracker'] = self._hysteresis_tracker
107 |         return state_dict
108 | 
109 |     def load_state_dict(self, state_dict):
110 |         self._scale = state_dict['scale'].to(self.device)
111 |         self._growth_tracker = state_dict['growth_tracker']
112 |         self._hysteresis_tracker = state_dict['hysteresis_tracker']


--------------------------------------------------------------------------------
/training/optimizer/optimizer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from .grad_scalar import *
  3 | 
  4 | # This follows some implementation from Megatron
  5 | 
  6 | 
  7 | def _has_overflow_serial(grads):
  8 | 
  9 |     def _has_inf_or_nan(x):
 10 |         try:
 11 |             # if x is half, the .float() incurs an additional deep copy, but it's necessary if
 12 |             # Pytorch's .sum() creates a one-element tensor of the same type as x
 13 |             # (which is true for some recent version of pytorch).
 14 |             cpu_sum = float(x.float().sum())
 15 |             # More efficient version that can be used if .sum() returns a Python scalar
 16 |             # cpu_sum = float(x.sum())
 17 |         except RuntimeError as instance:
 18 |             # We want to check if inst is actually an overflow exception.
 19 |             # RuntimeError could come from a different error.
 20 |             # If so, we still want the exception to propagate.
 21 |             if "value cannot be converted" not in instance.args[0]:
 22 |                 raise
 23 |             return True
 24 |         else:
 25 |             if cpu_sum in [float('inf'), -float('inf')] or cpu_sum != cpu_sum:
 26 |                 return True
 27 |             return False
 28 | 
 29 |     for p in grads:
 30 |         if _has_inf_or_nan(p):
 31 |             return torch.FloatTensor([1.0])
 32 | 
 33 |     return torch.FloatTensor([0.0])
 34 | 
 35 | 
 36 | # `x` is a torch.Tensor
 37 | 
 38 | 
 39 | 
 40 | def _zero_grad_group(group, set_to_none):
 41 |     """Zero out the gradient for a group of parameters.
 42 |     Note: copied from torch.optim.optimizer."""
 43 |     for param in group:
 44 |         if param.grad is not None:
 45 |             if set_to_none:
 46 |                 param.grad = None
 47 |             else:
 48 |                 if param.grad.grad_fn is not None:
 49 |                     param.grad.detach_()
 50 |                 else:
 51 |                     param.grad.requires_grad_(False)
 52 |                 param.grad.zero_()
 53 | 
 54 | 
 55 | '''
 56 | def _multi_tensor_copy_this_to_that(this, that):
 57 |     for this_, that_ in zip(this, that):
 58 |         that_.copy_(this_)
 59 | '''
 60 | 
 61 | 
 62 | class Fp16Optimizer:
 63 |     # If offload is set to true, the fp32 copy is stored on CPU.
 64 |     def __init__(self, optimizer, grad_scaler, device, offload=False):
 65 |         self.offload = offload
 66 |         if self.offload:
 67 |             self.cpu_to_gpu_stream = torch.cuda.Stream(device=device, priority=-1)
 68 |             self.gpu_to_cpu_stream = torch.cuda.Stream(device=device, priority=-1)
 69 |         self.optimizer = optimizer
 70 |         self.grad_scaler = grad_scaler
 71 | 
 72 |         if self.grad_scaler:
 73 |             self.found_inf = torch.cuda.FloatTensor([0.0], device=device) if not self.offload else torch.FloatTensor([0.0])
 74 | 
 75 |         self._dummy_overflow_buf = torch.cuda.IntTensor([0], device=device) if not self.offload else torch.IntTensor([0])
 76 | 
 77 |         # Note that the model should first be cast to fp16 before passing to the optimizer.
 78 |         self.float16_groups = []
 79 |         self.fp32_from_float16_groups = []
 80 | 
 81 |         # For all the groups in the original optimizer:
 82 |         for param_group in self.optimizer.param_groups:
 83 |             float16_params_this_group = []
 84 |             fp32_from_float16_params_this_group = []
 85 |             # For all the parameters in this group:
 86 |             for i, param in enumerate(param_group['params']):
 87 |                 if param.requires_grad:
 88 |                     # float16 params:
 89 |                     assert param.type() == 'torch.cuda.HalfTensor'
 90 |                     float16_params_this_group.append(param)
 91 |                     # Create a copy
 92 |                     if self.offload:
 93 |                         optimizer_param = param.detach().clone().float().to(device='cpu')
 94 |                         assert optimizer_param.device == torch.device('cpu')
 95 |                         if optimizer_param.grad is None:
 96 |                             optimizer_param.grad = torch.zeros_like(optimizer_param.data)
 97 |                     else:
 98 |                         optimizer_param = param.detach().clone().float()
 99 |                     # Replace the optimizer params with the new fp32 copy.
100 |                     param_group['params'][i] = optimizer_param
101 |                     fp32_from_float16_params_this_group.append(optimizer_param)
102 |                     # Reset existing state dict key to the new optimizer param.
103 |                     if param in self.optimizer.state:
104 |                         self.optimizer.state[optimizer_param] = self.optimizer.state.pop(param)
105 | 
106 |             self.float16_groups.append(float16_params_this_group)
107 |             self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group)
108 | 
109 |         # Leverage state_dict() and load_state_dict() to
110 |         # recast preexisting per-param state tensors
111 |         self.optimizer.load_state_dict(self.optimizer.state_dict())
112 | 
113 |     def zero_grad(self, set_to_none=True):
114 |         for group in self.float16_groups:
115 |             _zero_grad_group(group, set_to_none)
116 |         if not self.offload:
117 |             for group in self.fp32_from_float16_groups:
118 |                 _zero_grad_group(group, set_to_none)
119 | 
120 |     def get_loss_scale(self):
121 |         return self.grad_scaler.scale
122 | 
123 |     def _copy_model_grads_to_optimizer_grads(self):
124 |         # This only needs to be done for the float16 group.
125 |         for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups):
126 |             for model_param, optimizer_param in zip(model_group, optimizer_group):
127 |                 if model_param.grad is not None:
128 |                     if self.offload:
129 |                         with torch.cuda.stream(self.gpu_to_cpu_stream):
130 |                             optimizer_param.grad.copy_(model_param.grad, non_blocking=False)
131 |                     else:
132 |                         optimizer_param.grad = model_param.grad.float()
133 |                 # Safe to deallocate model's grad/optimizer_grad after copying.
134 |                 # (If using contiguous buffers, optimizer_grad's memory should
135 |                 # persist and therefore should not be deallocated.)
136 |                 model_param.grad = None
137 | 
138 |     def _unscale_optimizer_grads_and_check_for_nan(self):
139 |         optimizer_grads = []
140 |         # fp32 params fromm float16 ones.
141 |         for optimizer_group in self.fp32_from_float16_groups:
142 |             for optimizer_param in optimizer_group:
143 |                 if optimizer_param.grad is not None:
144 |                     optimizer_grads.append(optimizer_param.grad.data)
145 |         # Reset found inf.
146 |         self.found_inf.fill_(0.0)
147 |         # Unscale and set found inf/nan
148 |         print(optimizer_grads[0].device, self.found_inf.device, self.grad_scaler.inv_scale.device)
149 |         if self.offload:
150 |             self.found_inf = _has_overflow_serial(optimizer_grads)
151 |         else:
152 |             torch._amp_foreach_non_finite_check_and_unscale_(optimizer_grads, self.found_inf, self.grad_scaler.inv_scale)
153 |         # Check for nan.
154 |         found_inf_flag = (self.found_inf.item() > 0)
155 |         return found_inf_flag
156 | 
157 |     def _get_model_and_optimizer_params_data_float16_deprecated(self):
158 |         model_data = []
159 |         optimizer_data = []
160 |         for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups):
161 |             for model_param, optimizer_param in zip(model_group, optimizer_group):
162 |                 model_data.append(model_param.data)
163 |                 optimizer_data.append(optimizer_param.data)
164 |         return model_data, optimizer_data
165 | 
166 |     def _copy_optimizer_params_to_model_params(self):
167 |         # Only needed for the float16 params.
168 |         # model_data, optimizer_data = self._get_model_and_optimizer_params_data_float16_deprecated()
169 |         # _multi_tensor_copy_this_to_that(this=optimizer_data, that=model_data)
170 | 
171 |         for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups):
172 |             for model_param, optimizer_param in zip(model_group, optimizer_group):
173 |                 if self.offload:
174 |                     with torch.cuda.stream(self.cpu_to_gpu_stream):
175 |                         model_param.data.copy_(optimizer_param.data, non_blocking=False)
176 |                 else:
177 |                     model_param.data.copy_(optimizer_param.data)
178 | 
179 |     def _copy_model_params_to_optimizer_params(self):
180 |         # Only needed for the float16 params.
181 |         # model_data, optimizer_data = self._get_model_and_optimizer_params_data_float16_deprecated()
182 |         # _multi_tensor_copy_this_to_that(this=model_data, that=optimizer_data)
183 |         for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups):
184 |             for model_param, optimizer_param in zip(model_group, optimizer_group):
185 |                 if self.offload:
186 |                     with torch.cuda.stream(self.gpu_to_cpu_stream):
187 |                         optimizer_param.data.copy_(model_param.data, non_blocking=False)
188 |                 else:
189 |                     optimizer_param.data.copy_(model_param.data)
190 | 
191 |     def reload_model_params(self):
192 |         self._copy_model_params_to_optimizer_params()
193 | 
194 |     @torch.no_grad()
195 |     def step(self):
196 |         self._copy_model_grads_to_optimizer_grads()
197 | 
198 |         found_inf_flag = self._unscale_optimizer_grads_and_check_for_nan()
199 |         self.grad_scaler.update(found_inf_flag)
200 | 
201 |         # If we found inf/nan, skip the update.
202 |         if found_inf_flag:
203 |             print("!!! Warning: find inf in fp16 optimizer-step() !!!")
204 |             return False
205 |         
206 |         for params in self.fp32_from_float16_groups:
207 |             torch.nn.utils.clip_grad_norm_(params, 1.0)
208 | 
209 |         # Step the optimizer.
210 |         self.optimizer.step()
211 | 
212 |         self._copy_optimizer_params_to_model_params()
213 |         # Successful update.
214 |         return True
215 |     
216 |     def scale(self, z):
217 |         return z * self.grad_scaler.scale
218 |     
219 |     def unscale(self, z):
220 |         return z * self.grad_scaler.inv_scale
221 |     
222 |     def state_dict(self):
223 |         return self.optimizer.state_dict()
224 |     
225 |     def load_state_dict(self, state_dict):
226 |         self.optimizer.load_state_dict(state_dict)
227 | 
228 | 
229 | def get_fp16_optimizer(args, optimizer, device):
230 |     assert args.fp16 is not None
231 |     if args.loss_scale:
232 |         print("fp16 uses ConstantGradScaler.")
233 |         grad_scaler = ConstantGradScaler(args.loss_scale)
234 |     else:
235 |         print("fp16 uses DynamicGradScaler.")
236 |         grad_scaler = DynamicGradScaler(
237 |             initial_scale=args.initial_loss_scale,
238 |             min_scale=args.min_loss_scale,
239 |             growth_factor=2.0,
240 |             backoff_factor=0.5,
241 |             growth_interval=args.loss_scale_window,
242 |             hysteresis=args.hysteresis)
243 |     return Fp16Optimizer(optimizer, grad_scaler, device, getattr(args, 'use_offload', False))
244 | 
245 | 


--------------------------------------------------------------------------------
/training/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/pipeline_parallel/__init__.py


--------------------------------------------------------------------------------
/training/pipeline_parallel/dist_pp_utils.py:
--------------------------------------------------------------------------------
 1 | from .dist_gpipe_pipeline_async import GpipeAsync
 2 | 
 3 | 
 4 | def get_pp_module(args, config, device, use_dp):
 5 |     
 6 |     if args.pp_mode == 'gpipe':
 7 |         return GpipeAsync(args, config, device, use_dp)
 8 |     else:
 9 |         print("Not recognize this pipeline parallel mode.")
10 |         assert False
11 |         
12 | 


--------------------------------------------------------------------------------
/training/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/tasks/__init__.py


--------------------------------------------------------------------------------
/training/tasks/data_loaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/tasks/data_loaders/__init__.py


--------------------------------------------------------------------------------
/training/tasks/data_loaders/prosocial.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import torch
 4 | import json
 5 | from torch.utils.data import IterableDataset, DataLoader
 6 | from itertools import cycle, islice
 7 | import random
 8 | from datasets import Dataset
 9 | from datasets import load_dataset, load_from_disk
10 | from comm.comm_utils import *
11 | 
12 | 
13 | 
14 | class StreamDataset(IterableDataset):
15 |     def __init__(self, dataset, tokenizer, seq_length=1024):
16 |         
17 |         self.dataset = dataset
18 |         
19 |         self.tokenizer = tokenizer
20 |         self.seq_length = seq_length
21 |         
22 |         self.it = None
23 |         self.iter_count = 0
24 |         
25 |     def state_dict(self):
26 |         return {
27 |             'iter_count': self.iter_count,
28 |         }
29 |     
30 |     def load_state_dict(self, state_dict):
31 |         self.iter_count = state_dict['iter_count']
32 |         self.dataset = self.dataset.skip(self.iter_count)
33 |         
34 |     def get_sequence(self):
35 |         
36 |         it = cycle(iter(self.dataset))
37 |         
38 |         while True:
39 | 
40 |             text_context = '''Possible labels:
41 | 1. casual
42 | 2. needs caution
43 | 3. needs intervention
44 | 4. possibly needs caution
45 | 5. probably needs caution'''
46 | 
47 |             while True:
48 |                 
49 |                 instance = next(it)
50 |                 
51 |                 text = instance['text']
52 |                 text_context += '\n\n' + text
53 |                 
54 |                 input_ids = self.tokenizer(text_context.strip())['input_ids']
55 |                 if len(input_ids) > self.seq_length:
56 |                     break
57 |                 
58 |             input_ids = input_ids[:self.seq_length]
59 |             input_ids = torch.tensor(input_ids).long()
60 | 
61 |             yield {
62 |                 'input_ids': input_ids,
63 |             }
64 |             
65 |                 
66 |     def get_stream(self):
67 |         return cycle(self.get_sequence())
68 |     
69 |     def __iter__(self):
70 |         if self.it is None:
71 |             self.it = self.get_stream()
72 |         return self.it
73 |     


--------------------------------------------------------------------------------
/training/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/utils/__init__.py


--------------------------------------------------------------------------------
/training/utils/dist_args_utils.py:
--------------------------------------------------------------------------------
  1 | def add_device_arguments(parser):
  2 |     parser.add_argument('--use-cuda', default=True, type=lambda x: (str(x).lower() == 'true'),
  3 |                         help='if this is set to True, will use cuda to train')
  4 |     parser.add_argument('--cuda-id', type=int, default=0, metavar='N',
  5 |                         help='cuda index, if the instance has multiple GPUs.')
  6 |     parser.add_argument('--cuda-num', type=int, default=1, metavar='N',
  7 |                         help='number of GPUs, if the instance has multiple GPUs.')
  8 |     parser.add_argument('--debug-mem', default=True, type=lambda x: (str(x).lower() == 'true'),
  9 |                         help='if this is set to True, we will print some memory stats.')
 10 | 
 11 | 
 12 | def add_torch_distributed_arguments(parser):
 13 |     parser.add_argument('--dist-backend', type=str, default='cupy_nccl', metavar='S',
 14 |                         help='backend type for distributed PyTorch (default: cupy_nccl)')
 15 |     parser.add_argument('--dp-backend', type=str, default='nccl', metavar='S',
 16 |                         help='backend type for data parallel')
 17 |     parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:9000', metavar='S',
 18 |                         help='master ip for distributed PyTorch')
 19 |     parser.add_argument('--world-size', type=int, default=4, metavar='D',
 20 |                         help='world-size (default: 4)')
 21 |     parser.add_argument('--pipeline-group-size', type=int, default=4, metavar='D',
 22 |                         help='world-size (default: 2)')
 23 |     parser.add_argument('--data-group-size', type=int, default=1, metavar='D',
 24 |                         help='world-size (default: 1)')
 25 |     parser.add_argument('--rank', type=int, default=0, metavar='N',
 26 |                         help='rank of the node')
 27 | 
 28 | 
 29 | def add_task_arguments(parser):
 30 |     parser.add_argument('--train-data', nargs='+', default=['./glue_dataset/data/QQP/train.tsv'], metavar='S',
 31 |                         help='path to the training data')
 32 |     parser.add_argument('--valid-data', nargs='+', default=['./glue_dataset/data/QQP/test.tsv'], metavar='S',
 33 |                         help='path to the training data')
 34 |     parser.add_argument('--tokenizer-type', type=str, default='BertWordPieceLowerCase', metavar='S',
 35 |                         help='which tokenizer to use.')
 36 |     parser.add_argument('--vocab-file', type=str, default='./glue_dataset/data/bert-large-cased-vocab.txt', metavar='S',
 37 |                         help='which tokenizer to use.')
 38 |     parser.add_argument('--vocab-extra-ids', type=int, default=0, metavar='N',
 39 |                         help='-')
 40 |     parser.add_argument('--make-vocab-size-divisible-by', type=int, default=128, metavar='N',
 41 |                         help='-')
 42 |     parser.add_argument('--optimizer', type=str, default='adamw', metavar='N',
 43 |                         help='-')
 44 | 
 45 | 
 46 | def add_model_arguments(parser):
 47 |     parser.add_argument('--seq-length', type=int, default=1024, metavar='N',
 48 |                         help='-')
 49 |     parser.add_argument('--embedding-dim', type=int, default=768, metavar='N',
 50 |                         help='-')
 51 |     parser.add_argument('--num-layers', type=int, default=4, metavar='N',
 52 |                         help='-')
 53 |     parser.add_argument('--num-heads', type=int, default=12, metavar='N',
 54 |                         help='-')
 55 | 
 56 | 
 57 | def add_training_hyper_parameter_arguments(parser):
 58 |     parser.add_argument('--train-log-backend', type=str, default='print', metavar='N',
 59 |                         help='-')
 60 |     parser.add_argument('--project-name', type=str, default='test', metavar='N',
 61 |                         help='-')
 62 |     parser.add_argument('--batch-size', type=int, default=32, metavar='N',
 63 |                         help='input batch size for training (default: 100)')
 64 |     parser.add_argument('--micro-batch-size', type=int, default=8, metavar='N',
 65 |                         help='input micro batch size for training (default: 100)')
 66 |     parser.add_argument('--lr', type=float, default=0.01, metavar='N',
 67 |                         help='-')
 68 |     parser.add_argument('--num-iters', type=int, default=10, metavar='N',
 69 |                         help='-')
 70 | 
 71 | 
 72 | def add_mixed_precision_arguments(parser):
 73 |     parser.add_argument('--fp16', action='store_true',
 74 |                         help='Run model in fp16 mode.')
 75 |     parser.add_argument('--loss-scale', type=float, default=0,
 76 |                         help='Static loss scaling, positive power of 2 values can improve fp16 convergence. ')
 77 |     parser.add_argument('--initial-loss-scale', type=float, default=32768,
 78 |                         help='Initial loss-scale for dynamic loss scaling.')
 79 |     parser.add_argument('--min-loss-scale', type=float, default=1.0,
 80 |                         help='Minimum loss scale for dynamic loss scale.')
 81 |     parser.add_argument('--loss-scale-window', type=float, default=1000,
 82 |                         help='Window over which to raise/lower dynamic scale.')
 83 |     parser.add_argument('--hysteresis', type=int, default=2,
 84 |                         help='hysteresis for dynamic loss scaling')
 85 |     parser.add_argument('--use-offload', action='store_true',
 86 |                         help='Offload optim states to CPU')
 87 |     
 88 | 
 89 | 
 90 | def add_parallel_schema_arguments(parser):
 91 |     parser.add_argument('--pp-mode', type=str, default='gpipe', metavar='S',
 92 |                         help='use which pipeline parallel mode: gpipe or 1f1b.')
 93 |     parser.add_argument('--dp-mode', type=str, default='allreduce', metavar='S',
 94 |                         help='use which data parallel mode: allreduce.')
 95 |     parser.add_argument('--gradient-accumulate-step', type=int, default=1,
 96 |                         help='Number of gradient computation in Pipeline without data parallel sync.')
 97 |     
 98 | 
 99 | def get_model_arguments_str(args):
100 |     return '_l' + str(args.seq_length) + '_m' + str(args.embedding_dim)
101 | 
102 | 
103 | def get_dist_arguments_str(args, add_rank=True):
104 |     dist_str = '_w' + str(args.world_size) + '_p' + str(args.pipeline_group_size) + "_" + \
105 |                str(args.gradient_accumulate_step) + '_d' + str(args.data_group_size)
106 |     if add_rank:
107 |         dist_str = dist_str + '_' + str(args.rank)
108 |     return dist_str
109 | 
110 | 
111 | def get_learning_arguments_str(args):
112 |     return '_b' + str(args.batch_size) + '_' + str(args.micro_batch_size)
113 | 
114 | 
115 | def get_mixed_precision_arguments_str(args):
116 |     if args.fp16:
117 |         return '_fp16'
118 |     else:
119 |         return ''
120 | 


--------------------------------------------------------------------------------
/training/utils/dist_checkpoint_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import random
  4 | import json
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | from comm.comm_utils import *
  9 | 
 10 | 
 11 | def load_checkpoint(pipe, args):
 12 |     
 13 |     if os.path.isfile(os.path.join(args.checkpoint_path, 'latest')):
 14 |         with open(os.path.join(args.checkpoint_path, 'latest')) as f:
 15 |             latest_step = int(f.read())
 16 |     else:
 17 |         print('no checkpoint available, skipping')
 18 |         return
 19 |     
 20 |     checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}")
 21 |     
 22 |     try:
 23 |         with open(os.path.join(checkpoint_step_path, 'meta.json')) as f:
 24 |             meta = json.load(f)
 25 |     except:
 26 |         print('failed to load meta.')
 27 |         
 28 |     pipe.global_step = latest_step
 29 |     
 30 |     try:
 31 |         pipe.model.model.load_state_dict(
 32 |             torch.load(
 33 |                 os.path.join(
 34 |                     checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_checkpoint.pt'
 35 |                 ), map_location=torch.device('cpu')
 36 |             )
 37 |         )
 38 |     except:
 39 |         print('failed to load model params.')
 40 |     
 41 |     try:
 42 |         pipe.optimizer.load_state_dict(
 43 |             torch.load(
 44 |                 os.path.join(
 45 |                     checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_optimizer.pt'
 46 |                 ), map_location=torch.device('cpu')
 47 |             )
 48 |         )
 49 |     except:
 50 |         print('failed to load optim states.')
 51 |     
 52 |     try:
 53 |         pipe.scheduler.load_state_dict(
 54 |             torch.load(
 55 |                 os.path.join(
 56 |                     checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_scheduler.pt'
 57 |                 )
 58 |             )
 59 |         )
 60 |     except:
 61 |         print('failed to load scheduler states.')
 62 |         
 63 |             
 64 | def save_checkpoint(pipe, args) -> str:
 65 |     
 66 |     latest_step = pipe.global_step
 67 |     checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}")
 68 |     
 69 |     os.makedirs(checkpoint_step_path, exist_ok=True)
 70 | 
 71 |     print(f"Saving checkpoint to {checkpoint_step_path} ...")
 72 | 
 73 |     torch.save(
 74 |         pipe.model.model.state_dict(),
 75 |         os.path.join(
 76 |             checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_checkpoint.pt'
 77 |         )
 78 |     )
 79 |     
 80 |     torch.save(
 81 |         pipe.optimizer.state_dict(),
 82 |         os.path.join(
 83 |             checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_optimizer.pt'
 84 |         )
 85 |     )
 86 |     
 87 |     torch.save(
 88 |         pipe.scheduler.state_dict(),
 89 |         os.path.join(
 90 |             checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_scheduler.pt'
 91 |         )
 92 |     )
 93 |     
 94 |     with open(os.path.join(checkpoint_step_path, 'meta.json'), 'w') as f:
 95 |         json.dump({
 96 |             'step': latest_step,
 97 |         }, f)
 98 |     
 99 |     with open(os.path.join(args.checkpoint_path, 'latest'), 'w') as f:
100 |         f.write(f"{latest_step}")
101 | 
102 |     print(f"Checkpoint saved to {checkpoint_step_path} ... Done")
103 | 
104 |     return checkpoint_step_path
105 |         
106 |         
107 | def save_stream_dataloader_state_dict(dataloader, pipe, args):
108 |     
109 |     latest_step = pipe.global_step
110 |     checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}")
111 |     
112 |     os.system(f"mkdir -p {checkpoint_step_path}")
113 |     
114 |     torch.save(
115 |         dataloader.dataset.state_dict(),
116 |         os.path.join(
117 |             checkpoint_step_path, f'dataset_state_dict.pt'
118 |         )
119 |     )
120 |     
121 | def load_stream_dataloader_state_dict(dataloader, pipe, args):
122 |     
123 |     latest_step = pipe.global_step
124 |     checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}")
125 |     
126 |     try:
127 |         state_dict = torch.load(
128 |             os.path.join(
129 |                 checkpoint_step_path, f'dataset_state_dict.pt'
130 |             )
131 |         )
132 | 
133 |         dataloader.data.load_state_dict(state_dict)
134 |     
135 |     except Exception as e:
136 |         
137 |         print('failed to load dataset state_dict.')


--------------------------------------------------------------------------------
/training/utils/dist_debug_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def print_cuda_memory(args, info: str, device=None):
 5 |     if args.debug_mem:
 6 |         if device is None:
 7 |             device = torch.device('cuda', args.cuda_id)
 8 |         print("<{}>: current memory allocated: {:2.3f} MB, peak memory: {:2.3f} MB".format(
 9 |             info, torch.cuda.memory_allocated(device)/1048576, torch.cuda.max_memory_allocated(device)/1048576))
10 | 
11 | 
12 | def print_multi_cuda_memory(args, info: str):
13 |     if args.debug_mem:
14 |         for local_gpu_rank in range(args.cuda_num):
15 |             device = torch.device('cuda', local_gpu_rank)
16 |             print("<{}>({}): current memory allocated: {:2.3f} MB, peak memory: {:2.3f} MB".format(info, local_gpu_rank,
17 |                   torch.cuda.memory_allocated(device)/1048576, torch.cuda.max_memory_allocated(device)/1048576))
18 | 


--------------------------------------------------------------------------------
/training/utils/logging_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | try:
 4 |     import wandb
 5 |     _has_wandb = True
 6 | except:
 7 |     _has_wandb = False
 8 |     print("wandb is not installed.")
 9 |     
10 | try:
11 |     import loguru
12 |     _has_loguru = True
13 | except:
14 |     _has_loguru = False
15 |     print("loguru is not installed.")
16 |     
17 | train_log_backend = None
18 |     
19 | def init_train_logger(args):
20 |     
21 |     global train_log_backend
22 |     train_log_backend = getattr(args, 'train_log_backend', 'print')
23 |     
24 |     if train_log_backend == 'print':
25 |         pass
26 |     elif train_log_backend == 'loguru':
27 |         os.system("mkdir -p logs")
28 |         loguru.logger.add("logs/file_{time}.log")
29 |     elif train_log_backend == 'wandb':
30 |         
31 |         assert _has_wandb
32 |         
33 |         if not hasattr(args, 'project_name'):
34 |             import re
35 |             args.project_name = "test-" + \
36 |                 re.sub('[^a-zA-Z0-9 \n\.]', '_', args.task_name)
37 | 
38 |         wandb.init(
39 |             project=args.project_name, 
40 |             config=args,
41 |         )
42 |         
43 |     else:
44 |         raise Exception('Unknown logging backend.')
45 |         
46 | def train_log(x, *args, **kargs):
47 |     
48 |     if train_log_backend == 'print':
49 |         print(x)
50 |     elif train_log_backend == 'loguru':
51 |         loguru.logger.info(x)
52 |     elif train_log_backend == 'wandb':
53 |         wandb.log(x, *args, **kargs)
54 |     else:
55 |         raise Exception('Unknown logging backend.')
56 |     
57 |     


--------------------------------------------------------------------------------