├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── openchatkit-feedback-report.yaml ├── .gitignore ├── LICENSE ├── README.md ├── data ├── OIG-chip2 │ └── prepare.sh ├── OIG-moderation │ └── prepare.py ├── OIG │ └── prepare.py ├── prepare_data.py └── wikipedia-3sentence-level-retrieval-index │ └── prepare.py ├── docs ├── GPT-NeoXT-Chat-Base-20B.md └── finetuning-RedPajama-3B.md ├── environment.yml ├── inference ├── README.md ├── bot.py └── conversation.py ├── pretrained ├── GPT-NeoX-20B │ └── prepare.py ├── Llama-2-7B-32K-beta │ └── prepare.py ├── Pythia-6.9B-deduped │ └── prepare.py ├── RedPajama-3B │ └── prepare.py ├── RedPajama-7B │ └── prepare.py └── prepare_pretrained.py ├── retrieval ├── README.md ├── __init__.py └── wikipedia.py ├── tools ├── README.md ├── benchmark_input.json ├── convert_to_hf_gptneox.py ├── convert_to_hf_llama.py └── model_load_benchmark.py └── training ├── README.md ├── comm ├── __init__.py ├── comm_utils.py ├── nccl_backend.py └── torch_backend.py ├── data_parallel ├── __init__.py ├── dist_dp_allreduce.py ├── dist_dp_central_ps.py ├── dist_dp_local.py ├── dist_dp_sharded_ps.py ├── dist_dp_utils.py └── flatten_utils.py ├── dist_clm_train.py ├── dist_prefixlm_train.py ├── finetune_GPT-NeoXT-Chat-Base-20B.sh ├── finetune_Pythia-Chat-Base-7B.sh ├── finetune_RedPajama-INCITE-7B-Chat.sh ├── finetune_RedPajama-INCITE-Chat-3B-v1.sh ├── finetune_llama-2-7b-32k-booksum.sh ├── finetune_llama-2-7b-32k-mqa.sh ├── lora └── example │ ├── redpajama-incite-chat-3b.py │ └── redpajama-incite-chat-3b_inference.py ├── modules ├── __init__.py ├── deberta_modules.py ├── dist_deberta_pp_module.py ├── dist_gpt_fsdp_module.py ├── dist_gpt_pp_module.py ├── hf_gpt2_modules.py ├── hf_gptj_modules.py ├── hf_gptneox_modules.py ├── hf_opt_modules.py ├── llama_modules.py ├── task_modules.py ├── tokenizer.py └── utils.py ├── optimizer ├── __init__.py ├── grad_scalar.py └── optimizer.py ├── pipeline_parallel ├── __init__.py ├── dist_gpipe_pipeline_async.py └── dist_pp_utils.py ├── tasks ├── __init__.py └── data_loaders │ ├── __init__.py │ ├── data_utils.py │ └── prosocial.py └── utils ├── __init__.py ├── dist_args_utils.py ├── dist_checkpoint_utils.py ├── dist_debug_utils.py ├── event_report.py ├── logging_utils.py └── upload_manager.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/openchatkit-feedback-report.yaml: -------------------------------------------------------------------------------- 1 | name: OpenChatKit Feedback Report 2 | description: Details of feedback from using OpenChatKit test app 3 | title: OpenChatKit Feedback Report 4 | labels: "feedback report" 5 | assignees: [] 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thanks for taking the time to fill out this feedback report! 11 | - type: textarea 12 | id: my-question 13 | attributes: 14 | label: "My question:" 15 | validations: 16 | required: true 17 | - type: textarea 18 | id: bot-response 19 | attributes: 20 | label: "Bot response:" 21 | validations: 22 | required: true 23 | - type: textarea 24 | id: ideal-bot-response 25 | attributes: 26 | label: "Ideal bot response:" 27 | validations: 28 | required: true 29 | - type: checkboxes 30 | id: response-issues 31 | attributes: 32 | label: "Bot response was:" 33 | options: 34 | - label: Factually incorrect 35 | required: true 36 | - label: Not helpful 37 | required: true 38 | - label: Harmful, inappropriate or unsafe 39 | required: true 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # ignore downloaded files 132 | /data/OIG-moderation/files/ 133 | /data/OIG/files/ 134 | /data/wikipedia-3sentence-level-retrieval-index/files/ 135 | /pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b/ 136 | /pretrained/Pythia-6.9B-deduped/EleutherAI_pythia-6.9b-deduped/ 137 | /pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1 138 | 139 | # ignore training output 140 | /model_ckpts/ 141 | /huggingface_models/ 142 | /training/wandb/ 143 | 144 | # ignore trained low-rank adapters 145 | /outputs/ 146 | data/OIG-chip2/*.jsonl 147 | wandb/ -------------------------------------------------------------------------------- /data/OIG-chip2/prepare.sh: -------------------------------------------------------------------------------- 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) 2 | wget https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl -O ${DIR}/unified_chip2.jsonl -------------------------------------------------------------------------------- /data/OIG-moderation/prepare.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # Import the prepare_data function 5 | current_dir = os.path.dirname(os.path.abspath(__file__)) 6 | sys.path.append(os.path.join(current_dir, '..')) 7 | from prepare_data import prepare_data 8 | 9 | if __name__ == "__main__": 10 | dest_dir = os.path.join(current_dir, "files") 11 | prepare_data("https://huggingface.co/datasets/ontocord/OIG-moderation", dest_dir) 12 | -------------------------------------------------------------------------------- /data/OIG/prepare.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # Import the prepare_data function 5 | current_dir = os.path.dirname(os.path.abspath(__file__)) 6 | sys.path.append(os.path.join(current_dir, '..')) 7 | from prepare_data import prepare_data 8 | 9 | if __name__ == "__main__": 10 | dest_dir = os.path.join(current_dir, "files") 11 | prepare_data("https://huggingface.co/datasets/laion/OIG", dest_dir) 12 | -------------------------------------------------------------------------------- /data/wikipedia-3sentence-level-retrieval-index/prepare.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # Import the prepare_data function 5 | current_dir = os.path.dirname(os.path.abspath(__file__)) 6 | sys.path.append(os.path.join(current_dir, '..')) 7 | from prepare_data import prepare_data 8 | 9 | if __name__ == "__main__": 10 | dest_dir = os.path.join(current_dir, "files") 11 | prepare_data("https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index", dest_dir) 12 | -------------------------------------------------------------------------------- /docs/GPT-NeoXT-Chat-Base-20B.md: -------------------------------------------------------------------------------- 1 | # GPT-NeoXT-Chat-Base-20B 2 | 3 | OpenChatKit includes an instruction-tuned 20 billion parameter language model called GPT-NeoXT-Chat-Base-20B, a 6 billion parameter moderation model, and an extensible retrieval system for including up-to-date responses from custom repositories. It was trained on the OIG-43M training dataset, which was a collaboration between [Together](https://www.together.xyz/), [LAION](https://laion.ai), and [Ontocord.ai](https://ontocord.ai). Much more than a model release, this is the beginning of an open source project. We are releasing a set of tools and processes for ongoing improvement with community contributions. 4 | 5 | In this doc, you'll find steps for: 6 | - Training an OpenChatKit model 7 | - Testing inference using the model 8 | - Augmenting the model with additional context from a retrieval index 9 | 10 | # Contents 11 | 12 | - [Requirements](#requirements) 13 | - [Pre-trained Weights](#pre-trained-weights) 14 | - [Datasets](#datasets) 15 | * [Data Contributions](#data-contributions) 16 | - [Pretrained Base Model](#pretrained-base-model) 17 | - [Training and Finetuning](#training-and-finetuning) 18 | * [(Optional) 8bit Adam](#optional-8bit-adam) 19 | * [Train GPT-NeoX-Chat-Base-20B](#train-gpt-neox-chat-base-20b) 20 | - [Converting Weights to Huggingface Format](#converting-weights-to-huggingface-format) 21 | - [Inference](#inference) 22 | - [Monitoring](#monitoring) 23 | * [Loguru](#loguru) 24 | * [Weights & Biases](#weights--biases) 25 | - [Experimental: Retrieval-Augmented Models](#experimental-retrieval-augmented-models) 26 | - [Acknowledgements](#acknowledgements) 27 | 28 | # Requirements 29 | 30 | Before you begin, you need to install PyTorch and other dependencies. 31 | 32 | 1. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) from their website. 33 | 34 | 2. Install [Git LFS](https://git-lfs.com/) from their website. 35 | 36 | 3. Install the `git lfs` hooks. 37 | 38 | ```shell 39 | git lfs install 40 | ``` 41 | 42 | 4. Install mamba in the `base` environment so it's available in all environments. 43 | 44 | ```shell 45 | conda install mamba -n base -c conda-forge 46 | ``` 47 | 48 | 5. Create an environment called OpenChatKit using the `environment.yml` file at the root of this repo. 49 | 50 | ```shell 51 | mamba env create -f environment.yml 52 | ``` 53 | 54 | 6. Activate the new conda environment. 55 | 56 | ```shell 57 | conda activate OpenChatKit 58 | ``` 59 | 60 | # Pre-trained Weights 61 | 62 | GPT-NeoXT-Chat-Base-20B is a 20B-parameter variant of GPT-NeoX, fine-tuned on conversational datasets. We are releasing pre-trained weights for this model as [togethercomputer/GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B) on Huggingface. 63 | 64 | More details can be found on the model card for [GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B) on Huggingface. 65 | 66 | # Datasets 67 | 68 | The chat model was trained on the [OIG](https://huggingface.co/datasets/laion/OIG) dataset built by [LAION](https://laion.ai/), [Together](https://www.together.xyz/), and [Ontocord.ai](https://www.ontocord.ai/). To download the dataset from Huggingface run the command below from the root of the repo. 69 | 70 | ```shell 71 | python data/OIG/prepare.py 72 | ``` 73 | 74 | Once the command completes, the data will be in the `data/OIG/files` directory. 75 | 76 | ## Data Contributions 77 | 78 | You can help make this chat model better by contributing data! See the [OpenDataHub](https://github.com/togethercomputer/OpenDataHub) repo for more details. 79 | 80 | # Pretrained Base Model 81 | 82 | As mentioned above, the chat model is a fine-tuned variant of GPT-NeoX-20B from Eleuther AI. To download GPT-NeoX-20B and prepare it for fine tuning, run this command from the root of the repo. 83 | 84 | ```shell 85 | python pretrained/GPT-NeoX-20B/prepare.py 86 | ``` 87 | 88 | The weights for this model will be in the `pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b`. 89 | 90 | In case you want to fine-tune other gpt-neox models, e.g. [the Pythia model suite](https://huggingface.co/models?sort=downloads&search=pythia), you can specify the HF model name, for example: 91 | 92 | ```shell 93 | python pretrained/GPT-NeoX-20B/prepare.py --model-name EleutherAI/pythia-6.9b-deduped 94 | ``` 95 | 96 | And the weights for this model will be in the `pretrained/GPT-NeoX-20B/EleutherAI_pythia-6.9b-deduped`. 97 | 98 | 99 | # Training and Finetuning 100 | 101 | ## (Optional) 8bit Adam 102 | 103 | To use 8bit-adam during training, install the `bitsandbytes` package. 104 | 105 | ```shell 106 | pip install bitsandbytes # optional, to use 8bit-adam 107 | ``` 108 | 109 | ## Train GPT-NeoX-Chat-Base-20B 110 | 111 | The `training/finetune_GPT-NeoXT-Chat-Base-20B.sh` script configures and runs the training loop. After downloading the dataset and the base model, run: 112 | 113 | ```shell 114 | bash training/finetune_GPT-NeoXT-Chat-Base-20B.sh 115 | ``` 116 | 117 | The script launches 8 processes with a pipeline-parallel degree of 8 and a data-parallel degree of 1. 118 | 119 | As the training loop runs, checkpoints are saved to the `model_ckpts` directory at the root of the repo. 120 | 121 | Please see [the training README](training/README.md) for more details about customizing the training run. 122 | 123 | The `training/finetune_Pythia-Chat-Base-7B.sh` script is another example to fine-tune a 7B pythia (gpt-neox) model. The script launches 8 processes with a pipeline-parallel degree of 4 and a data-parallel degree of 2. 124 | 125 | # Converting Weights to Huggingface Format 126 | 127 | Before you can use this model to perform inference, it must be converted to the Huggingface format. Run this command from the root of the repo to do so. 128 | 129 | ```shell 130 | mkdir huggingface_models \ 131 | && python tools/convert_to_hf_gptneox.py \ 132 | --ckpt-path model_ckpts/GPT-Neo-XT-Chat-Base-20B/checkpoint_100 \ 133 | --save-path huggingface_models/GPT-NeoXT-Chat-Base-20B \ 134 | --n-stages 8 \ 135 | --n-layer-per-stage 6 \ 136 | --fp16 137 | ``` 138 | where the `--fp16` flag will load and store models in fp16. 139 | 140 | Make sure to replace `model_ckpts/GPT-Neo-XT-Chat-Base-20B/checkpoint_100` with the latest checkpoint in the `model_ckpts/GPT-Neo-XT-Chat-Base-20B` directory. 141 | 142 | If you need to convert ckpts of other gpt-neox variants, make sure to specify the correct config name for your variant. 143 | For example, if you want to convert a checkpoint fine-tuned from `EleutherAI/pythia-6.9b-deduped`, you should indicate this as a config name: 144 | ```shell 145 | python tools/convert_to_hf_gptneox.py \ 146 | --config-name EleutherAI/pythia-6.9b-deduped \ 147 | --ckpt-path model_ckpts/Pythia-Chat-Base-7B/checkpoint_100 \ 148 | --save-path huggingface_models/Pythia-Chat-Base-7B \ 149 | --n-stages 4 \ 150 | --n-layer-per-stage 8 \ 151 | --fp16 152 | ``` 153 | 154 | 155 | # Inference 156 | 157 | To help you test the model, we provide a simple test command line test harness to interact with the bot. 158 | 159 | ```shell 160 | python inference/bot.py 161 | ``` 162 | 163 | By default the script will load the model named GPT-NeoXT-Chat-Base-20B model under the `huggingface_models` directory, but you can override that behavior by specifying `--model`. 164 | 165 | For example, if you want to load the base model from our Huggingface, repo, you can run the following command which downloads the weights from HuggingFace. 166 | 167 | ```shell 168 | python inference/bot.py --model togethercomputer/GPT-NeoXT-Chat-Base-20B 169 | ``` 170 | 171 | Once the model has loaded, enter text at the prompt and the model will reply. 172 | 173 | ```shell 174 | $ python inference/bot.py 175 | Loading /home/csris/src/github.com/togethercomputer/OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:1... 176 | Welcome to OpenChatKit shell. Type /help or /? to list commands. 177 | 178 | >>> Hello. 179 | Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. 180 | Hello human. 181 | 182 | >>> 183 | ``` 184 | 185 | Commands are prefixed with a `/`, and the `/quit` command exits. 186 | 187 | Please see [the inference README](inference/README.md) for more details about arguments, running on multiple/specific GPUs, and running on consumer hardware. 188 | 189 | # Monitoring 190 | 191 | By default, the training script simply prints the loss as training proceeds, but it can also output metrics to a file using [loguru](https://github.com/Delgan/loguru) or report them to Weights & Biases. 192 | 193 | ## Loguru 194 | 195 | Add the flag `--train-log-backend loguru` to your training script to log to `./logs/file_{time}.log` 196 | 197 | ## Weights & Biases 198 | 199 | To use Weights & Biases, first login with your Weights & Biases token. 200 | 201 | ```shell 202 | wandb login 203 | ``` 204 | 205 | And set `--train-log-backend wandb` in the training script to enable logging to Weights & Biases. 206 | 207 | # Experimental: Retrieval-Augmented Models 208 | 209 | *Note: Retrieval is still experimental.* 210 | 211 | The code in `/retrieval` implements a python package for querying a Faiss index of Wikipedia. The following steps explain how to use this index to augment queries in the test harness with context from the retriever. 212 | 213 | 1. Download the Wikipedia index. 214 | 215 | ```shell 216 | python data/wikipedia-3sentence-level-retrieval-index/prepare.py 217 | ``` 218 | 219 | 2. Run the bot with the `--retrieval` flag. 220 | 221 | ```shell 222 | python inference/bot.py --retrieval 223 | ``` 224 | 225 | After starting, the bot will load both the chat model and the retrieval index, which takes a long time. Once the model and the index are loaded, all queries will be augmented with extra context. 226 | 227 | 228 | ```shell 229 | $ python inference/bot.py --retrieval 230 | Loading /OpenChatKit/inference/../huggingface_models/GPT-NeoXT-Chat-Base-20B to cuda:0... 231 | Loading retrieval index... 232 | Welcome to OpenChatKit shell. Type /help or /? to list commands. 233 | 234 | >>> Where is Zurich? 235 | Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. 236 | Where is Zurich? 237 | Zurich is located in Switzerland. 238 | 239 | >>> 240 | ``` 241 | 242 | # Acknowledgements 243 | 244 | Our model is a fine-tuned version of [gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b), a large language model trained by [Eleuther AI](https://www.eleuther.ai). We evaluated our model on [HELM](https://crfm.stanford.edu/helm/latest/) provided by the [Center for Research on Foundation Models](https://crfm.stanford.edu). And we collaborated with both [CRFM](https://crfm.stanford.edu) and [HazyResearch](http://hazyresearch.stanford.edu) at Stanford to build this model. 245 | 246 | We collaborated with [LAION](https://laion.ai/) and [Ontocord.ai](https://www.ontocord.ai/) to build the training data used to fine tune this model. 247 | -------------------------------------------------------------------------------- /docs/finetuning-RedPajama-3B.md: -------------------------------------------------------------------------------- 1 | # RedPajama-3B 2 | 3 | In this tutorial, you will learn how to fine-tune a base LLM on a sample of data. By the end of 4 | the tutorial, you will have fine-tuned the RedPajama-INCITE-Chat-3B model using a sample of 5 | chat data from the OIG dataset. You can adapt this tutorial to fine-tune with your own data. 6 | 7 | In order to fine-tune the RedPajama 3B models, please follow these steps: 8 | 9 | First clone the OpenChatKit repo: 10 | 11 | ```shell 12 | git clone git@github.com:togethercomputer/OpenChatKit.git 13 | ``` 14 | 15 | Next install dependencies as instructed by the OpenChatKit repo. 16 | 17 | # Prepare Weights 18 | 19 | ```shell 20 | python pretrained/RedPajama-3B/prepare.py 21 | ``` 22 | 23 | This script will download the weight from HuggingFace and prepare it for finetuning. The prepared weights will be saved at 24 | 25 | ``` 26 | pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1 27 | ``` 28 | 29 | # Prepare Fine Tuning Data 30 | 31 | We now need to preapre the training data. We provide an example script that downloads a small slice of data from OIG. 32 | To download this sample dataset, please run: 33 | 34 | ``` 35 | bash data/OIG-chip2/prepare.sh 36 | ```` 37 | 38 | The sample dataset will be saved at 39 | 40 | ``` 41 | data/OIG-chip2/unified_chip2.jsonl. 42 | ``` 43 | 44 | # Run Fine Tuning Script 45 | 46 | We provide an example training script. Please configure the parameters (e.g., learning_rate, batch_size, dataset_path) according to your hardware configuration. 47 | Then to start training, simply run 48 | 49 | ``` 50 | bash training/finetune_RedPajama-INCITE-Chat-3B-v1.sh 51 | ``` 52 | 53 | # Convert to Huggingface Format 54 | 55 | The fine-tuned model will be saved to 56 | 57 | ``` 58 | model_ckpts/rp-incite-chat-3b-finetuned/checkpoint_{steps} 59 | ``` 60 | 61 | In order to use it for inference, you will need to convert it to the HuggingFace format. To do so, run the following script 62 | (as an example, please change the checkpoint path, n-stages and n-layer-per-stage according to the training script): 63 | 64 | The default for n-stages used in the training script is 10 and the n-layer-per-stage is 8. 65 | 66 | ``` 67 | python tools/convert_to_hf_gptneox.py --config-name togethercomputer/RedPajama-INCITE-Chat-3B-v1 --ckpt-path model_ckpts/redpajama-incite-chat-3b-sample/checkpoint_10/ --save-path model_ckpts/hf --n-stages 4 --n-layer-per-stage 8 68 | ``` 69 | 70 | Then you are ready to go! You can load the model with HuggingFace and use it for inference, for example: 71 | 72 | ```python 73 | import torch 74 | import transformers 75 | from transformers import AutoTokenizer, AutoModelForCausalLM 76 | 77 | tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Chat-3B-v1") 78 | model = AutoModelForCausalLM.from_pretrained("./model_ckpts/hf", torch_dtype=torch.float16) 79 | model = model.to('cuda:0') 80 | 81 | prompt = ": Who is Alan Turing?\n:" 82 | inputs = tokenizer(prompt, return_tensors='pt').to(model.device) 83 | input_length = inputs.input_ids.shape[1] 84 | outputs = model.generate( 85 | **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True 86 | ) 87 | token = outputs.sequences[0, input_length:] 88 | output_str = tokenizer.decode(token) 89 | print(output_str) 90 | 91 | ``` 92 | 93 | Please note the above finetuning takes around 60GB VRAM to fit everything in to GPU, and may take even more to fit training data. If you do not have such GPUs, we also provide the low-rank finetuning scripts that works with 14GB VRAM. Here’re the steps to get started. 94 | 95 | * Clone the OpenChatKit repo, install dependencies and prepare the dataset. These steps are the same as full fine-tuning. 96 | 97 | * The sample low-rank finetuning script is at /training/lora/redpajama-incite-chat-3b.py, please modify this script to accommodate your own training data and preferred configuration. 98 | 99 | * Then you can start low-rank finetuning by running this script. 100 | 101 | Once the finetuning is finished, the resulting low-rank adapter will be saved to /outputs, and you can do inference with the following script. 102 | 103 | ``` 104 | python training/lora/redpajama-incite-chat-3b_inference.py 105 | ``` -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: OpenChatKit 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - conda-forge 6 | - defaults 7 | dependencies: 8 | - cudatoolkit=11.8.0 9 | - cupy=12.1.0 10 | - faiss-gpu=1.7.2 11 | - fastparquet=0.5.0 12 | - nccl=2.18.3.1 13 | - pip=23.2 14 | - pyarrow=12.0.1 15 | - python=3.10.9 16 | - python-snappy=0.6.1 17 | - pytorch=2.0.1 18 | - pytorch-cuda=11.8 19 | - snappy=1.1.9 20 | - torchaudio=2.0.2 21 | - torchvision=0.15.2 22 | - pip: 23 | - accelerate==0.21.0 24 | - boto3 25 | - datasets==2.13.1 26 | - loguru==0.6.0 27 | - netifaces==0.11.0 28 | - pandas==2.0.3 29 | - transformers==4.31.0 30 | - wandb==0.15.5 31 | - zstandard==0.21.0 32 | - sentencepiece 33 | -------------------------------------------------------------------------------- /inference/README.md: -------------------------------------------------------------------------------- 1 | # OpenChatKit Inference 2 | This directory contains code for OpenChatKit's inference. 3 | 4 | ## Arguments 5 | - `--gpu-id`: Primary GPU device to load inputs onto for inference. Default: `0` 6 | - `--model`: name/path of the model. Default = `../huggingface_models/GPT-NeoXT-Chat-Base-20B` 7 | - `--max-tokens`: the maximum number of tokens to generate. Default: `128` 8 | - `--sample`: indicates whether to sample. Default: `True` 9 | - `--temperature`: temperature for the LM. Default: `0.6` 10 | - `--top-k`: top-k for the LM. Default: `40` 11 | - `--retrieval`: augment queries with context from the retrieval index. Default `False` 12 | - `-g` `--gpu-vram`: GPU ID and VRAM to allocate to loading the model, separated by a `:` in the format `ID:RAM` where ID is the CUDA ID and RAM is in GiB. `gpu-id` must be present in this list to avoid errors. Accepts multiple values, for example, `-g ID_0:RAM_0 ID_1:RAM_1 ID_N:RAM_N` 13 | - `-r` `--cpu-ram`: CPU RAM overflow allocation for loading the model. Optional, and only used if the model does not fit onto the GPUs given. 14 | 15 | ## Hardware requirements for inference 16 | The GPT-NeoXT-Chat-Base-20B model requires at least 41GB of free VRAM. Used VRAM also goes up by ~100-200 MB per prompt. 17 | 18 | - A **minimum of 80 GB is recommended** 19 | 20 | - A **minimum of 48 GB in VRAM is recommended** for fast responses. 21 | 22 | If you'd like to run inference on a GPU with <48 GB VRAM, refer to this section on [running on consumer hardware](#running-on-consumer-hardware). 23 | 24 | By default, inference uses only CUDA Device 0. 25 | 26 | **NOTE: Inference currently requires at least 1x GPU.** 27 | 28 | ## Running on multiple GPUs 29 | Add the argument 30 | 31 | ```-g ID0:MAX_VRAM ID1:MAX_VRAM ID2:MAX_VRAM ...``` 32 | 33 | where IDx is the CUDA ID of the device and MAX_VRAM is the amount of VRAM you'd like to allocate to the device. 34 | 35 | For example, if you are running this on 4x 48 GB GPUs and want to distribute the model across all devices, add ```-g 0:10 1:12 2:12 3:12 4:12```. In this example, the first device gets loaded to a max of 10 GiB while the others are loaded with a max of 12 GiB. 36 | 37 | How it works: The model fills up the max available VRAM on the first device passed and then overflows into the next until the whole model is loaded. 38 | 39 | **IMPORTANT: This MAX_VRAM is only for loading the model. It does not account for the additional inputs that are added to the device. It is recommended to set the MAX_VRAM to be at least 1 or 2 GiB less than the max available VRAM on each device, and at least 3GiB less than the max available VRAM on the primary device (set by `gpu-id` default=0).** 40 | 41 | **Decrease MAX_VRAM if you run into CUDA OOM. This happens because each input takes up additional space on the device.** 42 | 43 | **NOTE: Total MAX_VRAM across all devices must be > size of the model in GB. If not, `bot.py` automatically offloads the rest of the model to RAM and disk. It will use up all available RAM. To allocate a specified amount of RAM: [refer to this section on running on consumer hardware](#running-on-consumer-hardware).** 44 | 45 | ## Running on specific GPUs 46 | If you have multiple GPUs but would only like to use a specific device(s), [use the same steps as in this section on running on multiple devices](#running-on-multiple-gpus) and only specify the devices you'd like to use. 47 | 48 | Also, if needed, add the argument `--gpu-id ID` where ID is the CUDA ID of the device you'd like to make the primary device. NOTE: The device specified in `--gpu-id` must be present as one of the ID in the argument `-g` to avoid errors. 49 | 50 | - **Example #1**: to run inference on devices 2 and 5 with a max of 25 GiB on each, and make device 5 the primary device, add: `--gpu-id 5 -g 2:25 5:25`. In this example, not adding `--gpu-id 5` will give you an error. 51 | - **Example #2**: to run inference on devices 0 and 3 with a max of 10GiB on 0 and 40GiB on 3, with device 0 as the primary device, add: `-g 0:10 3:40`. In this example, `--gpu-id` is not required because device 0 is specified in `-g`. 52 | - **Example #3**: to run inference only on device 1 with a max of 75 GiB, add: `--gpu-id 1 -g 1:75` 53 | 54 | 55 | ## Running on consumer hardware 56 | If you have multiple GPUs, each <48 GB VRAM, [the steps mentioned in this section on running on multiple GPUs](#running-on-multiple-gpus) still apply, unless, any of these apply: 57 | - Running on just 1x GPU with <48 GB VRAM, 58 | - <48 GB VRAM combined across multiple GPUs 59 | - Running into Out-Of-Memory (OOM) issues 60 | 61 | In which case, add the flag `-r CPU_RAM` where CPU_RAM is the maximum amount of RAM you'd like to allocate to loading model. Note: This significantly reduces inference speeds. 62 | 63 | The model will load without specifying `-r`, however, it is not recommended because it will allocate all available RAM to the model. To limit how much RAM the model can use, add `-r`. 64 | 65 | If the total VRAM + CPU_RAM < the size of the model in GiB, the rest of the model will be offloaded to a folder "offload" at the root of the directory. Note: This significantly reduces inference speeds. 66 | 67 | - Example: `-g 0:12 -r 20` will first load up to 12 GiB of the model into the CUDA device 0, then load up to 20 GiB into RAM, and load the rest into the "offload" directory. 68 | 69 | How it works: 70 | - https://github.com/huggingface/blog/blob/main/accelerate-large-models.md 71 | - https://www.youtube.com/embed/MWCSGj9jEAo 72 | -------------------------------------------------------------------------------- /inference/bot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | INFERENCE_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | # TODO: PYTHONPATH hacks are never a good idea. clean this up later 7 | sys.path.append(os.path.join(INFERENCE_DIR, '..')) 8 | 9 | import cmd 10 | import torch 11 | import argparse 12 | import conversation as convo 13 | import retrieval.wikipedia as wp 14 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, StoppingCriteria, StoppingCriteriaList 15 | from accelerate import infer_auto_device_map, init_empty_weights 16 | 17 | 18 | class StopWordsCriteria(StoppingCriteria): 19 | def __init__(self, tokenizer, stop_words, stream_callback): 20 | self._tokenizer = tokenizer 21 | self._stop_words = stop_words 22 | self._partial_result = '' 23 | self._stream_buffer = '' 24 | self._stream_callback = stream_callback 25 | 26 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 27 | first = not self._partial_result 28 | text = self._tokenizer.decode(input_ids[0, -1]) 29 | self._partial_result += text 30 | for stop_word in self._stop_words: 31 | if stop_word in self._partial_result: 32 | return True 33 | if self._stream_callback: 34 | if first: 35 | text = text.lstrip() 36 | # buffer tokens if the partial result ends with a prefix of a stop word, e.g. " 40 GB VRAM 55 | # load model onto one device 56 | if max_memory is None: 57 | self._model = AutoModelForCausalLM.from_pretrained( 58 | model_name, torch_dtype=torch.float16, device_map="auto") 59 | self._model.to(device) 60 | # load the model with the given max_memory config (for devices with insufficient VRAM or multi-gpu) 61 | else: 62 | config = AutoConfig.from_pretrained(model_name) 63 | # load empty weights 64 | with init_empty_weights(): 65 | model_from_conf = AutoModelForCausalLM.from_config(config) 66 | 67 | model_from_conf.tie_weights() 68 | 69 | # create a device_map from max_memory 70 | device_map = infer_auto_device_map( 71 | model_from_conf, 72 | max_memory=max_memory, 73 | no_split_module_classes=["GPTNeoXLayer"], 74 | dtype="float16" 75 | ) 76 | # load the model with the above device_map 77 | self._model = AutoModelForCausalLM.from_pretrained( 78 | model_name, 79 | device_map=device_map, 80 | offload_folder="offload", # optional offload-to-disk overflow directory (auto-created) 81 | offload_state_dict=True, 82 | torch_dtype=torch.float16 83 | ) 84 | self._tokenizer = AutoTokenizer.from_pretrained(model_name) 85 | 86 | def do_inference(self, prompt, max_new_tokens, do_sample, temperature, top_k, stream_callback=None): 87 | stop_criteria = StopWordsCriteria(self._tokenizer, [self.human_id], stream_callback) 88 | inputs = ( 89 | self._tokenizer(prompt, return_tensors='pt') 90 | .to(self._model.device) 91 | ) 92 | outputs = self._model.generate( 93 | **inputs, 94 | max_new_tokens=max_new_tokens, 95 | do_sample=do_sample, 96 | temperature=temperature, 97 | top_k=top_k, 98 | pad_token_id=self._tokenizer.eos_token_id, 99 | stopping_criteria=StoppingCriteriaList([stop_criteria]), 100 | ) 101 | output = self._tokenizer.batch_decode(outputs)[0] 102 | 103 | # remove the context from the output 104 | output = output[len(prompt):] 105 | 106 | return output 107 | 108 | 109 | class OpenChatKitShell(cmd.Cmd): 110 | intro = "Welcome to OpenChatKit shell. Type /help or /? to list commands.\n" 111 | prompt = ">>> " 112 | 113 | def __init__(self, gpu_id, model_name_or_path, max_tokens, sample, temperature, top_k, retrieval, max_memory, do_stream): 114 | super().__init__() 115 | self._gpu_id = gpu_id 116 | self._model_name_or_path = model_name_or_path 117 | self._max_tokens = max_tokens 118 | self._sample = sample 119 | self._temperature = temperature 120 | self._top_k = top_k 121 | self._retrieval = retrieval 122 | self._max_memory = max_memory 123 | self._do_stream = do_stream 124 | 125 | def preloop(self): 126 | print(f"Loading {self._model_name_or_path} to cuda:{self._gpu_id}...") 127 | self._model = ChatModel(self._model_name_or_path, self._gpu_id, self._max_memory) 128 | 129 | if self._retrieval: 130 | print(f"Loading retrieval index...") 131 | self._index = wp.WikipediaIndex() 132 | 133 | self._convo = convo.Conversation( 134 | self._model.human_id, self._model.bot_id) 135 | 136 | def precmd(self, line): 137 | if line.startswith('/'): 138 | return line[1:] 139 | else: 140 | return 'say ' + line 141 | 142 | def do_say(self, arg): 143 | if self._retrieval: 144 | results = self._index.search(arg) 145 | if len(results) > 0: 146 | self._convo.push_context_turn(results[0]) 147 | 148 | self._convo.push_human_turn(arg) 149 | 150 | output = self._model.do_inference( 151 | self._convo.get_raw_prompt(), 152 | self._max_tokens, 153 | self._sample, 154 | self._temperature, 155 | self._top_k, 156 | lambda x : print(x, end='', flush=True) if self._do_stream else None, 157 | ) 158 | 159 | self._convo.push_model_response(output) 160 | 161 | print("" if self._do_stream else self._convo.get_last_turn()) 162 | 163 | def do_raw_say(self, arg): 164 | output = self._model.do_inference( 165 | arg, 166 | self._max_tokens, 167 | self._sample, 168 | self._temperature, 169 | self._top_k 170 | ) 171 | 172 | print(output) 173 | 174 | def do_raw_prompt(self, arg): 175 | print(self._convo.get_raw_prompt()) 176 | 177 | def do_reset(self, arg): 178 | self._convo = convo.Conversation( 179 | self._model.human_id, self._model.bot_id) 180 | 181 | def do_hyperparameters(self, arg): 182 | print( 183 | f"Hyperparameters:\n" 184 | f" max_tokens: {self._max_tokens}\n" 185 | f" sample: {self._sample}\n" 186 | f" temperature: {self._temperature}\n" 187 | f" top_k: {self._top_k}" 188 | ) 189 | 190 | def do_quit(self, arg): 191 | return True 192 | 193 | 194 | def main(): 195 | parser = argparse.ArgumentParser( 196 | description='test harness for OpenChatKit') 197 | 198 | parser.add_argument( 199 | '--gpu-id', 200 | default=0, 201 | type=int, 202 | help='the ID of the GPU to run on' 203 | ) 204 | parser.add_argument( 205 | '--model', 206 | default=f"{INFERENCE_DIR}/../huggingface_models/Pythia-Chat-Base-7B", 207 | help='name/path of the model' 208 | ) 209 | parser.add_argument( 210 | '--max-tokens', 211 | default=128, 212 | type=int, 213 | help='the maximum number of tokens to generate' 214 | ) 215 | parser.add_argument( 216 | '--sample', 217 | default=True, 218 | action='store_true', 219 | help='indicates whether to sample' 220 | ) 221 | parser.add_argument( 222 | '--no-stream', 223 | action='store_true', 224 | help='indicates whether to stream tokens' 225 | ) 226 | parser.add_argument( 227 | '--temperature', 228 | default=0.6, 229 | type=float, 230 | help='temperature for the LM' 231 | ) 232 | parser.add_argument( 233 | '--top-k', 234 | default=40, 235 | type=int, 236 | help='top-k for the LM' 237 | ) 238 | parser.add_argument( 239 | '--retrieval', 240 | default=False, 241 | action='store_true', 242 | help='augment queries with context from the retrieval index' 243 | ) 244 | parser.add_argument( 245 | '-g', 246 | '--gpu-vram', 247 | action='store', 248 | help='max VRAM to allocate per GPU', 249 | nargs='+', 250 | required=False, 251 | ) 252 | parser.add_argument( 253 | '-r', 254 | '--cpu-ram', 255 | default=None, 256 | type=int, 257 | help='max CPU RAM to allocate', 258 | required=False 259 | ) 260 | args = parser.parse_args() 261 | 262 | # set max_memory dictionary if given 263 | if args.gpu_vram is None: 264 | max_memory = None 265 | else: 266 | max_memory = {} 267 | for i in range(len(args.gpu_vram)): 268 | # assign CUDA ID as label and XGiB as value 269 | max_memory[int(args.gpu_vram[i].split(':')[0])] = f"{args.gpu_vram[i].split(':')[1]}GiB" 270 | 271 | if args.cpu_ram is not None: 272 | # add cpu to max-memory if given 273 | max_memory['cpu'] = f"{int(args.cpu_ram)}GiB" 274 | 275 | OpenChatKitShell( 276 | args.gpu_id, 277 | args.model, 278 | args.max_tokens, 279 | args.sample, 280 | args.temperature, 281 | args.top_k, 282 | args.retrieval, 283 | max_memory, 284 | not args.no_stream, 285 | ).cmdloop() 286 | 287 | 288 | if __name__ == '__main__': 289 | main() 290 | -------------------------------------------------------------------------------- /inference/conversation.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | 4 | MEANINGLESS_WORDS = ['', '', '<|endoftext|>'] 5 | PRE_PROMPT = """\ 6 | Current Date: {} 7 | Current Time: {} 8 | 9 | """ 10 | 11 | def clean_response(response): 12 | for word in MEANINGLESS_WORDS: 13 | response = response.replace(word, "") 14 | response = response.strip("\n") 15 | return response 16 | 17 | class Conversation: 18 | def __init__(self, human_id, bot_id): 19 | cur_date = time.strftime('%Y-%m-%d') 20 | cur_time = time.strftime('%H:%M:%S %p %Z') 21 | 22 | self._human_id = human_id 23 | self._bot_id = bot_id 24 | self._prompt = PRE_PROMPT.format(cur_date, cur_time) 25 | 26 | def push_context_turn(self, context): 27 | # for now, context is represented as a human turn 28 | self._prompt += f"{self._human_id}: {context}\n" 29 | 30 | def push_human_turn(self, query): 31 | self._prompt += f"{self._human_id}: {query}\n" 32 | self._prompt += f"{self._bot_id}:" 33 | 34 | def push_model_response(self, response): 35 | has_finished = self._human_id in response 36 | bot_turn = response.split(f"{self._human_id}:")[0] 37 | bot_turn = clean_response(bot_turn) 38 | # if it is truncated, then append "..." to the end of the response 39 | if not has_finished: 40 | bot_turn += "..." 41 | 42 | self._prompt += f"{bot_turn}\n" 43 | 44 | def get_last_turn(self): 45 | human_tag = f"{self._human_id}:" 46 | bot_tag = f"{self._bot_id}:" 47 | turns = re.split(f"({human_tag}|{bot_tag})\W?", self._prompt) 48 | return turns[-1] 49 | 50 | def get_raw_prompt(self): 51 | return self._prompt 52 | 53 | @classmethod 54 | def from_raw_prompt(cls, value): 55 | self._prompt = value 56 | -------------------------------------------------------------------------------- /pretrained/GPT-NeoX-20B/prepare.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # Import the prepare_data function 5 | current_dir = os.path.dirname(os.path.abspath(__file__)) 6 | sys.path.append(os.path.join(current_dir, '..')) 7 | from prepare_pretrained import prepare_pretrained 8 | 9 | if __name__ == "__main__": 10 | model_name = "EleutherAI/gpt-neox-20b" 11 | save_path = os.path.join(current_dir, model_name.replace('/', '_')) 12 | prepare_pretrained(save_path, model_name) 13 | -------------------------------------------------------------------------------- /pretrained/Llama-2-7B-32K-beta/prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig 5 | 6 | DIR = os.path.dirname(os.path.abspath(__file__)) 7 | USE_AUTH_TOKEN = False 8 | 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser(description='Convert HF checkpoints') 11 | parser.add_argument('--model-name', type=str, default='togethercomputer/Llama-2-7B-32K-beta', 12 | help='model-name') 13 | parser.add_argument('--save-dir', type=str, default=DIR, 14 | help='model-name') 15 | parser.add_argument('--offload-dir', type=str, default=None, 16 | help='directory to offload from memory') 17 | args = parser.parse_args() 18 | 19 | if not os.path.exists(args.save_dir): 20 | os.mkdir(args.save_dir) 21 | save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_')) 22 | if not os.path.exists(save_path): 23 | os.mkdir(save_path) 24 | 25 | print('loading model from HF...') 26 | config = AutoConfig.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN) 27 | config.save_pretrained(save_path) 28 | tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN) 29 | tokenizer.save_pretrained(save_path) 30 | 31 | # offload model from memory to disk if offload-dir is specified 32 | if args.offload_dir is not None: 33 | if not os.path.exists(args.offload_dir): 34 | os.mkdir(args.offload_dir) 35 | model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, device_map="auto", offload_folder=args.offload_dir, use_auth_token=USE_AUTH_TOKEN) 36 | else: 37 | model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, use_auth_token=USE_AUTH_TOKEN) 38 | print('loaded model from HF...') 39 | 40 | print('converting the embedding layer...') 41 | item = {} 42 | item['embed_tokens.weight'] = model.model.embed_tokens.weight 43 | torch.save(item, os.path.join(save_path, 'pytorch_embs.pt')) 44 | print('converted the embedding layer.') 45 | 46 | for i in range(len(model.model.layers)): 47 | print(f'converting the {i}-th transformer layer...') 48 | torch.save(model.model.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt')) 49 | print(f'converted the {i}-th transformer layer.') 50 | 51 | print('converting the lm_head layer...') 52 | item = {} 53 | item['lm_head.weight'] = model.lm_head.weight 54 | item['norm.weight'] = model.model.norm.weight 55 | torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt')) 56 | print('converted the lm_head layer.') 57 | -------------------------------------------------------------------------------- /pretrained/Pythia-6.9B-deduped/prepare.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # Import the prepare_data function 5 | current_dir = os.path.dirname(os.path.abspath(__file__)) 6 | sys.path.append(os.path.join(current_dir, '..')) 7 | from prepare_pretrained import prepare_pretrained 8 | 9 | if __name__ == "__main__": 10 | model_name = "EleutherAI/pythia-6.9b-deduped" 11 | save_path = os.path.join(current_dir, model_name.replace('/', '_')) 12 | prepare_pretrained(save_path, model_name) 13 | -------------------------------------------------------------------------------- /pretrained/RedPajama-3B/prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | # Import the prepare_data function 5 | current_dir = os.path.dirname(os.path.abspath(__file__)) 6 | sys.path.append(os.path.join(current_dir, '..')) 7 | from prepare_pretrained import prepare_pretrained 8 | 9 | if __name__ == "__main__": 10 | model_name = "togethercomputer/RedPajama-INCITE-Chat-3B-v1" 11 | save_path = os.path.join(current_dir, model_name.replace('/', '_')) 12 | prepare_pretrained(save_path, model_name) 13 | -------------------------------------------------------------------------------- /pretrained/RedPajama-7B/prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig 5 | 6 | DIR = os.path.dirname(os.path.abspath(__file__)) 7 | USE_AUTH_TOKEN = False 8 | 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser(description='Convert HF checkpoints') 11 | parser.add_argument('--model-name', type=str, default='togethercomputer/RedPajama-INCITE-7B-Chat', 12 | help='model-name') 13 | parser.add_argument('--save-dir', type=str, default=DIR, 14 | help='model-name') 15 | parser.add_argument('--offload-dir', type=str, default=None, 16 | help='directory to offload from memory') 17 | args = parser.parse_args() 18 | 19 | if not os.path.exists(args.save_dir): 20 | os.mkdir(args.save_dir) 21 | save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_')) 22 | if not os.path.exists(save_path): 23 | os.mkdir(save_path) 24 | 25 | print('loading model from HF...') 26 | config = AutoConfig.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN) 27 | config.save_pretrained(save_path) 28 | tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=USE_AUTH_TOKEN) 29 | tokenizer.save_pretrained(save_path) 30 | 31 | # offload model from memory to disk if offload-dir is specified 32 | if args.offload_dir is not None: 33 | if not os.path.exists(args.offload_dir): 34 | os.mkdir(args.offload_dir) 35 | model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, device_map="auto", offload_folder=args.offload_dir, use_auth_token=USE_AUTH_TOKEN) 36 | else: 37 | model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, use_auth_token=USE_AUTH_TOKEN) 38 | print('loaded model from HF...') 39 | 40 | print('converting the embedding layer...') 41 | 42 | item = {} 43 | item['embed_in.weight'] = model.gpt_neox.embed_in.weight 44 | torch.save(item, os.path.join(save_path, 'pytorch_embs.pt')) 45 | print('converted the embedding layer.') 46 | 47 | for i in range(len(model.gpt_neox.layers)): 48 | print(f'converting the {i}-th transformer layer...') 49 | torch.save(model.gpt_neox.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt')) 50 | print(f'converted the {i}-th transformer layer.') 51 | 52 | print('converting the lm_head layer...') 53 | item = {} 54 | item['embed_out.weight'] = model.embed_out.weight 55 | item['final_layer_norm.weight'] = model.gpt_neox.final_layer_norm.weight 56 | item['final_layer_norm.bias'] = model.gpt_neox.final_layer_norm.bias 57 | torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt')) 58 | print('converted the lm_head layer.') 59 | -------------------------------------------------------------------------------- /pretrained/prepare_pretrained.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig 5 | 6 | DIR = os.path.dirname(os.path.abspath(__file__)) 7 | USE_AUTH_TOKEN = False 8 | 9 | # Load pretrained model from HuggingFace and save it to disk 10 | def prepare_pretrained(save_path, model_name, offload_dir=None): 11 | os.makedirs(save_path, exist_ok=True) 12 | 13 | print('loading model from HF...') 14 | config = AutoConfig.from_pretrained(model_name, use_auth_token=USE_AUTH_TOKEN) 15 | config.save_pretrained(save_path) 16 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=USE_AUTH_TOKEN) 17 | tokenizer.save_pretrained(save_path) 18 | 19 | # offload model from memory to disk if offload-dir is specified 20 | if offload_dir is not None: 21 | os.makedirs(offload_dir, exist_ok=True) 22 | model = AutoModelForCausalLM.from_pretrained(model_name, 23 | torch_dtype=torch.float16, 24 | device_map="auto", 25 | offload_folder=offload_dir, 26 | use_auth_token=USE_AUTH_TOKEN) 27 | else: 28 | model = AutoModelForCausalLM.from_pretrained(model_name, 29 | torch_dtype=torch.float16, 30 | use_auth_token=USE_AUTH_TOKEN) 31 | print('loaded model from HF...') 32 | 33 | print('converting the embedding layer...') 34 | item = {} 35 | item['embed_in.weight'] = model.gpt_neox.embed_in.weight 36 | torch.save(item, os.path.join(save_path, 'pytorch_embs.pt')) 37 | print('converted the embedding layer.') 38 | 39 | for i in range(len(model.gpt_neox.layers)): 40 | print(f'converting the {i}-th transformer layer...') 41 | torch.save(model.gpt_neox.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt')) 42 | print(f'converted the {i}-th transformer layer.') 43 | 44 | print('converting the lm_head layer...') 45 | item = {} 46 | item['embed_out.weight'] = model.embed_out.weight 47 | item['final_layer_norm.weight'] = model.gpt_neox.final_layer_norm.weight 48 | item['final_layer_norm.bias'] = model.gpt_neox.final_layer_norm.bias 49 | torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt')) 50 | print('converted the lm_head layer.') 51 | 52 | # python pretrained/prepare_pretrained.py --model-name EleutherAI/gpt-neox-125M --save-dir pretrained/files --offload-dir pretrained/files/offload 53 | def main(): 54 | parser = argparse.ArgumentParser(description='Convert HF checkpoints') 55 | parser.add_argument('--model-name', type=str, required=True, 56 | help='model-name') 57 | parser.add_argument('--save-dir', type=str, required=True, 58 | help='model-name') 59 | parser.add_argument('--offload-dir', type=str, default=None, 60 | help='directory to offload from memory') 61 | args = parser.parse_args() 62 | 63 | prepare_pretrained(args.save_dir, args.model_name, args.offload_dir) 64 | 65 | if __name__ == '__main__': 66 | main() -------------------------------------------------------------------------------- /retrieval/README.md: -------------------------------------------------------------------------------- 1 | # Retrieval-Enhanced Chatbot 2 | 3 | This is a demonstration of how to enhance a chatbot using Wikipedia. We'll be using [ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index](https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index). for this demo. Thank Christoph for providing this resource! 4 | 5 | In this demo, we'll be extending the approach of comparing and adding the adjacent `w` sentences to the matched sentence if their cosine similarity is larger than `w_th`. By doing so, we can provide the chatbot with a longer context, which may improve its performance. 6 | 7 | This demo combines both the above index and the chat model into one system 8 | 9 | ## Start the combined server 10 | 11 | To get started, we need to install some dependencies and download the Wikipedia index: 12 | 13 | 0. Install dependencies 14 | 15 | Install the necessary dependencies, including `torch`, `transformers`, `flask`, `faiss`, and `fastparquet`. 16 | 17 | 1. Open up wiki-server.py and set model_name_or_path to point to the path that contains the chat 18 | model 19 | 20 | 21 | 2. Start the retrieval server 22 | 23 | ```shell 24 | python wiki-server.py 25 | ``` 26 | 27 | The server will listen on port 7003. It will download the data sets from ChristophSchuhman. This 28 | may take a few minutes. 29 | 30 | 3. Test the full retrieval enhanced chatbot 31 | 32 | We now demonstrate both the wiki index and the GPT-NeoX-fine-tuned model. 33 | 34 | ```curl -X POST -H 'Content-Type: application/json' http://127.0.0.1:7003/inference -d '{ "prompt" : "where is zurich located?" }'``` 35 | 36 | Internally we first query the wiki index and generate a response using the provided model. To do 37 | this, We concatenate the retrieved information and the users' query into a prompt, 38 | encode it with a tokenizer, and generate a response using the chatbot model. 39 | 40 | The response should indicate the location of Zurich city. 41 | 42 | 43 | 4. To test just the retrieval functionality of the system you can can do the following. Curl works 44 | as well. 45 | 46 | ```python 47 | import requests 48 | 49 | endpoint = 'http://127.0.0.1:7003/search' 50 | res = requests.post(endpoint, json={ 51 | 'query': 'Where is Zurich?', 52 | 'k': 1, 53 | 'w': 5, 54 | 'w_th': 0.7, 55 | }) 56 | print(res.json()) 57 | ``` 58 | 59 | This should print the most relevant sentences about Zurich from Wikipedia. By increasing w and 60 | decreasing w_th, we can retrieve a longer context. 61 | 62 | 63 | -------------------------------------------------------------------------------- /retrieval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/retrieval/__init__.py -------------------------------------------------------------------------------- /retrieval/wikipedia.py: -------------------------------------------------------------------------------- 1 | # This file was adapted from ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index: 2 | # https://huggingface.co/datasets/ChristophSchuhmann/wikipedia-3sentence-level-retrieval-index/blob/main/wikiindexquery.py 3 | # 4 | # The original file was licensed under the Apache 2.0 license. 5 | 6 | import os 7 | 8 | from transformers import AutoTokenizer, AutoModel 9 | import faiss 10 | import numpy as np 11 | import pandas as pd 12 | 13 | DIR = os.path.dirname(os.path.abspath(__file__)) 14 | 15 | 16 | def mean_pooling(token_embeddings, mask): 17 | token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.) 18 | sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None] 19 | return sentence_embeddings 20 | 21 | def cos_sim_2d(x, y): 22 | norm_x = x / np.linalg.norm(x, axis=1, keepdims=True) 23 | norm_y = y / np.linalg.norm(y, axis=1, keepdims=True) 24 | return np.matmul(norm_x, norm_y.T) 25 | 26 | 27 | class WikipediaIndex: 28 | def __init__(self): 29 | path = os.path.join(DIR, '..', 'data', 'wikipedia-3sentence-level-retrieval-index', 'files') 30 | indexpath = os.path.join(path, 'knn.index') 31 | wiki_sentence_path = os.path.join(path, 'wikipedia-en-sentences.parquet') 32 | 33 | self._device = 'cuda' 34 | self._tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco') 35 | self._contriever = AutoModel.from_pretrained('facebook/contriever-msmarco').to(self._device) 36 | 37 | self._df_sentences = pd.read_parquet(wiki_sentence_path, engine='fastparquet') 38 | 39 | self._wiki_index = faiss.read_index(indexpath, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY) 40 | 41 | 42 | def search(self, query, k=1, w=5, w_th=0.5): 43 | inputs = self._tokenizer(query, padding=True, truncation=True, return_tensors='pt').to(self._device) 44 | outputs = self._contriever(**inputs) 45 | embeddings = mean_pooling(outputs[0], inputs['attention_mask']) 46 | 47 | query_vector = embeddings.cpu().detach().numpy().reshape(1, -1) 48 | 49 | distances, indices = self._wiki_index.search(query_vector, k) 50 | 51 | texts = [] 52 | for i, (dist, indice) in enumerate(zip(distances[0], indices[0])): 53 | text = self._df_sentences.iloc[indice]['text_snippet'] 54 | 55 | try: 56 | input_texts = [self._df_sentences.iloc[indice]['text_snippet']] 57 | for j in range(1, w+1): 58 | input_texts = [self._df_sentences.iloc[indice-j]['text_snippet']] + input_texts 59 | for j in range(1, w+1): 60 | input_texts = input_texts + [self._df_sentences.iloc[indice+j]['text_snippet']] 61 | 62 | inputs = self._tokenizer(input_texts, padding=True, truncation=True, return_tensors='pt').to(self._device) 63 | 64 | outputs = self._contriever(**inputs) 65 | embeddings = mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy() 66 | 67 | for j in range(1, w+1): 68 | if cos_sim_2d(embeddings[w-j].reshape(1, -1), embeddings[w].reshape(1, -1)) > w_th: 69 | text = self._df_sentences.iloc[indice-j]['text_snippet'] + text 70 | else: 71 | break 72 | 73 | for j in range(1, w+1): 74 | if cos_sim_2d(embeddings[w+j].reshape(1, -1), embeddings[w].reshape(1, -1)) > w_th: 75 | text += self._df_sentences.iloc[indice+j]['text_snippet'] 76 | else: 77 | break 78 | 79 | except Exception as e: 80 | print(e) 81 | 82 | texts.append(text) 83 | 84 | return texts 85 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # OpenChatKit Tools 2 | 3 | ## convert_to_hf_gptneox.py 4 | 5 | ## ml_load_benchmark.py 6 | 7 | The commands to run the model load benchmark tool is: 8 | ```shell 9 | $ python3 model_load_benchmark.py -i benchmark_input.json -o benchmark_results.json -d cuda:0 10 | ``` 11 | 12 | ``` 13 | usage: model_load_benchmark.py [-h] -i INPUT -o OUTPUT [-d DEVICE] [-r REPEAT_INFER] 14 | 15 | Benchmark downloading, loading, and running an inferernce for a set of ML models. 16 | 17 | optional arguments: 18 | -h, --help show this help message and exit 19 | -i INPUT, --input INPUT 20 | Input JSON file containing models to be benchmark 21 | -o OUTPUT, --output OUTPUT 22 | Output JSON file with model benchmark results 23 | -d DEVICE, --device DEVICE 24 | Cuda device name, e.g. "cuda:0" 25 | -r REPEAT_INFER, --repeat-infer REPEAT_INFER 26 | Repeat inferrence for warm timings 27 | ``` 28 | 29 | The input file is a JSON file with the names and paths of the models to be tested. For example: 30 | ```JSON 31 | { 32 | "GPT-NeoXT-Chat-Base-20B": "togethercomputer/GPT-NeoXT-Chat-Base-20B", 33 | "Pythia-Chat-Base-7B": "togethercomputer/Pythia-Chat-Base-7B", 34 | "GPT-JT-Moderation-6B": "togethercomputer/GPT-JT-Moderation-6B", 35 | "GPT-JT-6B-v1": "togethercomputer/GPT-JT-6B-v1", 36 | "GPT-JT-6B-v0": "togethercomputer/GPT-JT-6B-v0" 37 | } 38 | ``` 39 | 40 | The output is a json file with the timings for: 41 | 1. tokenizer download time in seconds -- `tokenizer_download_sec` 42 | 2. tokenizer load time in seconds -- `tokenizer_load_sec` 43 | 3. model download time -- `model_download_sec` 44 | 5. model load to RAM time -- `model_load_to_ram_sec` 45 | 6. model transfer to GPU time -- `model_transfer_to_gpu_sec` 46 | 7. inference time (input is "hello, world!") -- `inference_sec` 47 | 8. total time (sum of all the above) -- `total_sec` 48 | 9. inference time from a warm start (the average of running inference `REPEAT_INFER` times) -- `inference_warm_sec` 49 | 10. model main memory footprint in MB -- `model_main_memory_MB` 50 | 11. model GPU memory footprint in MB -- `model_gpu_memory_MB` 51 | 52 | An example of the output is: 53 | ```JSON 54 | { 55 | "GPT-JT-6B-v1": { 56 | "tokenizer_download_sec": 1.52, 57 | "tokenizer_load_sec": 0.10, 58 | "model_download_sec": 124.70, 59 | "model_load_to_ram_sec": 127.81, 60 | "model_main_memory_MB": 12297.10, 61 | "model_transfer_to_gpu_sec": 3.29, 62 | "model_gpu_memory_MB": 12219.74, 63 | "inference_sec": 0.93, 64 | "inference_warm_sec": 0.047, 65 | "total_sec": 258.38 66 | } 67 | } 68 | ``` -------------------------------------------------------------------------------- /tools/benchmark_input.json: -------------------------------------------------------------------------------- 1 | { 2 | "GPT-NeoXT-Chat-Base-20B": "togethercomputer/GPT-NeoXT-Chat-Base-20B", 3 | "Pythia-Chat-Base-7B": "togethercomputer/Pythia-Chat-Base-7B", 4 | "GPT-JT-Moderation-6B": "togethercomputer/GPT-JT-Moderation-6B", 5 | "GPT-JT-6B-v1": "togethercomputer/GPT-JT-6B-v1", 6 | "GPT-JT-6B-v0": "togethercomputer/GPT-JT-6B-v0" 7 | } -------------------------------------------------------------------------------- /tools/convert_to_hf_gptneox.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import argparse 5 | 6 | from transformers import GPTNeoXForCausalLM 7 | 8 | from transformers import AutoConfig, AutoTokenizer 9 | 10 | from transformers.modeling_utils import no_init_weights 11 | import os 12 | 13 | 14 | def create_empty_gptneox(config): 15 | 16 | import torch 17 | import torch.nn as nn 18 | 19 | _reset_parameters_linear = nn.Linear.reset_parameters 20 | def dummy(*args, **kargs): 21 | pass 22 | nn.Linear.reset_parameters = dummy 23 | 24 | # 1. disable init for faster initialization 25 | # 2. avoid tie token embeddings with lm_head, as we train them separately. 26 | with no_init_weights(_enable=True): 27 | model = GPTNeoXForCausalLM(config).eval() 28 | 29 | nn.Linear.reset_parameters = _reset_parameters_linear 30 | 31 | return model 32 | 33 | def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_per_stage=14): 34 | input_path = checkpoint_path 35 | 36 | assert n_stages * n_layer_per_stage >= len(model.gpt_neox.layers) 37 | # assert model.lm_head.weight.data is not model.transformer.wte.weight.data 38 | 39 | for i in range(n_stages): 40 | 41 | print(f'loading stage {i}') 42 | 43 | checkpoint = torch.load(os.path.join(input_path, f'prank_{i}_checkpoint.pt'), map_location=torch.device("cpu")) 44 | 45 | if i == 0: 46 | _tmp = {k[len(f"{0}."):]:v for k,v in checkpoint.items() if k.startswith(f"0.")} 47 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_embs.pt')) 48 | model.gpt_neox.embed_in.weight.data[:] = _tmp['embed_in.weight'] 49 | 50 | for j in range(n_layer_per_stage): 51 | _tmp = {k[len(f"{j+1}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j+1}.")} 52 | if len(_tmp) == 0: 53 | break 54 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_{j}.pt')) 55 | model.gpt_neox.layers[j].load_state_dict(_tmp) 56 | 57 | elif i == n_stages - 1: 58 | for j in range(n_layer_per_stage): 59 | _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} 60 | if len(_tmp) == 0: 61 | break 62 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt')) 63 | model.gpt_neox.layers[i*n_layer_per_stage + j].load_state_dict(_tmp) 64 | if i*n_layer_per_stage + j == len(model.gpt_neox.layers) - 1: 65 | j += 1 66 | break 67 | 68 | _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} 69 | if len(_tmp) == 0: 70 | break 71 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_lm_head.pt')) 72 | model.gpt_neox.final_layer_norm.weight.data[:] = _tmp['final_layer_norm.weight'] 73 | model.gpt_neox.final_layer_norm.bias.data[:] = _tmp['final_layer_norm.bias'] 74 | model.embed_out.weight.data[:] = _tmp['embed_out.weight'] 75 | if 'embed_out.bias' in _tmp: 76 | model.embed_out.bias.data[:] = _tmp['embed_out.bias'] 77 | 78 | else: 79 | for j in range(n_layer_per_stage): 80 | _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} 81 | if len(_tmp) == 0: 82 | break 83 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt')) 84 | model.gpt_neox.layers[i*n_layer_per_stage + j].load_state_dict(_tmp) 85 | 86 | return model 87 | 88 | 89 | if __name__ == '__main__': 90 | 91 | parser = argparse.ArgumentParser(description='Convert HF checkpoints') 92 | parser.add_argument('--config-name', type=str, default='EleutherAI/gpt-neox-20b', 93 | help='config-name') 94 | parser.add_argument('--ckpt-path', type=str, default=None, 95 | help='ckpt-path') 96 | parser.add_argument('--save-path', type=str, default=None, 97 | help='save-path') 98 | parser.add_argument('--n-stages', type=int, default=8, 99 | help='pipeline group size') 100 | parser.add_argument('--n-layer-per-stage', type=int, default=6, 101 | help='n layers per GPU device') 102 | parser.add_argument('--fp16', default=False, action='store_true') 103 | args = parser.parse_args() 104 | 105 | assert args.ckpt_path is not None 106 | assert args.save_path is not None 107 | 108 | os.makedirs(args.save_path, exist_ok=True) 109 | 110 | print('loading config...') 111 | config = AutoConfig.from_pretrained(args.config_name) 112 | print('loaded config.') 113 | print('loading tokenizer...') 114 | tokenizer = AutoTokenizer.from_pretrained(args.config_name) 115 | print('loaded tokenizer.') 116 | print('creating empty model...') 117 | model = create_empty_gptneox(config) 118 | if args.fp16: 119 | model = model.half() 120 | print('created empty model.') 121 | print('loading model ckpt...') 122 | load_decentralized_checkpoint( 123 | model, args.ckpt_path, n_stages=args.n_stages, n_layer_per_stage=args.n_layer_per_stage, 124 | ) 125 | print('loaded model ckpt.') 126 | 127 | print('saving HF model...') 128 | model.save_pretrained(args.save_path) 129 | print(f'saved HF model to `{args.save_path}`') 130 | config.save_pretrained(args.save_path) 131 | tokenizer.save_pretrained(args.save_path) 132 | 133 | -------------------------------------------------------------------------------- /tools/convert_to_hf_llama.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from transformers import LlamaForCausalLM 9 | from transformers import AutoConfig, AutoTokenizer 10 | 11 | from transformers.modeling_utils import no_init_weights 12 | import os 13 | 14 | 15 | def create_emtpy_llama(config): 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | _reset_parameters_linear = nn.Linear.reset_parameters 21 | def dummy(*args, **kargs): 22 | pass 23 | nn.Linear.reset_parameters = dummy 24 | 25 | # 1. disable init for faster initialization 26 | # 2. avoid tie token embeddings with lm_head, as we train them separately. 27 | with no_init_weights(_enable=True): 28 | model = LlamaForCausalLM(config).eval() 29 | 30 | nn.Linear.reset_parameters = _reset_parameters_linear 31 | 32 | return model 33 | 34 | def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_per_stage=16, ): 35 | input_path = checkpoint_path 36 | 37 | n_layers = len(model.model.layers) 38 | assert n_stages * n_layer_per_stage >= len(model.model.layers) 39 | # assert model.lm_head.weight.data is not model.transformer.wte.weight.data 40 | 41 | for i in range(n_stages): 42 | 43 | print(f'loading stage {i}') 44 | 45 | checkpoint = torch.load(os.path.join(input_path, f'prank_{i}_checkpoint.pt'), map_location=torch.device("cpu")) 46 | 47 | if i == 0: 48 | _tmp = {k[len(f"{0}."):]:v for k,v in checkpoint.items() if k.startswith(f"0.")} 49 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_embs.pt')) 50 | model.model.embed_tokens.weight.data[:] = _tmp['embed_tokens.weight'] 51 | 52 | for j in range(n_layer_per_stage): 53 | _tmp = {k[len(f"{j+1}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j+1}.")} 54 | if len(_tmp) == 0: 55 | break 56 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_{j}.pt')) 57 | ret = model.model.layers[j].load_state_dict(_tmp, strict=False) 58 | if len(ret.missing_keys): 59 | print('The following weight keys are missing:') 60 | print(ret.missing_keys) 61 | if len(ret.unexpected_keys): 62 | print('The following weight keys are unexpected:') 63 | print(ret.unexpected_keys) 64 | 65 | elif i == n_stages - 1: 66 | for j in range(n_layer_per_stage): 67 | if i*n_layer_per_stage + j == n_layers: 68 | break 69 | _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} 70 | if len(_tmp) == 0: 71 | break 72 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt')) 73 | ret = model.model.layers[i*n_layer_per_stage + j].load_state_dict(_tmp, strict=False) 74 | if len(ret.missing_keys): 75 | print('The following weight keys are missing:') 76 | print(ret.missing_keys) 77 | if len(ret.unexpected_keys): 78 | print('The following weight keys are unexpected:') 79 | print(ret.unexpected_keys) 80 | else: 81 | j += 1 82 | 83 | _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} 84 | if len(_tmp) == 0: 85 | break 86 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_lm_head.pt')) 87 | model.model.norm.weight.data[:] = _tmp['norm.weight'] 88 | if 'norm.bias' in _tmp: 89 | model.model.norm.bias.data[:] = _tmp['norm.bias'] 90 | model.lm_head.weight.data[:] = _tmp['lm_head.weight'] 91 | if 'lm_head.bias' in _tmp: 92 | model.lm_head.bias.data[:] = _tmp['lm_head.bias'] 93 | 94 | else: 95 | for j in range(n_layer_per_stage): 96 | _tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")} 97 | if len(_tmp) == 0: 98 | break 99 | # torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt')) 100 | ret = model.model.layers[i*n_layer_per_stage + j].load_state_dict(_tmp, strict=False) 101 | if len(ret.missing_keys): 102 | print('The following weight keys are missing:') 103 | print(ret.missing_keys) 104 | if len(ret.unexpected_keys): 105 | print('The following weight keys are unexpected:') 106 | print(ret.unexpected_keys) 107 | 108 | return model 109 | 110 | 111 | if __name__ == '__main__': 112 | 113 | parser = argparse.ArgumentParser(description='Convert HF checkpoints') 114 | parser.add_argument('--config-name', type=str, default='togethercomputer/Llama-2-7B-32K-beta', 115 | help='config-name') 116 | parser.add_argument('--ckpt-path', type=str, default=None, 117 | help='ckpt-path') 118 | parser.add_argument('--save-path', type=str, default=None, 119 | help='save-path') 120 | parser.add_argument('--n-stages', type=int, default=8, 121 | help='pipeline group size') 122 | parser.add_argument('--n-layer-per-stage', type=int, default=4, 123 | help='n layers per GPU device') 124 | parser.add_argument('--fp16', default=False, action='store_true') 125 | args = parser.parse_args() 126 | 127 | assert args.ckpt_path is not None 128 | assert args.save_path is not None 129 | 130 | if not os.path.exists(args.save_path): 131 | os.mkdir(args.save_path) 132 | 133 | # LlamaForCausalLM LlamaConfig LlamaTokenizer 134 | print('loading config...') 135 | config = AutoConfig.from_pretrained(args.config_name) 136 | print('loaded config.') 137 | print('loading tokenizer...') 138 | tokenizer = AutoTokenizer.from_pretrained(args.config_name) 139 | print('loaded tokenizer.') 140 | print('creating empty model...') 141 | model = create_emtpy_llama(config) 142 | if args.fp16: 143 | model = model.half() 144 | print('created empty model.') 145 | print('loading model ckpt...') 146 | load_decentralized_checkpoint( 147 | model, args.ckpt_path, n_stages=args.n_stages, n_layer_per_stage=args.n_layer_per_stage, 148 | ) 149 | print('loaded model ckpt.') 150 | 151 | print('saving HF model...') 152 | model.save_pretrained(args.save_path) 153 | print(f'saved HF model to `{args.save_path}`') 154 | config.save_pretrained(args.save_path) 155 | tokenizer.save_pretrained(args.save_path) 156 | -------------------------------------------------------------------------------- /tools/model_load_benchmark.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import time 4 | import torch 5 | import torchvision 6 | import os 7 | import re 8 | import psutil 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | 11 | # Benchmark download, tokenize, load, inference time. 12 | def benchmark(model_dict: dict, device_name: str, repeat_infer: int): 13 | 14 | # Initialize the benchmark results dictionary 15 | results_dict = {} 16 | 17 | # Check that we have CUDA GPUs available before running the benchmark 18 | if not torch.cuda.is_available(): 19 | print("ERROR: CUDA GPUs are not available, benchmark not run") 20 | return results_dict 21 | 22 | device = torch.device(device_name) 23 | 24 | process = psutil.Process() 25 | 26 | print(f'Using device {device}') 27 | 28 | # Loop through the models to test 29 | for model_name, model_path in model_dict.items(): 30 | # purge unused cached memory 31 | torch.cuda.empty_cache() 32 | 33 | print(f"Testing model: {model_name}") 34 | 35 | # Measure the time it takes to download the tokenizer data and load the tokenizer 36 | tokenizer_download_start_time = time.time() 37 | tokenizer = AutoTokenizer.from_pretrained(model_path, force_download=True) 38 | tokenizer_download_end_time = time.time() 39 | 40 | tokenizer = None 41 | 42 | # Measure the time it takes to load the tokenizer 43 | tokenizer_load_start_time = time.time() 44 | tokenizer = AutoTokenizer.from_pretrained(model_path) 45 | tokenizer_load_end_time = time.time() 46 | 47 | tokenizer_load_sec = tokenizer_load_end_time - tokenizer_load_start_time 48 | tokenizer_download_sec = tokenizer_download_end_time - tokenizer_download_start_time - tokenizer_load_sec 49 | 50 | print(f"Testing model: {model_name} --- tokenizer download time = {tokenizer_download_sec:.3} sec") 51 | print(f"Testing model: {model_name} --- tokenize load time = {tokenizer_load_sec:.3} sec") 52 | 53 | # Measure the time it takes to download and load the model into main memory 54 | model_download_start_time = time.time() 55 | model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, torchscript=True, force_download=True) 56 | model_download_end_time = time.time() 57 | 58 | model = None 59 | 60 | # Measure the time it takes to load the model into main memory 61 | memory_used_main_start = process.memory_info().rss 62 | model_load_to_ram_start_time = time.time() 63 | model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, torchscript=True) 64 | model_load_to_ram_end_time = time.time() 65 | memory_used_main_end = process.memory_info().rss 66 | 67 | model_load_to_ram_sec = model_load_to_ram_end_time - model_load_to_ram_start_time 68 | model_download_sec = model_download_end_time - model_download_start_time - model_load_to_ram_sec 69 | model_main_memory_bytes = memory_used_main_end - memory_used_main_start 70 | 71 | print(f"Testing model: {model_name} --- model download time = {model_download_sec:.3} sec") 72 | print(f"Testing model: {model_name} --- model load to RAM time = {model_load_to_ram_sec:.3} sec") 73 | print(f"Testing model: {model_name} --- model main memory size = {model_main_memory_bytes} bytes") 74 | 75 | # Measure the time it takes to load the model from main memory to the GPU 76 | gpu_memory_start = torch.cuda.memory_allocated(device) 77 | model_xfer_to_gpu_start_time = time.time() 78 | model = model.to(device) 79 | model_xfer_to_gpu_end_time = time.time() 80 | gpu_memory_end = torch.cuda.memory_allocated(device) 81 | 82 | model_xfer_to_gpu_sec = model_xfer_to_gpu_end_time - model_xfer_to_gpu_start_time 83 | model_gpu_memory_bytes = gpu_memory_end - gpu_memory_start 84 | 85 | print(f"Testing model: {model_name} --- model transfer to GPU time = {model_xfer_to_gpu_sec:.3} sec") 86 | print(f"Testing model: {model_name} --- model GPU memory size = {model_gpu_memory_bytes} bytes") 87 | 88 | # Measure the time it takes to run inference from a cold start 89 | inference_start_time = time.time() 90 | inputs = tokenizer("Hello, world!", return_tensors="pt").to(device) 91 | outputs = model(**inputs) 92 | inference_end_time = time.time() 93 | inference_sec = inference_end_time - inference_start_time 94 | 95 | print(f"Testing model: {model_name} --- inference time = {inference_sec:.3} sec") 96 | 97 | # Measure the time it takes to run inference from a cold start 98 | inference_warm_start_time = time.time() 99 | for i in range(0, repeat_infer): 100 | inputs = tokenizer("Hello, world!", return_tensors="pt").to(device) 101 | outputs = model(**inputs) 102 | inference_warm_end_time = time.time() 103 | inference_warm_sec = (inference_warm_end_time - inference_warm_start_time) / float(repeat_infer) 104 | 105 | print(f"Testing model: {model_name} --- inference warm time = {inference_warm_sec:.3} sec") 106 | 107 | total_sec = tokenizer_download_sec + tokenizer_load_sec + model_download_sec + model_load_to_ram_sec + model_xfer_to_gpu_sec + inference_sec 108 | 109 | print(f"Testing model: {model_name} --- total time = {total_sec:.3} sec") 110 | 111 | # Add the results to the dictionary 112 | results_dict[model_name] = { 113 | "tokenizer_download_sec": tokenizer_download_sec, 114 | "tokenizer_load_sec": tokenizer_load_sec, 115 | "model_download_sec": model_download_sec, 116 | "model_load_to_ram_sec": model_load_to_ram_sec, 117 | "model_main_memory_MB": float(model_main_memory_bytes) / 1000000.0, 118 | "model_transfer_to_gpu_sec": model_xfer_to_gpu_sec, 119 | "model_gpu_memory_MB": float(model_gpu_memory_bytes) / 1000000.0, 120 | "inference_sec": inference_sec, 121 | "inference_warm_sec": inference_warm_sec, 122 | "total_sec": total_sec 123 | } 124 | 125 | # Unload the model 126 | model = None 127 | torch.cuda.empty_cache() 128 | 129 | return results_dict 130 | 131 | # Define the main function 132 | def main(input_file: str, output_file: str, device_name: str, repeat_infer: int): 133 | 134 | # Load the models to test from the input JSON file 135 | with open(input_file, "r") as f: 136 | model_dict = json.load(f) 137 | 138 | # Run the benchmark 139 | results_dict = benchmark(model_dict, device_name, repeat_infer) 140 | 141 | # Write the results to the JSON output file 142 | # use a regular expression to apply formatting to floatin point 143 | json_data = re.sub('"(.*?)":\s*(0\.0*\d{2}|\d+\.\d{2})\d*(,?\n)', '"\\1": \\2\\3', json.dumps(results_dict, indent=4)) 144 | with open(output_file, 'w') as f: 145 | f.write(json_data) 146 | 147 | if __name__ == "__main__": 148 | # Create an argument parser 149 | parser = argparse.ArgumentParser(description='Benchmark downloading, loading, and running an inferernce for a set of ML models.') 150 | parser.add_argument('-i', '--input', required=True, help='Input JSON file containing models to be benchmark') 151 | parser.add_argument('-o', '--output', required=True, help='Output JSON file with model benchmark results') 152 | parser.add_argument('-d', '--device', required=False, default='cuda:0', help='Cuda device name, e.g. "cuda:0"') 153 | parser.add_argument('-r', '--repeat-infer', required=False, default=30, help='Repeat inferrence for warm timings') 154 | 155 | # Parse the command line arguments 156 | args = parser.parse_args() 157 | 158 | # Process the data 159 | main(args.input, args.output, args.device, max(args.repeat_infer, 1)) -------------------------------------------------------------------------------- /training/README.md: -------------------------------------------------------------------------------- 1 | # OpenChatKit Training 2 | 3 | This directory contains code for training a chat model using OpenChatKit. The main training script is `finetune_GPT-NeoXT-Chat-Base-20B.sh`. 4 | 5 | To customize training, make a copy of the script and modify the arguments. 6 | 7 | ## Arguments 8 | 9 | Environment vars that should be set: 10 | ```bash 11 | export GLOO_SOCKET_IFNAME=lo # this interface should be consistent to `--net-interface` 12 | export NCCL_SOCKET_IFNAME=lo # this interface should be consistent to `--net-interface` 13 | export WANDB_NAME=gptj-test # wandb run name 14 | ``` 15 | 16 | The following arguments should be carefully set: 17 | - `--model-name`: The path of model ckpt sharded by layers. 18 | - `--tokenizer-name`: Usually the same to `--model-name`. You can also use HF's model name. 19 | - `--model-type`: Indicate the model type. {gptj}. More model types will be added soon. 20 | - `--num-layers`: Number of Transformer layers **for each GPU**. E.g. GPT-J has 28 layers, if we use two GPUs to form a pipeline, `--num-layers` should be 14. 21 | - `--embedding-dim`: The hidden size of the model. GPT-J-6B is 4096. This is used to create buffers. 22 | - `--dist-url`: URL of rank 0 worker (master). It is the same to all workers. And this URL should be accessible by all workers. For local training (single machine multiple GPUs), this can be like `--dist-url tcp://127.0.0.1:7033` 23 | - `--world-size`: The total number of workers. `world-size == pipeline-group-size * data-group-size` 24 | - `--pipeline-group-size`: Number of GPU workers for each pipeline 25 | - `--data-group-size`: Number of data parallel workers. Also the number of pipelines. 26 | - `--net-interface`: Network interface. Should be consistent with `GLOO_SOCKET_IFNAME` and `NCCL_SOCKET_IFNAME`. 27 | 28 | The following arguments can be tuned / changed: 29 | - `--train-log-backend `: How to log the training info. {print, loguru, wandb}. 30 | - `--optimizer`: Optimizer type. {adam, 8bit-adam} (8bit-adam requires `pip install bitsandbytes`) 31 | - `--load-pretrained-model`: Whether to load model weights. Usually `true`. 32 | - `--task-name`: The task name or the path of a `jsonl` file. For multi-task training separate task names by `,`. 33 | There is an optional sampling weight after each task name, separated by `:` (default is 1.0). Sampling weights will be normalized. 34 | E.g. it should be like `--task-name cot:0.1,/path_task0.jsonl:1.0,/path_task0.jsonl:1.0,/path_task0.jsonl:1.0`. 35 | The number after the colon indicates the sampling weight for the task during training. For example, `cot:0.1` means the `cot` task will be sampled with a weight of 0.1. 36 | - `--checkpoint-path`: Path to save fine-tuned checkpoints. 37 | - `--checkpoint-steps`: Save ckpt every `checkpoint-steps`. 38 | - `--total-steps`: Total number of steps for training. (This counts all `gradient-accumulate-step`s.) 39 | - `--warmup-steps`: LR warmup steps. 40 | - `--lr`: learning rate 41 | - `--seq-length`: sequence length 42 | - `--batch-size`: batch size for each GPU device (of each gradient accumulation step). 43 | - `--micro-batch-size`: micro batch size for pipeline parallelism. 1 works fine. 44 | - `--gradient-accumulate-step`: Accumulate gradients for several steps before updating parameters. This is another way to achieve large batch sizes when GPU memory is not enough. 45 | 46 | The following arguments usually do not change: 47 | - `--dp-backend`: {nccl, gloo}, default nccl. 48 | - `--dp-mode`: {allreduce}. 49 | - `--fp16`: Flag to enable FP16 mixed precision training. Should always adding it for the current impl. 50 | - `--pp-mode`: always `gpipe` 51 | - `--profiling`: {no-profiling, tidy_profiling}. `tidy_profiling` will generate profile jsons. 52 | 53 | ## Adding Your Own Data to the DATASETS 54 | 55 | To add your own data to the training process, you should create a `jsonl` file where each line is a JSON object representing a single training example. Once you have your `jsonl` file, you can include it in the `--task-name` argument with an appropriate sampling weight. For instance, if your file is located at `/path_to_your_data/your_data.jsonl` and you wish to give it a sampling weight of 0.5, you would add `/path_to_your_data/your_data.jsonl:0.5` to the `--task-name` argument. 56 | 57 | If you have any questions or need further assistance, please refer to the [OpenDataHub](https://github.com/togethercomputer/OpenDataHub) repository or contact us through our [website](https://www.together.ai/contact). 58 | -------------------------------------------------------------------------------- /training/comm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/comm/__init__.py -------------------------------------------------------------------------------- /training/comm/comm_utils.py: -------------------------------------------------------------------------------- 1 | from .torch_backend import * 2 | from .nccl_backend import * 3 | 4 | _DATA_PARALLEL_COMM = None 5 | _DATA_PARALLEL_RANK = None 6 | _DATA_PARALLEL_WORLD_SIZE = None 7 | 8 | _PIPELINE_PARALLEL_COMM = None 9 | _PIPELINE_PARALLEL_RANK = None 10 | _PIPELINE_PARALLEL_WORLD_SIZE = None 11 | 12 | _TENSOR_PARALLEL_COMM = None 13 | _TENSOR_PARALLEL_RANK = None 14 | _TENSOR_PARALLEL_WORLD_SIZE = None 15 | 16 | import threading 17 | 18 | _LOCK = threading.RLock() 19 | 20 | def get_lock(): 21 | return _LOCK 22 | 23 | def get_data_parallel_comm() -> NCCLCommunicator: 24 | assert _DATA_PARALLEL_COMM is not None 25 | return _DATA_PARALLEL_COMM 26 | 27 | 28 | def get_data_parallel_rank() -> int: 29 | assert _DATA_PARALLEL_RANK is not None 30 | return _DATA_PARALLEL_RANK 31 | 32 | 33 | def get_data_parallel_world_size() -> int: 34 | assert _DATA_PARALLEL_WORLD_SIZE is not None 35 | return _DATA_PARALLEL_WORLD_SIZE 36 | 37 | 38 | def get_pipeline_parallel_comm() -> NCCLCommunicator: 39 | assert _PIPELINE_PARALLEL_COMM is not None 40 | return _PIPELINE_PARALLEL_COMM 41 | 42 | 43 | def get_pipeline_parallel_rank() -> int: 44 | assert _PIPELINE_PARALLEL_RANK is not None 45 | return _PIPELINE_PARALLEL_RANK 46 | 47 | 48 | def get_pipeline_parallel_world_size() -> int: 49 | assert _PIPELINE_PARALLEL_WORLD_SIZE is not None 50 | return _PIPELINE_PARALLEL_WORLD_SIZE 51 | 52 | 53 | def get_megatron_tensor_parallel_comm() -> NCCLCommunicator: 54 | assert _TENSOR_PARALLEL_COMM is not None 55 | return _TENSOR_PARALLEL_COMM 56 | 57 | 58 | def get_megatron_tensor_parallel_rank() -> int: 59 | assert _TENSOR_PARALLEL_RANK is not None 60 | return _TENSOR_PARALLEL_RANK 61 | 62 | 63 | def get_megatron_tensor_parallel_world_size() -> int: 64 | assert _TENSOR_PARALLEL_WORLD_SIZE is not None 65 | return _TENSOR_PARALLEL_WORLD_SIZE 66 | 67 | 68 | def default_init(args): 69 | import datetime 70 | import time 71 | try: 72 | dist.destroy_process_group() 73 | # the first time will raise exception, so the following code is skipped. 74 | print('destroy comm, increase port for 1. (this could cause problem)') 75 | url = ':'.join(args.dist_url.split(':')[:-1]) 76 | port = int(args.dist_url.split(':')[-1]) + 1 77 | args.dist_url = f"{url}:{port}" 78 | print(f"new master url: {args.dist_url}") 79 | except: 80 | pass 81 | dist.init_process_group(backend='gloo', timeout=datetime.timedelta(seconds=5*60), init_method=args.dist_url, world_size=args.world_size, rank=args.rank) 82 | 83 | 84 | def init_communicators(args): 85 | default_init(args) 86 | assert args.world_size == args.data_group_size * args.pipeline_group_size 87 | if args.world_size == args.data_group_size * args.pipeline_group_size: 88 | # We do the following hard code alignment of communication groups: 89 | # Suppose there are 8 instances (world_size), and 4 data parallel groups (data_group_size is 2), 90 | # Then there would be 2 pipeline parallel groups (pipeline_group_size is 4), then the groups will look like: 91 | # pipeline parallel: , 92 | # data parallel: , , , 93 | # assert args.world_size == args.data_group_size * args.pipeline_group_size 94 | global _DATA_PARALLEL_COMM 95 | global _PIPELINE_PARALLEL_COMM 96 | global _DATA_PARALLEL_RANK 97 | global _PIPELINE_PARALLEL_RANK 98 | global _DATA_PARALLEL_WORLD_SIZE 99 | global _PIPELINE_PARALLEL_WORLD_SIZE 100 | # We use pipeline parallel by default. 101 | _PIPELINE_PARALLEL_WORLD_SIZE = args.pipeline_group_size 102 | _PIPELINE_PARALLEL_RANK = args.rank % args.pipeline_group_size 103 | _PIPELINE_PARALLEL_COMM = NCCLCommunicator(_PIPELINE_PARALLEL_RANK, args.cuda_id, args.pipeline_group_size, 104 | "pipeline_group_"+str(args.rank // args.pipeline_group_size)) 105 | if args.data_group_size != 1: 106 | _DATA_PARALLEL_WORLD_SIZE = args.data_group_size 107 | _DATA_PARALLEL_RANK = args.rank // args.pipeline_group_size 108 | 109 | dp_backend = getattr(args, 'dp_backend', 'gloo') 110 | if dp_backend == 'nccl': 111 | 112 | _DATA_PARALLEL_COMM = NCCLCommunicator(_DATA_PARALLEL_RANK, args.cuda_id, args.data_group_size, 113 | "data_group_"+str(args.rank % args.pipeline_group_size)) 114 | 115 | elif dp_backend == 'gloo': 116 | 117 | for i in range(args.pipeline_group_size): 118 | ranks = [rank for rank in range(i, args.world_size, args.pipeline_group_size)] 119 | print(args.rank, ranks) 120 | data_group = torch.distributed.new_group(ranks, backend='gloo') 121 | if args.rank in ranks: 122 | def to_global_rank(dp_rank): 123 | rank = _PIPELINE_PARALLEL_RANK + dp_rank * args.pipeline_group_size 124 | # print(f"{dp_rank} --> {rank}") 125 | return rank 126 | _DATA_PARALLEL_COMM = TorchCommunicator( 127 | data_group, to_global_rank=to_global_rank, 128 | dp_rank=_DATA_PARALLEL_RANK, 129 | comm_group_size=args.data_group_size,) 130 | 131 | else: 132 | assert False 133 | 134 | print('comm init done!!') 135 | 136 | # elif args.world_size == args.data_group_size * args.tensor_group_size: 137 | # global _DATA_PARALLEL_COMM 138 | # global _TENSOR_PARALLEL_COMM 139 | # global _DATA_PARALLEL_RANK 140 | # global _TENSOR_PARALLEL_RANK 141 | # global _DATA_PARALLEL_WORLD_SIZE 142 | # global _TENSOR_PARALLEL_WORLD_SIZE 143 | # We use megatron tensor parallel by default. 144 | # _TENSOR_PARALLEL_WORLD_SIZE = args.tensor_group_size 145 | # _TENSOR_PARALLEL_RANK = args.rank % args.tensor_group_size 146 | # _TENSOR_PARALLEL_COMM = NCCLCommunicator(_TENSOR_PARALLEL_RANK, args.cuda_id, args.tensor_group_size, 147 | # "tensor_group_" + str(args.rank // args.tensor_group_size)) 148 | # if args.data_group_size != 1: 149 | # _DATA_PARALLEL_WORLD_SIZE = args.data_group_size 150 | # _DATA_PARALLEL_RANK = args.rank // args.tensor_group_size 151 | # _DATA_PARALLEL_COMM = NCCLCommunicator(_DATA_PARALLEL_RANK, args.cuda_id, args.data_group_size, 152 | # "data_group_" + str(args.rank % args.tensor_group_size)) 153 | else: 154 | print("Not supported yet") 155 | assert False 156 | 157 | 158 | 159 | def reinit_dp_communicator(args): 160 | 161 | print('###### reinit start #######') 162 | 163 | default_init(args) 164 | assert args.world_size == args.data_group_size * args.pipeline_group_size 165 | if args.world_size == args.data_group_size * args.pipeline_group_size: 166 | # We do the following hard code alignment of communication groups: 167 | # Suppose there are 8 instances (world_size), and 4 data parallel groups (data_group_size is 2), 168 | # Then there would be 2 pipeline parallel groups (pipeline_group_size is 4), then the groups will look like: 169 | # pipeline parallel: , 170 | # data parallel: , , , 171 | # assert args.world_size == args.data_group_size * args.pipeline_group_size 172 | global _DATA_PARALLEL_COMM 173 | global _PIPELINE_PARALLEL_COMM 174 | global _DATA_PARALLEL_RANK 175 | global _PIPELINE_PARALLEL_RANK 176 | global _DATA_PARALLEL_WORLD_SIZE 177 | global _PIPELINE_PARALLEL_WORLD_SIZE 178 | 179 | if args.data_group_size != 1: 180 | 181 | dp_backend = getattr(args, 'dp_backend', 'gloo') 182 | if dp_backend == 'nccl': 183 | 184 | raise Exception('NCCL cannot reinit.') 185 | 186 | elif dp_backend == 'gloo': 187 | 188 | for i in range(args.pipeline_group_size): 189 | ranks = [rank for rank in range(i, args.world_size, args.pipeline_group_size)] 190 | print(args.rank, ranks) 191 | data_group = torch.distributed.new_group(ranks, backend='gloo') 192 | if args.rank in ranks: 193 | def to_global_rank(dp_rank): 194 | rank = _PIPELINE_PARALLEL_RANK + dp_rank * args.pipeline_group_size 195 | # print(f"{dp_rank} --> {rank}") 196 | return rank 197 | _DATA_PARALLEL_COMM = TorchCommunicator( 198 | data_group, to_global_rank=to_global_rank, 199 | dp_rank=_DATA_PARALLEL_RANK, 200 | comm_group_size=args.data_group_size,) 201 | 202 | else: 203 | assert False 204 | 205 | print('######## dp comm reinit done!! ########') -------------------------------------------------------------------------------- /training/comm/nccl_backend.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import cupy 4 | import cupy.cuda.nccl 5 | import torch.distributed as dist 6 | from typing import List 7 | 8 | 9 | def _type_torch_to_cupy(torch_type: torch.dtype): 10 | # print(torch_type) 11 | mappings = { 12 | torch.uint8: cupy.cuda.nccl.NCCL_UINT8, 13 | torch.int32: cupy.cuda.nccl.NCCL_INT32, 14 | torch.int64: cupy.cuda.nccl.NCCL_INT64, 15 | torch.int: cupy.cuda.nccl.NCCL_INT, 16 | torch.float16: cupy.cuda.nccl.NCCL_FLOAT16, 17 | torch.float32: cupy.cuda.nccl.NCCL_FLOAT32, 18 | torch.float64: cupy.cuda.nccl.NCCL_FLOAT64, 19 | torch.float: cupy.cuda.nccl.NCCL_FLOAT 20 | } 21 | return mappings[torch_type] 22 | 23 | 24 | class NCCLCommunicator: 25 | def __init__(self, 26 | comm_rank: int, 27 | cuda_id: int, 28 | comm_group_size: int, 29 | comm_name: str): 30 | self.comm_rank = comm_rank 31 | cupy.cuda.Device(cuda_id).use() 32 | self.comm_group_size = comm_group_size 33 | print("Initialize NCCLCommunicator: <", comm_name, ">; rank:", comm_rank) 34 | self.dist_store = dist.distributed_c10d._get_default_store() 35 | 36 | if self.comm_rank == 0: 37 | cuda_id = cupy.cuda.nccl.get_unique_id() 38 | # print(cuda_id) 39 | cuda_id_str = np.array(cuda_id).tobytes() 40 | self.dist_store.set('group-'+comm_name+'-unique-id', cuda_id_str) 41 | # print("Master put .") 42 | else: 43 | cuda_id_str = self.dist_store.get('group-'+comm_name+'-unique-id') 44 | 45 | comm_id = tuple(np.frombuffer(cuda_id_str, dtype=int)) 46 | # comm_id = cupy.cuda.nccl.get_unique_id() 47 | # print(comm_id) 48 | self.comm = cupy.cuda.nccl.NcclCommunicator(comm_group_size, comm_id, comm_rank) 49 | 50 | @staticmethod 51 | def barrier(): 52 | dist.barrier() 53 | 54 | def store_set(self, key, value): 55 | self.dist_store.set(key, value) 56 | 57 | def store_get(self, key): 58 | return self.dist_store.get(key) 59 | 60 | def send(self, 61 | tensor: torch.Tensor, 62 | dst: int, 63 | stream=cupy.cuda.Stream.null): 64 | # print("Send tensor of size:", torch.numel(tensor)) 65 | self.comm.send( 66 | tensor.data_ptr(), 67 | torch.numel(tensor), 68 | _type_torch_to_cupy(tensor.dtype), 69 | dst, 70 | stream.ptr 71 | ) 72 | 73 | def recv(self, 74 | tensor: torch.Tensor, 75 | src: int, 76 | stream=cupy.cuda.Stream.null): 77 | # print("Recv tensor of size:", torch.numel(tensor)) 78 | # print("mean:", torch.mean(tensor).item(), " std:", torch.std(tensor).item()) 79 | self.comm.recv( 80 | tensor.data_ptr(), 81 | torch.numel(tensor), 82 | _type_torch_to_cupy(tensor.dtype), 83 | src, 84 | stream.ptr 85 | ) 86 | 87 | def broadcast(self, 88 | tensor: torch.Tensor, 89 | src: int, 90 | stream=cupy.cuda.Stream.null): 91 | self.comm.bcast( 92 | tensor.data_ptr(), 93 | torch.numel(tensor), 94 | _type_torch_to_cupy(tensor.dtype), 95 | src, 96 | stream.ptr 97 | ) 98 | 99 | def reduce(self, 100 | tensor: torch.Tensor, 101 | dst: int, 102 | stream=cupy.cuda.Stream.null, 103 | op=cupy.cuda.nccl.NCCL_SUM): 104 | self.comm.reduce( 105 | tensor.data_ptr(), # force it to be in-place. 106 | tensor.data_ptr(), 107 | torch.numel(tensor), 108 | _type_torch_to_cupy(tensor.dtype), 109 | op, 110 | dst, 111 | stream.ptr 112 | ) 113 | 114 | def all_reduce(self, 115 | tensor: torch.Tensor, 116 | stream=cupy.cuda.Stream.null, 117 | op=cupy.cuda.nccl.NCCL_SUM): 118 | self.comm.allReduce( 119 | tensor.data_ptr(), 120 | tensor.data_ptr(), 121 | torch.numel(tensor), 122 | _type_torch_to_cupy(tensor.dtype), 123 | op, 124 | stream.ptr 125 | ) 126 | 127 | def scatter(self, 128 | tensor: torch.Tensor, 129 | scatter_list: List[torch.Tensor], 130 | src: int, 131 | stream=cupy.cuda.Stream.null): 132 | cupy.cuda.nccl.groupStart() 133 | if self.comm_rank == src: 134 | for i in range(self.comm_group_size): 135 | self.send( 136 | scatter_list[i], 137 | i, 138 | stream 139 | ) 140 | self.recv( 141 | tensor, 142 | src, 143 | stream 144 | ) 145 | cupy.cuda.nccl.groupEnd() 146 | 147 | def gather(self, 148 | tensor: torch.Tensor, 149 | gather_list: List[torch.Tensor], 150 | dst: int, 151 | stream=cupy.cuda.Stream.null): 152 | cupy.cuda.nccl.groupStart() 153 | if self.comm_rank == dst: 154 | for i in range(self.comm_group_size): 155 | self.recv( 156 | gather_list[i], 157 | i, 158 | stream 159 | ) 160 | self.send( 161 | tensor, 162 | dst, 163 | stream 164 | ) 165 | cupy.cuda.nccl.groupEnd() 166 | 167 | def all_to_all(self, 168 | output_tensor_list: List[torch.Tensor], 169 | input_tensor_list: List[torch.Tensor], 170 | stream=cupy.cuda.Stream.null): 171 | assert len(output_tensor_list) == self.comm_group_size and len(input_tensor_list) == self.comm_group_size 172 | cupy.cuda.nccl.groupStart() 173 | for i in range(self.comm_group_size): 174 | self.send(input_tensor_list[i], i, stream) 175 | self.recv(output_tensor_list[i], i, stream) 176 | cupy.cuda.nccl.groupEnd() 177 | 178 | def all_gather(self, 179 | tensor: torch.Tensor, 180 | output_tensor_list: List[torch.Tensor], 181 | stream=cupy.cuda.Stream.null 182 | ): 183 | assert len(output_tensor_list) == self.comm_group_size 184 | cupy.cuda.nccl.groupStart() 185 | for i in range(self.comm_group_size): 186 | self.send(tensor, i, stream) 187 | self.recv(output_tensor_list[i], i, stream) 188 | cupy.cuda.nccl.groupEnd() 189 | 190 | def all_reduce_opt(self, 191 | tensor: torch.Tensor, 192 | buffer: List[torch.Tensor], 193 | stream=cupy.cuda.Stream.null, 194 | caller=None): 195 | # First do all-to-all 196 | assert torch.numel(tensor.data) % self.comm_group_size == 0 197 | chunk_size = torch.numel(tensor.data) // self.comm_group_size 198 | t_type = _type_torch_to_cupy(tensor.dtype) 199 | element_size = tensor.data.element_size() 200 | 201 | cupy.cuda.nccl.groupStart() 202 | for i in range(self.comm_group_size): 203 | self.comm.send(tensor.data_ptr()+i*chunk_size*element_size, chunk_size, t_type, i, stream.ptr) 204 | self.comm.recv(buffer[i].data_ptr(), chunk_size, t_type, i, stream.ptr) 205 | cupy.cuda.nccl.groupEnd() 206 | 207 | for i in range(1, self.comm_group_size): 208 | buffer[0] += buffer[i] 209 | 210 | cupy.cuda.nccl.groupStart() 211 | for i in range(self.comm_group_size): 212 | self.comm.send(buffer[0].data_ptr(), chunk_size, t_type, i, stream.ptr) 213 | self.comm.recv(tensor.data_ptr()+i*chunk_size*element_size, chunk_size, t_type, i, stream.ptr) 214 | cupy.cuda.nccl.groupEnd() 215 | 216 | -------------------------------------------------------------------------------- /training/comm/torch_backend.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from typing import List 4 | 5 | class TorchCommunicator: 6 | 7 | def __init__(self, 8 | process_group, 9 | to_global_rank=lambda rank: rank, 10 | dp_rank=None, 11 | comm_group_size=None,): 12 | self.process_group = process_group 13 | self.to_global_rank = to_global_rank 14 | self.dp_rank = dp_rank 15 | self.comm_group_size = comm_group_size 16 | 17 | # @staticmethod 18 | def barrier(self): 19 | dist.barrier(group=self.process_group) 20 | 21 | def send(self, 22 | tensor: torch.Tensor, 23 | dst: int, 24 | stream=None): 25 | # print("Send tensor of size:", torch.numel(tensor)) 26 | if tensor.device == torch.device('cpu'): 27 | dist.send(tensor, self.to_global_rank(dst), group=self.process_group) 28 | else: 29 | dist.send(tensor.cpu(), self.to_global_rank(dst), group=self.process_group) 30 | 31 | def recv(self, 32 | tensor: torch.Tensor, 33 | src: int, 34 | stream=None): 35 | 36 | if tensor.device == torch.device('cpu'): 37 | dist.recv(tensor, self.to_global_rank(src), group=self.process_group) 38 | else: 39 | buffer = tensor.cpu() 40 | dist.recv(buffer, self.to_global_rank(src), group=self.process_group) 41 | tensor[:] = buffer.to(tensor.device) 42 | 43 | def isend(self, 44 | tensor: torch.Tensor, 45 | dst: int, 46 | stream=None): 47 | # print("Send tensor of size:", torch.numel(tensor)) 48 | if tensor.device == torch.device('cpu'): 49 | handler = dist.isend(tensor, self.to_global_rank(dst), group=self.process_group) 50 | else: 51 | handler = dist.isend(tensor.cpu(), self.to_global_rank(dst), group=self.process_group) 52 | return handler 53 | 54 | def irecv(self, 55 | tensor: torch.Tensor, 56 | src: int, 57 | stream=None): 58 | if tensor.device == torch.device('cpu'): 59 | handler = dist.irecv(tensor, self.to_global_rank(src), group=self.process_group) 60 | else: 61 | assert False 62 | buffer = tensor.cpu() 63 | handler = dist.irecv(buffer, self.to_global_rank(src), group=self.process_group) 64 | tensor[:] = buffer.to(tensor.device) 65 | return handler 66 | 67 | def broadcast(self, 68 | tensor: torch.Tensor, 69 | src: int, 70 | stream=None): 71 | if tensor.device == torch.device('cpu'): 72 | dist.broadcast(tensor, self.to_global_rank(src), group=self.process_group) 73 | else: 74 | buffer = tensor.cpu() 75 | dist.broadcast(buffer, self.to_global_rank(src), group=self.process_group) 76 | tensor[:] = buffer.to(tensor.device) 77 | 78 | def reduce(self, 79 | tensor: torch.Tensor, 80 | dst: int, 81 | stream=None, 82 | op=dist.ReduceOp.SUM): 83 | dist.reduce(tensor, self.to_global_rank(dst), group=self.process_group, op=op) 84 | 85 | def all_reduce(self, 86 | tensor: torch.Tensor, 87 | stream = None, 88 | op=dist.ReduceOp.SUM): 89 | buffer = tensor.cpu() 90 | dist.all_reduce(buffer, group=self.process_group, op=op) 91 | tensor[:] = buffer.to(tensor.device) 92 | 93 | def gather(self, 94 | tensor: torch.Tensor, 95 | gather_list: List[torch.Tensor], 96 | dst: int, 97 | stream=None): 98 | dist.gather(tensor, gather_list, self.to_global_rank(dst), group=self.process_group) 99 | 100 | def all_to_all(self, 101 | output_tensor_list: List[torch.Tensor], 102 | input_tensor_list: List[torch.Tensor], 103 | stream=None): 104 | dist.all_to_all(output_tensor_list, input_tensor_list, group=self.process_group) 105 | 106 | def all_gather(self, 107 | tensor: torch.Tensor, 108 | output_tensor_list: List[torch.Tensor], 109 | stream=None): 110 | dist.all_gather(output_tensor_list, tensor, group=self.process_group) 111 | 112 | -------------------------------------------------------------------------------- /training/data_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/data_parallel/__init__.py -------------------------------------------------------------------------------- /training/data_parallel/dist_dp_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch.cuda 2 | from comm.comm_utils import * 3 | from .flatten_utils import flatten_params 4 | 5 | 6 | class AllReduceDP: 7 | def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True): 8 | self.flatten = flatten 9 | self.global_rank = args.rank 10 | self.dp_group_size = args.data_group_size 11 | self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') 12 | self.dp_comm = get_data_parallel_comm() 13 | self.dp_rank = get_data_parallel_rank() 14 | self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1) 15 | self.torch_optim_comp_stream = torch.cuda.default_stream(device=device) 16 | self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 17 | self.allreduce_grad_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 18 | self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 19 | 20 | self.module = module 21 | num_paras, element_size = self._compute_total_para_num() 22 | print("Total number of parameters: {}, element size: {}, total size {} MB." 23 | .format(num_paras, element_size, num_paras * element_size // 1024 // 1024)) 24 | 25 | if self.flatten: 26 | self.flatten_para = flatten_params(self.module.parameters()) 27 | print("Flattened parameter number: {}, element size: {}." 28 | .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size())) 29 | print("Flattened parameter grad number: {}, element size: {}." 30 | .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size())) 31 | 32 | assert optimizer is not None 33 | self.optimizer = optimizer 34 | 35 | if self.enable_tidy_profiling: 36 | self.global_rank = args.rank 37 | self.init_event = None 38 | self.init_time_stamp = None 39 | if self.flatten: 40 | self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False) 41 | else: 42 | self.allreduce_gradients_start_events = dict() 43 | self.allreduce_gradients_end_events = dict() 44 | for name, _ in self.module.named_parameters(): 45 | self.allreduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) 46 | self.allreduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) 47 | 48 | self.optimizer_step_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, 49 | blocking=False) 50 | 51 | def _compute_total_para_num(self): 52 | total_count = 0 53 | element_size = 0 54 | for para in self.module.parameters(): 55 | # print("Parameter: ", para.data.shape) 56 | total_count += torch.numel(para.data) 57 | element_size = para.element_size() 58 | return total_count, element_size 59 | 60 | def profile_mark_allreduce_start(self, name=None): 61 | if self.enable_tidy_profiling: 62 | if name is None: 63 | self.dp_comm_stream.record_event(self.allreduce_gradients_start_event) 64 | else: 65 | self.dp_comm_stream.record_event(self.allreduce_gradients_start_events[name]) 66 | 67 | def profile_mark_allreduce_end(self, name=None): 68 | if self.enable_tidy_profiling: 69 | if name: 70 | self.dp_comm_stream.record_event(self.allreduce_gradients_end_events[name]) 71 | 72 | def profile_mark_optimizer_step_start(self): 73 | if self.enable_tidy_profiling: 74 | self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event) 75 | 76 | def _allreduce_gradients(self): 77 | with torch.cuda.stream(self.dp_comm_stream): 78 | cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) 79 | self.dp_comm_stream.wait_event(self.backward_ready_event) 80 | if self.flatten: 81 | self.profile_mark_allreduce_start() 82 | self.dp_comm.all_reduce(self.flatten_para.grad, stream=cupy_dp_stream) 83 | self.profile_mark_allreduce_end() 84 | else: 85 | for name, para in self.module.named_parameters(): 86 | if para.grad is None: 87 | continue 88 | self.profile_mark_allreduce_start(name) 89 | self.dp_comm.all_reduce(para.grad, stream=cupy_dp_stream) 90 | self.profile_mark_allreduce_end(name) 91 | self.dp_comm_stream.record_event(self.allreduce_grad_ready_event) 92 | 93 | def optimizer_step(self): 94 | self._allreduce_gradients() 95 | with torch.cuda.stream(self.torch_optim_comp_stream): 96 | self.torch_optim_comp_stream.wait_event(self.allreduce_grad_ready_event) 97 | self.profile_mark_optimizer_step_start() 98 | self.optimizer.step() 99 | self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event) 100 | 101 | def set_time_stamp(self, init_time_stamp, init_event): 102 | self.init_event = init_event 103 | self.init_time_stamp = init_time_stamp 104 | 105 | def get_ts(self, event): 106 | return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 107 | 108 | def profiling_data_parallel(self, init_time_stamp, init_event): 109 | self.set_time_stamp(init_time_stamp, init_event) 110 | profiling_log = [] 111 | 112 | if self.flatten: 113 | allreduce_slot = self.allreduce_gradients_start_event.elapsed_time(self.allreduce_grad_ready_event)*1e+3 114 | allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 115 | "ts": self.get_ts(self.allreduce_gradients_start_event), 116 | "dur": allreduce_slot, "cname": "cq_build_passed", 117 | "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} 118 | # print(allreduce_log) 119 | profiling_log.append(allreduce_log) 120 | else: 121 | for name, para in self.module.named_parameters(): 122 | allreduce_slot = self.allreduce_gradients_start_events[name].elapsed_time( 123 | self.allreduce_gradients_end_events[name]) * 1e+3 124 | allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 125 | "ts": self.get_ts(self.allreduce_gradients_start_events[name]), "dur": allreduce_slot, 126 | "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}} 127 | # print(allreduce_log) 128 | profiling_log.append(allreduce_log) 129 | 130 | optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3 131 | optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp", 132 | "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"} 133 | # print(optimizer_log) 134 | profiling_log.append(optimizer_log) 135 | return profiling_log 136 | -------------------------------------------------------------------------------- /training/data_parallel/dist_dp_central_ps.py: -------------------------------------------------------------------------------- 1 | import torch.cuda 2 | from comm.comm_utils import * 3 | from .flatten_utils import flatten_params 4 | 5 | 6 | class CentralPSDP: 7 | def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True): 8 | self.flatten = flatten 9 | self.global_rank = args.rank 10 | self.dp_group_size = args.data_group_size 11 | self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') 12 | self.dp_comm = get_data_parallel_comm() 13 | self.dp_rank = get_data_parallel_rank() 14 | self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1) 15 | self.torch_optim_comp_stream = torch.cuda.default_stream(device=device) 16 | self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 17 | self.broadcast_reduced_gradients_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, 18 | blocking=False) 19 | self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 20 | 21 | self.module = module 22 | num_paras, element_size = self._compute_total_para_num() 23 | print("Total number of parameters: {}, element size: {}, total size {} MB." 24 | .format(num_paras, element_size, num_paras * element_size // 1024 // 1024)) 25 | 26 | if self.flatten: 27 | self.flatten_para = flatten_params(self.module.parameters()) 28 | print("Flattened parameter number: {}, element size: {}." 29 | .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size())) 30 | print("Flattened parameter grad number: {}, element size: {}." 31 | .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size())) 32 | 33 | assert optimizer is not None 34 | self.optimizer = optimizer 35 | 36 | if self.enable_tidy_profiling: 37 | self.global_rank = args.rank 38 | self.init_event = None 39 | self.init_time_stamp = None 40 | if self.flatten: 41 | self.reduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False) 42 | self.reduce_gradients_end_event = torch.cuda.Event(enable_timing=True, blocking=False) 43 | self.broadcast_reduced_grad_start_event = torch.cuda.Event(enable_timing=True, blocking=False) 44 | else: 45 | self.reduce_gradients_start_events = dict() 46 | self.reduce_gradients_end_events = dict() 47 | self.broadcast_reduced_grad_start_events = dict() 48 | self.broadcast_reduced_grad_end_events = dict() 49 | 50 | for name, _ in self.module.named_parameters(): 51 | self.reduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) 52 | self.reduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) 53 | self.broadcast_reduced_grad_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) 54 | self.broadcast_reduced_grad_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) 55 | 56 | self.optimizer_step_start_event = torch.cuda.Event(enable_timing=True, blocking=False) 57 | 58 | def _compute_total_para_num(self): 59 | total_count = 0 60 | element_size = 0 61 | for para in self.module.parameters(): 62 | # print("Parameter: ", para.data.shape) 63 | total_count += torch.numel(para.data) 64 | element_size = para.element_size() 65 | return total_count, element_size 66 | 67 | def profile_mark_reduce_start(self, name=None): 68 | if self.enable_tidy_profiling: 69 | if name is None: 70 | self.dp_comm_stream.record_event(self.reduce_gradients_start_event) 71 | else: 72 | self.dp_comm_stream.record_event(self.reduce_gradients_start_events[name]) 73 | 74 | def profile_mark_reduce_end(self, name=None): 75 | if self.enable_tidy_profiling: 76 | if name is None: 77 | self.dp_comm_stream.record_event(self.reduce_gradients_end_event) 78 | else: 79 | self.dp_comm_stream.record_event(self.reduce_gradients_end_events[name]) 80 | 81 | def profile_mark_optimizer_step_start(self): 82 | if self.enable_tidy_profiling: 83 | self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event) 84 | 85 | def profile_mark_broadcast_start(self, name=None): 86 | if self.enable_tidy_profiling: 87 | if name is None: 88 | self.dp_comm_stream.record_event(self.broadcast_reduced_grad_start_event) 89 | else: 90 | self.dp_comm_stream.record_event(self.broadcast_reduced_grad_start_events[name]) 91 | 92 | def profile_mark_broadcast_end(self, name=None): 93 | if self.enable_tidy_profiling: 94 | if name: 95 | self.dp_comm_stream.record_event(self.broadcast_reduced_grad_end_events[name]) 96 | 97 | def _reduce_gradients(self): 98 | with torch.cuda.stream(self.dp_comm_stream): 99 | cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) 100 | self.dp_comm_stream.wait_event(self.backward_ready_event) 101 | if self.flatten: 102 | self.profile_mark_reduce_start() 103 | self.dp_comm.reduce(self.flatten_para.grad, dst=0, stream=cupy_dp_stream) 104 | self.profile_mark_reduce_end() 105 | else: 106 | for name, para in self.module.named_parameters(): 107 | self.profile_mark_reduce_start(name) 108 | self.dp_comm.reduce(para.grad, dst=0, stream=cupy_dp_stream) 109 | self.profile_mark_reduce_end(name) 110 | 111 | def _broadcast_reduced_gradients(self): 112 | with torch.cuda.stream(self.dp_comm_stream): 113 | cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) 114 | if self.flatten: 115 | self.profile_mark_broadcast_start() 116 | self.dp_comm.broadcast(self.flatten_para.grad, src=0, stream=cupy_dp_stream) 117 | self.profile_mark_broadcast_end() 118 | else: 119 | for name, para in self.module.named_parameters(): 120 | self.profile_mark_broadcast_start(name) 121 | self.dp_comm.broadcast(para.grad, src=0, stream=cupy_dp_stream) 122 | self.profile_mark_broadcast_end(name) 123 | self.dp_comm_stream.record_event(self.broadcast_reduced_gradients_ready_event) 124 | 125 | def optimizer_step(self): 126 | self._reduce_gradients() 127 | self._broadcast_reduced_gradients() 128 | with torch.cuda.stream(self.torch_optim_comp_stream): 129 | self.torch_optim_comp_stream.wait_event(self.broadcast_reduced_gradients_ready_event) 130 | self.profile_mark_optimizer_step_start() 131 | self.optimizer.step() 132 | self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event) 133 | 134 | def set_time_stamp(self, init_time_stamp, init_event): 135 | self.init_event = init_event 136 | self.init_time_stamp = init_time_stamp 137 | 138 | def get_ts(self, event): 139 | return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 140 | 141 | def profiling_data_parallel(self, init_time_stamp, init_event): 142 | self.set_time_stamp(init_time_stamp, init_event) 143 | profiling_log = [] 144 | if self.flatten: 145 | reduce_slot = self.reduce_gradients_start_event.elapsed_time(self.reduce_gradients_end_event) * 1e+3 146 | reduce_log = {"name": "opt_reduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 147 | "ts": self.get_ts(self.reduce_gradients_start_event), 148 | "dur": reduce_slot, "cname": "cq_build_passed", 149 | "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} 150 | # print(reduce_log) 151 | profiling_log.append(reduce_log) 152 | else: 153 | for name, para in self.module.named_parameters(): 154 | reduce_slot = self.reduce_gradients_start_events[name].elapsed_time( 155 | self.reduce_gradients_end_events[name]) * 1e+3 156 | reduce_log = {"name": "opt_reduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 157 | "ts": self.get_ts(self.reduce_gradients_start_events[name]), "dur": reduce_slot, 158 | "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}} 159 | # print(reduce_log) 160 | profiling_log.append(reduce_log) 161 | 162 | optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3 163 | optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp", 164 | "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"} 165 | # print(optimizer_log) 166 | profiling_log.append(optimizer_log) 167 | 168 | if self.flatten: 169 | broadcast_slot = self.broadcast_reduced_grad_start_event.elapsed_time( 170 | self.broadcast_reduced_gradients_ready_event) * 1e+3 171 | broadcast_log = {"name": "opt_broadcast", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 172 | "ts": self.get_ts(self.broadcast_reduced_grad_start_event), 173 | "dur": broadcast_slot, "cname": "cq_build_passed", 174 | "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} 175 | profiling_log.append(broadcast_log) 176 | else: 177 | for name, para in self.module.named_parameters(): 178 | broadcast_slot = self.broadcast_reduced_grad_start_events[name].elapsed_time( 179 | self.broadcast_reduced_grad_end_events[name]) * 1e+3 180 | broadcast_log = {"name": "opt_broadcast", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 181 | "ts": self.get_ts(self.broadcast_reduced_grad_start_events[name]), "dur": broadcast_slot, 182 | "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}} 183 | # print(broadcast_log) 184 | profiling_log.append(broadcast_log) 185 | return profiling_log 186 | -------------------------------------------------------------------------------- /training/data_parallel/dist_dp_local.py: -------------------------------------------------------------------------------- 1 | import torch.cuda 2 | import cupy 3 | from comm.comm_utils import * 4 | from .flatten_utils import flatten_params 5 | 6 | 7 | class LocalDP: 8 | def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True): 9 | flatten = True 10 | self.flatten = flatten 11 | self.global_rank = args.rank 12 | self.dp_group_size = args.data_group_size 13 | self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') 14 | self.dp_comm = get_data_parallel_comm() 15 | self.dp_rank = get_data_parallel_rank() 16 | self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1) 17 | self.torch_optim_comp_stream = torch.cuda.default_stream(device=device) 18 | self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 19 | self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 20 | self.allreduce_grad_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 21 | self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 22 | 23 | self.module = module 24 | num_paras, element_size = self._compute_total_para_num() 25 | print("Total number of parameters: {}, element size: {}, total size {} MB." 26 | .format(num_paras, element_size, num_paras * element_size // 1024 // 1024)) 27 | 28 | if self.flatten: 29 | self.flatten_para = flatten_params(self.module.parameters()) 30 | print("Flattened parameter number: {}, element size: {}." 31 | .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size())) 32 | print("Flattened parameter grad number: {}, element size: {}." 33 | .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size())) 34 | 35 | assert optimizer is not None 36 | self.optimizer = optimizer 37 | 38 | if self.enable_tidy_profiling: 39 | self.global_rank = args.rank 40 | self.init_event = None 41 | self.init_time_stamp = None 42 | if self.flatten: 43 | self.allreduce_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False) 44 | else: 45 | self.allreduce_gradients_start_events = dict() 46 | self.allreduce_gradients_end_events = dict() 47 | for name, _ in self.module.named_parameters(): 48 | self.allreduce_gradients_start_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) 49 | self.allreduce_gradients_end_events[name] = torch.cuda.Event(enable_timing=True, blocking=False) 50 | 51 | self.optimizer_step_start_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, 52 | blocking=False) 53 | 54 | def _compute_total_para_num(self): 55 | total_count = 0 56 | element_size = 0 57 | for para in self.module.parameters(): 58 | # print("Parameter: ", para.data.shape) 59 | total_count += torch.numel(para.data) 60 | element_size = para.element_size() 61 | return total_count, element_size 62 | 63 | def profile_mark_allreduce_start(self, name=None): 64 | if self.enable_tidy_profiling: 65 | if name is None: 66 | self.dp_comm_stream.record_event(self.allreduce_gradients_start_event) 67 | else: 68 | self.dp_comm_stream.record_event(self.allreduce_gradients_start_events[name]) 69 | 70 | def profile_mark_allreduce_end(self, name=None): 71 | if self.enable_tidy_profiling: 72 | if name: 73 | self.dp_comm_stream.record_event(self.allreduce_gradients_end_events[name]) 74 | 75 | def profile_mark_optimizer_step_start(self): 76 | if self.enable_tidy_profiling: 77 | self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event) 78 | 79 | def allreduce_parameters(self): 80 | self._local_parameters_backup = [ 81 | p.data.clone() for p in self.module.parameters() 82 | ] 83 | torch.cuda.synchronize() 84 | self.dp_comm.barrier() 85 | with torch.cuda.stream(self.dp_comm_stream): 86 | cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) 87 | self.dp_comm_stream.wait_event(self.backward_ready_event) 88 | if self.flatten: 89 | self.profile_mark_allreduce_start() 90 | self.dp_comm.all_reduce(self.flatten_para.data, stream=cupy_dp_stream) 91 | self.flatten_para.data /= self.dp_group_size 92 | self.profile_mark_allreduce_end() 93 | else: 94 | for name, para in self.module.named_parameters(): 95 | self.profile_mark_allreduce_start(name) 96 | self.dp_comm.all_reduce(para.data, stream=cupy_dp_stream) 97 | para.data /= self.dp_group_size 98 | self.profile_mark_allreduce_end(name) 99 | self.dp_comm_stream.record_event(self.allreduce_grad_ready_event) 100 | torch.cuda.synchronize() 101 | self.dp_comm.barrier() 102 | 103 | def rollback_parameters(self): 104 | if not hasattr(self, '_local_parameters_backup'): 105 | return 106 | 107 | for p, p_local in zip(self.module.parameters(), self._local_parameters_backup): 108 | p.data[:] = p_local.data 109 | 110 | del self._local_parameters_backup 111 | 112 | 113 | def optimizer_step(self): 114 | # torch.cuda.synchronize() 115 | with torch.cuda.stream(self.torch_optim_comp_stream): 116 | self.torch_optim_comp_stream.record_event(self.allreduce_gradients_start_event) 117 | self.torch_optim_comp_stream.record_event(self.allreduce_grad_ready_event) 118 | self.torch_optim_comp_stream.wait_event(self.backward_ready_event) 119 | self.profile_mark_optimizer_step_start() 120 | self.optimizer.step() 121 | self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event) 122 | 123 | def set_time_stamp(self, init_time_stamp, init_event): 124 | self.init_event = init_event 125 | self.init_time_stamp = init_time_stamp 126 | 127 | def get_ts(self, event): 128 | return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 129 | 130 | def profiling_data_parallel(self, init_time_stamp, init_event): 131 | self.set_time_stamp(init_time_stamp, init_event) 132 | profiling_log = [] 133 | 134 | if self.flatten: 135 | allreduce_slot = self.allreduce_gradients_start_event.elapsed_time(self.allreduce_grad_ready_event)*1e+3 136 | allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 137 | "ts": self.get_ts(self.allreduce_gradients_start_event), 138 | "dur": allreduce_slot, "cname": "cq_build_passed", 139 | "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} 140 | # print(allreduce_log) 141 | profiling_log.append(allreduce_log) 142 | else: 143 | for name, para in self.module.named_parameters(): 144 | allreduce_slot = self.allreduce_gradients_start_events[name].elapsed_time( 145 | self.allreduce_gradients_end_events[name]) * 1e+3 146 | allreduce_log = {"name": "opt_allreduce", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 147 | "ts": self.get_ts(self.allreduce_gradients_start_events[name]), "dur": allreduce_slot, 148 | "cname": "cq_build_passed", "args": {'para': name, 'size': torch.numel(para.data)}} 149 | # print(allreduce_log) 150 | profiling_log.append(allreduce_log) 151 | 152 | optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3 153 | optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp", 154 | "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"} 155 | # print(optimizer_log) 156 | profiling_log.append(optimizer_log) 157 | return profiling_log 158 | -------------------------------------------------------------------------------- /training/data_parallel/dist_dp_sharded_ps.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.cuda 5 | from comm.comm_utils import * 6 | from .flatten_utils import flatten_params 7 | 8 | 9 | class ShardedPSDP: 10 | def __init__(self, args, device, module: torch.nn.Module, optimizer: torch.optim.Optimizer = None, flatten=True): 11 | self.flatten = flatten 12 | self.global_rank = args.rank 13 | self.dp_group_size = args.data_group_size 14 | self.enable_tidy_profiling = (args.profiling == 'tidy_profiling') 15 | self.dp_comm = get_data_parallel_comm() 16 | self.dp_rank = get_data_parallel_rank() 17 | self.dp_comm_stream = torch.cuda.Stream(device=device, priority=-1) 18 | self.torch_optim_comp_stream = torch.cuda.default_stream(device=device) 19 | self.backward_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 20 | self.sync_gradients_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 21 | self.optimizer_step_ready_event = torch.cuda.Event(enable_timing=self.enable_tidy_profiling, blocking=False) 22 | 23 | self.module = module 24 | assert optimizer is not None 25 | self.optimizer = optimizer 26 | num_paras, element_size = self._compute_total_para_num() 27 | print("Total number of parameters: {}, element size: {}, total size {} MB." 28 | .format(num_paras, element_size, num_paras * element_size // 1024 // 1024)) 29 | 30 | assert self.flatten 31 | # self.para = list(self.module.parameters()) 32 | self.flatten_para = flatten_params(self.module.parameters(), self.dp_group_size) 33 | print("Flattened parameter number: {}, element size: {}." 34 | .format(self.flatten_para.data.numel(), self.flatten_para.data.element_size())) 35 | print("Flattened parameter grad number: {}, element size: {}." 36 | .format(self.flatten_para.grad.numel(), self.flatten_para.grad.element_size())) 37 | 38 | self.grad_buffer = self._declare_grad_buffer() 39 | 40 | if self.enable_tidy_profiling: 41 | self.global_rank = args.rank 42 | self.init_event = None 43 | self.init_time_stamp = None 44 | 45 | assert self.flatten 46 | self.sync_gradients_start_event = torch.cuda.Event(enable_timing=True, blocking=False) 47 | 48 | self.optimizer_step_start_event = torch.cuda.Event(enable_timing=True, blocking=False) 49 | 50 | def _compute_total_para_num(self): 51 | total_count = 0 52 | element_size = 0 53 | for para in self.module.parameters(): 54 | # print("Parameter: ", para.data.shape) 55 | total_count += torch.numel(para.data) 56 | element_size = para.element_size() 57 | return total_count, element_size 58 | 59 | def _declare_grad_buffer(self): 60 | assert self.flatten_para.data.numel() % self.dp_group_size == 0 61 | chunk_size = self.flatten_para.data.numel() // self.dp_group_size 62 | grad_buffer = [torch.zeros(chunk_size, device=self.flatten_para.device, dtype=self.flatten_para.dtype) 63 | for _ in range(self.dp_group_size)] 64 | return grad_buffer 65 | 66 | def profile_mark_sync_grad_start(self): 67 | if self.enable_tidy_profiling: 68 | self.dp_comm_stream.record_event(self.sync_gradients_start_event) 69 | 70 | def profile_mark_allreduce_end(self): 71 | pass 72 | 73 | def profile_mark_optimizer_step_start(self): 74 | if self.enable_tidy_profiling: 75 | self.torch_optim_comp_stream.record_event(self.optimizer_step_start_event) 76 | 77 | def _sync_gradients(self): 78 | with torch.cuda.stream(self.dp_comm_stream): 79 | cupy_dp_stream = cupy.cuda.ExternalStream(self.dp_comm_stream.cuda_stream) 80 | self.dp_comm_stream.wait_event(self.backward_ready_event) 81 | assert self.flatten 82 | self.profile_mark_sync_grad_start() 83 | self.dp_comm.all_reduce_opt(self.flatten_para.grad, self.grad_buffer, stream=cupy_dp_stream) 84 | self.profile_mark_allreduce_end() 85 | self.dp_comm_stream.record_event(self.sync_gradients_ready_event) 86 | 87 | def optimizer_step(self): 88 | self._sync_gradients() 89 | with torch.cuda.stream(self.torch_optim_comp_stream): 90 | self.torch_optim_comp_stream.wait_event(self.sync_gradients_ready_event) 91 | self.profile_mark_optimizer_step_start() 92 | self.optimizer.step() 93 | self.torch_optim_comp_stream.record_event(self.optimizer_step_ready_event) 94 | 95 | def set_time_stamp(self, init_time_stamp, init_event): 96 | self.init_event = init_event 97 | self.init_time_stamp = init_time_stamp 98 | 99 | def get_ts(self, event): 100 | return self.init_time_stamp + self.init_event.elapsed_time(event) * 1e+3 101 | 102 | def profiling_data_parallel(self, init_time_stamp, init_event): 103 | self.set_time_stamp(init_time_stamp, init_event) 104 | profiling_log = [] 105 | 106 | assert self.flatten 107 | allreduce_slot = self.sync_gradients_start_event.elapsed_time(self.sync_gradients_ready_event)*1e+3 108 | allreduce_log = {"name": "opt_shardedPS_sync", "ph": "X", "pid": self.global_rank, "tid": "7. optimizer-comm", 109 | "ts": self.get_ts(self.sync_gradients_start_event), 110 | "dur": allreduce_slot, "cname": "cq_build_passed", 111 | "args": {'para': 'flattened_grad', 'size': self.flatten_para.grad.numel()}} 112 | # print(allreduce_log) 113 | profiling_log.append(allreduce_log) 114 | 115 | optimizer_slot = self.optimizer_step_start_event.elapsed_time(self.optimizer_step_ready_event) * 1e+3 116 | optimizer_log = {"name": "opt_comp", "ph": "X", "pid": self.global_rank, "tid": "8. optimizer-comp", 117 | "ts": self.get_ts(self.optimizer_step_start_event), "dur": optimizer_slot, "cname": "bad"} 118 | # print(optimizer_log) 119 | profiling_log.append(optimizer_log) 120 | return profiling_log 121 | -------------------------------------------------------------------------------- /training/data_parallel/dist_dp_utils.py: -------------------------------------------------------------------------------- 1 | from .dist_dp_allreduce import AllReduceDP 2 | from .dist_dp_sharded_ps import ShardedPSDP 3 | from .dist_dp_local import LocalDP 4 | 5 | 6 | def get_dp_module(args, device, module, optimizer): 7 | print("Data parallel implementation: ", args.dp_mode) 8 | if args.dp_mode == 'allreduce': 9 | return AllReduceDP(args, device, module, optimizer, flatten=False) 10 | # flatten gradient is not compatible with fp16 now 11 | elif args.dp_mode == 'local': 12 | return LocalDP(args, device, module, optimizer, flatten=False) 13 | elif args.dp_mode == 'sharded_ps': 14 | return ShardedPSDP(args, device, module, optimizer, flatten=False) 15 | else: 16 | print("Not recognize this data parallel mode.") 17 | assert False 18 | -------------------------------------------------------------------------------- /training/data_parallel/flatten_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def _assert_contiguous(tensors): 5 | data_ptr = None 6 | for t in tensors: 7 | if data_ptr is not None: 8 | assert t.data_ptr() == data_ptr 9 | data_ptr = t.data_ptr() + t.numel() * t.element_size() 10 | 11 | 12 | def flatten_params(param_set, chunk=None): 13 | params = [p for p in param_set] 14 | weights = [p.data for p in params] 15 | grads = [p.grad.data if p.grad is not None else torch.zeros_like(p.data) for p in params] 16 | sizes = [p.numel() for p in params] 17 | total_size = sum(sizes) 18 | if chunk: 19 | total_size = ((total_size+chunk-1)//chunk)*chunk 20 | 21 | flatten_weights_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device) 22 | flatten_grads_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device) 23 | flatten_weights_storage = flatten_weights_tensor.storage() 24 | flatten_grads_storage = flatten_grads_tensor.storage() 25 | 26 | def set_storage(param, weight_storage, grad_storage, storage_offset): 27 | with torch.no_grad(): 28 | z = torch.zeros_like(param.data) 29 | z.set_(weight_storage, storage_offset, param.shape) 30 | param.data = z 31 | 32 | t = torch.zeros_like(param.data) 33 | t.set_(grad_storage, storage_offset, param.shape) 34 | param.grad = t 35 | 36 | offset = 0 37 | for i in range(len(params)): 38 | flatten_weights_tensor[offset: offset + weights[i].numel()] = weights[i].reshape(-1) 39 | flatten_grads_tensor[offset: offset + grads[i].numel()] = grads[i].reshape(-1) 40 | set_storage(params[i], flatten_weights_storage, flatten_grads_storage, offset) 41 | offset += sizes[i] 42 | 43 | weight_tensors = [p.data for p in params] 44 | grad_tensors = [p.grad.data for p in params] 45 | 46 | _assert_contiguous(weight_tensors) 47 | _assert_contiguous(grad_tensors) 48 | 49 | with torch.no_grad(): 50 | flatten_para = torch.nn.Parameter(flatten_weights_tensor, requires_grad=False) 51 | flatten_para.grad = flatten_grads_tensor 52 | return flatten_para 53 | 54 | 55 | def flatten_tensors(tensor_set, chunk=None): 56 | tensors = [p for p in tensor_set] 57 | weights = [p.data for p in tensors] 58 | sizes = [p.numel() for p in tensors] 59 | total_size = sum(sizes) 60 | if chunk: 61 | total_size = ((total_size+chunk-1)//chunk)*chunk 62 | 63 | flatten_weights_tensor = torch.zeros(total_size, dtype=weights[0].dtype).to(weights[0].device) 64 | flatten_weights_storage = flatten_weights_tensor.storage() 65 | 66 | def set_storage(param, weight_storage, storage_offset): 67 | with torch.no_grad(): 68 | z = torch.zeros_like(param.data) 69 | z.set_(weight_storage, storage_offset, param.shape) 70 | param.data = z 71 | 72 | offset = 0 73 | for i in range(len(tensors)): 74 | flatten_weights_tensor[offset: offset + weights[i].numel()] = weights[i].reshape(-1) 75 | set_storage(tensors[i], flatten_weights_storage, offset) 76 | offset += sizes[i] 77 | 78 | return flatten_weights_tensor 79 | -------------------------------------------------------------------------------- /training/finetune_GPT-NeoXT-Chat-Base-20B.sh: -------------------------------------------------------------------------------- 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) 2 | 3 | netif=lo 4 | export GLOO_SOCKET_IFNAME=${netif} 5 | export NCCL_SOCKET_IFNAME=${netif} 6 | export MODEL_NAME=GPT-Neo-XT-Chat-Base-20B 7 | 8 | export SHOW_DATA=0 9 | 10 | BASE_MODEL="${DIR}/../pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b/" 11 | 12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-20000} 13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-100} 14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} 15 | 16 | DATASETS="\ 17 | ${DIR}/../data/OIG/files/unified_ni.jsonl:0.2,\ 18 | ${DIR}/../data/OIG/files/unified_p3.jsonl:0.5,\ 19 | ${DIR}/../data/OIG/files/unified_flan.jsonl:0.2,\ 20 | ${DIR}/../data/OIG/files/unified_chip2.jsonl:0.01,\ 21 | ${DIR}/../data/OIG/files/unified_rallio_safety_and_prosocial.jsonl:0.1,\ 22 | ${DIR}/../data/OIG/files/unified_soda_dialog.jsonl:0.1,\ 23 | ${DIR}/../data/OIG/files/unified_unifiedskg_instructions.jsonl:0.1,\ 24 | ${DIR}/../data/OIG/files/unified_merged_code_xp3.jsonl:0.1,\ 25 | ${DIR}/../data/OIG/files/unified_oscar_en_sample_dialog.jsonl:0.1,\ 26 | ${DIR}/../data/OIG/files/unified_ul2_plus_oscar_en_sample_dialog.jsonl:0.1,\ 27 | ${DIR}/../data/OIG/files/unified_multi_news.jsonl:0.05,\ 28 | ${DIR}/../data/OIG/files/unified_openai_summarize_tldr.jsonl:0.05,\ 29 | ${DIR}/../data/OIG/files/unified_squad_v2.jsonl:0.01,\ 30 | ${DIR}/../data/OIG/files/unified_nq.jsonl:0.01,\ 31 | ${DIR}/../data/OIG/files/unified_poetry_instructions.jsonl:0.01,\ 32 | ${DIR}/../data/OIG/files/unified_sqlv2.jsonl:0.01,\ 33 | ${DIR}/../data/OIG/files/unified_unnatural_instructions.jsonl:0.01,\ 34 | ${DIR}/../data/OIG/files/unified_conv_finqa.jsonl:0.01,\ 35 | ${DIR}/../data/OIG/files/unified_essays.jsonl:0.01,\ 36 | ${DIR}/../data/OIG/files/unified_plot_screenplay_books_dialog.jsonl:0.01,\ 37 | ${DIR}/../data/OIG/files/unified_grade_school_math_instructions.jsonl:0.01,\ 38 | ${DIR}/../data/OIG/files/unified_mathqa_flanv2_kojma_cot.jsonl:0.01,\ 39 | ${DIR}/../data/OIG/files/unified_joke_explanations.jsonl:0.01,\ 40 | ${DIR}/../data/OIG/files/unified_cuad.jsonl:0.01,\ 41 | ${DIR}/../data/OIG/files/unified_abstract_infill.jsonl:0.1,\ 42 | ${DIR}/../data/OIG/files/unified_image_prompts_instructions.jsonl:0.01 \ 43 | " 44 | 45 | ARGS="--model-name ${BASE_MODEL} \ 46 | --tokenizer-name ${BASE_MODEL} \ 47 | --project-name together \ 48 | --model-type gptneox \ 49 | --optimizer adam \ 50 | --seed 42 \ 51 | --load-pretrained-model true \ 52 | --task-name \ 53 | "${DATASETS}" \ 54 | --checkpoint-path ${CHECKPOINT_PATH} \ 55 | --total-steps ${TOTAL_STEPS} --warmup-steps 10 --train-warmup-steps 0 \ 56 | --checkpoint-steps ${CHECKPOINT_STEPS} \ 57 | --lr 1e-6 --seq-length 2048 --batch-size 64 --micro-batch-size 1 --gradient-accumulate-step 1 \ 58 | --dist-url tcp://127.0.0.1:7033 \ 59 | --num-layers 6 --embedding-dim 6144 \ 60 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ 61 | --job-id 0 --net-interface ${netif} \ 62 | --fp16 \ 63 | --dp-backend nccl \ 64 | --dp-mode allreduce \ 65 | --pp-mode gpipe --profiling no-profiling" 66 | 67 | 68 | (trap 'kill 0' SIGINT; \ 69 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ 70 | & \ 71 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ 72 | & \ 73 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ 74 | & \ 75 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ 76 | & \ 77 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ 78 | & \ 79 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ 80 | & \ 81 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ 82 | & \ 83 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ 84 | & \ 85 | wait) 86 | -------------------------------------------------------------------------------- /training/finetune_Pythia-Chat-Base-7B.sh: -------------------------------------------------------------------------------- 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) 2 | 3 | netif=lo 4 | export GLOO_SOCKET_IFNAME=${netif} 5 | export NCCL_SOCKET_IFNAME=${netif} 6 | export MODEL_NAME=Pythia-Chat-Base-7B 7 | 8 | export SHOW_DATA=0 9 | 10 | BASE_MODEL="${DIR}/../pretrained/Pythia-6.9B-deduped/EleutherAI_pythia-6.9b-deduped/" 11 | 12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-20000} 13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-100} 14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} 15 | 16 | DATASETS="\ 17 | ${DIR}/../data/OIG/files/unified_ni.jsonl:0.2,\ 18 | ${DIR}/../data/OIG/files/unified_p3.jsonl:0.5,\ 19 | ${DIR}/../data/OIG/files/unified_flan.jsonl:0.2,\ 20 | ${DIR}/../data/OIG/files/unified_chip2.jsonl:0.01,\ 21 | ${DIR}/../data/OIG/files/unified_rallio_safety_and_prosocial.jsonl:0.1,\ 22 | ${DIR}/../data/OIG/files/unified_soda_dialog.jsonl:0.1,\ 23 | ${DIR}/../data/OIG/files/unified_unifiedskg_instructions.jsonl:0.1,\ 24 | ${DIR}/../data/OIG/files/unified_merged_code_xp3.jsonl:0.1,\ 25 | ${DIR}/../data/OIG/files/unified_oscar_en_sample_dialog.jsonl:0.1,\ 26 | ${DIR}/../data/OIG/files/unified_ul2_plus_oscar_en_sample_dialog.jsonl:0.1,\ 27 | ${DIR}/../data/OIG/files/unified_multi_news.jsonl:0.05,\ 28 | ${DIR}/../data/OIG/files/unified_openai_summarize_tldr.jsonl:0.05,\ 29 | ${DIR}/../data/OIG/files/unified_squad_v2.jsonl:0.01,\ 30 | ${DIR}/../data/OIG/files/unified_nq.jsonl:0.01,\ 31 | ${DIR}/../data/OIG/files/unified_poetry_instructions.jsonl:0.01,\ 32 | ${DIR}/../data/OIG/files/unified_sqlv2.jsonl:0.01,\ 33 | ${DIR}/../data/OIG/files/unified_unnatural_instructions.jsonl:0.01,\ 34 | ${DIR}/../data/OIG/files/unified_conv_finqa.jsonl:0.01,\ 35 | ${DIR}/../data/OIG/files/unified_essays.jsonl:0.01,\ 36 | ${DIR}/../data/OIG/files/unified_plot_screenplay_books_dialog.jsonl:0.01,\ 37 | ${DIR}/../data/OIG/files/unified_grade_school_math_instructions.jsonl:0.01,\ 38 | ${DIR}/../data/OIG/files/unified_mathqa_flanv2_kojma_cot.jsonl:0.01,\ 39 | ${DIR}/../data/OIG/files/unified_joke_explanations.jsonl:0.01,\ 40 | ${DIR}/../data/OIG/files/unified_cuad.jsonl:0.01,\ 41 | ${DIR}/../data/OIG/files/unified_abstract_infill.jsonl:0.1,\ 42 | ${DIR}/../data/OIG/files/unified_image_prompts_instructions.jsonl:0.01 \ 43 | " 44 | 45 | ARGS="--model-name ${BASE_MODEL} \ 46 | --tokenizer-name ${BASE_MODEL} \ 47 | --project-name together \ 48 | --model-type gptneox \ 49 | --optimizer adam \ 50 | --seed 42 \ 51 | --load-pretrained-model true \ 52 | --task-name \ 53 | "${DATASETS}" \ 54 | --checkpoint-path ${CHECKPOINT_PATH} \ 55 | --total-steps ${TOTAL_STEPS} --warmup-steps 10 --train-warmup-steps 0 \ 56 | --checkpoint-steps ${CHECKPOINT_STEPS} \ 57 | --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \ 58 | --dist-url tcp://127.0.0.1:7033 \ 59 | --num-layers 8 --embedding-dim 4096 \ 60 | --world-size 8 --pipeline-group-size 4 --data-group-size 2 \ 61 | --job-id 0 --net-interface ${netif} \ 62 | --fp16 \ 63 | --dp-backend nccl \ 64 | --dp-mode allreduce \ 65 | --pp-mode gpipe --profiling no-profiling" 66 | 67 | 68 | (trap 'kill 0' SIGINT; \ 69 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ 70 | & \ 71 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ 72 | & \ 73 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ 74 | & \ 75 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ 76 | & \ 77 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ 78 | & \ 79 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ 80 | & \ 81 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ 82 | & \ 83 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ 84 | & \ 85 | wait) 86 | -------------------------------------------------------------------------------- /training/finetune_RedPajama-INCITE-7B-Chat.sh: -------------------------------------------------------------------------------- 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) 2 | 3 | netif=lo 4 | export GLOO_SOCKET_IFNAME=${netif} 5 | export NCCL_SOCKET_IFNAME=${netif} 6 | export MODEL_NAME=redpajama-incite-chat-3b-sample 7 | 8 | export SHOW_DATA=0 9 | 10 | BASE_MODEL="${DIR}/../pretrained/RedPajama-7B/togethercomputer_RedPajama-INCITE-7B-Chat" 11 | 12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10} 13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10} 14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} 15 | 16 | DATASETS="${DIR}/../data/OIG-chip2/unified_chip2.jsonl:1" 17 | 18 | ARGS="--model-name ${BASE_MODEL} \ 19 | --tokenizer-name ${BASE_MODEL} \ 20 | --project-name together \ 21 | --model-type gptneox \ 22 | --optimizer adam \ 23 | --seed 42 \ 24 | --load-pretrained-model true \ 25 | --task-name \ 26 | "${DATASETS}" \ 27 | --checkpoint-path ${CHECKPOINT_PATH} \ 28 | --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \ 29 | --checkpoint-steps ${CHECKPOINT_STEPS} \ 30 | --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \ 31 | --dist-url tcp://127.0.0.1:7033 \ 32 | --num-layers 4 --embedding-dim 2560 \ 33 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ 34 | --job-id 0 --net-interface ${netif} \ 35 | --fp16 \ 36 | --dp-backend nccl \ 37 | --dp-mode allreduce \ 38 | --pp-mode gpipe --profiling no-profiling" 39 | 40 | 41 | (trap 'kill 0' SIGINT; \ 42 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ 43 | & \ 44 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ 45 | & \ 46 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ 47 | & \ 48 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ 49 | & \ 50 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ 51 | & \ 52 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ 53 | & \ 54 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ 55 | & \ 56 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ 57 | & \ 58 | wait) 59 | -------------------------------------------------------------------------------- /training/finetune_RedPajama-INCITE-Chat-3B-v1.sh: -------------------------------------------------------------------------------- 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) 2 | 3 | netif=lo 4 | export GLOO_SOCKET_IFNAME=${netif} 5 | export NCCL_SOCKET_IFNAME=${netif} 6 | export MODEL_NAME=redpajama-incite-chat-3b-sample 7 | 8 | export SHOW_DATA=0 9 | 10 | BASE_MODEL="${DIR}/../pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1" 11 | 12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10} 13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10} 14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} 15 | 16 | DATASETS="${DIR}/../data/OIG-chip2/unified_chip2.jsonl:1" 17 | 18 | ARGS="--model-name ${BASE_MODEL} \ 19 | --tokenizer-name ${BASE_MODEL} \ 20 | --project-name together \ 21 | --model-type gptneox \ 22 | --optimizer adam \ 23 | --seed 42 \ 24 | --load-pretrained-model true \ 25 | --task-name \ 26 | "${DATASETS}" \ 27 | --checkpoint-path ${CHECKPOINT_PATH} \ 28 | --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \ 29 | --checkpoint-steps ${CHECKPOINT_STEPS} \ 30 | --lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \ 31 | --dist-url tcp://127.0.0.1:7033 \ 32 | --num-layers 4 --embedding-dim 2560 \ 33 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ 34 | --job-id 0 --net-interface ${netif} \ 35 | --fp16 \ 36 | --dp-backend nccl \ 37 | --dp-mode allreduce \ 38 | --pp-mode gpipe --profiling no-profiling" 39 | 40 | 41 | (trap 'kill 0' SIGINT; \ 42 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ 43 | & \ 44 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ 45 | & \ 46 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ 47 | & \ 48 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ 49 | & \ 50 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ 51 | & \ 52 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ 53 | & \ 54 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ 55 | & \ 56 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ 57 | & \ 58 | wait) 59 | -------------------------------------------------------------------------------- /training/finetune_llama-2-7b-32k-booksum.sh: -------------------------------------------------------------------------------- 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) 2 | 3 | netif=lo 4 | export GLOO_SOCKET_IFNAME=${netif} 5 | export NCCL_SOCKET_IFNAME=${netif} 6 | export MODEL_NAME=llama-2-7b-32k-booksum 7 | 8 | export SHOW_DATA=1 9 | 10 | BASE_MODEL="${DIR}/../pretrained/Llama-2-7B-32K-beta/togethercomputer_Llama-2-7B-32K-beta" 11 | 12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10} 13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10} 14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} 15 | 16 | DATASETS="https://huggingface.co/datasets/togethercomputer/Long-Data-Collections/resolve/main/fine-tune/booksum.jsonl.zst:1" 17 | 18 | ARGS="--model-name ${BASE_MODEL} \ 19 | --tokenizer-name ${BASE_MODEL} \ 20 | --project-name together \ 21 | --model-type llama \ 22 | --optimizer adam \ 23 | --seed 42 \ 24 | --load-pretrained-model true \ 25 | --task-name \ 26 | "${DATASETS}" \ 27 | --checkpoint-path ${CHECKPOINT_PATH} \ 28 | --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \ 29 | --checkpoint-steps ${CHECKPOINT_STEPS} \ 30 | --lr 2e-5 --seq-length 32768 --batch-size 4 --micro-batch-size 1 --gradient-accumulate-step 1 \ 31 | --dist-url tcp://127.0.0.1:7033 \ 32 | --num-layers 4 --embedding-dim 4096 \ 33 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ 34 | --job-id 0 --net-interface ${netif} \ 35 | --fp16 \ 36 | --dp-backend nccl \ 37 | --dp-mode allreduce \ 38 | --pp-mode gpipe --profiling no-profiling" 39 | 40 | (trap 'kill 0' SIGINT; \ 41 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ 42 | & \ 43 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ 44 | & \ 45 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ 46 | & \ 47 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ 48 | & \ 49 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ 50 | & \ 51 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ 52 | & \ 53 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ 54 | & \ 55 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ 56 | & \ 57 | wait) 58 | -------------------------------------------------------------------------------- /training/finetune_llama-2-7b-32k-mqa.sh: -------------------------------------------------------------------------------- 1 | DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) 2 | 3 | netif=lo 4 | export GLOO_SOCKET_IFNAME=${netif} 5 | export NCCL_SOCKET_IFNAME=${netif} 6 | export MODEL_NAME=llama-2-7b-32k-mqa 7 | 8 | export SHOW_DATA=1 9 | 10 | BASE_MODEL="${DIR}/../pretrained/Llama-2-7B-32K-beta/togethercomputer_Llama-2-7B-32K-beta" 11 | 12 | TOTAL_STEPS=${FINETUNE_TOTAL_STEPS:-10} 13 | CHECKPOINT_STEPS=${FINETUNE_CHECKPOINT_STEPS:-10} 14 | CHECKPOINT_PATH=${FINETUNE_CHECKPOINT_PATH:-"${DIR}/../model_ckpts/${MODEL_NAME}"} 15 | 16 | DATASETS="https://huggingface.co/datasets/togethercomputer/Long-Data-Collections/resolve/main/fine-tune/natural_questions_10_200_docs.jsonl.zst:1" 17 | 18 | ARGS="--model-name ${BASE_MODEL} \ 19 | --tokenizer-name ${BASE_MODEL} \ 20 | --project-name together \ 21 | --model-type llama \ 22 | --optimizer adam \ 23 | --seed 42 \ 24 | --load-pretrained-model true \ 25 | --task-name \ 26 | "${DATASETS}" \ 27 | --checkpoint-path ${CHECKPOINT_PATH} \ 28 | --total-steps ${TOTAL_STEPS} --warmup-steps 0 --train-warmup-steps 0 \ 29 | --checkpoint-steps ${CHECKPOINT_STEPS} \ 30 | --lr 2e-5 --seq-length 32768 --batch-size 4 --micro-batch-size 1 --gradient-accumulate-step 1 \ 31 | --dist-url tcp://127.0.0.1:7033 \ 32 | --num-layers 4 --embedding-dim 4096 \ 33 | --world-size 8 --pipeline-group-size 8 --data-group-size 1 \ 34 | --job-id 0 --net-interface ${netif} \ 35 | --fp16 \ 36 | --dp-backend nccl \ 37 | --dp-mode allreduce \ 38 | --pp-mode gpipe --profiling no-profiling" 39 | 40 | (trap 'kill 0' SIGINT; \ 41 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \ 42 | & \ 43 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \ 44 | & \ 45 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \ 46 | & \ 47 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \ 48 | & \ 49 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \ 50 | & \ 51 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \ 52 | & \ 53 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \ 54 | & \ 55 | python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \ 56 | & \ 57 | wait) 58 | -------------------------------------------------------------------------------- /training/lora/example/redpajama-incite-chat-3b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 4 | import torch 5 | import transformers 6 | import torch.nn as nn 7 | import bitsandbytes as bnb 8 | from datasets import Dataset 9 | from peft import LoraConfig, get_peft_model 10 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM 11 | 12 | # this script should take around 14GB VRAM 13 | 14 | MODEL_NAME='redpajama-incite-chat-3b-sample-lowrank' 15 | 16 | # read datasets 17 | with open('data/OIG-chip2/unified_chip2.jsonl', 'r') as fp: 18 | data = [json.loads(x) for x in fp.readlines()] 19 | 20 | model = AutoModelForCausalLM.from_pretrained( 21 | "togethercomputer/RedPajama-INCITE-Chat-3B-v1", 22 | device_map='auto', 23 | ) 24 | 25 | tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Chat-3B-v1") 26 | tokenizer.pad_token = tokenizer.eos_token 27 | 28 | for param in model.parameters(): 29 | param.requires_grad = False # freeze the model - train adapters later 30 | if param.ndim == 1: 31 | # cast the small parameters (e.g. layernorm) to fp32 for stability 32 | param.data = param.data.to(torch.float32) 33 | 34 | model.gradient_checkpointing_enable() # reduce number of stored activations 35 | model.enable_input_require_grads() 36 | 37 | def print_trainable_parameters(model): 38 | """ 39 | Prints the number of trainable parameters in the model. 40 | """ 41 | trainable_params = 0 42 | all_param = 0 43 | for _, param in model.named_parameters(): 44 | all_param += param.numel() 45 | if param.requires_grad: 46 | trainable_params += param.numel() 47 | print( 48 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" 49 | ) 50 | 51 | config = LoraConfig( 52 | r=16, 53 | lora_alpha=32, 54 | target_modules=["query_key_value", "xxx"], 55 | lora_dropout=0.05, 56 | bias="none", 57 | task_type="CAUSAL_LM" 58 | ) 59 | 60 | model = get_peft_model(model, config) 61 | print_trainable_parameters(model) 62 | 63 | ## Training 64 | 65 | data = Dataset.from_list(data) 66 | data = data.map(lambda samples: tokenizer(samples['text']), batched=True) 67 | 68 | trainer = transformers.Trainer( 69 | model=model, 70 | train_dataset=data, 71 | args=transformers.TrainingArguments( 72 | per_device_train_batch_size=4, 73 | gradient_accumulation_steps=4, 74 | warmup_steps=100, 75 | max_steps=200, 76 | learning_rate=2e-4, 77 | fp16=True, 78 | logging_steps=1, 79 | output_dir='outputs' 80 | ), 81 | data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) 82 | ) 83 | model.config.use_cache = False # silence the warnings. Please re-enable for inference! 84 | trainer.train() 85 | 86 | # save the trained adapter to disk 87 | model.save_pretrained(f"outputs/{MODEL_NAME}") 88 | -------------------------------------------------------------------------------- /training/lora/example/redpajama-incite-chat-3b_inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from peft import PeftModel, PeftConfig 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | 5 | peft_model_path ='outputs/redpajama-incite-chat-3b-sample-lowrank' 6 | 7 | config = PeftConfig.from_pretrained(peft_model_path) 8 | model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto') 9 | tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) 10 | 11 | # Load the Lora model 12 | model = PeftModel.from_pretrained(model, peft_model_path) 13 | 14 | batch = tokenizer(": Hello!\n:", return_tensors='pt') 15 | 16 | with torch.cuda.amp.autocast(): 17 | output_tokens = model.generate(**batch, max_new_tokens=50) 18 | 19 | print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) 20 | -------------------------------------------------------------------------------- /training/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/modules/__init__.py -------------------------------------------------------------------------------- /training/modules/dist_deberta_pp_module.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from .deberta_modules import DebertaV2Embeddings, DebertaV2Layers, DebertaClassificationHead 3 | 4 | 5 | class DebertaStageBase(nn.Module): 6 | def __init__(self, args, config): 7 | super().__init__() 8 | self._to_cpu = False # (args.dist_backend == "gloo") 9 | self.config = config 10 | 11 | def _create_first_layer(self): 12 | return DebertaV2Embeddings(self.config) 13 | 14 | def _create_last_layer(self): 15 | return DebertaClassificationHead(self.config) 16 | 17 | def _create_transformer_layers(self, first_block=False): 18 | return DebertaV2Layers(self.config, first_block=first_block) # TODO: checkpoint 19 | 20 | 21 | class DebertaStageFirst(DebertaStageBase): 22 | def __init__(self, args, config, device): 23 | super().__init__(args, config) 24 | self.device = device 25 | self.embeddings = self._create_first_layer().to(device) 26 | self.encoder = self._create_transformer_layers(first_block=True).to(device) 27 | 28 | def forward(self, x, token_type_ids=None, attention_mask=None): 29 | if self._to_cpu: 30 | x = x.to(self.device) 31 | if token_type_ids is not None: 32 | token_type_ids = token_type_ids.to(self.device) 33 | if attention_mask is not None: 34 | attention_mask = attention_mask.to(self.device) 35 | x = self.embeddings(x, token_type_ids=token_type_ids) 36 | out = self.encoder(x, attention_mask=attention_mask) 37 | return out.cpu() if self._to_cpu else out 38 | 39 | 40 | class DebertaStageMiddle(DebertaStageBase): 41 | def __init__(self, args, config, device): 42 | super().__init__(args, config) 43 | self.device = device 44 | self.encoder = self._create_transformer_layers(first_block=False).to(device) 45 | 46 | def forward(self, x, attention_mask=None): 47 | if self._to_cpu: 48 | x = x.to(self.device) 49 | if attention_mask is not None: 50 | attention_mask = attention_mask.to(self.device) 51 | out = self.encoder(x, attention_mask=attention_mask) 52 | return out.cpu() if self._to_cpu else out 53 | 54 | 55 | class DebertaStageLast(DebertaStageBase): 56 | def __init__(self, args, config, device): 57 | super().__init__(args, config) 58 | self.device = device 59 | self.encoder = self._create_transformer_layers(first_block=False).to(device) 60 | self.output_head = self._create_last_layer().to(device) 61 | 62 | def forward(self, x, attention_mask=None, input_ids=None): 63 | if self._to_cpu: 64 | x = x.to(self.device) 65 | if attention_mask is not None: 66 | attention_mask = attention_mask.to(self.device) 67 | x = self.encoder(x, attention_mask=attention_mask) 68 | out = self.output_head(x) 69 | return out.cpu() if self._to_cpu else out -------------------------------------------------------------------------------- /training/modules/dist_gpt_fsdp_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP 3 | from .task_modules import GlueClassification 4 | from .gpt_modules import MultiHeadAttention, TwoLayerMLP, GPTEmbedding 5 | from fairscale.nn.checkpoint import checkpoint_wrapper 6 | 7 | 8 | # This is only implemented to support checkpoint in FSDP 9 | 10 | class GPTTransformerFsdpLayer(torch.nn.Module): 11 | def __init__(self, model_dim, head_num, feedforward_dim=2048, layer_norm_eps=1e-5, use_checkpoint=True, 12 | explicit_fsdp=False) -> None: 13 | super(GPTTransformerFsdpLayer, self).__init__() 14 | self.attn = MultiHeadAttention(model_dim, head_num) 15 | if use_checkpoint: 16 | self.attn = checkpoint_wrapper(self.attn) 17 | if explicit_fsdp: 18 | self.attn = FSDP(self.attn, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False, 19 | flatten_parameters=False) 20 | # Implementation of Feedforward model 21 | self.mlp = TwoLayerMLP(model_dim, feedforward_dim) 22 | if use_checkpoint: 23 | self.mlp = checkpoint_wrapper(self.mlp) 24 | if explicit_fsdp: 25 | self.attn = FSDP(self.attn, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False, 26 | flatten_parameters=False) 27 | self.norm1 = torch.nn.LayerNorm(model_dim, eps=layer_norm_eps) 28 | self.norm2 = torch.nn.LayerNorm(model_dim, eps=layer_norm_eps) 29 | # self.dropout1 = nn.Dropout(dropout) 30 | # self.dropout2 = nn.Dropout(dropout) 31 | 32 | def forward(self, x: torch.Tensor) -> torch.Tensor: 33 | x = self.norm1(x) 34 | # x = x + self.dropout_1(self.attn(x2, x2, x2)) 35 | x.requires_grad_(True) 36 | x = self.attn(x) 37 | x = self.norm2(x) 38 | # x = x + self.dropout_2(self.ff(x2)) 39 | x.requires_grad_(True) 40 | x = self.mlp(x) 41 | return x 42 | 43 | 44 | class GPTGlueFsdpModel(torch.nn.Module): 45 | def __init__(self, args, vocab_size, num_classes, use_checkpoint=True): 46 | super(GPTGlueFsdpModel, self).__init__() 47 | self.embedding = GPTEmbedding(vocab_size, args.embedding_dim, args.seq_length) 48 | 49 | module_list = [] 50 | for _ in range(args.num_layers): 51 | module_list.append(GPTTransformerFsdpLayer(args.embedding_dim, args.num_heads, 52 | args.embedding_dim * 4, use_checkpoint, explicit_fsdp=False)) 53 | self.transformers = torch.nn.Sequential(*module_list) 54 | self.classifier = GlueClassification(args.embedding_dim, num_classes) 55 | 56 | def forward(self, input_ids, position_ids): 57 | input_emb = self.embedding(input_ids, position_ids) 58 | output_emb = self.transformers(input_emb) 59 | return self.classifier(output_emb) 60 | 61 | 62 | class GPTFsdpStageBase(torch.nn.Module): 63 | def __init__(self, args, num_stage_layers, vocab_size, num_classes, use_checkpoint=True, explicit_fsdp=True): 64 | super(GPTFsdpStageBase, self).__init__() 65 | self._vocab_size = vocab_size 66 | self._explicit_fsdp = explicit_fsdp 67 | self._use_checkpoint = use_checkpoint 68 | self._embedding_dim = args.embedding_dim # embedding dimension 69 | self._seq_length = args.seq_length 70 | self._num_classes = num_classes 71 | # the dimension of the feedforward aws_network model in nn.TransformerEncoder 72 | self._feedforward_dim = args.embedding_dim * 4 73 | self._num_heads = args.num_heads # the number of heads in the multi-head attention models 74 | self._num_layers = num_stage_layers 75 | 76 | def _create_first_layer(self): 77 | emb = GPTEmbedding(self._vocab_size, self._embedding_dim, self._seq_length) 78 | if self._explicit_fsdp: 79 | return FSDP(emb, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False, 80 | flatten_parameters=False) 81 | else: 82 | return emb 83 | 84 | def _create_last_layer(self): 85 | classifier = GlueClassification(self._embedding_dim, self._num_classes) 86 | if self._explicit_fsdp: 87 | return FSDP(classifier, reshard_after_forward=True, move_params_to_cpu=False, mixed_precision=False, 88 | flatten_parameters=False) 89 | else: 90 | return classifier 91 | 92 | def _create_fsdp_transformer_layer(self): 93 | return GPTTransformerFsdpLayer(self._embedding_dim, self._num_heads, self._feedforward_dim, 94 | use_checkpoint=self._use_checkpoint, explicit_fsdp=self._explicit_fsdp) 95 | 96 | 97 | class GPTFsdpStageFirst(GPTFsdpStageBase): 98 | def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True): 99 | super(GPTFsdpStageFirst, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint, 100 | explicit_fsdp) 101 | self.device = device 102 | module_list = [self._create_first_layer()] 103 | for _ in range(self._num_layers): 104 | module_list.append(self._create_fsdp_transformer_layer()) 105 | self.model = torch.nn.Sequential(*module_list).to(device) 106 | 107 | def forward(self, x): 108 | out = self.model(x) 109 | return out 110 | 111 | 112 | class GPTFsdpStageMiddle(GPTFsdpStageBase): 113 | def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True): 114 | super(GPTFsdpStageMiddle, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint, 115 | explicit_fsdp) 116 | self.device = device 117 | module_list = [] 118 | for _ in range(self._num_layers): 119 | module_list.append(self._create_fsdp_transformer_layer()) 120 | self.model = torch.nn.Sequential(*module_list).to(device) 121 | 122 | def forward(self, x): 123 | out = self.model(x) 124 | return out 125 | 126 | 127 | class GPTFsdpStageLast(GPTFsdpStageBase): 128 | def __init__(self, args, num_stage_layers, vocab_size, num_classes, device, use_checkpoint=True, explicit_fsdp=True): 129 | super(GPTFsdpStageLast, self).__init__(args, num_stage_layers, vocab_size, num_classes, use_checkpoint, 130 | explicit_fsdp) 131 | self.device = device 132 | module_list = [] 133 | for _ in range(self._num_layers): 134 | module_list.append(self._create_fsdp_transformer_layer()) 135 | module_list.append(self._create_last_layer()) 136 | self.model = torch.nn.Sequential(*module_list).to(device) 137 | 138 | def forward(self, x): 139 | out = self.model(x) 140 | return out 141 | -------------------------------------------------------------------------------- /training/modules/dist_gpt_pp_module.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import nn 3 | from comm.comm_utils import * 4 | 5 | from copy import deepcopy 6 | 7 | 8 | class GPTStageBase(nn.Module): 9 | def __init__(self, args, config): 10 | super(GPTStageBase, self).__init__() 11 | self._to_cpu = (args.dist_backend == "gloo") 12 | self._embedding_dim = args.embedding_dim # embedding dimension 13 | self._seq_length = args.seq_length 14 | # the dimension of the feedforward aws_network model in nn.TransformerEncoder 15 | self._feedforward_dim = args.embedding_dim * 4 16 | self._num_heads = args.num_heads # the number of heads in the multi-head attention models 17 | self._num_layers = args.num_layers 18 | self._layer_begin = get_pipeline_parallel_rank() * args.num_layers 19 | self._layer_end = min(self._layer_begin + args.num_layers, args.max_layers) 20 | 21 | self._task_type = getattr(args, 'task_type', 'language_model') 22 | 23 | self.load_pretrained_model = args.load_pretrained_model 24 | self.model_name = args.model_name 25 | self.config = config 26 | 27 | if hasattr(args, 'model_type'): 28 | if args.model_type == "gpt2": 29 | from .hf_gpt2_modules import GPTEmbeddings, GPTBlock, GPTLMHead 30 | elif args.model_type == "gptj": 31 | from .hf_gptj_modules import GPTEmbeddings, GPTBlock, GPTLMHead 32 | elif args.model_type == "gptneox": 33 | from .hf_gptneox_modules import GPTEmbeddings, GPTBlock, GPTLMHead 34 | elif args.model_type == 'llama': 35 | from .llama_modules import GPTEmbeddings, GPTBlock, GPTLMHead 36 | else: 37 | raise Exception("unknown") 38 | else: 39 | raise Exception("!!!! model type not defined") 40 | 41 | self._GPTEmbeddings = GPTEmbeddings 42 | self._GPTBlock = GPTBlock 43 | self._GPTLMHead = GPTLMHead 44 | 45 | def _create_first_layer(self): 46 | layer = self._GPTEmbeddings(deepcopy(self.config)) 47 | if self.load_pretrained_model: 48 | print('loading embs') 49 | ret = layer.load_state_dict( 50 | torch.load(f'{self.model_name}/pytorch_embs.pt'), strict=False 51 | ) 52 | if len(ret.missing_keys): 53 | print('The following weight keys are missing:') 54 | print(ret.missing_keys) 55 | if len(ret.unexpected_keys): 56 | print('The following weight keys are unexpected:') 57 | print(ret.unexpected_keys) 58 | return layer 59 | 60 | def _create_last_layer(self): 61 | layer = self._GPTLMHead(deepcopy(self.config)) 62 | if self.load_pretrained_model: 63 | print('loading lm_head') 64 | ret = layer.load_state_dict( 65 | torch.load(f'{self.model_name}/pytorch_lm_head.pt'), strict=False 66 | ) 67 | if len(ret.missing_keys): 68 | print('The following weight keys are missing:') 69 | print(ret.missing_keys) 70 | if len(ret.unexpected_keys): 71 | print('The following weight keys are unexpected:') 72 | print(ret.unexpected_keys) 73 | return layer 74 | 75 | def _create_transformer_layer(self, layer_idx=0): 76 | config = deepcopy(self.config) 77 | layer = self._GPTBlock(config, layer_id=layer_idx) # TODO: checkpoint 78 | if self.load_pretrained_model: 79 | print(f'loading layer {layer_idx}') 80 | ret = layer.load_state_dict( 81 | torch.load(f'{self.model_name}/pytorch_{layer_idx}.pt'), strict=False 82 | ) 83 | if len(ret.missing_keys): 84 | print('The following weight keys are missing:') 85 | print(ret.missing_keys) 86 | if len(ret.unexpected_keys): 87 | print('The following weight keys are unexpected:') 88 | print(ret.unexpected_keys) 89 | return layer 90 | 91 | 92 | class GPTStageFull(GPTStageBase): 93 | def __init__(self, args, config, device): 94 | super(GPTStageFull, self).__init__(args, config) 95 | self.device = device 96 | module_list = [self._create_first_layer()] 97 | for layer_idx in range(self._layer_begin, self._layer_end): 98 | module_list.append(self._create_transformer_layer(layer_idx=layer_idx)) 99 | if hasattr(args, 'skip_lm_head') and args.skip_lm_head: 100 | pass 101 | else: 102 | module_list.append(self._create_last_layer()) 103 | self.model = nn.Sequential(*module_list).to(device) 104 | 105 | def forward(self, x, **kargs): 106 | for module in self.model: 107 | x = module(x, **kargs) 108 | return x 109 | 110 | 111 | class GPTStageFirst(GPTStageBase): 112 | def __init__(self, args, config, device): 113 | super(GPTStageFirst, self).__init__(args, config) 114 | self.device = device 115 | module_list = [self._create_first_layer()] 116 | for layer_idx in range(self._layer_begin, self._layer_end): 117 | module_list.append(self._create_transformer_layer(layer_idx=layer_idx)) 118 | self.model = nn.Sequential(*module_list).to(device) 119 | 120 | def forward(self, x, **kargs): 121 | for module in self.model: 122 | x = module(x, **kargs) 123 | return x 124 | # out = self.model(x.to(self.device), **kargs) 125 | # return out.cpu() if self._to_cpu else out 126 | 127 | 128 | class GPTStageMiddle(GPTStageBase): 129 | def __init__(self, args, config, device): 130 | super(GPTStageMiddle, self).__init__(args, config) 131 | self.device = device 132 | module_list = [] 133 | for layer_idx in range(self._layer_begin, self._layer_end): 134 | module_list.append(self._create_transformer_layer(layer_idx=layer_idx)) 135 | self.model = nn.Sequential(*module_list).to(device) 136 | 137 | def forward(self, x, **kargs): 138 | for module in self.model: 139 | x = module(x, **kargs) 140 | return x 141 | # out = self.model(x.to(self.device), **kargs) if self._to_cpu else self.model(x) 142 | # return out.cpu() if self._to_cpu else out 143 | 144 | 145 | class GPTStageLast(GPTStageBase): 146 | def __init__(self, args, config, device): 147 | super(GPTStageLast, self).__init__(args, config) 148 | self.device = device 149 | module_list = [] 150 | for layer_idx in range(self._layer_begin, self._layer_end): 151 | module_list.append(self._create_transformer_layer(layer_idx=layer_idx)) 152 | 153 | if hasattr(args, 'skip_lm_head') and args.skip_lm_head: 154 | pass 155 | else: 156 | module_list.append(self._create_last_layer()) 157 | 158 | self.model = nn.Sequential(*module_list).to(device) 159 | 160 | # self.upscale_last = nn.Linear(args.embedding_dim, 9216).to(device) 161 | 162 | def forward(self, x, **kargs): 163 | for module in self.model: 164 | x = module(x, **kargs) 165 | 166 | return x 167 | 168 | # def forward(self, x, **kargs): 169 | # for module in self.model[:-1]: 170 | # x = module(x, **kargs) 171 | # hid = x 172 | # x = self.model[-1](x, **kargs) 173 | 174 | # hid = self.upscale_last(hid) 175 | # loss = torch.nn.functional.mse_loss(hid, kargs['teacher_hidden_states']) 176 | # print(loss.item()) 177 | # return x, loss 178 | -------------------------------------------------------------------------------- /training/modules/task_modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class GlueClassification(torch.nn.Module): 5 | def __init__(self, model_dim, num_classes): 6 | super(GlueClassification, self).__init__() 7 | self.model_dim = model_dim 8 | self.num_classes = num_classes 9 | self.pooler_layer = torch.nn.Linear(model_dim, model_dim) 10 | self.fc_layer = torch.nn.Linear(model_dim, num_classes) 11 | 12 | def forward(self, hidden_states, pooler_index=0): 13 | pooled = hidden_states[:, pooler_index, :] 14 | pooled = self.pooler_layer(pooled) 15 | pooled = torch.tanh(pooled) 16 | return self.fc_layer(pooled) 17 | -------------------------------------------------------------------------------- /training/modules/tokenizer.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import AutoTokenizer, GPT2TokenizerFast, DebertaV2Tokenizer 3 | 4 | def build_tokenizer(args): 5 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) 6 | if tokenizer.pad_token is None: 7 | tokenizer.pad_token = tokenizer.eos_token 8 | return tokenizer 9 | 10 | def build_gpt2_tokenizer(args): 11 | tokenizer = GPT2TokenizerFast.from_pretrained(args.tokenizer_name) 12 | tokenizer.pad_token = tokenizer.eos_token 13 | return tokenizer 14 | 15 | def build_deberta_tokenizer(args): 16 | tokenizer = DebertaV2Tokenizer.from_pretrained(args.tokenizer_name) 17 | return tokenizer 18 | -------------------------------------------------------------------------------- /training/modules/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import numpy as np 4 | from torch import nn 5 | from torch.nn import functional 6 | from typing import Optional, Tuple, Union 7 | 8 | 9 | # @torch.jit.script 10 | def gpt_loss_func(input, target): 11 | lm_logits, labels = input, target 12 | shift_logits = lm_logits[..., :-1, :].contiguous() 13 | shift_labels = labels[..., 1:].contiguous() 14 | loss = functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) 15 | return loss -------------------------------------------------------------------------------- /training/optimizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/optimizer/__init__.py -------------------------------------------------------------------------------- /training/optimizer/grad_scalar.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from abc import abstractmethod 3 | 4 | import torch 5 | 6 | 7 | class GradScaler(ABC): 8 | def __init__(self, initial_scale, device=None): 9 | """Initialize scale value with the input initial scale.""" 10 | assert initial_scale > 0.0 11 | self.device = device 12 | self._scale = torch.cuda.FloatTensor([initial_scale], device=device) 13 | 14 | @property 15 | def scale(self): 16 | return self._scale 17 | 18 | @property 19 | def inv_scale(self): 20 | return self._scale.double().reciprocal().float() 21 | 22 | @abstractmethod 23 | def update(self, found_inf): 24 | pass 25 | 26 | @abstractmethod 27 | def state_dict(self): 28 | pass 29 | 30 | @abstractmethod 31 | def load_state_dict(self, state_dict): 32 | pass 33 | 34 | 35 | class ConstantGradScaler(GradScaler): 36 | 37 | def update(self, found_inf): 38 | pass 39 | 40 | def state_dict(self): 41 | return dict() 42 | 43 | def load_state_dict(self, state_dict): 44 | pass 45 | 46 | 47 | class DynamicGradScaler(GradScaler): 48 | 49 | def __init__(self, initial_scale, min_scale, 50 | growth_factor, backoff_factor, 51 | growth_interval, hysteresis, device=None): 52 | """"Grad scaler with dynamic scale that gets adjusted 53 | during training.""" 54 | super(DynamicGradScaler, self).__init__(initial_scale, device=device) 55 | 56 | # Lower bound on the scale. 57 | assert min_scale > 0.0 58 | assert min_scale <= initial_scale 59 | self.min_scale = torch.cuda.FloatTensor([min_scale], device=device) 60 | # Growth and backoff factors for the scale. 61 | assert growth_factor > 1.0 62 | self.growth_factor = torch.cuda.FloatTensor([growth_factor], device=device) 63 | assert backoff_factor < 1.0 64 | assert backoff_factor > 0.0 65 | self.backoff_factor = torch.cuda.FloatTensor([backoff_factor], device=device) 66 | # Interval over which if we don't see any inf/nan, 67 | # we will scale the grad scale by the growth factor. 68 | assert growth_interval > 0 69 | self.growth_interval = growth_interval 70 | # Number of inf/nans we should see before scaling down 71 | # the grad scale by the backoff factor. 72 | assert hysteresis > 0 73 | self.hysteresis = hysteresis 74 | 75 | # Trackers. 76 | self._growth_tracker = 0 77 | self._hysteresis_tracker = self.hysteresis 78 | 79 | def update(self, found_inf): 80 | # If we have an inf/nan, growth tracker is set to 0 81 | # and hysterisis tracker is reduced by 1. 82 | if found_inf: 83 | self._growth_tracker = 0 84 | self._hysteresis_tracker -= 1 85 | # Now if we are out of hysteresis count, scale down the loss. 86 | if self._hysteresis_tracker <= 0: 87 | self._scale = torch.max(self._scale * self.backoff_factor, 88 | self.min_scale) 89 | print('##### scale backoff to', self._scale) 90 | else: 91 | # If there is no nan/inf, increment the growth tracker. 92 | self._growth_tracker += 1 93 | # If we have had enough consequitive intervals with no nan/inf: 94 | if self._growth_tracker == self.growth_interval: 95 | # Reset the tracker and hysteresis trackers, 96 | self._growth_tracker = 0 97 | self._hysteresis_tracker = self.hysteresis 98 | # and scale up the loss scale. 99 | self._scale = self._scale * self.growth_factor 100 | print('##### scale grow to', self._scale) 101 | 102 | def state_dict(self): 103 | state_dict = {} 104 | state_dict['scale'] = self._scale 105 | state_dict['growth_tracker'] = self._growth_tracker 106 | state_dict['hysteresis_tracker'] = self._hysteresis_tracker 107 | return state_dict 108 | 109 | def load_state_dict(self, state_dict): 110 | self._scale = state_dict['scale'].to(self.device) 111 | self._growth_tracker = state_dict['growth_tracker'] 112 | self._hysteresis_tracker = state_dict['hysteresis_tracker'] -------------------------------------------------------------------------------- /training/optimizer/optimizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .grad_scalar import * 3 | 4 | # This follows some implementation from Megatron 5 | 6 | 7 | def _has_overflow_serial(grads): 8 | 9 | def _has_inf_or_nan(x): 10 | try: 11 | # if x is half, the .float() incurs an additional deep copy, but it's necessary if 12 | # Pytorch's .sum() creates a one-element tensor of the same type as x 13 | # (which is true for some recent version of pytorch). 14 | cpu_sum = float(x.float().sum()) 15 | # More efficient version that can be used if .sum() returns a Python scalar 16 | # cpu_sum = float(x.sum()) 17 | except RuntimeError as instance: 18 | # We want to check if inst is actually an overflow exception. 19 | # RuntimeError could come from a different error. 20 | # If so, we still want the exception to propagate. 21 | if "value cannot be converted" not in instance.args[0]: 22 | raise 23 | return True 24 | else: 25 | if cpu_sum in [float('inf'), -float('inf')] or cpu_sum != cpu_sum: 26 | return True 27 | return False 28 | 29 | for p in grads: 30 | if _has_inf_or_nan(p): 31 | return torch.FloatTensor([1.0]) 32 | 33 | return torch.FloatTensor([0.0]) 34 | 35 | 36 | # `x` is a torch.Tensor 37 | 38 | 39 | 40 | def _zero_grad_group(group, set_to_none): 41 | """Zero out the gradient for a group of parameters. 42 | Note: copied from torch.optim.optimizer.""" 43 | for param in group: 44 | if param.grad is not None: 45 | if set_to_none: 46 | param.grad = None 47 | else: 48 | if param.grad.grad_fn is not None: 49 | param.grad.detach_() 50 | else: 51 | param.grad.requires_grad_(False) 52 | param.grad.zero_() 53 | 54 | 55 | ''' 56 | def _multi_tensor_copy_this_to_that(this, that): 57 | for this_, that_ in zip(this, that): 58 | that_.copy_(this_) 59 | ''' 60 | 61 | 62 | class Fp16Optimizer: 63 | # If offload is set to true, the fp32 copy is stored on CPU. 64 | def __init__(self, optimizer, grad_scaler, device, offload=False): 65 | self.offload = offload 66 | if self.offload: 67 | self.cpu_to_gpu_stream = torch.cuda.Stream(device=device, priority=-1) 68 | self.gpu_to_cpu_stream = torch.cuda.Stream(device=device, priority=-1) 69 | self.optimizer = optimizer 70 | self.grad_scaler = grad_scaler 71 | 72 | if self.grad_scaler: 73 | self.found_inf = torch.cuda.FloatTensor([0.0], device=device) if not self.offload else torch.FloatTensor([0.0]) 74 | 75 | self._dummy_overflow_buf = torch.cuda.IntTensor([0], device=device) if not self.offload else torch.IntTensor([0]) 76 | 77 | # Note that the model should first be cast to fp16 before passing to the optimizer. 78 | self.float16_groups = [] 79 | self.fp32_from_float16_groups = [] 80 | 81 | # For all the groups in the original optimizer: 82 | for param_group in self.optimizer.param_groups: 83 | float16_params_this_group = [] 84 | fp32_from_float16_params_this_group = [] 85 | # For all the parameters in this group: 86 | for i, param in enumerate(param_group['params']): 87 | if param.requires_grad: 88 | # float16 params: 89 | assert param.type() == 'torch.cuda.HalfTensor' 90 | float16_params_this_group.append(param) 91 | # Create a copy 92 | if self.offload: 93 | optimizer_param = param.detach().clone().float().to(device='cpu') 94 | assert optimizer_param.device == torch.device('cpu') 95 | if optimizer_param.grad is None: 96 | optimizer_param.grad = torch.zeros_like(optimizer_param.data) 97 | else: 98 | optimizer_param = param.detach().clone().float() 99 | # Replace the optimizer params with the new fp32 copy. 100 | param_group['params'][i] = optimizer_param 101 | fp32_from_float16_params_this_group.append(optimizer_param) 102 | # Reset existing state dict key to the new optimizer param. 103 | if param in self.optimizer.state: 104 | self.optimizer.state[optimizer_param] = self.optimizer.state.pop(param) 105 | 106 | self.float16_groups.append(float16_params_this_group) 107 | self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group) 108 | 109 | # Leverage state_dict() and load_state_dict() to 110 | # recast preexisting per-param state tensors 111 | self.optimizer.load_state_dict(self.optimizer.state_dict()) 112 | 113 | def zero_grad(self, set_to_none=True): 114 | for group in self.float16_groups: 115 | _zero_grad_group(group, set_to_none) 116 | if not self.offload: 117 | for group in self.fp32_from_float16_groups: 118 | _zero_grad_group(group, set_to_none) 119 | 120 | def get_loss_scale(self): 121 | return self.grad_scaler.scale 122 | 123 | def _copy_model_grads_to_optimizer_grads(self): 124 | # This only needs to be done for the float16 group. 125 | for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups): 126 | for model_param, optimizer_param in zip(model_group, optimizer_group): 127 | if model_param.grad is not None: 128 | if self.offload: 129 | with torch.cuda.stream(self.gpu_to_cpu_stream): 130 | optimizer_param.grad.copy_(model_param.grad, non_blocking=False) 131 | else: 132 | optimizer_param.grad = model_param.grad.float() 133 | # Safe to deallocate model's grad/optimizer_grad after copying. 134 | # (If using contiguous buffers, optimizer_grad's memory should 135 | # persist and therefore should not be deallocated.) 136 | model_param.grad = None 137 | 138 | def _unscale_optimizer_grads_and_check_for_nan(self): 139 | optimizer_grads = [] 140 | # fp32 params fromm float16 ones. 141 | for optimizer_group in self.fp32_from_float16_groups: 142 | for optimizer_param in optimizer_group: 143 | if optimizer_param.grad is not None: 144 | optimizer_grads.append(optimizer_param.grad.data) 145 | # Reset found inf. 146 | self.found_inf.fill_(0.0) 147 | # Unscale and set found inf/nan 148 | print(optimizer_grads[0].device, self.found_inf.device, self.grad_scaler.inv_scale.device) 149 | if self.offload: 150 | self.found_inf = _has_overflow_serial(optimizer_grads) 151 | else: 152 | torch._amp_foreach_non_finite_check_and_unscale_(optimizer_grads, self.found_inf, self.grad_scaler.inv_scale) 153 | # Check for nan. 154 | found_inf_flag = (self.found_inf.item() > 0) 155 | return found_inf_flag 156 | 157 | def _get_model_and_optimizer_params_data_float16_deprecated(self): 158 | model_data = [] 159 | optimizer_data = [] 160 | for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups): 161 | for model_param, optimizer_param in zip(model_group, optimizer_group): 162 | model_data.append(model_param.data) 163 | optimizer_data.append(optimizer_param.data) 164 | return model_data, optimizer_data 165 | 166 | def _copy_optimizer_params_to_model_params(self): 167 | # Only needed for the float16 params. 168 | # model_data, optimizer_data = self._get_model_and_optimizer_params_data_float16_deprecated() 169 | # _multi_tensor_copy_this_to_that(this=optimizer_data, that=model_data) 170 | 171 | for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups): 172 | for model_param, optimizer_param in zip(model_group, optimizer_group): 173 | if self.offload: 174 | with torch.cuda.stream(self.cpu_to_gpu_stream): 175 | model_param.data.copy_(optimizer_param.data, non_blocking=False) 176 | else: 177 | model_param.data.copy_(optimizer_param.data) 178 | 179 | def _copy_model_params_to_optimizer_params(self): 180 | # Only needed for the float16 params. 181 | # model_data, optimizer_data = self._get_model_and_optimizer_params_data_float16_deprecated() 182 | # _multi_tensor_copy_this_to_that(this=model_data, that=optimizer_data) 183 | for model_group, optimizer_group in zip(self.float16_groups, self.fp32_from_float16_groups): 184 | for model_param, optimizer_param in zip(model_group, optimizer_group): 185 | if self.offload: 186 | with torch.cuda.stream(self.gpu_to_cpu_stream): 187 | optimizer_param.data.copy_(model_param.data, non_blocking=False) 188 | else: 189 | optimizer_param.data.copy_(model_param.data) 190 | 191 | def reload_model_params(self): 192 | self._copy_model_params_to_optimizer_params() 193 | 194 | @torch.no_grad() 195 | def step(self): 196 | self._copy_model_grads_to_optimizer_grads() 197 | 198 | found_inf_flag = self._unscale_optimizer_grads_and_check_for_nan() 199 | self.grad_scaler.update(found_inf_flag) 200 | 201 | # If we found inf/nan, skip the update. 202 | if found_inf_flag: 203 | print("!!! Warning: find inf in fp16 optimizer-step() !!!") 204 | return False 205 | 206 | for params in self.fp32_from_float16_groups: 207 | torch.nn.utils.clip_grad_norm_(params, 1.0) 208 | 209 | # Step the optimizer. 210 | self.optimizer.step() 211 | 212 | self._copy_optimizer_params_to_model_params() 213 | # Successful update. 214 | return True 215 | 216 | def scale(self, z): 217 | return z * self.grad_scaler.scale 218 | 219 | def unscale(self, z): 220 | return z * self.grad_scaler.inv_scale 221 | 222 | def state_dict(self): 223 | return self.optimizer.state_dict() 224 | 225 | def load_state_dict(self, state_dict): 226 | self.optimizer.load_state_dict(state_dict) 227 | 228 | 229 | def get_fp16_optimizer(args, optimizer, device): 230 | assert args.fp16 is not None 231 | if args.loss_scale: 232 | print("fp16 uses ConstantGradScaler.") 233 | grad_scaler = ConstantGradScaler(args.loss_scale) 234 | else: 235 | print("fp16 uses DynamicGradScaler.") 236 | grad_scaler = DynamicGradScaler( 237 | initial_scale=args.initial_loss_scale, 238 | min_scale=args.min_loss_scale, 239 | growth_factor=2.0, 240 | backoff_factor=0.5, 241 | growth_interval=args.loss_scale_window, 242 | hysteresis=args.hysteresis) 243 | return Fp16Optimizer(optimizer, grad_scaler, device, getattr(args, 'use_offload', False)) 244 | 245 | -------------------------------------------------------------------------------- /training/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/pipeline_parallel/__init__.py -------------------------------------------------------------------------------- /training/pipeline_parallel/dist_pp_utils.py: -------------------------------------------------------------------------------- 1 | from .dist_gpipe_pipeline_async import GpipeAsync 2 | 3 | 4 | def get_pp_module(args, config, device, use_dp): 5 | 6 | if args.pp_mode == 'gpipe': 7 | return GpipeAsync(args, config, device, use_dp) 8 | else: 9 | print("Not recognize this pipeline parallel mode.") 10 | assert False 11 | 12 | -------------------------------------------------------------------------------- /training/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/tasks/__init__.py -------------------------------------------------------------------------------- /training/tasks/data_loaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/tasks/data_loaders/__init__.py -------------------------------------------------------------------------------- /training/tasks/data_loaders/prosocial.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import torch 4 | import json 5 | from torch.utils.data import IterableDataset, DataLoader 6 | from itertools import cycle, islice 7 | import random 8 | from datasets import Dataset 9 | from datasets import load_dataset, load_from_disk 10 | from comm.comm_utils import * 11 | 12 | 13 | 14 | class StreamDataset(IterableDataset): 15 | def __init__(self, dataset, tokenizer, seq_length=1024): 16 | 17 | self.dataset = dataset 18 | 19 | self.tokenizer = tokenizer 20 | self.seq_length = seq_length 21 | 22 | self.it = None 23 | self.iter_count = 0 24 | 25 | def state_dict(self): 26 | return { 27 | 'iter_count': self.iter_count, 28 | } 29 | 30 | def load_state_dict(self, state_dict): 31 | self.iter_count = state_dict['iter_count'] 32 | self.dataset = self.dataset.skip(self.iter_count) 33 | 34 | def get_sequence(self): 35 | 36 | it = cycle(iter(self.dataset)) 37 | 38 | while True: 39 | 40 | text_context = '''Possible labels: 41 | 1. casual 42 | 2. needs caution 43 | 3. needs intervention 44 | 4. possibly needs caution 45 | 5. probably needs caution''' 46 | 47 | while True: 48 | 49 | instance = next(it) 50 | 51 | text = instance['text'] 52 | text_context += '\n\n' + text 53 | 54 | input_ids = self.tokenizer(text_context.strip())['input_ids'] 55 | if len(input_ids) > self.seq_length: 56 | break 57 | 58 | input_ids = input_ids[:self.seq_length] 59 | input_ids = torch.tensor(input_ids).long() 60 | 61 | yield { 62 | 'input_ids': input_ids, 63 | } 64 | 65 | 66 | def get_stream(self): 67 | return cycle(self.get_sequence()) 68 | 69 | def __iter__(self): 70 | if self.it is None: 71 | self.it = self.get_stream() 72 | return self.it 73 | -------------------------------------------------------------------------------- /training/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/togethercomputer/OpenChatKit/a7094aa583d4ac9ecbe700f0c5b11e6bb28cb454/training/utils/__init__.py -------------------------------------------------------------------------------- /training/utils/dist_args_utils.py: -------------------------------------------------------------------------------- 1 | def add_device_arguments(parser): 2 | parser.add_argument('--use-cuda', default=True, type=lambda x: (str(x).lower() == 'true'), 3 | help='if this is set to True, will use cuda to train') 4 | parser.add_argument('--cuda-id', type=int, default=0, metavar='N', 5 | help='cuda index, if the instance has multiple GPUs.') 6 | parser.add_argument('--cuda-num', type=int, default=1, metavar='N', 7 | help='number of GPUs, if the instance has multiple GPUs.') 8 | parser.add_argument('--debug-mem', default=True, type=lambda x: (str(x).lower() == 'true'), 9 | help='if this is set to True, we will print some memory stats.') 10 | 11 | 12 | def add_torch_distributed_arguments(parser): 13 | parser.add_argument('--dist-backend', type=str, default='cupy_nccl', metavar='S', 14 | help='backend type for distributed PyTorch (default: cupy_nccl)') 15 | parser.add_argument('--dp-backend', type=str, default='nccl', metavar='S', 16 | help='backend type for data parallel') 17 | parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:9000', metavar='S', 18 | help='master ip for distributed PyTorch') 19 | parser.add_argument('--world-size', type=int, default=4, metavar='D', 20 | help='world-size (default: 4)') 21 | parser.add_argument('--pipeline-group-size', type=int, default=4, metavar='D', 22 | help='world-size (default: 2)') 23 | parser.add_argument('--data-group-size', type=int, default=1, metavar='D', 24 | help='world-size (default: 1)') 25 | parser.add_argument('--rank', type=int, default=0, metavar='N', 26 | help='rank of the node') 27 | 28 | 29 | def add_task_arguments(parser): 30 | parser.add_argument('--train-data', nargs='+', default=['./glue_dataset/data/QQP/train.tsv'], metavar='S', 31 | help='path to the training data') 32 | parser.add_argument('--valid-data', nargs='+', default=['./glue_dataset/data/QQP/test.tsv'], metavar='S', 33 | help='path to the training data') 34 | parser.add_argument('--tokenizer-type', type=str, default='BertWordPieceLowerCase', metavar='S', 35 | help='which tokenizer to use.') 36 | parser.add_argument('--vocab-file', type=str, default='./glue_dataset/data/bert-large-cased-vocab.txt', metavar='S', 37 | help='which tokenizer to use.') 38 | parser.add_argument('--vocab-extra-ids', type=int, default=0, metavar='N', 39 | help='-') 40 | parser.add_argument('--make-vocab-size-divisible-by', type=int, default=128, metavar='N', 41 | help='-') 42 | parser.add_argument('--optimizer', type=str, default='adamw', metavar='N', 43 | help='-') 44 | 45 | 46 | def add_model_arguments(parser): 47 | parser.add_argument('--seq-length', type=int, default=1024, metavar='N', 48 | help='-') 49 | parser.add_argument('--embedding-dim', type=int, default=768, metavar='N', 50 | help='-') 51 | parser.add_argument('--num-layers', type=int, default=4, metavar='N', 52 | help='-') 53 | parser.add_argument('--num-heads', type=int, default=12, metavar='N', 54 | help='-') 55 | 56 | 57 | def add_training_hyper_parameter_arguments(parser): 58 | parser.add_argument('--train-log-backend', type=str, default='print', metavar='N', 59 | help='-') 60 | parser.add_argument('--project-name', type=str, default='test', metavar='N', 61 | help='-') 62 | parser.add_argument('--batch-size', type=int, default=32, metavar='N', 63 | help='input batch size for training (default: 100)') 64 | parser.add_argument('--micro-batch-size', type=int, default=8, metavar='N', 65 | help='input micro batch size for training (default: 100)') 66 | parser.add_argument('--lr', type=float, default=0.01, metavar='N', 67 | help='-') 68 | parser.add_argument('--num-iters', type=int, default=10, metavar='N', 69 | help='-') 70 | 71 | 72 | def add_mixed_precision_arguments(parser): 73 | parser.add_argument('--fp16', action='store_true', 74 | help='Run model in fp16 mode.') 75 | parser.add_argument('--loss-scale', type=float, default=0, 76 | help='Static loss scaling, positive power of 2 values can improve fp16 convergence. ') 77 | parser.add_argument('--initial-loss-scale', type=float, default=32768, 78 | help='Initial loss-scale for dynamic loss scaling.') 79 | parser.add_argument('--min-loss-scale', type=float, default=1.0, 80 | help='Minimum loss scale for dynamic loss scale.') 81 | parser.add_argument('--loss-scale-window', type=float, default=1000, 82 | help='Window over which to raise/lower dynamic scale.') 83 | parser.add_argument('--hysteresis', type=int, default=2, 84 | help='hysteresis for dynamic loss scaling') 85 | parser.add_argument('--use-offload', action='store_true', 86 | help='Offload optim states to CPU') 87 | 88 | 89 | 90 | def add_parallel_schema_arguments(parser): 91 | parser.add_argument('--pp-mode', type=str, default='gpipe', metavar='S', 92 | help='use which pipeline parallel mode: gpipe or 1f1b.') 93 | parser.add_argument('--dp-mode', type=str, default='allreduce', metavar='S', 94 | help='use which data parallel mode: allreduce.') 95 | parser.add_argument('--gradient-accumulate-step', type=int, default=1, 96 | help='Number of gradient computation in Pipeline without data parallel sync.') 97 | 98 | 99 | def get_model_arguments_str(args): 100 | return '_l' + str(args.seq_length) + '_m' + str(args.embedding_dim) 101 | 102 | 103 | def get_dist_arguments_str(args, add_rank=True): 104 | dist_str = '_w' + str(args.world_size) + '_p' + str(args.pipeline_group_size) + "_" + \ 105 | str(args.gradient_accumulate_step) + '_d' + str(args.data_group_size) 106 | if add_rank: 107 | dist_str = dist_str + '_' + str(args.rank) 108 | return dist_str 109 | 110 | 111 | def get_learning_arguments_str(args): 112 | return '_b' + str(args.batch_size) + '_' + str(args.micro_batch_size) 113 | 114 | 115 | def get_mixed_precision_arguments_str(args): 116 | if args.fp16: 117 | return '_fp16' 118 | else: 119 | return '' 120 | -------------------------------------------------------------------------------- /training/utils/dist_checkpoint_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import random 4 | import json 5 | import numpy as np 6 | import torch 7 | 8 | from comm.comm_utils import * 9 | 10 | 11 | def load_checkpoint(pipe, args): 12 | 13 | if os.path.isfile(os.path.join(args.checkpoint_path, 'latest')): 14 | with open(os.path.join(args.checkpoint_path, 'latest')) as f: 15 | latest_step = int(f.read()) 16 | else: 17 | print('no checkpoint available, skipping') 18 | return 19 | 20 | checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}") 21 | 22 | try: 23 | with open(os.path.join(checkpoint_step_path, 'meta.json')) as f: 24 | meta = json.load(f) 25 | except: 26 | print('failed to load meta.') 27 | 28 | pipe.global_step = latest_step 29 | 30 | try: 31 | pipe.model.model.load_state_dict( 32 | torch.load( 33 | os.path.join( 34 | checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_checkpoint.pt' 35 | ), map_location=torch.device('cpu') 36 | ) 37 | ) 38 | except: 39 | print('failed to load model params.') 40 | 41 | try: 42 | pipe.optimizer.load_state_dict( 43 | torch.load( 44 | os.path.join( 45 | checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_optimizer.pt' 46 | ), map_location=torch.device('cpu') 47 | ) 48 | ) 49 | except: 50 | print('failed to load optim states.') 51 | 52 | try: 53 | pipe.scheduler.load_state_dict( 54 | torch.load( 55 | os.path.join( 56 | checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_scheduler.pt' 57 | ) 58 | ) 59 | ) 60 | except: 61 | print('failed to load scheduler states.') 62 | 63 | 64 | def save_checkpoint(pipe, args) -> str: 65 | 66 | latest_step = pipe.global_step 67 | checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}") 68 | 69 | os.makedirs(checkpoint_step_path, exist_ok=True) 70 | 71 | print(f"Saving checkpoint to {checkpoint_step_path} ...") 72 | 73 | torch.save( 74 | pipe.model.model.state_dict(), 75 | os.path.join( 76 | checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_checkpoint.pt' 77 | ) 78 | ) 79 | 80 | torch.save( 81 | pipe.optimizer.state_dict(), 82 | os.path.join( 83 | checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_optimizer.pt' 84 | ) 85 | ) 86 | 87 | torch.save( 88 | pipe.scheduler.state_dict(), 89 | os.path.join( 90 | checkpoint_step_path, f'prank_{get_pipeline_parallel_rank()}_scheduler.pt' 91 | ) 92 | ) 93 | 94 | with open(os.path.join(checkpoint_step_path, 'meta.json'), 'w') as f: 95 | json.dump({ 96 | 'step': latest_step, 97 | }, f) 98 | 99 | with open(os.path.join(args.checkpoint_path, 'latest'), 'w') as f: 100 | f.write(f"{latest_step}") 101 | 102 | print(f"Checkpoint saved to {checkpoint_step_path} ... Done") 103 | 104 | return checkpoint_step_path 105 | 106 | 107 | def save_stream_dataloader_state_dict(dataloader, pipe, args): 108 | 109 | latest_step = pipe.global_step 110 | checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}") 111 | 112 | os.system(f"mkdir -p {checkpoint_step_path}") 113 | 114 | torch.save( 115 | dataloader.dataset.state_dict(), 116 | os.path.join( 117 | checkpoint_step_path, f'dataset_state_dict.pt' 118 | ) 119 | ) 120 | 121 | def load_stream_dataloader_state_dict(dataloader, pipe, args): 122 | 123 | latest_step = pipe.global_step 124 | checkpoint_step_path = os.path.join(args.checkpoint_path, f"checkpoint_{latest_step}") 125 | 126 | try: 127 | state_dict = torch.load( 128 | os.path.join( 129 | checkpoint_step_path, f'dataset_state_dict.pt' 130 | ) 131 | ) 132 | 133 | dataloader.data.load_state_dict(state_dict) 134 | 135 | except Exception as e: 136 | 137 | print('failed to load dataset state_dict.') -------------------------------------------------------------------------------- /training/utils/dist_debug_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def print_cuda_memory(args, info: str, device=None): 5 | if args.debug_mem: 6 | if device is None: 7 | device = torch.device('cuda', args.cuda_id) 8 | print("<{}>: current memory allocated: {:2.3f} MB, peak memory: {:2.3f} MB".format( 9 | info, torch.cuda.memory_allocated(device)/1048576, torch.cuda.max_memory_allocated(device)/1048576)) 10 | 11 | 12 | def print_multi_cuda_memory(args, info: str): 13 | if args.debug_mem: 14 | for local_gpu_rank in range(args.cuda_num): 15 | device = torch.device('cuda', local_gpu_rank) 16 | print("<{}>({}): current memory allocated: {:2.3f} MB, peak memory: {:2.3f} MB".format(info, local_gpu_rank, 17 | torch.cuda.memory_allocated(device)/1048576, torch.cuda.max_memory_allocated(device)/1048576)) 18 | -------------------------------------------------------------------------------- /training/utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | try: 4 | import wandb 5 | _has_wandb = True 6 | except: 7 | _has_wandb = False 8 | print("wandb is not installed.") 9 | 10 | try: 11 | import loguru 12 | _has_loguru = True 13 | except: 14 | _has_loguru = False 15 | print("loguru is not installed.") 16 | 17 | train_log_backend = None 18 | 19 | def init_train_logger(args): 20 | 21 | global train_log_backend 22 | train_log_backend = getattr(args, 'train_log_backend', 'print') 23 | 24 | if train_log_backend == 'print': 25 | pass 26 | elif train_log_backend == 'loguru': 27 | os.system("mkdir -p logs") 28 | loguru.logger.add("logs/file_{time}.log") 29 | elif train_log_backend == 'wandb': 30 | 31 | assert _has_wandb 32 | 33 | if not hasattr(args, 'project_name'): 34 | import re 35 | args.project_name = "test-" + \ 36 | re.sub('[^a-zA-Z0-9 \n\.]', '_', args.task_name) 37 | 38 | wandb.init( 39 | project=args.project_name, 40 | config=args, 41 | ) 42 | 43 | else: 44 | raise Exception('Unknown logging backend.') 45 | 46 | def train_log(x, *args, **kargs): 47 | 48 | if train_log_backend == 'print': 49 | print(x) 50 | elif train_log_backend == 'loguru': 51 | loguru.logger.info(x) 52 | elif train_log_backend == 'wandb': 53 | wandb.log(x, *args, **kargs) 54 | else: 55 | raise Exception('Unknown logging backend.') 56 | 57 | --------------------------------------------------------------------------------