├── .hydra ├── config.yaml ├── hydra.yaml └── overrides.yaml ├── .vscode └── launch.json ├── README.md ├── main.log ├── minGPT-ddp ├── README.md ├── mingpt │ ├── .hydra │ │ ├── config.yaml │ │ ├── hydra.yaml │ │ └── overrides.yaml │ ├── __pycache__ │ │ ├── char_dataset.cpython-38.pyc │ │ ├── model.cpython-38.pyc │ │ └── trainer.cpython-38.pyc │ ├── char_dataset.py │ ├── data │ │ └── input.txt │ ├── gpt2_train_cfg.yaml │ ├── main.log │ ├── main.py │ ├── model.py │ └── trainer.py └── requirements.txt ├── multi_gpu.py ├── multi_gpu_torchrun.py └── single_gpu.py /.hydra/config.yaml: -------------------------------------------------------------------------------- 1 | data_config: 2 | path: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt/data/input.txt 3 | block_size: 128 4 | train_split: 0.9 5 | truncate: 0.02 6 | gpt_config: 7 | n_layer: 8 8 | n_head: 8 9 | n_embd: 512 10 | trainer_config: 11 | max_epochs: 10 12 | batch_size: 216 13 | data_loader_workers: 4 14 | grad_norm_clip: 1.0 15 | snapshot_path: gpt_snapshot.pt 16 | save_every: 3 17 | use_amp: true 18 | optimizer_config: 19 | weight_decay: 0.1 20 | learning_rate: 0.0003 21 | -------------------------------------------------------------------------------- /.hydra/hydra.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | run: 3 | dir: ./ 4 | sweep: 5 | dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} 6 | subdir: ${hydra.job.num} 7 | launcher: 8 | _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher 9 | sweeper: 10 | _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper 11 | max_batch_size: null 12 | params: null 13 | help: 14 | app_name: ${hydra.job.name} 15 | header: '${hydra.help.app_name} is powered by Hydra. 16 | 17 | ' 18 | footer: 'Powered by Hydra (https://hydra.cc) 19 | 20 | Use --hydra-help to view Hydra specific help 21 | 22 | ' 23 | template: '${hydra.help.header} 24 | 25 | == Configuration groups == 26 | 27 | Compose your configuration from those groups (group=option) 28 | 29 | 30 | $APP_CONFIG_GROUPS 31 | 32 | 33 | == Config == 34 | 35 | Override anything in the config (foo.bar=value) 36 | 37 | 38 | $CONFIG 39 | 40 | 41 | ${hydra.help.footer} 42 | 43 | ' 44 | hydra_help: 45 | template: 'Hydra (${hydra.runtime.version}) 46 | 47 | See https://hydra.cc for more info. 48 | 49 | 50 | == Flags == 51 | 52 | $FLAGS_HELP 53 | 54 | 55 | == Configuration groups == 56 | 57 | Compose your configuration from those groups (For example, append hydra/job_logging=disabled 58 | to command line) 59 | 60 | 61 | $HYDRA_CONFIG_GROUPS 62 | 63 | 64 | Use ''--cfg hydra'' to Show the Hydra config. 65 | 66 | ' 67 | hydra_help: ??? 68 | hydra_logging: 69 | version: 1 70 | formatters: 71 | simple: 72 | format: '[%(asctime)s][HYDRA] %(message)s' 73 | handlers: 74 | console: 75 | class: logging.StreamHandler 76 | formatter: simple 77 | stream: ext://sys.stdout 78 | root: 79 | level: INFO 80 | handlers: 81 | - console 82 | loggers: 83 | logging_example: 84 | level: DEBUG 85 | disable_existing_loggers: false 86 | job_logging: 87 | version: 1 88 | formatters: 89 | simple: 90 | format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' 91 | handlers: 92 | console: 93 | class: logging.StreamHandler 94 | formatter: simple 95 | stream: ext://sys.stdout 96 | file: 97 | class: logging.FileHandler 98 | formatter: simple 99 | filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log 100 | root: 101 | level: INFO 102 | handlers: 103 | - console 104 | - file 105 | disable_existing_loggers: false 106 | env: {} 107 | mode: RUN 108 | searchpath: [] 109 | callbacks: {} 110 | output_subdir: .hydra 111 | overrides: 112 | hydra: 113 | - hydra.mode=RUN 114 | task: [] 115 | job: 116 | name: main 117 | chdir: null 118 | override_dirname: '' 119 | id: ??? 120 | num: ??? 121 | config_name: gpt2_train_cfg 122 | env_set: {} 123 | env_copy: [] 124 | config: 125 | override_dirname: 126 | kv_sep: '=' 127 | item_sep: ',' 128 | exclude_keys: [] 129 | runtime: 130 | version: 1.3.2 131 | version_base: '1.3' 132 | cwd: /home/tim/桌面/git/ddp-tutorial-series 133 | config_sources: 134 | - path: hydra.conf 135 | schema: pkg 136 | provider: hydra 137 | - path: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt 138 | schema: file 139 | provider: main 140 | - path: '' 141 | schema: structured 142 | provider: schema 143 | output_dir: /home/tim/桌面/git/ddp-tutorial-series 144 | choices: 145 | hydra/env: default 146 | hydra/callbacks: null 147 | hydra/job_logging: default 148 | hydra/hydra_logging: default 149 | hydra/hydra_help: default 150 | hydra/help: default 151 | hydra/sweeper: basic 152 | hydra/launcher: basic 153 | hydra/output: default 154 | verbose: false 155 | -------------------------------------------------------------------------------- /.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | [] 2 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Python: torchrun", 6 | "type": "python", 7 | "request": "launch", 8 | // 设置 program 的路径为 torchrun 脚本对应的绝对路径 9 | "program": "/usr/local/anaconda3/envs/torch-2.0.1-cu117-py39/lib/python3.9/site-packages/torch/distributed/run.py", 10 | // 设置 torchrun 命令的参数 11 | "args":[ 12 | "--standalone", 13 | "--nproc_per_node=gpu", 14 | "multi_gpu_torchrun.py" 15 | ], 16 | "console": "integratedTerminal", 17 | "justMyCode": true, 18 | "env": { 19 | "CUDA_VISIBLE_DEVICES": "5, 6" 20 | }, 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](https://api.gitsponsors.com/api/badge/link?p=KoJzzoI5V0U5cxCo3lEj9srdIMTw7IoiFPToGomZRFp9HNMVxC2tGRy4n5Chm6M03jA9RjbezjCIKyoQVFxp7yN3+IexpNGKeaLHWoqwrp/6C6BjFgQf7A9QnfnJcs9D) 2 | # ddp-tutorial-series 3 | Follow the [pytorch official tutorial](https://pytorch.org/tutorials/beginner/ddp_series_intro.html?utm_source=youtube&utm_medium=organic_social&utm_campaign=tutorial) to learn how to use `nn.parallel.DistributedDataParallel` to speed up training 4 | 5 | # distributed-pytorch 6 | 7 | Code for the DDP tutorial series at https://pytorch.org/tutorials/beginner/ddp_series_intro.html 8 | 9 | Each code file extends upon the previous one. The series starts with a non-distributed script that runs on a single GPU and incrementally updates to end with multinode training on a Slurm cluster. 10 | 11 | ## Files 12 | 13 | - [single_gpu.py](https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/single_gpu.py): Non-distributed training script 14 | - [multigpu.py](https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py): DDP on a single node 15 | - [multigpu_torchrun.py](https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu_torchrun.py): DDP on a single node using Torchrun 16 | - minGPT-ddp: training a GPT-like model (from the minGPT repo [https://github.com/karpathy/minGPT](https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbGo2ZjQtMzFlQ2pJQmV6VV9yenFpdmlXVGItd3xBQ3Jtc0tueWdxVFZsYTNPRXFTSW5xejhUajZ1OVYydjNraENoZzNka05ZLWMtZXJkM1VjaFd5cENUMld0TEc5N3VkRFV2bzM2aWdvWVRjTU01TmFfZE9mdXVBTFczWDJZMnU2TjA4Z0tCd25LX2sxOFJLMWtsMA&q=https%3A%2F%2Fgithub.com%2Fkarpathy%2FminGPT&v=XFsFDGKZHh4)) with DDP. 17 | 18 | 19 | 20 | ## 我的笔记 21 | 22 | - [Pytorch 多卡并行(1)—— 原理简介和 DDP 并行实践](https://blog.csdn.net/wxc971231/article/details/132816104) 23 | 24 | - [Pytorch 多卡并行(2)—— 使用 torchrun 进行容错处理](https://blog.csdn.net/wxc971231/article/details/132827787) 25 | 26 | - [Pytorch 多卡并行(3)—— 使用 DDP 加速 minGPT 训练](https://blog.csdn.net/wxc971231/article/details/132829661) 27 | 28 | -------------------------------------------------------------------------------- /main.log: -------------------------------------------------------------------------------- 1 | [2023-09-12 18:52:52,922][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 2 | [2023-09-12 18:52:52,924][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. 3 | [2023-09-12 18:53:56,746][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 4 | [2023-09-12 18:53:56,747][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. 5 | [2023-09-12 18:55:25,877][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 6 | [2023-09-12 18:55:25,878][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. 7 | [2023-09-12 18:56:10,506][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 8 | [2023-09-12 18:56:10,507][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. 9 | [2023-09-12 18:56:14,516][root][INFO] - Reducer buckets have been rebuilt in this iteration. 10 | [2023-09-12 18:59:06,955][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 11 | [2023-09-12 18:59:06,956][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. 12 | [2023-09-12 18:59:11,087][root][INFO] - Reducer buckets have been rebuilt in this iteration. 13 | -------------------------------------------------------------------------------- /minGPT-ddp/README.md: -------------------------------------------------------------------------------- 1 | # minGPT-DDP 2 | 3 | Code accompanying the tutorial at https://pytorch.org/tutorials/intermediate/ddp_series_minGPT.html for training a GPT-like model with Distributed Data Parallel (DDP) in PyTorch. 4 | 5 | Files marked with an asterisk (*) are adapted from the minGPT repo (https://github.com/karpathy/minGPT). 6 | 7 | - [trainer.py](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/trainer.py) includes the Trainer class that runs the distributed training iterations on the model with the provided dataset. 8 | - [model.py *](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/model.py) defines the model architecture. 9 | - [char_dataset.py *](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/char_dataset.py) contains the `Dataset`class for a character-level dataset. 10 | - [gpt2_train_cfg.yaml](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/gpt2_train_cfg.yaml) contains the configurations for data, model, optimizer and training run. 11 | - [main.py](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/main.py) is the entry point to the trainig job. It sets up the DDP process group, reads all the configurations and runs the training job. 12 | - [slurm/](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/slurm) contains files for setting up an AWS cluster and the slurm script to run multinode training. -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/.hydra/config.yaml: -------------------------------------------------------------------------------- 1 | data_config: 2 | path: ./data/input.txt 3 | block_size: 128 4 | train_split: 0.9 5 | truncate: 0.02 6 | gpt_config: 7 | n_layer: 8 8 | n_head: 8 9 | n_embd: 512 10 | trainer_config: 11 | max_epochs: 10 12 | batch_size: 216 13 | data_loader_workers: 4 14 | grad_norm_clip: 1.0 15 | snapshot_path: gpt_snapshot.pt 16 | save_every: 3 17 | use_amp: true 18 | optimizer_config: 19 | weight_decay: 0.1 20 | learning_rate: 0.0003 21 | -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/.hydra/hydra.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | run: 3 | dir: ./ 4 | sweep: 5 | dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} 6 | subdir: ${hydra.job.num} 7 | launcher: 8 | _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher 9 | sweeper: 10 | _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper 11 | max_batch_size: null 12 | params: null 13 | help: 14 | app_name: ${hydra.job.name} 15 | header: '${hydra.help.app_name} is powered by Hydra. 16 | 17 | ' 18 | footer: 'Powered by Hydra (https://hydra.cc) 19 | 20 | Use --hydra-help to view Hydra specific help 21 | 22 | ' 23 | template: '${hydra.help.header} 24 | 25 | == Configuration groups == 26 | 27 | Compose your configuration from those groups (group=option) 28 | 29 | 30 | $APP_CONFIG_GROUPS 31 | 32 | 33 | == Config == 34 | 35 | Override anything in the config (foo.bar=value) 36 | 37 | 38 | $CONFIG 39 | 40 | 41 | ${hydra.help.footer} 42 | 43 | ' 44 | hydra_help: 45 | template: 'Hydra (${hydra.runtime.version}) 46 | 47 | See https://hydra.cc for more info. 48 | 49 | 50 | == Flags == 51 | 52 | $FLAGS_HELP 53 | 54 | 55 | == Configuration groups == 56 | 57 | Compose your configuration from those groups (For example, append hydra/job_logging=disabled 58 | to command line) 59 | 60 | 61 | $HYDRA_CONFIG_GROUPS 62 | 63 | 64 | Use ''--cfg hydra'' to Show the Hydra config. 65 | 66 | ' 67 | hydra_help: ??? 68 | hydra_logging: 69 | version: 1 70 | formatters: 71 | simple: 72 | format: '[%(asctime)s][HYDRA] %(message)s' 73 | handlers: 74 | console: 75 | class: logging.StreamHandler 76 | formatter: simple 77 | stream: ext://sys.stdout 78 | root: 79 | level: INFO 80 | handlers: 81 | - console 82 | loggers: 83 | logging_example: 84 | level: DEBUG 85 | disable_existing_loggers: false 86 | job_logging: 87 | version: 1 88 | formatters: 89 | simple: 90 | format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' 91 | handlers: 92 | console: 93 | class: logging.StreamHandler 94 | formatter: simple 95 | stream: ext://sys.stdout 96 | file: 97 | class: logging.FileHandler 98 | formatter: simple 99 | filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log 100 | root: 101 | level: INFO 102 | handlers: 103 | - console 104 | - file 105 | disable_existing_loggers: false 106 | env: {} 107 | mode: RUN 108 | searchpath: [] 109 | callbacks: {} 110 | output_subdir: .hydra 111 | overrides: 112 | hydra: 113 | - hydra.mode=RUN 114 | task: [] 115 | job: 116 | name: main 117 | chdir: null 118 | override_dirname: '' 119 | id: ??? 120 | num: ??? 121 | config_name: gpt2_train_cfg 122 | env_set: {} 123 | env_copy: [] 124 | config: 125 | override_dirname: 126 | kv_sep: '=' 127 | item_sep: ',' 128 | exclude_keys: [] 129 | runtime: 130 | version: 1.3.2 131 | version_base: '1.3' 132 | cwd: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt 133 | config_sources: 134 | - path: hydra.conf 135 | schema: pkg 136 | provider: hydra 137 | - path: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt 138 | schema: file 139 | provider: main 140 | - path: '' 141 | schema: structured 142 | provider: schema 143 | output_dir: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt 144 | choices: 145 | hydra/env: default 146 | hydra/callbacks: null 147 | hydra/job_logging: default 148 | hydra/hydra_logging: default 149 | hydra/hydra_help: default 150 | hydra/help: default 151 | hydra/sweeper: basic 152 | hydra/launcher: basic 153 | hydra/output: default 154 | verbose: false 155 | -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | [] 2 | -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/__pycache__/char_dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wxc971231/ddp-tutorial-series/ef810fbeae202cba704fce220ca07a445575f28b/minGPT-ddp/mingpt/__pycache__/char_dataset.cpython-38.pyc -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wxc971231/ddp-tutorial-series/ef810fbeae202cba704fce220ca07a445575f28b/minGPT-ddp/mingpt/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/__pycache__/trainer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wxc971231/ddp-tutorial-series/ef810fbeae202cba704fce220ca07a445575f28b/minGPT-ddp/mingpt/__pycache__/trainer.cpython-38.pyc -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/char_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | import fsspec 4 | from dataclasses import dataclass 5 | 6 | """ 7 | Adapted from https://github.com/karpathy/minGPT/blob/master/projects/chargpt/chargpt.py 8 | """ 9 | 10 | @dataclass 11 | class DataConfig: 12 | path: str = None 13 | block_size: int = None # 输入序列长度 14 | train_split: float = None # 训练集和测试集划分 15 | truncate: float = 1.0 # 用于训练的数据占全体数据的比例 16 | 17 | class CharDataset(Dataset): 18 | 19 | def __init__(self, data_cfg: DataConfig): #data_path: str, block_size): 20 | # 加载所需比例的数据 21 | data = fsspec.open(data_cfg.path).open().read().decode('utf-8') 22 | data = data[ : int(len(data) * data_cfg.truncate)] 23 | 24 | # Set 去重,转 list 后排序得到数据集中的唯一字符列表作为词表 25 | chars = sorted(list(set(data))) 26 | data_size, vocab_size = len(data), len(chars) 27 | print('Data has %d characters, %d unique.' % (data_size, vocab_size)) 28 | 29 | # 得到字符和词表索引之间的双射 30 | self.stoi = {ch: i for i, ch in enumerate(chars)} # 字符 -> 词表索引 31 | self.itos = {i: ch for i, ch in enumerate(chars)} # 词表索引 -> 字符 32 | 33 | self.block_size = data_cfg.block_size 34 | self.vocab_size = vocab_size 35 | self.data = data 36 | 37 | def __len__(self): 38 | return len(self.data) - self.block_size 39 | 40 | def __getitem__(self, idx): 41 | # grab a chunk of (block_size + 1) characters from the data 42 | chunk = self.data[idx:idx + self.block_size + 1] 43 | 44 | # encode every character to an integer 45 | dix = [self.stoi[s] for s in chunk] 46 | x = torch.tensor(dix[:-1], dtype=torch.long) 47 | y = torch.tensor(dix[1:], dtype=torch.long) 48 | return x, y 49 | -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/gpt2_train_cfg.yaml: -------------------------------------------------------------------------------- 1 | data_config: 2 | path: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt/data/input.txt 3 | block_size: 128 # 输入序列长度 4 | train_split: 0.9 # 训练集和测试集划分 5 | truncate: 0.02 # 只用5%的数据进行训练 6 | gpt_config: 7 | n_layer: 8 8 | n_head: 8 9 | n_embd: 512 10 | trainer_config: 11 | max_epochs: 10 12 | batch_size: 216 13 | data_loader_workers: 4 14 | grad_norm_clip: 1.0 15 | snapshot_path: gpt_snapshot.pt 16 | save_every: 3 17 | use_amp: True 18 | optimizer_config: 19 | weight_decay: 0.1 20 | learning_rate: 0.0003 21 | 22 | hydra: 23 | run: 24 | dir: ./ 25 | -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/main.log: -------------------------------------------------------------------------------- 1 | [2023-09-11 17:00:40,135][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 2 | [2023-09-11 17:00:40,136][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 3 | [2023-09-11 17:00:40,136][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 4 | [2023-09-11 17:00:40,136][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 5 | [2023-09-11 17:00:43,432][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 6 | [2023-09-11 17:00:43,433][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 7 | [2023-09-11 18:01:07,588][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 8 | [2023-09-11 18:01:07,596][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 9 | [2023-09-11 18:01:07,596][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 10 | [2023-09-11 18:01:07,599][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 11 | [2023-09-11 18:01:11,054][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 12 | [2023-09-11 18:01:11,054][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 13 | [2023-09-11 18:02:31,321][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 14 | [2023-09-11 18:02:31,321][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 15 | [2023-09-11 18:02:31,322][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 16 | [2023-09-11 18:02:31,331][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 17 | [2023-09-11 18:02:34,753][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 18 | [2023-09-11 18:02:34,753][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 19 | [2023-09-11 18:06:18,569][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 20 | [2023-09-11 18:06:18,578][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 21 | [2023-09-11 18:06:18,578][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 22 | [2023-09-11 18:06:18,580][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 23 | [2023-09-11 18:06:21,444][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 24 | [2023-09-11 18:06:21,446][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 25 | [2023-09-11 18:08:44,508][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 26 | [2023-09-11 18:08:44,517][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 27 | [2023-09-11 18:08:44,517][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 28 | [2023-09-11 18:08:44,518][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 29 | [2023-09-11 18:08:47,320][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 30 | [2023-09-11 18:08:47,320][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 31 | [2023-09-11 18:09:30,301][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 32 | [2023-09-11 18:09:30,306][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 33 | [2023-09-11 18:09:30,307][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 34 | [2023-09-11 18:09:30,311][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 35 | [2023-09-11 18:09:33,182][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 36 | [2023-09-11 18:09:33,182][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 37 | [2023-09-11 18:31:37,555][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 38 | [2023-09-11 18:31:37,555][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 39 | [2023-09-11 18:31:37,556][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 40 | [2023-09-11 18:31:37,556][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 41 | [2023-09-11 18:31:40,494][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 42 | [2023-09-11 18:31:40,495][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 43 | [2023-09-11 18:32:07,745][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 44 | [2023-09-11 18:32:07,755][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 45 | [2023-09-11 18:32:07,755][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 46 | [2023-09-11 18:32:07,756][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 47 | [2023-09-11 18:32:10,667][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 48 | [2023-09-11 18:32:10,667][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 49 | [2023-09-11 18:42:41,048][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 50 | [2023-09-11 18:42:41,048][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 51 | [2023-09-11 18:42:41,049][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 52 | [2023-09-11 18:42:41,058][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 53 | [2023-09-11 18:42:44,410][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 54 | [2023-09-11 18:42:44,411][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 55 | [2023-09-11 18:44:57,544][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 56 | [2023-09-11 18:44:57,548][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 57 | [2023-09-11 18:44:57,548][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 58 | [2023-09-11 18:44:57,555][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 59 | [2023-09-11 18:45:01,088][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 60 | [2023-09-11 18:45:01,088][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 61 | [2023-09-11 18:49:33,785][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 62 | [2023-09-11 18:49:33,785][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 63 | [2023-09-11 18:49:33,785][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1 64 | [2023-09-11 18:49:33,785][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. 65 | [2023-09-11 18:49:36,698][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 66 | [2023-09-11 18:49:36,698][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration. 67 | [2023-09-12 18:29:46,692][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 68 | [2023-09-12 18:29:46,692][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. 69 | [2023-09-12 18:32:47,875][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 70 | [2023-09-12 18:32:47,875][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. 71 | [2023-09-12 18:34:07,827][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0 72 | [2023-09-12 18:34:07,827][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes. 73 | [2023-09-12 18:34:11,181][root][INFO] - Reducer buckets have been rebuilt in this iteration. 74 | -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.data import random_split 4 | from torch.distributed import init_process_group, destroy_process_group 5 | from model import GPT, GPTConfig, OptimizerConfig, create_optimizer 6 | from trainer import Trainer, TrainerConfig 7 | from char_dataset import CharDataset, DataConfig 8 | from omegaconf import DictConfig 9 | import hydra 10 | 11 | 12 | def ddp_setup(): 13 | os.environ["MASTER_ADDR"] = "localhost" # 由于这里是单机实验所以直接写 localhost 14 | os.environ["MASTER_PORT"] = "12355" # 任意空闲端口 15 | init_process_group(backend="nccl") 16 | torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) 17 | 18 | def get_train_objs(gpt_cfg: GPTConfig, opt_cfg: OptimizerConfig, data_cfg: DataConfig): 19 | dataset = CharDataset(data_cfg) 20 | train_len = int(len(dataset) * data_cfg.train_split) 21 | train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len]) 22 | 23 | gpt_cfg.vocab_size = dataset.vocab_size 24 | gpt_cfg.block_size = dataset.block_size 25 | model = GPT(gpt_cfg) 26 | optimizer = create_optimizer(model, opt_cfg) 27 | 28 | return model, optimizer, train_set, test_set 29 | 30 | @hydra.main(version_base=None, config_path=".", config_name="gpt2_train_cfg") 31 | def main(cfg: DictConfig): 32 | # 初始化进程池 33 | ddp_setup() 34 | 35 | # 从 yaml 文件读取超参数 36 | gpt_cfg = GPTConfig(**cfg['gpt_config']) 37 | opt_cfg = OptimizerConfig(**cfg['optimizer_config']) 38 | data_cfg = DataConfig(**cfg['data_config']) 39 | trainer_cfg = TrainerConfig(**cfg['trainer_config']) 40 | 41 | # 创建训练对象 42 | model, optimizer, train_data, test_data = get_train_objs(gpt_cfg, opt_cfg, data_cfg) 43 | trainer = Trainer(trainer_cfg, model, optimizer, train_data, test_data) 44 | 45 | # 开始训练 46 | trainer.train() 47 | 48 | # 训练完成后,销毁进程池 49 | destroy_process_group() 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | 55 | ''' 56 | 运行命令: 57 | CUDA_VISIBLE_DEVICES=1,2 torchrun --standalone --nproc_per_node=gpu main.py 58 | ''' -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Full definition of a GPT Language Model, all of it in this single file. 3 | Adapted from https://github.com/karpathy/minGPT/blob/master/mingpt/model.py 4 | """ 5 | 6 | from dataclasses import dataclass 7 | import math 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import functional as F 11 | 12 | @dataclass 13 | class GPTConfig: 14 | model_type: str = 'gpt2' 15 | # model configurations 16 | n_layer: int = None 17 | n_head: int = None 18 | n_embd: int = None 19 | # openai's values for gpt2 20 | vocab_size: int = 50257 21 | block_size: int = 1024 22 | # dropout hyperparameters 23 | embd_pdrop: float = 0.1 24 | resid_pdrop: float = 0.1 25 | attn_pdrop: float = 0.1 26 | 27 | @dataclass 28 | class OptimizerConfig: 29 | learning_rate: float = 3e-4 30 | weight_decay: float = 0.1 31 | 32 | class MultiheadAttentionLayer(nn.Module): 33 | """ 34 | A multi-head masked self-attention layer with a projection at the end. 35 | """ 36 | 37 | def __init__(self, config, device="cpu", dtype=torch.float32): 38 | super().__init__() 39 | assert config.n_embd % config.n_head == 0 40 | self.resid_drop = nn.Dropout(config.resid_pdrop) 41 | 42 | # output projection 43 | self.c_proj = nn.Linear(config.n_embd, config.n_embd, device=device, dtype=dtype) 44 | 45 | # Causal mask。注意这个mask是通过 self.register_buffer 方法登记的 46 | # 这样登记过的张量可以求梯度也可以随模型在 CPU/GPU 之间移动,但是不进行参数优化 47 | self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size)) 48 | .view(1, 1, config.block_size, config.block_size)) 49 | 50 | self.attn = torch.nn.MultiheadAttention( 51 | embed_dim=config.n_embd, 52 | num_heads=config.n_head, 53 | dropout=config.attn_pdrop, 54 | batch_first=True, 55 | device=device, 56 | dtype=dtype 57 | ) 58 | 59 | def forward(self, x): 60 | _, seq_size, _ = x.size() # batch size, sequence length, embedding dimensionality (n_embd) 61 | y = self.attn(x, x, x, attn_mask=self.mask[0, 0, :seq_size, :seq_size])[0] 62 | y = self.resid_drop(self.c_proj(y)) 63 | return y 64 | 65 | class Block(nn.Module): 66 | """ an unassuming Transformer block """ 67 | def __init__(self, config: GPTConfig): 68 | super().__init__() 69 | self.ln1 = nn.LayerNorm(config.n_embd) 70 | self.ln2 = nn.LayerNorm(config.n_embd) 71 | self.attn = MultiheadAttentionLayer(config) 72 | self.mlp = nn.Sequential( 73 | nn.Linear(config.n_embd, 4 * config.n_embd), 74 | nn.GELU(), 75 | nn.Linear(4 * config.n_embd, config.n_embd), 76 | nn.Dropout(config.resid_pdrop), 77 | ) 78 | 79 | def forward(self, x): 80 | x = x + self.attn(self.ln1(x)) 81 | x = x + self.mlp(self.ln2(x)) 82 | return x 83 | 84 | class EmbeddingStem(nn.Module): 85 | def __init__(self, config: GPTConfig, device="cpu", dtype=torch.float32): 86 | super().__init__() 87 | self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd, device=device, dtype=dtype) 88 | self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd, device=device, dtype=dtype)) 89 | self.drop = nn.Dropout(config.embd_pdrop) 90 | self.block_size = config.block_size 91 | 92 | def reset_parameters(self): 93 | self.tok_emb.reset_parameters() # 将 nn.Embedding 层参数初始化为正态分布采样 94 | 95 | def forward(self, idx): 96 | b, t = idx.size() 97 | assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}" 98 | 99 | token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) embedding vector 100 | position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) position vector 101 | return self.drop(token_embeddings + position_embeddings) 102 | 103 | class GPT(nn.Module): 104 | """ GPT Language Model """ 105 | 106 | def __init__(self, config: GPTConfig): 107 | super().__init__() 108 | self.block_size = config.block_size 109 | config = self._set_model_config(config) 110 | 111 | # input embedding stem 112 | self.emb_stem = EmbeddingStem(config) 113 | # transformer 114 | self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)]) 115 | # decoder head 116 | self.ln_f = nn.LayerNorm(config.n_embd) 117 | self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False) 118 | 119 | # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper 120 | self.apply(self._init_weights) 121 | for pn, p in self.named_parameters(): 122 | if pn.endswith('c_proj.weight'): 123 | p.data.normal_(mean=0.0, std=0.02/math.sqrt(2 * config.n_layer)) 124 | 125 | # report number of parameters (note we don't count the decoder parameters in lm_head) 126 | n_params = sum(p.numel() for p in self.blocks.parameters()) 127 | print("number of parameters: %.2fM" % (n_params/1e6,)) 128 | 129 | def _set_model_config(self, config): 130 | type_given = config.model_type is not None 131 | params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None]) 132 | # assert type_given ^ params_given # exactly one of these (XOR) 133 | if type_given and not params_given: 134 | # translate from model_type to detailed configuration 135 | config.__dict__.update({ 136 | # names follow the huggingface naming conventions 137 | # GPT-1 138 | 'openai-gpt': dict(n_layer=12, n_head=12, n_embd=768), # 117M params 139 | # GPT-2 configs 140 | 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params 141 | 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params 142 | 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params 143 | 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params 144 | # Gophers 145 | 'gopher-44m': dict(n_layer=8, n_head=16, n_embd=512), 146 | # (there are a number more...) 147 | # I made these tiny models up 148 | 'gpt-mini': dict(n_layer=6, n_head=6, n_embd=192), 149 | 'gpt-micro': dict(n_layer=4, n_head=4, n_embd=128), 150 | 'gpt-nano': dict(n_layer=3, n_head=3, n_embd=48), 151 | }[config.model_type]) 152 | return config 153 | 154 | def _init_weights(self, module): 155 | if isinstance(module, (nn.Linear, nn.Embedding)): 156 | module.weight.data.normal_(mean=0.0, std=0.02) 157 | if isinstance(module, nn.Linear) and module.bias is not None: 158 | module.bias.data.zero_() 159 | elif isinstance(module, nn.LayerNorm): 160 | module.bias.data.zero_() 161 | module.weight.data.fill_(1.0) 162 | 163 | def forward(self, idx, targets=None): 164 | x = self.emb_stem(idx) 165 | x = self.blocks(x) 166 | x = self.ln_f(x) 167 | logits = self.head(x) 168 | 169 | # if we are given some desired targets also calculate the loss 170 | loss = None 171 | if targets is not None: 172 | loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) 173 | 174 | return logits, loss 175 | 176 | @torch.no_grad() 177 | def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None): 178 | """ 179 | Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete 180 | the sequence max_new_tokens times, feeding the predictions back into the model each time. 181 | Most likely you'll want to make sure to be in model.eval() mode of operation for this. 182 | """ 183 | for _ in range(max_new_tokens): 184 | # if the sequence context is growing too long we must crop it at block_size 185 | idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:] 186 | # forward the model to get the logits for the index in the sequence 187 | logits, _ = self(idx_cond) 188 | # pluck the logits at the final step and scale by desired temperature 189 | logits = logits[:, -1, :] / temperature 190 | # optionally crop the logits to only the top k options 191 | if top_k is not None: 192 | v, _ = torch.topk(logits, top_k) 193 | logits[logits < v[:, [-1]]] = -float('Inf') 194 | # apply softmax to convert logits to (normalized) probabilities 195 | probs = F.softmax(logits, dim=-1) 196 | # either sample from the distribution or take the most likely element 197 | if do_sample: 198 | idx_next = torch.multinomial(probs, num_samples=1) 199 | else: 200 | _, idx_next = torch.topk(probs, k=1, dim=-1) 201 | # append sampled index to the running sequence and continue 202 | idx = torch.cat((idx, idx_next), dim=1) 203 | 204 | return idx 205 | 206 | 207 | def create_optimizer(model: torch.nn.Module, opt_config: OptimizerConfig): 208 | """ 209 | This long function is unfortunately doing something very simple and is being very defensive: 210 | We are separating out all parameters of the model into two buckets: those that will experience 211 | weight decay for regularization and those that won't (biases, and layernorm/embedding weights). 212 | We are then returning the PyTorch optimizer object. 213 | """ 214 | 215 | # separate out all parameters to those that will and won't experience regularizing weight decay 216 | decay = set() 217 | no_decay = set() 218 | whitelist_weight_modules = (torch.nn.Linear, ) 219 | blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding) 220 | for mn, m in model.named_modules(): 221 | for pn, p in m.named_parameters(): 222 | fpn = '%s.%s' % (mn, pn) if mn else pn # full param name 223 | # random note: because named_modules and named_parameters are recursive 224 | # we will see the same tensors p many many times. but doing it this way 225 | # allows us to know which parent module any tensor p belongs to... 226 | if pn.endswith('bias'): 227 | # all biases will not be decayed 228 | no_decay.add(fpn) 229 | elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules): 230 | # weights of whitelist modules will be weight decayed 231 | decay.add(fpn) 232 | elif pn.endswith('in_proj_weight'): 233 | # MHA projection layer 234 | decay.add(fpn) 235 | elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules): 236 | # weights of blacklist modules will NOT be weight decayed 237 | no_decay.add(fpn) 238 | elif pn.endswith('pos_emb'): 239 | # positional embedding shouldn't be decayed 240 | no_decay.add(fpn) 241 | 242 | # validate that we considered every parameter 243 | param_dict = {pn: p for pn, p in model.named_parameters()} 244 | inter_params = decay & no_decay 245 | union_params = decay | no_decay 246 | assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), ) 247 | assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \ 248 | % (str(param_dict.keys() - union_params), ) 249 | 250 | # create the pytorch optimizer object 251 | optim_groups = [ 252 | {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": opt_config.weight_decay}, 253 | {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}, 254 | ] 255 | optimizer = torch.optim.AdamW(optim_groups, lr=opt_config.learning_rate, betas=(0.9, 0.95)) 256 | return optimizer -------------------------------------------------------------------------------- /minGPT-ddp/mingpt/trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple training loop; Boilerplate that could apply to any arbitrary neural network, 3 | so nothing in this file really has anything to do with GPT specifically. 4 | """ 5 | 6 | from dataclasses import dataclass, asdict 7 | from collections import OrderedDict 8 | from typing import Optional, Any, Dict 9 | import os 10 | import torch 11 | from torch.utils.data import Dataset, DataLoader 12 | from torch.nn.parallel import DistributedDataParallel as DDP 13 | from torch.utils.data.distributed import DistributedSampler 14 | import fsspec 15 | import torch.distributed as dist 16 | 17 | @dataclass 18 | class TrainerConfig: 19 | max_epochs: int = None 20 | batch_size: int = None 21 | data_loader_workers: int = None 22 | grad_norm_clip: float = None 23 | snapshot_path: Optional[str] = None 24 | save_every: int = None 25 | use_amp: bool = None 26 | 27 | @dataclass 28 | class Snapshot: 29 | model_state: 'OrderedDict[str, torch.Tensor]' 30 | optimizer_state: Dict[str, Any] 31 | finished_epoch: int 32 | 33 | class Trainer: 34 | def __init__(self, trainer_config: TrainerConfig, model, optimizer, train_dataset, test_dataset=None): 35 | self.config = trainer_config 36 | # set torchrun variables 37 | self.local_rank = int(os.environ["LOCAL_RANK"]) # 在所有node的所有进程中当前GPU进程的rank 38 | self.global_rank = int(os.environ["RANK"]) # 在当前node中当前GPU进程的rank 39 | 40 | # data stuff 41 | self.train_dataset = train_dataset 42 | self.train_loader = self._prepare_dataloader(train_dataset) 43 | self.test_loader = self._prepare_dataloader(test_dataset) if test_dataset else None 44 | 45 | # initialize train states 46 | self.epochs_run = 0 47 | self.model = model.to(self.local_rank) 48 | self.optimizer = optimizer 49 | self.save_every = self.config.save_every 50 | 51 | # load snapshot if available. only necessary on the first node. 52 | if self.config.snapshot_path is None: 53 | self.config.snapshot_path = "snapshot.pt" 54 | self._load_snapshot() 55 | 56 | # wrap with DDP. this step will synch model across all the processes. 57 | self.model = DDP(self.model, device_ids=[self.local_rank]) 58 | 59 | # torch.cuda.amp.GradScaler 是一个用于自动混合精度训练的 PyTorch 工具,它可以帮助加速模型训练并减少显存使用量 60 | # 具体来说,GradScaler 可以将梯度缩放到较小的范围,以避免数值下溢或溢出的问题,同时保持足够的精度以避免模型的性能下降 61 | if self.config.use_amp: 62 | self.scaler = torch.cuda.amp.GradScaler() 63 | 64 | def _prepare_dataloader(self, dataset: Dataset): 65 | return DataLoader( 66 | dataset, 67 | batch_size=self.config.batch_size, 68 | pin_memory=True, 69 | shuffle=False, 70 | num_workers=self.config.data_loader_workers, 71 | sampler=DistributedSampler(dataset) # 这个 sampler 自动将数据分块后送个各个 GPU,它能避免数据重叠 72 | ) 73 | 74 | def _load_snapshot(self): 75 | try: 76 | snapshot = fsspec.open(self.config.snapshot_path) # fsspec 为各种后端存储系统提供统一的 Python 接口,可以用相同的语法打开本地、AWS S3 和 GCS 等各种云存储平台的文件 77 | with snapshot as f: 78 | snapshot_data = torch.load(f, map_location="cpu") 79 | except FileNotFoundError: 80 | print("Snapshot not found. Training model from scratch") 81 | return 82 | 83 | snapshot = Snapshot(**snapshot_data) 84 | self.model.load_state_dict(snapshot.model_state) 85 | self.optimizer.load_state_dict(snapshot.optimizer_state) 86 | self.epochs_run = snapshot.finished_epoch 87 | print(f"Resuming training from snapshot at Epoch {self.epochs_run}") 88 | 89 | def _save_snapshot(self, epoch): 90 | # capture snapshot 91 | model = self.model 92 | raw_model = model.module if hasattr(model, "module") else model 93 | snapshot = Snapshot( 94 | model_state=raw_model.state_dict(), 95 | optimizer_state=self.optimizer.state_dict(), 96 | finished_epoch=epoch 97 | ) 98 | # save snapshot 99 | snapshot = asdict(snapshot) 100 | torch.save(snapshot, self.config.snapshot_path) 101 | print(f"Snapshot saved at epoch {epoch}") 102 | 103 | def _run_batch(self, source, targets, train: bool = True) -> float: 104 | with torch.set_grad_enabled(train), torch.cuda.amp.autocast(dtype=torch.float16, enabled=(self.config.use_amp)): 105 | _, loss = self.model(source, targets) 106 | 107 | if train: 108 | self.optimizer.zero_grad(set_to_none=True) 109 | if self.config.use_amp: 110 | self.scaler.scale(loss).backward() 111 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm_clip) 112 | self.scaler.step(self.optimizer) 113 | self.scaler.update() 114 | else: 115 | loss.backward() 116 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm_clip) 117 | self.optimizer.step() 118 | 119 | #return loss.item() 120 | return loss 121 | 122 | def _run_epoch(self, epoch: int, dataloader: DataLoader, train: bool = True): 123 | dataloader.sampler.set_epoch(epoch) 124 | for iter, (source, targets) in enumerate(dataloader): 125 | step_type = "Train" if train else "Eval" 126 | source = source.to(self.local_rank) 127 | targets = targets.to(self.local_rank) 128 | batch_loss = self._run_batch(source, targets, train) 129 | if iter % 100 == 0: 130 | #print(f"[GPU{self.global_rank}] Epoch {epoch} | Iter {iter} | {step_type} Loss {batch_loss.item():.5f}") 131 | if train: 132 | print(f"[GPU{self.global_rank}] Epoch {epoch} | Iter {iter} | {step_type} Loss {batch_loss.item():.5f}") 133 | else: 134 | eval_loss_list = [torch.zeros_like(batch_loss) for _ in range(int(os.environ['WORLD_SIZE']))] 135 | dist.gather( 136 | batch_loss, 137 | eval_loss_list if self.local_rank == 0 else None, 138 | dst=0 139 | ) 140 | if self.local_rank == 0: 141 | for i, loss in enumerate(eval_loss_list): 142 | print(f"[GPU{i}] Epoch {epoch} | Iter {iter} | {step_type} Loss {loss.item():.5f}") 143 | 144 | def train(self): 145 | for epoch in range(self.epochs_run, self.config.max_epochs): 146 | epoch += 1 147 | 148 | # train for one epoch 149 | self._run_epoch(epoch, self.train_loader, train=True) 150 | 151 | # 各个 GPU 上都在跑一样的训练进程,这里指定 rank0 进程保存 snapshot 以免重复保存 152 | if self.local_rank == 0 and epoch % self.save_every == 0: 153 | self._save_snapshot(epoch) 154 | 155 | # eval run 156 | if self.test_loader: 157 | self._run_epoch(epoch, self.test_loader, train=False) 158 | -------------------------------------------------------------------------------- /minGPT-ddp/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.11.0 2 | fsspec 3 | boto3 4 | hydra-core 5 | requests 6 | aiohttp -------------------------------------------------------------------------------- /multi_gpu.py: -------------------------------------------------------------------------------- 1 | # 使用 DistributedDataParallel 进行单机多卡训练 2 | import torch 3 | import torch.nn.functional as F 4 | from torch.utils.data import Dataset, DataLoader 5 | import os 6 | 7 | # 对 python 多进程的一个 pytorch 包装 8 | import torch.multiprocessing as mp 9 | 10 | # 这个 sampler 可以把采样的数据分散到各个 CPU 上 11 | from torch.utils.data.distributed import DistributedSampler 12 | 13 | # 实现分布式数据并行的核心类 14 | from torch.nn.parallel import DistributedDataParallel as DDP 15 | 16 | # DDP 在每个 GPU 上运行一个进程,其中都有一套完全相同的 Trainer 副本(包括model和optimizer) 17 | # 各个进程之间通过一个进程池进行通信,这两个方法来初始化和销毁进程池 18 | from torch.distributed import init_process_group, destroy_process_group 19 | 20 | 21 | def ddp_setup(rank, world_size): 22 | """ 23 | setup the distribution process group 24 | 25 | Args: 26 | rank: Unique identifier of each process 27 | world_size: Total number of processes 28 | """ 29 | # MASTER Node(运行 rank0 进程,多机多卡时的主机)用来协调各个 Node 的所有进程之间的通信 30 | os.environ["MASTER_ADDR"] = "localhost" # 由于这里是单机实验所以直接写 localhost 31 | os.environ["MASTER_PORT"] = "12355" # 任意空闲端口 32 | init_process_group( 33 | backend="nccl", # Nvidia CUDA CPU 用这个 "nccl" 34 | rank=rank, 35 | world_size=world_size 36 | ) 37 | torch.cuda.set_device(rank) 38 | 39 | class Trainer: 40 | def __init__( 41 | self, 42 | model: torch.nn.Module, 43 | train_data: DataLoader, 44 | optimizer: torch.optim.Optimizer, 45 | gpu_id: int, 46 | save_every: int, 47 | ) -> None: 48 | self.gpu_id = gpu_id 49 | self.model = model.to(gpu_id) 50 | self.train_data = train_data 51 | self.optimizer = optimizer 52 | self.save_every = save_every # 指定保存 ckpt 的周期 53 | self.model = DDP(model, device_ids=[gpu_id]) # model 要用 DDP 包装一下 54 | 55 | def _run_batch(self, source, targets): 56 | self.optimizer.zero_grad() 57 | output = self.model(source) 58 | loss = F.cross_entropy(output, targets) 59 | loss.backward() 60 | self.optimizer.step() 61 | 62 | def _run_epoch(self, epoch): 63 | b_sz = len(next(iter(self.train_data))[0]) 64 | print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}") 65 | self.train_data.sampler.set_epoch(epoch) # 在各个 epoch 入口调用 DistributedSampler 的 set_epoch 方法是很重要的,这样才能打乱每个 epoch 的样本顺序 66 | for source, targets in self.train_data: 67 | source = source.to(self.gpu_id) 68 | targets = targets.to(self.gpu_id) 69 | self._run_batch(source, targets) 70 | 71 | def _save_checkpoint(self, epoch): 72 | ckp = self.model.module.state_dict() # 由于多了一层 DDP 包装,通过 .module 获取原始参数 73 | PATH = "checkpoint.pt" 74 | torch.save(ckp, PATH) 75 | print(f"Epoch {epoch} | Training checkpoint saved at {PATH}") 76 | 77 | def train(self, max_epochs: int): 78 | for epoch in range(max_epochs): 79 | self._run_epoch(epoch) 80 | # 各个 GPU 上都在跑一样的训练进程,这里指定 rank0 进程保存 ckpt 以免重复保存 81 | if self.gpu_id == 0 and epoch % self.save_every == 0: 82 | self._save_checkpoint(epoch) 83 | 84 | class MyTrainDataset(Dataset): 85 | def __init__(self, size): 86 | self.size = size 87 | self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)] 88 | 89 | def __len__(self): 90 | return self.size 91 | 92 | def __getitem__(self, index): 93 | return self.data[index] 94 | 95 | def load_train_objs(): 96 | train_set = MyTrainDataset(2048) # load your dataset 97 | model = torch.nn.Linear(20, 1) # load your model 98 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 99 | return train_set, model, optimizer 100 | 101 | 102 | def prepare_dataloader(dataset: Dataset, batch_size: int): 103 | return DataLoader( 104 | dataset, 105 | batch_size=batch_size, 106 | pin_memory=True, 107 | shuffle=False, # 设置了新的 sampler,参数 shuffle 要设置为 False 108 | sampler=DistributedSampler(dataset) # 这个 sampler 自动将数据分块后送个各个 GPU,它能避免数据重叠 109 | ) 110 | 111 | 112 | def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int): 113 | # 初始化进程池 114 | ddp_setup(rank, world_size) 115 | 116 | # 进行训练 117 | dataset, model, optimizer = load_train_objs() 118 | train_data = prepare_dataloader(dataset, batch_size) 119 | trainer = Trainer(model, train_data, optimizer, rank, save_every) 120 | trainer.train(total_epochs) 121 | 122 | # 销毁进程池 123 | destroy_process_group() 124 | 125 | 126 | if __name__ == "__main__": 127 | import argparse 128 | parser = argparse.ArgumentParser(description='simple distributed training job') 129 | parser.add_argument('--total-epochs', type=int, default=50, help='Total epochs to train the model') 130 | parser.add_argument('--save-every', type=int, default=10, help='How often to save a snapshot') 131 | parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)') 132 | args = parser.parse_args() 133 | 134 | world_size = torch.cuda.device_count() 135 | 136 | # 利用 mp.spawn,在整个 distribution group 的 nprocs 个 GPU 上生成进程来执行 fn 方法,并能设置要传入 fn 的参数 args 137 | # 注意不需要 fn 的 rank 参数,它由 mp.spawn 自动分配 138 | mp.spawn( 139 | fn=main, 140 | args=(world_size, args.save_every, args.total_epochs, args.batch_size), 141 | nprocs=world_size 142 | ) -------------------------------------------------------------------------------- /multi_gpu_torchrun.py: -------------------------------------------------------------------------------- 1 | # 使用 DistributedDataParallel 进行单机多卡训练的基础上,使用 torchrun 进行容错处理,增强程序稳定性 2 | # torchrun 允许我们在训练过程中按一定保存 snapshots,其中应当包含当前 epoch、模型参数(ckpt)、优化器参数、lr调度器参数等恢复训练所需的全部参数 3 | # 一旦程序出错退出,torchrun 会自动从最近 snapshots 重启所有进程 4 | # 除了增强稳定性外,torchrun 还会自动完成所有环境变量设置和进程分配工作,所以不再需要手动设置 rank 或用 mp.spawn 生成并分配进程 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | from torch.utils.data import Dataset, DataLoader 9 | import os 10 | import numpy as np 11 | from tqdm import tqdm 12 | 13 | # 对 python 多进程的一个 pytorch 包装 14 | import torch.multiprocessing as mp 15 | 16 | # 这个 sampler 可以把采样的数据分散到各个 CPU 上 17 | from torch.utils.data.distributed import DistributedSampler 18 | 19 | # 实现分布式数据并行的核心类 20 | from torch.nn.parallel import DistributedDataParallel as DDP 21 | 22 | # DDP 在每个 GPU 上运行一个进程,其中都有一套完全相同的 Trainer 副本(包括model和optimizer) 23 | # 各个进程之间通过一个进程池进行通信,这两个方法来初始化和销毁进程池 24 | from torch.distributed import init_process_group, destroy_process_group 25 | 26 | 27 | def ddp_setup(): 28 | # torchrun 会处理环境变量以及 rank & world_size 设置 29 | os.environ["MASTER_ADDR"] = "localhost" # 由于这里是单机实验所以直接写 localhost 30 | os.environ["MASTER_PORT"] = "12355" # 任意空闲端口 31 | init_process_group(backend="nccl") 32 | torch.cuda.set_device(int(os.environ['LOCAL_RANK'])) 33 | 34 | class Trainer: 35 | def __init__( 36 | self, 37 | model: torch.nn.Module, 38 | train_data: DataLoader, 39 | optimizer: torch.optim.Optimizer, 40 | save_every: int, 41 | snapshot_path: str, # 保存 snapshots 的位置 42 | ) -> None: 43 | self.gpu_id = int(os.environ['LOCAL_RANK']) # torchrun 会自动设置这个环境变量指出当前进程的 rank 44 | self.world_size = int(os.environ['WORLD_SIZE']) 45 | self.model = model.to(self.gpu_id) 46 | self.train_data = train_data 47 | self.optimizer = optimizer 48 | self.save_every = save_every # 指定保存 snapshots 的周期 49 | self.epochs_run = 0 # 存储将要保存在 snapshots 中的 epoch num 信息 50 | self.snapshot_path = snapshot_path 51 | 52 | # 若存在 snapshots 则加载,这样重复运行指令就能自动继续训练了 53 | if os.path.exists(snapshot_path): 54 | print('loading snapshot') 55 | self._load_snapshot(snapshot_path) 56 | 57 | self.model = DDP(self.model, device_ids=[self.gpu_id]) # model 要用 DDP 包装一下 58 | 59 | def _load_snapshot(self, snapshot_path): 60 | ''' 加载 snapshot 并重启训练 ''' 61 | loc = f"cuda:{self.gpu_id}" 62 | snapshot = torch.load(snapshot_path, map_location=loc) 63 | self.model.load_state_dict(snapshot["MODEL_STATE"]) 64 | self.epochs_run = snapshot["EPOCHS_RUN"] 65 | print(f"Resuming training from snapshot at Epoch {self.epochs_run}") 66 | 67 | def _run_batch(self, source, targets): 68 | self.optimizer.zero_grad() 69 | output = self.model(source) 70 | loss = torch.mean(F.mse_loss(output, targets)) 71 | loss.backward() 72 | self.optimizer.step() 73 | return loss.item() 74 | 75 | def _run_epoch(self, epoch): 76 | epoch_losses = [] 77 | self.train_data.sampler.set_epoch(epoch) # 设置 epoch 保证多 GPU 上数据不重叠 78 | for source, targets in self.train_data: 79 | source = source.to(self.gpu_id) 80 | targets = targets.to(self.gpu_id) 81 | loss = self._run_batch(source, targets) 82 | epoch_losses.append(loss) 83 | return np.mean(epoch_losses) 84 | 85 | def _save_snapshot(self, epoch): 86 | # 在 snapshot 中保存恢复训练所必须的参数 87 | snapshot = { 88 | "MODEL_STATE": self.model.module.state_dict(), # 由于多了一层 DDP 包装,通过 .module 获取原始参数 89 | "EPOCHS_RUN": epoch, 90 | } 91 | torch.save(snapshot, self.snapshot_path) 92 | #print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}") 93 | 94 | def train(self, max_epochs: int): 95 | # 现在从 self.epochs_run 开始训练,统一重启的情况 96 | with tqdm(total=max_epochs, desc=f"[GPU{self.gpu_id}] Training", position=self.gpu_id, initial=self.epochs_run) as pbar: 97 | for epoch in range(self.epochs_run + 1, max_epochs + 1): 98 | epoch_loss = self._run_epoch(epoch) 99 | 100 | # 各个 GPU 上都在跑一样的训练进程,这里指定 rank0 进程保存 snapshot 以免重复保存 101 | if self.gpu_id == 0 and epoch % self.save_every == 0: 102 | self._save_snapshot(epoch) 103 | 104 | pbar.set_postfix({'epoch': epoch, 'loss':'{:.2f}'.format(epoch_loss)}) 105 | pbar.update() 106 | 107 | class MyTrainDataset(Dataset): 108 | def __init__(self, size): 109 | self.size = size 110 | 111 | # Simple Linear Regression problem 112 | input_dim = 2 113 | output_dim = 1 114 | true_w = torch.Tensor([-2, 3.4]).view(input_dim, output_dim) 115 | true_b = 4.2 116 | 117 | features = torch.randn(size=(size, input_dim), dtype=torch.float32) 118 | labels = torch.mm(features,true_w) + true_b 119 | labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float32) 120 | 121 | self.data = [(features[i], labels[i]) for i in range(size)] 122 | 123 | def __len__(self): 124 | return self.size 125 | 126 | def __getitem__(self, index): 127 | return self.data[index] 128 | 129 | def load_train_objs(): 130 | train_set = MyTrainDataset(2048) # load your dataset 131 | model = torch.nn.Linear(2, 1) # load your model 132 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 133 | return train_set, model, optimizer 134 | 135 | def prepare_dataloader(dataset: Dataset, batch_size: int): 136 | return DataLoader( 137 | dataset, 138 | batch_size=batch_size, 139 | pin_memory=True, 140 | shuffle=False, # 设置了新的 sampler,参数 shuffle 要设置为 False 141 | sampler=DistributedSampler(dataset) # 这个 sampler 自动将数据分块后送个各个 GPU,它能避免数据重叠 142 | ) 143 | 144 | def main(save_every: int, total_epochs: int, batch_size: int, snapshot_path: str="snapshot.pt"): 145 | # 初始化进程池 146 | ddp_setup() 147 | 148 | # 进行训练 149 | dataset, model, optimizer = load_train_objs() 150 | train_data = prepare_dataloader(dataset, batch_size) 151 | trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path) 152 | trainer.train(total_epochs) 153 | 154 | # 销毁进程池 155 | destroy_process_group() 156 | 157 | if __name__ == "__main__": 158 | import argparse 159 | parser = argparse.ArgumentParser(description='simple distributed training job') 160 | parser.add_argument('--total-epochs', type=int, default=100, help='Total epochs to train the model') 161 | parser.add_argument('--save-every', type=int, default=10, help='How often to save a snapshot') 162 | parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)') 163 | args = parser.parse_args() 164 | 165 | # 现在 torchrun 负责在各个 GPU 上生成进程并执行,不再需要 mp.spawn 了 166 | main(args.save_every, args.total_epochs, args.batch_size) 167 | 168 | ''' 169 | 运行命令: 170 | torchrun --standalone --nproc_per_node=gpu multi_gpu_torchrun.py 171 | 172 | 参数说明: 173 | --standalone 代表单机运行 174 | --nproc_per_node=gpu 代表使用所有可用GPU, 等于号后也可写gpu数量n, 这样会使用前n个GPU 175 | 176 | 运行后获取参数: 177 | os.environ['RANK'] 得到在所有机器所有进程中当前GPU的rank 178 | os.environ['LOCAL_RANK'] 得到在当前node中当前GPU的rank 179 | os.environ['WORLD_SIZE'] 得到GPU的数量 180 | 181 | 通过 CUDA_VISIBLE_DEVICES 指定程序可见的GPU, 从而实现指定GPU运行: 182 | CUDA_VISIBLE_DEVICES=0,3 torchrun --standalone --nproc_per_node=gpu multi_gpu_torchrun.py 183 | 184 | ''' -------------------------------------------------------------------------------- /single_gpu.py: -------------------------------------------------------------------------------- 1 | # 单 GPU 训练示例 2 | import torch 3 | import torch.nn.functional as F 4 | from torch.utils.data import Dataset, DataLoader 5 | 6 | class Trainer: 7 | def __init__( 8 | self, 9 | model: torch.nn.Module, 10 | train_data: DataLoader, 11 | optimizer: torch.optim.Optimizer, 12 | gpu_id: int, 13 | save_every: int, 14 | ) -> None: 15 | self.gpu_id = gpu_id 16 | self.model = model.to(gpu_id) 17 | self.train_data = train_data 18 | self.optimizer = optimizer 19 | self.save_every = save_every 20 | 21 | def _run_batch(self, source, targets): 22 | self.optimizer.zero_grad() 23 | output = self.model(source) 24 | loss = F.cross_entropy(output, targets) 25 | loss.backward() 26 | self.optimizer.step() 27 | 28 | def _run_epoch(self, epoch): 29 | b_sz = len(next(iter(self.train_data))[0]) 30 | print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}") 31 | for source, targets in self.train_data: 32 | source = source.to(self.gpu_id) 33 | targets = targets.to(self.gpu_id) 34 | self._run_batch(source, targets) 35 | 36 | def _save_checkpoint(self, epoch): 37 | ckp = self.model.state_dict() 38 | PATH = "checkpoint.pt" 39 | torch.save(ckp, PATH) 40 | print(f"Epoch {epoch} | Training checkpoint saved at {PATH}") 41 | 42 | def train(self, max_epochs: int): 43 | for epoch in range(max_epochs): 44 | self._run_epoch(epoch) 45 | if epoch % self.save_every == 0: 46 | self._save_checkpoint(epoch) 47 | 48 | class MyTrainDataset(Dataset): 49 | def __init__(self, size): 50 | self.size = size 51 | self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)] 52 | 53 | def __len__(self): 54 | return self.size 55 | 56 | def __getitem__(self, index): 57 | return self.data[index] 58 | 59 | 60 | 61 | def load_train_objs(): 62 | train_set = MyTrainDataset(2048) # load your dataset 63 | model = torch.nn.Linear(20, 1) # load your model 64 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 65 | return train_set, model, optimizer 66 | 67 | def prepare_dataloader(dataset: Dataset, batch_size: int): 68 | return DataLoader( 69 | dataset, 70 | batch_size=batch_size, 71 | pin_memory=True, 72 | shuffle=True 73 | ) 74 | 75 | def main(device, total_epochs, save_every, batch_size): 76 | dataset, model, optimizer = load_train_objs() 77 | train_data = prepare_dataloader(dataset, batch_size) 78 | trainer = Trainer(model, train_data, optimizer, device, save_every) 79 | trainer.train(total_epochs) 80 | 81 | 82 | 83 | if __name__ == "__main__": 84 | import argparse 85 | parser = argparse.ArgumentParser(description='simple distributed training job') 86 | parser.add_argument('--total-epochs', type=int, default=50, help='Total epochs to train the model') 87 | parser.add_argument('--save-every', type=int, default=10, help='How often to save a snapshot') 88 | parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)') 89 | args = parser.parse_args() 90 | 91 | device = 0 # shorthand for cuda:0 92 | main(device, args.total_epochs, args.save_every, args.batch_size) --------------------------------------------------------------------------------