├── .hydra
    ├── config.yaml
    ├── hydra.yaml
    └── overrides.yaml
├── .vscode
    └── launch.json
├── README.md
├── main.log
├── minGPT-ddp
    ├── README.md
    ├── mingpt
    │   ├── .hydra
    │   │   ├── config.yaml
    │   │   ├── hydra.yaml
    │   │   └── overrides.yaml
    │   ├── __pycache__
    │   │   ├── char_dataset.cpython-38.pyc
    │   │   ├── model.cpython-38.pyc
    │   │   └── trainer.cpython-38.pyc
    │   ├── char_dataset.py
    │   ├── data
    │   │   └── input.txt
    │   ├── gpt2_train_cfg.yaml
    │   ├── main.log
    │   ├── main.py
    │   ├── model.py
    │   └── trainer.py
    └── requirements.txt
├── multi_gpu.py
├── multi_gpu_torchrun.py
└── single_gpu.py


/.hydra/config.yaml:
--------------------------------------------------------------------------------
 1 | data_config:
 2 |   path: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt/data/input.txt
 3 |   block_size: 128
 4 |   train_split: 0.9
 5 |   truncate: 0.02
 6 | gpt_config:
 7 |   n_layer: 8
 8 |   n_head: 8
 9 |   n_embd: 512
10 | trainer_config:
11 |   max_epochs: 10
12 |   batch_size: 216
13 |   data_loader_workers: 4
14 |   grad_norm_clip: 1.0
15 |   snapshot_path: gpt_snapshot.pt
16 |   save_every: 3
17 |   use_amp: true
18 | optimizer_config:
19 |   weight_decay: 0.1
20 |   learning_rate: 0.0003
21 | 


--------------------------------------------------------------------------------
/.hydra/hydra.yaml:
--------------------------------------------------------------------------------
  1 | hydra:
  2 |   run:
  3 |     dir: ./
  4 |   sweep:
  5 |     dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
  6 |     subdir: ${hydra.job.num}
  7 |   launcher:
  8 |     _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
  9 |   sweeper:
 10 |     _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
 11 |     max_batch_size: null
 12 |     params: null
 13 |   help:
 14 |     app_name: ${hydra.job.name}
 15 |     header: '${hydra.help.app_name} is powered by Hydra.
 16 | 
 17 |       '
 18 |     footer: 'Powered by Hydra (https://hydra.cc)
 19 | 
 20 |       Use --hydra-help to view Hydra specific help
 21 | 
 22 |       '
 23 |     template: '${hydra.help.header}
 24 | 
 25 |       == Configuration groups ==
 26 | 
 27 |       Compose your configuration from those groups (group=option)
 28 | 
 29 | 
 30 |       $APP_CONFIG_GROUPS
 31 | 
 32 | 
 33 |       == Config ==
 34 | 
 35 |       Override anything in the config (foo.bar=value)
 36 | 
 37 | 
 38 |       $CONFIG
 39 | 
 40 | 
 41 |       ${hydra.help.footer}
 42 | 
 43 |       '
 44 |   hydra_help:
 45 |     template: 'Hydra (${hydra.runtime.version})
 46 | 
 47 |       See https://hydra.cc for more info.
 48 | 
 49 | 
 50 |       == Flags ==
 51 | 
 52 |       $FLAGS_HELP
 53 | 
 54 | 
 55 |       == Configuration groups ==
 56 | 
 57 |       Compose your configuration from those groups (For example, append hydra/job_logging=disabled
 58 |       to command line)
 59 | 
 60 | 
 61 |       $HYDRA_CONFIG_GROUPS
 62 | 
 63 | 
 64 |       Use ''--cfg hydra'' to Show the Hydra config.
 65 | 
 66 |       '
 67 |     hydra_help: ???
 68 |   hydra_logging:
 69 |     version: 1
 70 |     formatters:
 71 |       simple:
 72 |         format: '[%(asctime)s][HYDRA] %(message)s'
 73 |     handlers:
 74 |       console:
 75 |         class: logging.StreamHandler
 76 |         formatter: simple
 77 |         stream: ext://sys.stdout
 78 |     root:
 79 |       level: INFO
 80 |       handlers:
 81 |       - console
 82 |     loggers:
 83 |       logging_example:
 84 |         level: DEBUG
 85 |     disable_existing_loggers: false
 86 |   job_logging:
 87 |     version: 1
 88 |     formatters:
 89 |       simple:
 90 |         format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
 91 |     handlers:
 92 |       console:
 93 |         class: logging.StreamHandler
 94 |         formatter: simple
 95 |         stream: ext://sys.stdout
 96 |       file:
 97 |         class: logging.FileHandler
 98 |         formatter: simple
 99 |         filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100 |     root:
101 |       level: INFO
102 |       handlers:
103 |       - console
104 |       - file
105 |     disable_existing_loggers: false
106 |   env: {}
107 |   mode: RUN
108 |   searchpath: []
109 |   callbacks: {}
110 |   output_subdir: .hydra
111 |   overrides:
112 |     hydra:
113 |     - hydra.mode=RUN
114 |     task: []
115 |   job:
116 |     name: main
117 |     chdir: null
118 |     override_dirname: ''
119 |     id: ???
120 |     num: ???
121 |     config_name: gpt2_train_cfg
122 |     env_set: {}
123 |     env_copy: []
124 |     config:
125 |       override_dirname:
126 |         kv_sep: '='
127 |         item_sep: ','
128 |         exclude_keys: []
129 |   runtime:
130 |     version: 1.3.2
131 |     version_base: '1.3'
132 |     cwd: /home/tim/桌面/git/ddp-tutorial-series
133 |     config_sources:
134 |     - path: hydra.conf
135 |       schema: pkg
136 |       provider: hydra
137 |     - path: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt
138 |       schema: file
139 |       provider: main
140 |     - path: ''
141 |       schema: structured
142 |       provider: schema
143 |     output_dir: /home/tim/桌面/git/ddp-tutorial-series
144 |     choices:
145 |       hydra/env: default
146 |       hydra/callbacks: null
147 |       hydra/job_logging: default
148 |       hydra/hydra_logging: default
149 |       hydra/hydra_help: default
150 |       hydra/help: default
151 |       hydra/sweeper: basic
152 |       hydra/launcher: basic
153 |       hydra/output: default
154 |   verbose: false
155 | 


--------------------------------------------------------------------------------
/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | []
2 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Python: torchrun",
 6 |             "type": "python",
 7 |             "request": "launch",
 8 |             // 设置 program 的路径为 torchrun 脚本对应的绝对路径
 9 |             "program": "/usr/local/anaconda3/envs/torch-2.0.1-cu117-py39/lib/python3.9/site-packages/torch/distributed/run.py",
10 |             // 设置 torchrun 命令的参数
11 |             "args":[
12 |                 "--standalone",
13 |                 "--nproc_per_node=gpu",
14 |                 "multi_gpu_torchrun.py"
15 |             ],
16 |             "console": "integratedTerminal",
17 |             "justMyCode": true,
18 |             "env": {
19 |                 "CUDA_VISIBLE_DEVICES": "5, 6"
20 |             },
21 |         }
22 |     ]
23 | }
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [<img src="https://api.gitsponsors.com/api/badge/img?id=686616963" height="20">](https://api.gitsponsors.com/api/badge/link?p=KoJzzoI5V0U5cxCo3lEj9srdIMTw7IoiFPToGomZRFp9HNMVxC2tGRy4n5Chm6M03jA9RjbezjCIKyoQVFxp7yN3+IexpNGKeaLHWoqwrp/6C6BjFgQf7A9QnfnJcs9D)
 2 | # ddp-tutorial-series
 3 | Follow the [pytorch official tutorial](https://pytorch.org/tutorials/beginner/ddp_series_intro.html?utm_source=youtube&utm_medium=organic_social&utm_campaign=tutorial) to learn how to use `nn.parallel.DistributedDataParallel` to speed up training
 4 | 
 5 | # distributed-pytorch
 6 | 
 7 | Code for the DDP tutorial series at https://pytorch.org/tutorials/beginner/ddp_series_intro.html
 8 | 
 9 | Each code file extends upon the previous one. The series starts with a non-distributed script that runs on a single GPU and incrementally updates to end with multinode training on a Slurm cluster.
10 | 
11 | ## Files
12 | 
13 | - [single_gpu.py](https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/single_gpu.py): Non-distributed training script
14 | - [multigpu.py](https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py): DDP on a single node
15 | - [multigpu_torchrun.py](https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu_torchrun.py): DDP on a single node using Torchrun
16 | - minGPT-ddp:  training a GPT-like model (from the minGPT repo [https://github.com/karpathy/minGPT](https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbGo2ZjQtMzFlQ2pJQmV6VV9yenFpdmlXVGItd3xBQ3Jtc0tueWdxVFZsYTNPRXFTSW5xejhUajZ1OVYydjNraENoZzNka05ZLWMtZXJkM1VjaFd5cENUMld0TEc5N3VkRFV2bzM2aWdvWVRjTU01TmFfZE9mdXVBTFczWDJZMnU2TjA4Z0tCd25LX2sxOFJLMWtsMA&q=https%3A%2F%2Fgithub.com%2Fkarpathy%2FminGPT&v=XFsFDGKZHh4)) with DDP. 
17 | 
18 | 
19 | 
20 | ## 我的笔记
21 | 
22 | - [Pytorch 多卡并行（1）—— 原理简介和 DDP 并行实践](https://blog.csdn.net/wxc971231/article/details/132816104)
23 | 
24 | - [Pytorch 多卡并行（2）—— 使用 torchrun 进行容错处理](https://blog.csdn.net/wxc971231/article/details/132827787)
25 | 
26 | - [Pytorch 多卡并行（3）—— 使用 DDP 加速 minGPT 训练](https://blog.csdn.net/wxc971231/article/details/132829661)
27 | 
28 | 


--------------------------------------------------------------------------------
/main.log:
--------------------------------------------------------------------------------
 1 | [2023-09-12 18:52:52,922][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
 2 | [2023-09-12 18:52:52,924][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
 3 | [2023-09-12 18:53:56,746][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
 4 | [2023-09-12 18:53:56,747][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
 5 | [2023-09-12 18:55:25,877][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
 6 | [2023-09-12 18:55:25,878][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
 7 | [2023-09-12 18:56:10,506][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
 8 | [2023-09-12 18:56:10,507][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
 9 | [2023-09-12 18:56:14,516][root][INFO] - Reducer buckets have been rebuilt in this iteration.
10 | [2023-09-12 18:59:06,955][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
11 | [2023-09-12 18:59:06,956][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
12 | [2023-09-12 18:59:11,087][root][INFO] - Reducer buckets have been rebuilt in this iteration.
13 | 


--------------------------------------------------------------------------------
/minGPT-ddp/README.md:
--------------------------------------------------------------------------------
 1 | # minGPT-DDP
 2 | 
 3 | Code accompanying the tutorial at https://pytorch.org/tutorials/intermediate/ddp_series_minGPT.html for training a GPT-like model with Distributed Data Parallel (DDP) in PyTorch.
 4 | 
 5 | Files marked with an asterisk (*) are adapted from the minGPT repo (https://github.com/karpathy/minGPT).
 6 | 
 7 | - [trainer.py](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/trainer.py) includes the Trainer class that runs the distributed training iterations on the model with the provided dataset.
 8 | - [model.py *](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/model.py) defines the model architecture.
 9 | - [char_dataset.py *](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/char_dataset.py) contains the `Dataset`class for a character-level dataset.
10 | - [gpt2_train_cfg.yaml](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/gpt2_train_cfg.yaml) contains the configurations for data, model, optimizer and training run.
11 | - [main.py](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/main.py) is the entry point to the trainig job. It sets up the DDP process group, reads all the configurations and runs the training job.
12 | - [slurm/](https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/slurm) contains files for setting up an AWS cluster and the slurm script to run multinode training.


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/.hydra/config.yaml:
--------------------------------------------------------------------------------
 1 | data_config:
 2 |   path: ./data/input.txt
 3 |   block_size: 128
 4 |   train_split: 0.9
 5 |   truncate: 0.02
 6 | gpt_config:
 7 |   n_layer: 8
 8 |   n_head: 8
 9 |   n_embd: 512
10 | trainer_config:
11 |   max_epochs: 10
12 |   batch_size: 216
13 |   data_loader_workers: 4
14 |   grad_norm_clip: 1.0
15 |   snapshot_path: gpt_snapshot.pt
16 |   save_every: 3
17 |   use_amp: true
18 | optimizer_config:
19 |   weight_decay: 0.1
20 |   learning_rate: 0.0003
21 | 


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/.hydra/hydra.yaml:
--------------------------------------------------------------------------------
  1 | hydra:
  2 |   run:
  3 |     dir: ./
  4 |   sweep:
  5 |     dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
  6 |     subdir: ${hydra.job.num}
  7 |   launcher:
  8 |     _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
  9 |   sweeper:
 10 |     _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
 11 |     max_batch_size: null
 12 |     params: null
 13 |   help:
 14 |     app_name: ${hydra.job.name}
 15 |     header: '${hydra.help.app_name} is powered by Hydra.
 16 | 
 17 |       '
 18 |     footer: 'Powered by Hydra (https://hydra.cc)
 19 | 
 20 |       Use --hydra-help to view Hydra specific help
 21 | 
 22 |       '
 23 |     template: '${hydra.help.header}
 24 | 
 25 |       == Configuration groups ==
 26 | 
 27 |       Compose your configuration from those groups (group=option)
 28 | 
 29 | 
 30 |       $APP_CONFIG_GROUPS
 31 | 
 32 | 
 33 |       == Config ==
 34 | 
 35 |       Override anything in the config (foo.bar=value)
 36 | 
 37 | 
 38 |       $CONFIG
 39 | 
 40 | 
 41 |       ${hydra.help.footer}
 42 | 
 43 |       '
 44 |   hydra_help:
 45 |     template: 'Hydra (${hydra.runtime.version})
 46 | 
 47 |       See https://hydra.cc for more info.
 48 | 
 49 | 
 50 |       == Flags ==
 51 | 
 52 |       $FLAGS_HELP
 53 | 
 54 | 
 55 |       == Configuration groups ==
 56 | 
 57 |       Compose your configuration from those groups (For example, append hydra/job_logging=disabled
 58 |       to command line)
 59 | 
 60 | 
 61 |       $HYDRA_CONFIG_GROUPS
 62 | 
 63 | 
 64 |       Use ''--cfg hydra'' to Show the Hydra config.
 65 | 
 66 |       '
 67 |     hydra_help: ???
 68 |   hydra_logging:
 69 |     version: 1
 70 |     formatters:
 71 |       simple:
 72 |         format: '[%(asctime)s][HYDRA] %(message)s'
 73 |     handlers:
 74 |       console:
 75 |         class: logging.StreamHandler
 76 |         formatter: simple
 77 |         stream: ext://sys.stdout
 78 |     root:
 79 |       level: INFO
 80 |       handlers:
 81 |       - console
 82 |     loggers:
 83 |       logging_example:
 84 |         level: DEBUG
 85 |     disable_existing_loggers: false
 86 |   job_logging:
 87 |     version: 1
 88 |     formatters:
 89 |       simple:
 90 |         format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
 91 |     handlers:
 92 |       console:
 93 |         class: logging.StreamHandler
 94 |         formatter: simple
 95 |         stream: ext://sys.stdout
 96 |       file:
 97 |         class: logging.FileHandler
 98 |         formatter: simple
 99 |         filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100 |     root:
101 |       level: INFO
102 |       handlers:
103 |       - console
104 |       - file
105 |     disable_existing_loggers: false
106 |   env: {}
107 |   mode: RUN
108 |   searchpath: []
109 |   callbacks: {}
110 |   output_subdir: .hydra
111 |   overrides:
112 |     hydra:
113 |     - hydra.mode=RUN
114 |     task: []
115 |   job:
116 |     name: main
117 |     chdir: null
118 |     override_dirname: ''
119 |     id: ???
120 |     num: ???
121 |     config_name: gpt2_train_cfg
122 |     env_set: {}
123 |     env_copy: []
124 |     config:
125 |       override_dirname:
126 |         kv_sep: '='
127 |         item_sep: ','
128 |         exclude_keys: []
129 |   runtime:
130 |     version: 1.3.2
131 |     version_base: '1.3'
132 |     cwd: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt
133 |     config_sources:
134 |     - path: hydra.conf
135 |       schema: pkg
136 |       provider: hydra
137 |     - path: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt
138 |       schema: file
139 |       provider: main
140 |     - path: ''
141 |       schema: structured
142 |       provider: schema
143 |     output_dir: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt
144 |     choices:
145 |       hydra/env: default
146 |       hydra/callbacks: null
147 |       hydra/job_logging: default
148 |       hydra/hydra_logging: default
149 |       hydra/hydra_help: default
150 |       hydra/help: default
151 |       hydra/sweeper: basic
152 |       hydra/launcher: basic
153 |       hydra/output: default
154 |   verbose: false
155 | 


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | []
2 | 


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/__pycache__/char_dataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wxc971231/ddp-tutorial-series/ef810fbeae202cba704fce220ca07a445575f28b/minGPT-ddp/mingpt/__pycache__/char_dataset.cpython-38.pyc


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wxc971231/ddp-tutorial-series/ef810fbeae202cba704fce220ca07a445575f28b/minGPT-ddp/mingpt/__pycache__/model.cpython-38.pyc


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/__pycache__/trainer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wxc971231/ddp-tutorial-series/ef810fbeae202cba704fce220ca07a445575f28b/minGPT-ddp/mingpt/__pycache__/trainer.cpython-38.pyc


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/char_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | import fsspec
 4 | from dataclasses import dataclass
 5 | 
 6 | """
 7 | Adapted from https://github.com/karpathy/minGPT/blob/master/projects/chargpt/chargpt.py
 8 | """
 9 | 
10 | @dataclass
11 | class DataConfig:
12 |     path: str = None
13 |     block_size: int = None      # 输入序列长度    
14 |     train_split: float = None   # 训练集和测试集划分
15 |     truncate: float = 1.0       # 用于训练的数据占全体数据的比例
16 | 
17 | class CharDataset(Dataset):
18 | 
19 |     def __init__(self, data_cfg: DataConfig): #data_path: str, block_size):
20 |         # 加载所需比例的数据
21 |         data = fsspec.open(data_cfg.path).open().read().decode('utf-8')
22 |         data = data[ : int(len(data) * data_cfg.truncate)]
23 | 
24 |         # Set 去重，转 list 后排序得到数据集中的唯一字符列表作为词表
25 |         chars = sorted(list(set(data))) 
26 |         data_size, vocab_size = len(data), len(chars)
27 |         print('Data has %d characters, %d unique.' % (data_size, vocab_size))
28 | 
29 |         # 得到字符和词表索引之间的双射
30 |         self.stoi = {ch: i for i, ch in enumerate(chars)}   # 字符 -> 词表索引
31 |         self.itos = {i: ch for i, ch in enumerate(chars)}   # 词表索引 -> 字符
32 |         
33 |         self.block_size = data_cfg.block_size   
34 |         self.vocab_size = vocab_size
35 |         self.data = data
36 | 
37 |     def __len__(self):
38 |         return len(self.data) - self.block_size
39 | 
40 |     def __getitem__(self, idx):
41 |         # grab a chunk of (block_size + 1) characters from the data
42 |         chunk = self.data[idx:idx + self.block_size + 1]
43 |         
44 |         # encode every character to an integer
45 |         dix = [self.stoi[s] for s in chunk]
46 |         x = torch.tensor(dix[:-1], dtype=torch.long)
47 |         y = torch.tensor(dix[1:], dtype=torch.long)
48 |         return x, y
49 | 


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/gpt2_train_cfg.yaml:
--------------------------------------------------------------------------------
 1 | data_config:
 2 |   path: /home/tim/桌面/git/ddp-tutorial-series/minGPT-ddp/mingpt/data/input.txt
 3 |   block_size: 128   # 输入序列长度
 4 |   train_split: 0.9  # 训练集和测试集划分
 5 |   truncate: 0.02    # 只用5%的数据进行训练
 6 | gpt_config:
 7 |   n_layer: 8
 8 |   n_head: 8
 9 |   n_embd: 512       
10 | trainer_config:
11 |   max_epochs: 10
12 |   batch_size: 216
13 |   data_loader_workers: 4
14 |   grad_norm_clip: 1.0
15 |   snapshot_path: gpt_snapshot.pt
16 |   save_every: 3
17 |   use_amp: True
18 | optimizer_config:
19 |   weight_decay: 0.1
20 |   learning_rate: 0.0003
21 | 
22 | hydra:
23 |   run:
24 |     dir: ./
25 | 


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/main.log:
--------------------------------------------------------------------------------
 1 | [2023-09-11 17:00:40,135][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
 2 | [2023-09-11 17:00:40,136][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
 3 | [2023-09-11 17:00:40,136][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
 4 | [2023-09-11 17:00:40,136][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
 5 | [2023-09-11 17:00:43,432][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
 6 | [2023-09-11 17:00:43,433][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
 7 | [2023-09-11 18:01:07,588][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
 8 | [2023-09-11 18:01:07,596][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
 9 | [2023-09-11 18:01:07,596][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
10 | [2023-09-11 18:01:07,599][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
11 | [2023-09-11 18:01:11,054][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
12 | [2023-09-11 18:01:11,054][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
13 | [2023-09-11 18:02:31,321][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
14 | [2023-09-11 18:02:31,321][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
15 | [2023-09-11 18:02:31,322][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
16 | [2023-09-11 18:02:31,331][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
17 | [2023-09-11 18:02:34,753][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
18 | [2023-09-11 18:02:34,753][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
19 | [2023-09-11 18:06:18,569][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
20 | [2023-09-11 18:06:18,578][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
21 | [2023-09-11 18:06:18,578][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
22 | [2023-09-11 18:06:18,580][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
23 | [2023-09-11 18:06:21,444][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
24 | [2023-09-11 18:06:21,446][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
25 | [2023-09-11 18:08:44,508][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
26 | [2023-09-11 18:08:44,517][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
27 | [2023-09-11 18:08:44,517][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
28 | [2023-09-11 18:08:44,518][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
29 | [2023-09-11 18:08:47,320][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
30 | [2023-09-11 18:08:47,320][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
31 | [2023-09-11 18:09:30,301][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
32 | [2023-09-11 18:09:30,306][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
33 | [2023-09-11 18:09:30,307][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
34 | [2023-09-11 18:09:30,311][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
35 | [2023-09-11 18:09:33,182][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
36 | [2023-09-11 18:09:33,182][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
37 | [2023-09-11 18:31:37,555][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
38 | [2023-09-11 18:31:37,555][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
39 | [2023-09-11 18:31:37,556][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
40 | [2023-09-11 18:31:37,556][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
41 | [2023-09-11 18:31:40,494][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
42 | [2023-09-11 18:31:40,495][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
43 | [2023-09-11 18:32:07,745][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
44 | [2023-09-11 18:32:07,755][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
45 | [2023-09-11 18:32:07,755][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
46 | [2023-09-11 18:32:07,756][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
47 | [2023-09-11 18:32:10,667][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
48 | [2023-09-11 18:32:10,667][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
49 | [2023-09-11 18:42:41,048][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
50 | [2023-09-11 18:42:41,048][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
51 | [2023-09-11 18:42:41,049][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
52 | [2023-09-11 18:42:41,058][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
53 | [2023-09-11 18:42:44,410][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
54 | [2023-09-11 18:42:44,411][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
55 | [2023-09-11 18:44:57,544][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
56 | [2023-09-11 18:44:57,548][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
57 | [2023-09-11 18:44:57,548][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
58 | [2023-09-11 18:44:57,555][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
59 | [2023-09-11 18:45:01,088][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
60 | [2023-09-11 18:45:01,088][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
61 | [2023-09-11 18:49:33,785][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
62 | [2023-09-11 18:49:33,785][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
63 | [2023-09-11 18:49:33,785][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
64 | [2023-09-11 18:49:33,785][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
65 | [2023-09-11 18:49:36,698][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
66 | [2023-09-11 18:49:36,698][torch.nn.parallel.distributed][INFO] - Reducer buckets have been rebuilt in this iteration.
67 | [2023-09-12 18:29:46,692][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
68 | [2023-09-12 18:29:46,692][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
69 | [2023-09-12 18:32:47,875][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
70 | [2023-09-12 18:32:47,875][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
71 | [2023-09-12 18:34:07,827][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
72 | [2023-09-12 18:34:07,827][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
73 | [2023-09-12 18:34:11,181][root][INFO] - Reducer buckets have been rebuilt in this iteration.
74 | 


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.data import random_split
 4 | from torch.distributed import init_process_group, destroy_process_group
 5 | from model import GPT, GPTConfig, OptimizerConfig, create_optimizer
 6 | from trainer import Trainer, TrainerConfig
 7 | from char_dataset import CharDataset, DataConfig
 8 | from omegaconf import DictConfig
 9 | import hydra
10 | 
11 | 
12 | def ddp_setup():
13 |     os.environ["MASTER_ADDR"] = "localhost" # 由于这里是单机实验所以直接写 localhost
14 |     os.environ["MASTER_PORT"] = "12355"     # 任意空闲端口
15 |     init_process_group(backend="nccl")
16 |     torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
17 | 
18 | def get_train_objs(gpt_cfg: GPTConfig, opt_cfg: OptimizerConfig, data_cfg: DataConfig):
19 |     dataset = CharDataset(data_cfg)
20 |     train_len = int(len(dataset) * data_cfg.train_split)
21 |     train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])
22 | 
23 |     gpt_cfg.vocab_size = dataset.vocab_size
24 |     gpt_cfg.block_size = dataset.block_size
25 |     model = GPT(gpt_cfg)
26 |     optimizer = create_optimizer(model, opt_cfg)
27 |     
28 |     return model, optimizer, train_set, test_set
29 |  
30 | @hydra.main(version_base=None, config_path=".", config_name="gpt2_train_cfg")
31 | def main(cfg: DictConfig):
32 |     # 初始化进程池
33 |     ddp_setup()
34 | 
35 |     # 从 yaml 文件读取超参数
36 |     gpt_cfg = GPTConfig(**cfg['gpt_config'])
37 |     opt_cfg = OptimizerConfig(**cfg['optimizer_config'])
38 |     data_cfg = DataConfig(**cfg['data_config'])
39 |     trainer_cfg = TrainerConfig(**cfg['trainer_config'])
40 | 
41 |     # 创建训练对象
42 |     model, optimizer, train_data, test_data = get_train_objs(gpt_cfg, opt_cfg, data_cfg)
43 |     trainer = Trainer(trainer_cfg, model, optimizer, train_data, test_data)
44 |     
45 |     # 开始训练
46 |     trainer.train()
47 | 
48 |     # 训练完成后，销毁进程池
49 |     destroy_process_group()
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 
55 | '''
56 | 运行命令: 
57 |     CUDA_VISIBLE_DEVICES=1,2 torchrun --standalone --nproc_per_node=gpu main.py
58 | '''


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Full definition of a GPT Language Model, all of it in this single file.
  3 | Adapted from https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
  4 | """
  5 | 
  6 | from dataclasses import dataclass
  7 | import math
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.nn import functional as F
 11 | 
 12 | @dataclass
 13 | class GPTConfig:
 14 |     model_type: str = 'gpt2'
 15 |     # model configurations
 16 |     n_layer: int = None
 17 |     n_head: int = None
 18 |     n_embd: int =  None
 19 |     # openai's values for gpt2
 20 |     vocab_size: int = 50257 
 21 |     block_size: int = 1024
 22 |     # dropout hyperparameters
 23 |     embd_pdrop: float = 0.1
 24 |     resid_pdrop: float = 0.1
 25 |     attn_pdrop: float = 0.1
 26 | 
 27 | @dataclass
 28 | class OptimizerConfig:
 29 |     learning_rate: float = 3e-4
 30 |     weight_decay: float = 0.1
 31 | 
 32 | class MultiheadAttentionLayer(nn.Module):
 33 |     """
 34 |     A multi-head masked self-attention layer with a projection at the end.
 35 |     """
 36 | 
 37 |     def __init__(self, config, device="cpu", dtype=torch.float32):
 38 |         super().__init__()
 39 |         assert config.n_embd % config.n_head == 0
 40 |         self.resid_drop = nn.Dropout(config.resid_pdrop)
 41 |         
 42 |         # output projection
 43 |         self.c_proj = nn.Linear(config.n_embd, config.n_embd, device=device, dtype=dtype)
 44 | 
 45 |         # Causal mask。注意这个mask是通过 self.register_buffer 方法登记的
 46 |         # 这样登记过的张量可以求梯度也可以随模型在 CPU/GPU 之间移动，但是不进行参数优化
 47 |         self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
 48 |                              .view(1, 1, config.block_size, config.block_size))
 49 |         
 50 |         self.attn = torch.nn.MultiheadAttention(
 51 |             embed_dim=config.n_embd,
 52 |             num_heads=config.n_head,
 53 |             dropout=config.attn_pdrop,
 54 |             batch_first=True,
 55 |             device=device,
 56 |             dtype=dtype
 57 |         )
 58 | 
 59 |     def forward(self, x):
 60 |         _, seq_size, _ = x.size()   # batch size, sequence length, embedding dimensionality (n_embd)
 61 |         y = self.attn(x, x, x, attn_mask=self.mask[0, 0, :seq_size, :seq_size])[0]
 62 |         y = self.resid_drop(self.c_proj(y))
 63 |         return y
 64 | 
 65 | class Block(nn.Module):
 66 |     """ an unassuming Transformer block """
 67 |     def __init__(self, config: GPTConfig):
 68 |         super().__init__()
 69 |         self.ln1 = nn.LayerNorm(config.n_embd)
 70 |         self.ln2 = nn.LayerNorm(config.n_embd)
 71 |         self.attn = MultiheadAttentionLayer(config)
 72 |         self.mlp = nn.Sequential(
 73 |             nn.Linear(config.n_embd, 4 * config.n_embd),
 74 |             nn.GELU(),
 75 |             nn.Linear(4 * config.n_embd, config.n_embd),
 76 |             nn.Dropout(config.resid_pdrop),
 77 |         )
 78 | 
 79 |     def forward(self, x):
 80 |         x = x + self.attn(self.ln1(x))
 81 |         x = x + self.mlp(self.ln2(x))
 82 |         return x
 83 | 
 84 | class EmbeddingStem(nn.Module):
 85 |     def __init__(self, config: GPTConfig, device="cpu", dtype=torch.float32):
 86 |         super().__init__()
 87 |         self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd, device=device, dtype=dtype)
 88 |         self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd, device=device, dtype=dtype))
 89 |         self.drop = nn.Dropout(config.embd_pdrop)
 90 |         self.block_size = config.block_size
 91 | 
 92 |     def reset_parameters(self): 
 93 |         self.tok_emb.reset_parameters() # 将 nn.Embedding 层参数初始化为正态分布采样
 94 | 
 95 |     def forward(self, idx):
 96 |         b, t = idx.size()
 97 |         assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
 98 | 
 99 |         token_embeddings = self.tok_emb(idx)            # each index maps to a (learnable) embedding vector
100 |         position_embeddings = self.pos_emb[:, :t, :]    # each position maps to a (learnable) position vector
101 |         return self.drop(token_embeddings + position_embeddings)
102 |         
103 | class GPT(nn.Module):
104 |     """ GPT Language Model """
105 | 
106 |     def __init__(self, config: GPTConfig):
107 |         super().__init__()
108 |         self.block_size = config.block_size
109 |         config = self._set_model_config(config)
110 | 
111 |         # input embedding stem
112 |         self.emb_stem = EmbeddingStem(config)
113 |         # transformer
114 |         self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
115 |         # decoder head
116 |         self.ln_f = nn.LayerNorm(config.n_embd)
117 |         self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
118 | 
119 |         # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
120 |         self.apply(self._init_weights)
121 |         for pn, p in self.named_parameters():
122 |             if pn.endswith('c_proj.weight'):
123 |                 p.data.normal_(mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
124 | 
125 |         # report number of parameters (note we don't count the decoder parameters in lm_head)
126 |         n_params = sum(p.numel() for p in self.blocks.parameters())
127 |         print("number of parameters: %.2fM" % (n_params/1e6,))
128 | 
129 |     def _set_model_config(self, config):
130 |         type_given = config.model_type is not None
131 |         params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
132 |         # assert type_given ^ params_given # exactly one of these (XOR)
133 |         if type_given and not params_given:
134 |             # translate from model_type to detailed configuration
135 |             config.__dict__.update({
136 |                 # names follow the huggingface naming conventions
137 |                 # GPT-1
138 |                 'openai-gpt':   dict(n_layer=12, n_head=12, n_embd=768),  # 117M params
139 |                 # GPT-2 configs
140 |                 'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
141 |                 'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
142 |                 'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
143 |                 'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
144 |                 # Gophers
145 |                 'gopher-44m':   dict(n_layer=8, n_head=16, n_embd=512),
146 |                 # (there are a number more...)
147 |                 # I made these tiny models up
148 |                 'gpt-mini':     dict(n_layer=6, n_head=6, n_embd=192),
149 |                 'gpt-micro':    dict(n_layer=4, n_head=4, n_embd=128),
150 |                 'gpt-nano':     dict(n_layer=3, n_head=3, n_embd=48),
151 |             }[config.model_type])
152 |         return config
153 |     
154 |     def _init_weights(self, module):
155 |         if isinstance(module, (nn.Linear, nn.Embedding)):
156 |             module.weight.data.normal_(mean=0.0, std=0.02)
157 |             if isinstance(module, nn.Linear) and module.bias is not None:
158 |                 module.bias.data.zero_()
159 |         elif isinstance(module, nn.LayerNorm):
160 |             module.bias.data.zero_()
161 |             module.weight.data.fill_(1.0)
162 | 
163 |     def forward(self, idx, targets=None):
164 |         x = self.emb_stem(idx)
165 |         x = self.blocks(x)
166 |         x = self.ln_f(x)
167 |         logits = self.head(x)
168 | 
169 |         # if we are given some desired targets also calculate the loss
170 |         loss = None
171 |         if targets is not None:
172 |             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
173 | 
174 |         return logits, loss
175 | 
176 |     @torch.no_grad()
177 |     def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
178 |         """
179 |         Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
180 |         the sequence max_new_tokens times, feeding the predictions back into the model each time.
181 |         Most likely you'll want to make sure to be in model.eval() mode of operation for this.
182 |         """
183 |         for _ in range(max_new_tokens):
184 |             # if the sequence context is growing too long we must crop it at block_size
185 |             idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
186 |             # forward the model to get the logits for the index in the sequence
187 |             logits, _ = self(idx_cond)
188 |             # pluck the logits at the final step and scale by desired temperature
189 |             logits = logits[:, -1, :] / temperature
190 |             # optionally crop the logits to only the top k options
191 |             if top_k is not None:
192 |                 v, _ = torch.topk(logits, top_k)
193 |                 logits[logits < v[:, [-1]]] = -float('Inf')
194 |             # apply softmax to convert logits to (normalized) probabilities
195 |             probs = F.softmax(logits, dim=-1)
196 |             # either sample from the distribution or take the most likely element
197 |             if do_sample:
198 |                 idx_next = torch.multinomial(probs, num_samples=1)
199 |             else:
200 |                 _, idx_next = torch.topk(probs, k=1, dim=-1)
201 |             # append sampled index to the running sequence and continue
202 |             idx = torch.cat((idx, idx_next), dim=1)
203 | 
204 |         return idx
205 | 
206 | 
207 | def create_optimizer(model: torch.nn.Module, opt_config: OptimizerConfig):
208 |     """
209 |     This long function is unfortunately doing something very simple and is being very defensive:
210 |     We are separating out all parameters of the model into two buckets: those that will experience
211 |     weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
212 |     We are then returning the PyTorch optimizer object.
213 |     """
214 | 
215 |     # separate out all parameters to those that will and won't experience regularizing weight decay
216 |     decay = set()
217 |     no_decay = set()
218 |     whitelist_weight_modules = (torch.nn.Linear, )
219 |     blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
220 |     for mn, m in model.named_modules():
221 |         for pn, p in m.named_parameters():
222 |             fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
223 |             # random note: because named_modules and named_parameters are recursive
224 |             # we will see the same tensors p many many times. but doing it this way
225 |             # allows us to know which parent module any tensor p belongs to...
226 |             if pn.endswith('bias'):
227 |                 # all biases will not be decayed
228 |                 no_decay.add(fpn)
229 |             elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
230 |                 # weights of whitelist modules will be weight decayed
231 |                 decay.add(fpn)
232 |             elif pn.endswith('in_proj_weight'):
233 |                 # MHA projection layer
234 |                 decay.add(fpn)
235 |             elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
236 |                 # weights of blacklist modules will NOT be weight decayed
237 |                 no_decay.add(fpn)
238 |             elif pn.endswith('pos_emb'):
239 |                 # positional embedding shouldn't be decayed
240 |                 no_decay.add(fpn)
241 | 
242 |     # validate that we considered every parameter
243 |     param_dict = {pn: p for pn, p in model.named_parameters()}
244 |     inter_params = decay & no_decay
245 |     union_params = decay | no_decay
246 |     assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
247 |     assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
248 |                                                 % (str(param_dict.keys() - union_params), )
249 | 
250 |     # create the pytorch optimizer object
251 |     optim_groups = [
252 |         {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": opt_config.weight_decay},
253 |         {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
254 |     ]
255 |     optimizer = torch.optim.AdamW(optim_groups, lr=opt_config.learning_rate, betas=(0.9, 0.95))
256 |     return optimizer


--------------------------------------------------------------------------------
/minGPT-ddp/mingpt/trainer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple training loop; Boilerplate that could apply to any arbitrary neural network,
  3 | so nothing in this file really has anything to do with GPT specifically.
  4 | """
  5 | 
  6 | from dataclasses import dataclass, asdict
  7 | from collections import OrderedDict
  8 | from typing import Optional, Any, Dict
  9 | import os
 10 | import torch
 11 | from torch.utils.data import Dataset, DataLoader
 12 | from torch.nn.parallel import DistributedDataParallel as DDP
 13 | from torch.utils.data.distributed import DistributedSampler
 14 | import fsspec
 15 | import torch.distributed as dist
 16 | 
 17 | @dataclass
 18 | class TrainerConfig:
 19 |     max_epochs: int = None
 20 |     batch_size: int = None
 21 |     data_loader_workers: int = None
 22 |     grad_norm_clip: float = None
 23 |     snapshot_path: Optional[str] = None
 24 |     save_every: int = None
 25 |     use_amp: bool = None
 26 | 
 27 | @dataclass
 28 | class Snapshot:
 29 |     model_state: 'OrderedDict[str, torch.Tensor]'
 30 |     optimizer_state: Dict[str, Any]
 31 |     finished_epoch: int
 32 | 
 33 | class Trainer:
 34 |     def __init__(self, trainer_config: TrainerConfig, model, optimizer, train_dataset, test_dataset=None):
 35 |         self.config = trainer_config
 36 |         # set torchrun variables
 37 |         self.local_rank = int(os.environ["LOCAL_RANK"]) # 在所有node的所有进程中当前GPU进程的rank
 38 |         self.global_rank = int(os.environ["RANK"])      # 在当前node中当前GPU进程的rank
 39 |         
 40 |         # data stuff
 41 |         self.train_dataset = train_dataset
 42 |         self.train_loader = self._prepare_dataloader(train_dataset)
 43 |         self.test_loader = self._prepare_dataloader(test_dataset) if test_dataset else None
 44 |         
 45 |         # initialize train states
 46 |         self.epochs_run = 0
 47 |         self.model = model.to(self.local_rank)
 48 |         self.optimizer = optimizer        
 49 |         self.save_every = self.config.save_every
 50 | 
 51 |         # load snapshot if available. only necessary on the first node.
 52 |         if self.config.snapshot_path is None:
 53 |             self.config.snapshot_path = "snapshot.pt"
 54 |         self._load_snapshot()
 55 | 
 56 |         # wrap with DDP. this step will synch model across all the processes.
 57 |         self.model = DDP(self.model, device_ids=[self.local_rank])
 58 | 
 59 |         # torch.cuda.amp.GradScaler 是一个用于自动混合精度训练的 PyTorch 工具，它可以帮助加速模型训练并减少显存使用量
 60 |         # 具体来说，GradScaler 可以将梯度缩放到较小的范围，以避免数值下溢或溢出的问题，同时保持足够的精度以避免模型的性能下降
 61 |         if self.config.use_amp: 
 62 |             self.scaler = torch.cuda.amp.GradScaler()
 63 | 
 64 |     def _prepare_dataloader(self, dataset: Dataset):
 65 |         return DataLoader(
 66 |             dataset,
 67 |             batch_size=self.config.batch_size,
 68 |             pin_memory=True,
 69 |             shuffle=False,
 70 |             num_workers=self.config.data_loader_workers,
 71 |             sampler=DistributedSampler(dataset)                 # 这个 sampler 自动将数据分块后送个各个 GPU，它能避免数据重叠
 72 |         )
 73 | 
 74 |     def _load_snapshot(self):
 75 |         try:
 76 |             snapshot = fsspec.open(self.config.snapshot_path)   # fsspec 为各种后端存储系统提供统一的 Python 接口，可以用相同的语法打开本地、AWS S3 和 GCS 等各种云存储平台的文件
 77 |             with snapshot as f:
 78 |                 snapshot_data = torch.load(f, map_location="cpu")
 79 |         except FileNotFoundError:
 80 |             print("Snapshot not found. Training model from scratch")
 81 |             return 
 82 |     
 83 |         snapshot = Snapshot(**snapshot_data)
 84 |         self.model.load_state_dict(snapshot.model_state)
 85 |         self.optimizer.load_state_dict(snapshot.optimizer_state)
 86 |         self.epochs_run = snapshot.finished_epoch
 87 |         print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
 88 | 
 89 |     def _save_snapshot(self, epoch):
 90 |         # capture snapshot
 91 |         model = self.model
 92 |         raw_model = model.module if hasattr(model, "module") else model
 93 |         snapshot = Snapshot(
 94 |             model_state=raw_model.state_dict(),
 95 |             optimizer_state=self.optimizer.state_dict(),
 96 |             finished_epoch=epoch
 97 |         )
 98 |         # save snapshot
 99 |         snapshot = asdict(snapshot)
100 |         torch.save(snapshot, self.config.snapshot_path)
101 |         print(f"Snapshot saved at epoch {epoch}")
102 | 
103 |     def _run_batch(self, source, targets, train: bool = True) -> float:
104 |         with torch.set_grad_enabled(train), torch.cuda.amp.autocast(dtype=torch.float16, enabled=(self.config.use_amp)):
105 |             _, loss = self.model(source, targets)
106 |         
107 |         if train:
108 |             self.optimizer.zero_grad(set_to_none=True)
109 |             if self.config.use_amp: 
110 |                 self.scaler.scale(loss).backward()
111 |                 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm_clip)
112 |                 self.scaler.step(self.optimizer)
113 |                 self.scaler.update()
114 |             else:
115 |                 loss.backward()
116 |                 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm_clip)
117 |                 self.optimizer.step()
118 |         
119 |         #return loss.item()
120 |         return loss
121 | 
122 |     def _run_epoch(self, epoch: int, dataloader: DataLoader, train: bool = True):
123 |         dataloader.sampler.set_epoch(epoch)
124 |         for iter, (source, targets) in enumerate(dataloader):
125 |             step_type = "Train" if train else "Eval"
126 |             source = source.to(self.local_rank)
127 |             targets = targets.to(self.local_rank)
128 |             batch_loss = self._run_batch(source, targets, train)
129 |             if iter % 100 == 0:
130 |                 #print(f"[GPU{self.global_rank}] Epoch {epoch} | Iter {iter} | {step_type} Loss {batch_loss.item():.5f}")
131 |                 if train:
132 |                     print(f"[GPU{self.global_rank}] Epoch {epoch} | Iter {iter} | {step_type} Loss {batch_loss.item():.5f}")
133 |                 else:
134 |                     eval_loss_list = [torch.zeros_like(batch_loss) for _ in range(int(os.environ['WORLD_SIZE']))]
135 |                     dist.gather(
136 |                         batch_loss,
137 |                         eval_loss_list if self.local_rank == 0 else None, 
138 |                         dst=0
139 |                     )
140 |                     if self.local_rank == 0:
141 |                         for i, loss in enumerate(eval_loss_list):
142 |                             print(f"[GPU{i}] Epoch {epoch} | Iter {iter} | {step_type} Loss {loss.item():.5f}")
143 | 
144 |     def train(self):
145 |         for epoch in range(self.epochs_run, self.config.max_epochs):
146 |             epoch += 1
147 |             
148 |             # train for one epoch
149 |             self._run_epoch(epoch, self.train_loader, train=True)
150 | 
151 |             # 各个 GPU 上都在跑一样的训练进程，这里指定 rank0 进程保存 snapshot 以免重复保存
152 |             if self.local_rank == 0 and epoch % self.save_every == 0:
153 |                 self._save_snapshot(epoch)
154 | 
155 |             # eval run
156 |             if self.test_loader:
157 |                 self._run_epoch(epoch, self.test_loader, train=False)
158 | 


--------------------------------------------------------------------------------
/minGPT-ddp/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.11.0
2 | fsspec
3 | boto3
4 | hydra-core
5 | requests
6 | aiohttp


--------------------------------------------------------------------------------
/multi_gpu.py:
--------------------------------------------------------------------------------
  1 | # 使用 DistributedDataParallel 进行单机多卡训练
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch.utils.data import Dataset, DataLoader
  5 | import os
  6 | 
  7 | # 对 python 多进程的一个 pytorch 包装
  8 | import torch.multiprocessing as mp
  9 | 
 10 | # 这个 sampler 可以把采样的数据分散到各个 CPU 上                                      
 11 | from torch.utils.data.distributed import DistributedSampler     
 12 | 
 13 | # 实现分布式数据并行的核心类        
 14 | from torch.nn.parallel import DistributedDataParallel as DDP         
 15 | 
 16 | # DDP 在每个 GPU 上运行一个进程，其中都有一套完全相同的 Trainer 副本（包括model和optimizer）
 17 | # 各个进程之间通过一个进程池进行通信，这两个方法来初始化和销毁进程池
 18 | from torch.distributed import init_process_group, destroy_process_group 
 19 | 
 20 | 
 21 | def ddp_setup(rank, world_size):
 22 |     """
 23 |     setup the distribution process group
 24 | 
 25 |     Args:
 26 |         rank: Unique identifier of each process
 27 |         world_size: Total number of processes
 28 |     """
 29 |     # MASTER Node（运行 rank0 进程，多机多卡时的主机）用来协调各个 Node 的所有进程之间的通信
 30 |     os.environ["MASTER_ADDR"] = "localhost" # 由于这里是单机实验所以直接写 localhost
 31 |     os.environ["MASTER_PORT"] = "12355"     # 任意空闲端口
 32 |     init_process_group(
 33 |         backend="nccl",                     # Nvidia CUDA CPU 用这个 "nccl"
 34 |         rank=rank,                          
 35 |         world_size=world_size
 36 |     )
 37 |     torch.cuda.set_device(rank)
 38 | 
 39 | class Trainer:
 40 |     def __init__(
 41 |         self,
 42 |         model: torch.nn.Module,
 43 |         train_data: DataLoader,
 44 |         optimizer: torch.optim.Optimizer,
 45 |         gpu_id: int,
 46 |         save_every: int,
 47 |     ) -> None:
 48 |         self.gpu_id = gpu_id
 49 |         self.model = model.to(gpu_id)
 50 |         self.train_data = train_data
 51 |         self.optimizer = optimizer
 52 |         self.save_every = save_every                    # 指定保存 ckpt 的周期
 53 |         self.model = DDP(model, device_ids=[gpu_id])    # model 要用 DDP 包装一下
 54 | 
 55 |     def _run_batch(self, source, targets):
 56 |         self.optimizer.zero_grad()
 57 |         output = self.model(source)
 58 |         loss = F.cross_entropy(output, targets)
 59 |         loss.backward()
 60 |         self.optimizer.step()
 61 | 
 62 |     def _run_epoch(self, epoch):
 63 |         b_sz = len(next(iter(self.train_data))[0])
 64 |         print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
 65 |         self.train_data.sampler.set_epoch(epoch)        # 在各个 epoch 入口调用 DistributedSampler 的 set_epoch 方法是很重要的，这样才能打乱每个 epoch 的样本顺序
 66 |         for source, targets in self.train_data: 
 67 |             source = source.to(self.gpu_id)
 68 |             targets = targets.to(self.gpu_id)
 69 |             self._run_batch(source, targets)
 70 | 
 71 |     def _save_checkpoint(self, epoch):
 72 |         ckp = self.model.module.state_dict()            # 由于多了一层 DDP 包装，通过 .module 获取原始参数 
 73 |         PATH = "checkpoint.pt"
 74 |         torch.save(ckp, PATH)
 75 |         print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")
 76 | 
 77 |     def train(self, max_epochs: int):
 78 |         for epoch in range(max_epochs):
 79 |             self._run_epoch(epoch)
 80 |             # 各个 GPU 上都在跑一样的训练进程，这里指定 rank0 进程保存 ckpt 以免重复保存
 81 |             if self.gpu_id == 0 and epoch % self.save_every == 0:
 82 |                 self._save_checkpoint(epoch)
 83 | 
 84 | class MyTrainDataset(Dataset):
 85 |     def __init__(self, size):
 86 |         self.size = size
 87 |         self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]
 88 | 
 89 |     def __len__(self):
 90 |         return self.size
 91 |     
 92 |     def __getitem__(self, index):
 93 |         return self.data[index]
 94 | 
 95 | def load_train_objs():
 96 |     train_set = MyTrainDataset(2048)  # load your dataset
 97 |     model = torch.nn.Linear(20, 1)  # load your model
 98 |     optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
 99 |     return train_set, model, optimizer
100 | 
101 | 
102 | def prepare_dataloader(dataset: Dataset, batch_size: int):
103 |     return DataLoader(
104 |         dataset,
105 |         batch_size=batch_size,
106 |         pin_memory=True,
107 |         shuffle=False,                      # 设置了新的 sampler，参数 shuffle 要设置为 False 
108 |         sampler=DistributedSampler(dataset) # 这个 sampler 自动将数据分块后送个各个 GPU，它能避免数据重叠
109 |     )
110 | 
111 | 
112 | def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
113 |     # 初始化进程池
114 |     ddp_setup(rank, world_size)
115 | 
116 |     # 进行训练
117 |     dataset, model, optimizer = load_train_objs()
118 |     train_data = prepare_dataloader(dataset, batch_size)
119 |     trainer = Trainer(model, train_data, optimizer, rank, save_every)
120 |     trainer.train(total_epochs)
121 |    
122 |     # 销毁进程池
123 |     destroy_process_group()
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     import argparse
128 |     parser = argparse.ArgumentParser(description='simple distributed training job')
129 |     parser.add_argument('--total-epochs', type=int, default=50, help='Total epochs to train the model')
130 |     parser.add_argument('--save-every', type=int, default=10, help='How often to save a snapshot')
131 |     parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
132 |     args = parser.parse_args()
133 |     
134 |     world_size = torch.cuda.device_count()
135 |     
136 |     # 利用 mp.spawn，在整个 distribution group 的 nprocs 个 GPU 上生成进程来执行 fn 方法，并能设置要传入 fn 的参数 args
137 |     # 注意不需要 fn 的 rank 参数，它由 mp.spawn 自动分配
138 |     mp.spawn(
139 |         fn=main, 
140 |         args=(world_size, args.save_every, args.total_epochs, args.batch_size), 
141 |         nprocs=world_size
142 |     )


--------------------------------------------------------------------------------
/multi_gpu_torchrun.py:
--------------------------------------------------------------------------------
  1 | # 使用 DistributedDataParallel 进行单机多卡训练的基础上，使用 torchrun 进行容错处理，增强程序稳定性
  2 | # torchrun 允许我们在训练过程中按一定保存 snapshots，其中应当包含当前 epoch、模型参数（ckpt）、优化器参数、lr调度器参数等恢复训练所需的全部参数
  3 | # 一旦程序出错退出，torchrun 会自动从最近 snapshots 重启所有进程
  4 | # 除了增强稳定性外，torchrun 还会自动完成所有环境变量设置和进程分配工作，所以不再需要手动设置 rank 或用 mp.spawn 生成并分配进程
  5 | 
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from torch.utils.data import Dataset, DataLoader
  9 | import os
 10 | import numpy as np
 11 | from tqdm import tqdm
 12 | 
 13 | # 对 python 多进程的一个 pytorch 包装
 14 | import torch.multiprocessing as mp
 15 | 
 16 | # 这个 sampler 可以把采样的数据分散到各个 CPU 上                                      
 17 | from torch.utils.data.distributed import DistributedSampler     
 18 | 
 19 | # 实现分布式数据并行的核心类        
 20 | from torch.nn.parallel import DistributedDataParallel as DDP         
 21 | 
 22 | # DDP 在每个 GPU 上运行一个进程，其中都有一套完全相同的 Trainer 副本（包括model和optimizer）
 23 | # 各个进程之间通过一个进程池进行通信，这两个方法来初始化和销毁进程池
 24 | from torch.distributed import init_process_group, destroy_process_group 
 25 | 
 26 | 
 27 | def ddp_setup():
 28 |     # torchrun 会处理环境变量以及 rank & world_size 设置
 29 |     os.environ["MASTER_ADDR"] = "localhost" # 由于这里是单机实验所以直接写 localhost
 30 |     os.environ["MASTER_PORT"] = "12355"     # 任意空闲端口
 31 |     init_process_group(backend="nccl")
 32 |     torch.cuda.set_device(int(os.environ['LOCAL_RANK']))
 33 | 
 34 | class Trainer:
 35 |     def __init__(
 36 |         self,
 37 |         model: torch.nn.Module,
 38 |         train_data: DataLoader,
 39 |         optimizer: torch.optim.Optimizer,
 40 |         save_every: int,    
 41 |         snapshot_path: str,                                 # 保存 snapshots 的位置 
 42 |     ) -> None:
 43 |         self.gpu_id = int(os.environ['LOCAL_RANK'])         # torchrun 会自动设置这个环境变量指出当前进程的 rank
 44 |         self.world_size = int(os.environ['WORLD_SIZE'])
 45 |         self.model = model.to(self.gpu_id)
 46 |         self.train_data = train_data
 47 |         self.optimizer = optimizer
 48 |         self.save_every = save_every                        # 指定保存 snapshots 的周期
 49 |         self.epochs_run = 0                                 # 存储将要保存在 snapshots 中的 epoch num 信息
 50 |         self.snapshot_path = snapshot_path
 51 | 
 52 |         # 若存在 snapshots 则加载，这样重复运行指令就能自动继续训练了
 53 |         if os.path.exists(snapshot_path):
 54 |             print('loading snapshot')
 55 |             self._load_snapshot(snapshot_path)
 56 |         
 57 |         self.model = DDP(self.model, device_ids=[self.gpu_id])   # model 要用 DDP 包装一下
 58 | 
 59 |     def _load_snapshot(self, snapshot_path):
 60 |         ''' 加载 snapshot 并重启训练 '''
 61 |         loc = f"cuda:{self.gpu_id}"
 62 |         snapshot = torch.load(snapshot_path, map_location=loc)
 63 |         self.model.load_state_dict(snapshot["MODEL_STATE"])
 64 |         self.epochs_run = snapshot["EPOCHS_RUN"]
 65 |         print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
 66 |     
 67 |     def _run_batch(self, source, targets):
 68 |         self.optimizer.zero_grad()
 69 |         output = self.model(source)
 70 |         loss = torch.mean(F.mse_loss(output, targets)) 
 71 |         loss.backward()
 72 |         self.optimizer.step()
 73 |         return loss.item()
 74 | 
 75 |     def _run_epoch(self, epoch):
 76 |         epoch_losses = []
 77 |         self.train_data.sampler.set_epoch(epoch)            # 设置 epoch 保证多 GPU 上数据不重叠
 78 |         for source, targets in self.train_data:
 79 |             source = source.to(self.gpu_id)
 80 |             targets = targets.to(self.gpu_id)
 81 |             loss = self._run_batch(source, targets)
 82 |             epoch_losses.append(loss)
 83 |         return np.mean(epoch_losses)
 84 | 
 85 |     def _save_snapshot(self, epoch):
 86 |         # 在 snapshot 中保存恢复训练所必须的参数
 87 |         snapshot = {
 88 |             "MODEL_STATE": self.model.module.state_dict(),  # 由于多了一层 DDP 包装，通过 .module 获取原始参数 
 89 |             "EPOCHS_RUN": epoch,
 90 |         }
 91 |         torch.save(snapshot, self.snapshot_path)
 92 |         #print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}")
 93 | 
 94 |     def train(self, max_epochs: int):
 95 |         # 现在从 self.epochs_run 开始训练，统一重启的情况
 96 |         with tqdm(total=max_epochs, desc=f"[GPU{self.gpu_id}] Training", position=self.gpu_id, initial=self.epochs_run) as pbar:
 97 |             for epoch in range(self.epochs_run + 1, max_epochs + 1):
 98 |                 epoch_loss = self._run_epoch(epoch)                         
 99 | 
100 |                 # 各个 GPU 上都在跑一样的训练进程，这里指定 rank0 进程保存 snapshot 以免重复保存
101 |                 if self.gpu_id == 0 and epoch % self.save_every == 0:
102 |                     self._save_snapshot(epoch)
103 | 
104 |                 pbar.set_postfix({'epoch': epoch, 'loss':'{:.2f}'.format(epoch_loss)})
105 |                 pbar.update()
106 |                 
107 | class MyTrainDataset(Dataset):
108 |     def __init__(self, size):
109 |         self.size = size
110 |         
111 |         # Simple Linear Regression problem
112 |         input_dim = 2
113 |         output_dim = 1
114 |         true_w = torch.Tensor([-2, 3.4]).view(input_dim, output_dim)
115 |         true_b = 4.2
116 | 
117 |         features = torch.randn(size=(size, input_dim), dtype=torch.float32) 
118 |         labels = torch.mm(features,true_w) + true_b
119 |         labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float32)
120 | 
121 |         self.data = [(features[i], labels[i]) for i in range(size)]
122 | 
123 |     def __len__(self):
124 |         return self.size
125 |     
126 |     def __getitem__(self, index):
127 |         return self.data[index]
128 | 
129 | def load_train_objs():
130 |     train_set = MyTrainDataset(2048)    # load your dataset
131 |     model = torch.nn.Linear(2, 1)       # load your model
132 |     optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
133 |     return train_set, model, optimizer
134 | 
135 | def prepare_dataloader(dataset: Dataset, batch_size: int):
136 |     return DataLoader(
137 |         dataset,
138 |         batch_size=batch_size,
139 |         pin_memory=True,
140 |         shuffle=False,                      # 设置了新的 sampler，参数 shuffle 要设置为 False 
141 |         sampler=DistributedSampler(dataset) # 这个 sampler 自动将数据分块后送个各个 GPU，它能避免数据重叠
142 |     )
143 | 
144 | def main(save_every: int, total_epochs: int, batch_size: int, snapshot_path: str="snapshot.pt"):
145 |     # 初始化进程池
146 |     ddp_setup()
147 | 
148 |     # 进行训练
149 |     dataset, model, optimizer = load_train_objs()
150 |     train_data = prepare_dataloader(dataset, batch_size)
151 |     trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path)
152 |     trainer.train(total_epochs)
153 |    
154 |     # 销毁进程池
155 |     destroy_process_group()
156 | 
157 | if __name__ == "__main__":
158 |     import argparse
159 |     parser = argparse.ArgumentParser(description='simple distributed training job')
160 |     parser.add_argument('--total-epochs', type=int, default=100, help='Total epochs to train the model')
161 |     parser.add_argument('--save-every', type=int, default=10, help='How often to save a snapshot')
162 |     parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
163 |     args = parser.parse_args()
164 |     
165 |     # 现在 torchrun 负责在各个 GPU 上生成进程并执行，不再需要 mp.spawn 了
166 |     main(args.save_every, args.total_epochs, args.batch_size)
167 | 
168 | '''
169 | 运行命令: 
170 |     torchrun --standalone --nproc_per_node=gpu multi_gpu_torchrun.py
171 | 
172 | 参数说明：
173 |     --standalone 代表单机运行 
174 |     --nproc_per_node=gpu 代表使用所有可用GPU, 等于号后也可写gpu数量n, 这样会使用前n个GPU
175 | 
176 | 运行后获取参数：
177 |     os.environ['RANK']          得到在所有机器所有进程中当前GPU的rank
178 |     os.environ['LOCAL_RANK']    得到在当前node中当前GPU的rank
179 |     os.environ['WORLD_SIZE']    得到GPU的数量
180 | 
181 | 通过 CUDA_VISIBLE_DEVICES 指定程序可见的GPU, 从而实现指定GPU运行:
182 |     CUDA_VISIBLE_DEVICES=0,3 torchrun --standalone --nproc_per_node=gpu multi_gpu_torchrun.py
183 | 
184 | '''


--------------------------------------------------------------------------------
/single_gpu.py:
--------------------------------------------------------------------------------
 1 | # 单 GPU 训练示例
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from torch.utils.data import Dataset, DataLoader
 5 | 
 6 | class Trainer:
 7 |     def __init__(
 8 |         self,
 9 |         model: torch.nn.Module,
10 |         train_data: DataLoader,
11 |         optimizer: torch.optim.Optimizer,
12 |         gpu_id: int,
13 |         save_every: int, 
14 |     ) -> None:
15 |         self.gpu_id = gpu_id
16 |         self.model = model.to(gpu_id)
17 |         self.train_data = train_data
18 |         self.optimizer = optimizer
19 |         self.save_every = save_every
20 | 
21 |     def _run_batch(self, source, targets):
22 |         self.optimizer.zero_grad()
23 |         output = self.model(source)
24 |         loss = F.cross_entropy(output, targets)
25 |         loss.backward()
26 |         self.optimizer.step()
27 | 
28 |     def _run_epoch(self, epoch):
29 |         b_sz = len(next(iter(self.train_data))[0])
30 |         print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
31 |         for source, targets in self.train_data:
32 |             source = source.to(self.gpu_id)
33 |             targets = targets.to(self.gpu_id)
34 |             self._run_batch(source, targets)
35 | 
36 |     def _save_checkpoint(self, epoch):
37 |         ckp = self.model.state_dict()
38 |         PATH = "checkpoint.pt"
39 |         torch.save(ckp, PATH)
40 |         print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")
41 | 
42 |     def train(self, max_epochs: int):
43 |         for epoch in range(max_epochs):
44 |             self._run_epoch(epoch)
45 |             if epoch % self.save_every == 0:
46 |                 self._save_checkpoint(epoch)
47 | 
48 | class MyTrainDataset(Dataset):
49 |     def __init__(self, size):
50 |         self.size = size
51 |         self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]
52 | 
53 |     def __len__(self):
54 |         return self.size
55 |     
56 |     def __getitem__(self, index):
57 |         return self.data[index]
58 | 
59 | 
60 | 
61 | def load_train_objs():
62 |     train_set = MyTrainDataset(2048)  # load your dataset
63 |     model = torch.nn.Linear(20, 1)  # load your model
64 |     optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
65 |     return train_set, model, optimizer
66 | 
67 | def prepare_dataloader(dataset: Dataset, batch_size: int):
68 |     return DataLoader(
69 |         dataset,
70 |         batch_size=batch_size,
71 |         pin_memory=True,
72 |         shuffle=True
73 |     )
74 | 
75 | def main(device, total_epochs, save_every, batch_size):
76 |     dataset, model, optimizer = load_train_objs()
77 |     train_data = prepare_dataloader(dataset, batch_size)
78 |     trainer = Trainer(model, train_data, optimizer, device, save_every)
79 |     trainer.train(total_epochs)
80 | 
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     import argparse
85 |     parser = argparse.ArgumentParser(description='simple distributed training job')
86 |     parser.add_argument('--total-epochs', type=int, default=50, help='Total epochs to train the model')
87 |     parser.add_argument('--save-every', type=int, default=10, help='How often to save a snapshot')
88 |     parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
89 |     args = parser.parse_args()
90 |     
91 |     device = 0  # shorthand for cuda:0
92 |     main(device, args.total_epochs, args.save_every, args.batch_size)


--------------------------------------------------------------------------------