├── .gitignore
├── LICENSE
├── README.md
├── arguments
    ├── inference_args.py
    └── training_args.py
├── config
    └── dense_model.json
├── ddp_inference_gather.py
├── ds_config
    ├── README.md
    └── zero2.json
├── inference.py
├── inference_deepspeed.py
├── models
    ├── README.md
    ├── __init__.py
    ├── dense_model
    │   ├── __init__.py
    │   ├── dataloader.py
    │   ├── datamodule.py
    │   ├── drop_scheduler.py
    │   └── model.py
    └── rnn_model
    │   ├── __init__.py
    │   └── model.py
├── pip_install_deepspeed.sh
├── requirements.txt
├── scripts
    ├── README.md
    ├── run_inference.sh
    ├── run_inference_deepspeed.sh
    ├── run_train_cpu.sh
    ├── run_train_deepspeed.sh
    ├── run_train_gpu.sh
    └── run_train_gpu_ddp.sh
├── train.py
└── utils
    ├── README.md
    ├── __init__.py
    ├── comfy.py
    └── config_loader.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # wandb log
132 | wandb/
133 | 
134 | # datasets
135 | *.pt
136 | deepspeed_all_result.txt
137 | model_outputs/
138 | lightning_logs/
139 | distributed_result/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 YooSungHyun
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NEW Release 20230312!!!
  2 | 
  3 | update dropout **Dropout Reduces Underfitting**
  4 | 
  5 | [paper](https://arxiv.org/abs/2303.01500)
  6 | 
  7 | [git](https://github.com/facebookresearch/dropout)
  8 | 
  9 | ## Usage
 10 | 
 11 | CHECK PLZ `run_train_gpu.sh` and `./models/dense_model/`
 12 | 
 13 | ### 1. make drop_scheduler.py
 14 | 
 15 | ### 2. make dropout scheduling function your each model classes
 16 | 
 17 | **Example**
 18 | 
 19 | ```python
 20 | # 1) update dropout function
 21 | def update_dropout(self, drop_rate):
 22 |     self.drop_rate = drop_rate
 23 |     for module in self.modules():
 24 |         if isinstance(module, nn.Dropout):
 25 |             module.p = drop_rate
 26 | 
 27 | # 2) we have to count each step (for calculate schedulering global steps)
 28 | def on_train_start(self):
 29 |     from models.dense_model.drop_scheduler import drop_scheduler
 30 | 
 31 |     self.drop_scheduler = {}
 32 |     if self.args.dropout_p > 0.0:
 33 |         self.drop_scheduler["do"] = drop_scheduler(
 34 |             self.args.dropout_p,
 35 |             self.args.max_epochs,
 36 |             self.trainer.num_training_batches,
 37 |             self.args.cutoff_epoch,
 38 |             self.args.drop_mode,
 39 |             self.args.drop_schedule,
 40 |         )
 41 |         print(
 42 |             "on_train_start :: Min DO = %.7f, Max DO = %.7f"
 43 |             % (min(self.drop_scheduler["do"]), max(self.drop_scheduler["do"]))
 44 |         )
 45 | 
 46 | # 3) Finally, you can scheduling dropout prob in your training_step
 47 | if "do" in self.drop_scheduler:
 48 |         dropout_p = self.drop_scheduler["do"][self.trainer.global_step]
 49 |         self.update_dropout(dropout_p)
 50 |         self.log("dropout_p", dropout_p, sync_dist=(self.device != "cpu"))
 51 | ```
 52 | 
 53 | ### 3. we have to input dropout schedule setting in training_arg
 54 | 
 55 | ### 4. replace run script
 56 | 
 57 | ``` bash
 58 | --dropout_p=0.1 \
 59 | --cutoff_epoch=1 \
 60 | --drop_mode=standard \
 61 | --drop_schedule=constant
 62 | ```
 63 | 
 64 | 
 65 | 
 66 | ### If you want to use, normal style dropout, input dropout_p and `drop_mode=standard` (default) and `drop_schedule=constant` (default)
 67 | you can check your dropout scheduling process in wandb
 68 | 
 69 | ![image](https://user-images.githubusercontent.com/34292279/224522943-37e5779a-3f40-44c0-8aec-5a13f84ff47c.png)
 70 | 
 71 | 
 72 | # pytorch-lightning-template
 73 | 
 74 | very simple but, write down is boring </br>
 75 | boring boiling code rolling ⚡ </br>
 76 | **If you need some function or someting, plz comment issues (plz write eng or ko). I reply and implement ASAP!!** </br>
 77 | 
 78 | - DataModule more detail: [PyTorch-Lightning Dev Guide](https://pytorch-lightning.readthedocs.io/en/stable/data/datamodule.html)
 79 | - Model more detail: [PyTorch-Lightning Dev Guide](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html)
 80 | - Inference more detail: [PyTorch-Lightning Dev Guide](https://pytorch-lightning.readthedocs.io/en/stable/deploy/production_intermediate.html)
 81 | - WanDB with lightning more detail
 82 |   - [Weight & Bias Dev Guide 1](https://wandb.ai/wandb_fc/korean/reports/Weights-Biases-Pytorch-Lightning---VmlldzozNzAxOTg)
 83 |   - [Weight & Bias Dev Guide 2](https://docs.wandb.ai/guides/integrations/lightning)
 84 | 
 85 | # WanDB
 86 | 
 87 | https://docs.wandb.ai/v/ko/quickstart
 88 | 
 89 | # Training Detail
 90 | 
 91 | - Using DDP, Not DP or CPU</br>
 92 |   Maybe want to using DP or CPU, Change some argument or python Script</br>
 93 |   See more detail: [PyTorch-Lightning Dev Guide](https://pytorch-lightning.readthedocs.io/en/stable/accelerators/gpu_intermediate.html)
 94 | - Optimizer: AdamW
 95 |   - LearningRate Scheduler: OneCycleLR
 96 |   - see more detail: [PyTorch Dev Guide](https://pytorch.org/docs/stable/optim.html)
 97 | - Monitoring Tool: WanDB
 98 | 
 99 | # Pytorch-Lightning Life Cycle
100 | 
101 | ## Training
102 | 
103 | 1. train.py(main) -> argparse
104 |    - using [simple_parsing library](https://github.com/lebrice/SimpleParsing) looks like HFArgumentParser
105 |    - Trainer Argument placed with `pl.Trainer.add_argparse_args` (automatic define argparse)
106 | 2. def] WandbLogger, set seed(os, random, np, torch, torch.cuda)
107 | 3. def] CustomDataModule (`LightningDataModule`)
108 |    - You Not have to using `LightningDataModule`. but, if you implement that in 'LightningModule', source code is looked mess
109 |    - DataModule important `prepare_data` and `setup`
110 |      - `prepare_data` is only run on cpu and not multi processing (Warning, if you using distributed learning, this place's variable is not share)
111 |        - I recommand, It just using data download or datasets save
112 |      - `setup` is run on gpu or cpu and distributed. using map or dataload or something!
113 |        - `setup` can have stage(fit (train), test, predict)
114 |    - DataModule can have each stage's dataloader
115 |      - using default or someting
116 |    - Dataset can define this section or making each python script and just import & using!
117 | 4. def] CustomNet (`LightningModule`)
118 |    - each step and step_end or epoch, epoch_end
119 |    - i think using just training_step, validation_step, validation_epoch_end is simple and best
120 |      - training_step -> forward -> configure_optimizers
121 |      - when count in each validation step (each batch step validation) -> validation_epoch_end (all batch result gather) -> log (on wandb)
122 | 5. wandb logger additional setting
123 | 6. checkpoint setting
124 |    - monitor name is same on your each step's log name
125 | 7. learning_rate monitor setting
126 | 8. ddp strategy modify
127 |    - if your dataset is so big to ddp, timeout parameter change like that
128 |    - huggingface is so hard to make it. but lightning is feel free
129 | 9. make trainer to your arg
130 | 10. training run and model save!
131 | 
132 | ### Training Script Usage
133 | 
134 | 1. cd your project root(./pytorch-lightning-template)
135 | 
136 | ```
137 | # Don't Script RUN in your scripts FOLDER!!!!! CHK PLZ!!!!!!!
138 | bash scripts/run_train_~~~.sh
139 | ```
140 | 
141 | ## Inference
142 | 
143 | 1. inference.py(main) -> argparse
144 | 2. set seed
145 | 3. model load (second param is your model init param)
146 | 4. simply torch inference & END!
147 | 
148 | ### Inference Script Usage
149 | 
150 | 1. cd your project root(./pytorch-lightning-template)
151 | 
152 | ```
153 | # Don't Script RUN in your scripts FOLDER!!!!! CHK PLZ!!!!!!!
154 | bash scripts/run_inference~~~.sh
155 | ```
156 | 
157 | # (Optional) Install DeepSpeed
158 | 
159 | 1. run pip_install_deepspeed.sh
160 | 
161 | ```
162 | bash pip_install_deepspeed.sh
163 | ```
164 | 


--------------------------------------------------------------------------------
/arguments/inference_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class InferenceArguments:
 6 |     """Help string for this group of command-line arguments"""
 7 | 
 8 |     seed: int = None  # all seed
 9 |     local_rank: int = None  # ddp local rank
10 |     model_path: str = "model_outputs"  # target pytorch lightning model dir
11 |     config_path: str = "model_outputs"  # target pytorch lightning model dir
12 |     per_device_test_batch_size: int = 1  # The batch size per GPU/TPU core/CPU for evaluation.
13 |     model_select: str = "linear"  # linear or rnn
14 |     truncated_bptt_steps: int = 1  # TBPTT step size
15 |     valid_on_cpu: bool = False  # If you want to run validation_step on cpu -> true
16 | 


--------------------------------------------------------------------------------
/arguments/training_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | import simple_parsing as sp
 3 | 
 4 | 
 5 | @dataclass
 6 | class TrainingArguments:
 7 |     """Help string for this group of command-line arguments"""
 8 | 
 9 |     seed: int = None  # all seed
10 |     local_rank: int = None  # ddp local rank
11 |     data_dir: str = "datasets"  # target pytorch lightning data dirs
12 |     ratio: float = 0.2  # train/valid split ratio
13 |     output_dir: str = "model_outputs"  # model output path
14 |     config_path: str = "config/dense_model.json"
15 |     num_workers: int = None  # how many proc map?
16 |     learning_rate: float = 0.001  # learning rate
17 |     warmup_ratio: float = 0.2  # learning rate scheduler warmup ratio per EPOCH
18 |     max_lr: float = 0.01  # lr_scheduler max learning rate
19 |     div_factor: int = 25  # initial_lr = max_lr/div_factor
20 |     final_div_factor: int = 1e4  # (max_lr/div_factor)*final_div_factor is final lr
21 |     weight_decay: float = 0.0001  # weigth decay
22 |     per_device_train_batch_size: int = 1  # The batch size per GPU/TPU core/CPU for training.
23 |     per_device_eval_batch_size: int = 1  # The batch size per GPU/TPU core/CPU for evaluation.
24 |     valid_on_cpu: bool = False  # If you want to run validation_step on cpu -> true
25 |     model_select: str = "linear"  # linear or rnn
26 |     truncated_bptt_steps: int = 1  # TBPTT step size
27 |     deepspeed_config: str = "ds_config/zero2.json"
28 |     dropout_p: float = 0.0  # Drop path rate (default: 0.0)
29 |     cutoff_epoch: int = 0  # if drop_mode is early / late, this is the epoch where dropout ends / starts
30 |     drop_mode: str = sp.field(default="standard", choices=["standard", "early", "late"])  # drop mode
31 |     drop_schedule: str = sp.field(
32 |         default="constant", choices=["constant", "linear"]
33 |     )  # drop schedule for early dropout / s.d. only
34 | 


--------------------------------------------------------------------------------
/config/dense_model.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model": {
3 |         "input_dense_dim": 512,
4 |         "output_dense_dim": 256
5 |     }
6 | }


--------------------------------------------------------------------------------
/ddp_inference_gather.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | gpu_all_result = list()
 5 | gpu0_result = torch.load("distributed_result/predictions_0.pt")
 6 | for item in gpu0_result:
 7 |     gpu_all_result.extend(item)
 8 | gpu1_result = torch.load("distributed_result/predictions_1.pt")
 9 | for item in gpu1_result:
10 |     gpu_all_result.extend(item)
11 | gpu2_result = torch.load("distributed_result/predictions_2.pt")
12 | for item in gpu2_result:
13 |     gpu_all_result.extend(item)
14 | gpu3_result = torch.load("distributed_result/predictions_3.pt")
15 | for item in gpu3_result:
16 |     gpu_all_result.extend(item)
17 | gpu_all_result = sorted(gpu_all_result, key=lambda x: x[0])
18 | all_result = list()
19 | for item in gpu_all_result:
20 |     # item is (index, torch([int]))
21 |     all_result.append(item[1].detach()[0])
22 | test = torch.stack(all_result)
23 | # test is torch([all result of int])
24 | np.savetxt("deepspeed_all_result.txt", test.numpy())
25 | 


--------------------------------------------------------------------------------
/ds_config/README.md:
--------------------------------------------------------------------------------
1 | # DeepSpeed Config
2 | 
3 | Maybe, In pytorch-lightning**⚡**, You can not use `AUTO config value`. (Not Like HuggingFace, so uncomfortable)
4 | 
5 | 


--------------------------------------------------------------------------------
/ds_config/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": false,
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "zero_optimization": {
11 |         "stage": 2,
12 |         "allgather_partitions": true,
13 |         "allgather_bucket_size": 5e8,
14 |         "overlap_comm": true,
15 |         "reduce_scatter": true,
16 |         "reduce_bucket_size": 5e8,
17 |         "contiguous_gradients": true,
18 |         "cpu_offload": false
19 |     },
20 |     "steps_per_print": 2000,
21 |     "wall_clock_breakdown": false
22 | }


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytorch_lightning as pl
 3 | from models.dense_model.model import CustomNet
 4 | from models.rnn_model.model import LSTMModel
 5 | from utils.compy import dataclass_to_namespace
 6 | from arguments.inference_args import InferenceArguments
 7 | from simple_parsing import ArgumentParser
 8 | 
 9 | 
10 | def main(hparams):
11 |     pl.seed_everything(hparams.seed)
12 | 
13 |     if hparams.model_select == "linear":
14 |         model = CustomNet.load_from_checkpoint(hparams.model_path, args=hparams)
15 |         features = torch.randn(1, 512)
16 |     else:
17 |         model = LSTMModel.load_from_checkpoint(hparams.model_path, args=hparams)
18 |         features = torch.randn(200, 1)
19 |     model.eval()
20 |     with torch.no_grad():
21 |         logits = model(features)
22 |     print(logits)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = ArgumentParser()
27 |     parser = pl.Trainer.add_argparse_args(parser)
28 |     parser.add_arguments(InferenceArguments, dest="inference_args")
29 |     args = parser.parse_args()
30 |     args = dataclass_to_namespace(args, "inference_args")
31 |     main(args)
32 | 


--------------------------------------------------------------------------------
/inference_deepspeed.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import pytorch_lightning as pl
 4 | from utils.compy import dataclass_to_namespace
 5 | from arguments.inference_args import InferenceArguments
 6 | from simple_parsing import ArgumentParser
 7 | from models.dense_model.model import CustomNet
 8 | from models.dense_model.datamodule import CustomDataset
 9 | from models.rnn_model.model import LSTMModel
10 | from pytorch_lightning.callbacks import BasePredictionWriter
11 | from torch.utils.data import DataLoader, TensorDataset
12 | 
13 | 
14 | class CustomWriter(BasePredictionWriter):
15 |     def __init__(self, output_dir, write_interval):
16 |         super().__init__(write_interval)
17 |         self.output_dir = output_dir
18 | 
19 |     def write_on_epoch_end(self, trainer, pl_module, predictions, batch_indices):
20 |         result_list = list()
21 |         for batch_index, pred in list(zip(batch_indices[0], predictions[0])):
22 |             result_list.append(list(zip(batch_index, pred)))
23 |         torch.save(result_list, os.path.join(self.output_dir, f"predictions_{trainer.global_rank}.pt"))
24 | 
25 | 
26 | def read_json(fname):
27 |     from pathlib import Path
28 |     import json
29 | 
30 |     fname = Path(fname)
31 |     with fname.open("rt") as handle:
32 |         return json.load(handle)
33 | 
34 | 
35 | def on_load_checkpoint(checkpoint):
36 |     state_dict = {k.partition("_forward_module.")[2]: checkpoint[k] for k in checkpoint.keys()}
37 |     checkpoint["state_dict"] = state_dict
38 |     return checkpoint
39 | 
40 | 
41 | def main(hparams):
42 |     pl.seed_everything(hparams.seed)
43 |     device = torch.device("cuda")
44 |     os.makedirs("distributed_result", exist_ok=True)
45 |     if hparams.model_select == "linear":
46 |         model = CustomNet(hparams)
47 |         checkpoint = torch.load(hparams.model_path, map_location=device)
48 |         checkpoint = on_load_checkpoint(checkpoint)
49 |         model.load_state_dict(checkpoint["state_dict"])
50 |         features = torch.randn(10, 512)
51 |         temp_label = torch.randn(10, 1)
52 |         infer_datasets = CustomDataset(features, temp_label)
53 |         infer_loader = DataLoader(
54 |             dataset=infer_datasets, batch_size=hparams.per_device_test_batch_size, num_workers=4, pin_memory=True
55 |         )
56 |     else:
57 |         model = LSTMModel(hparams)
58 |         checkpoint = torch.load(hparams.model_path, map_location=device)
59 |         checkpoint = on_load_checkpoint(checkpoint)
60 |         model.load_state_dict(checkpoint["state_dict"])
61 |         features = torch.randn(200, 1)
62 |         temp_label = torch.randn(200, 1)
63 |         infer_datasets = TensorDataset(features, temp_label)
64 |         infer_loader = DataLoader(
65 |             dataset=infer_datasets, batch_size=hparams.per_device_test_batch_size, num_workers=4, pin_memory=True
66 |         )
67 | 
68 |     pred_writer = CustomWriter(output_dir="distributed_result", write_interval="epoch")
69 |     hparams.callbacks = [pred_writer]
70 |     trainer = pl.Trainer.from_argparse_args(hparams)
71 | 
72 |     trainer.predict(model, infer_loader, return_predictions=False)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     parser = ArgumentParser()
77 |     parser = pl.Trainer.add_argparse_args(parser)
78 |     parser.add_arguments(InferenceArguments, dest="inference_args")
79 |     args = parser.parse_args()
80 |     args = dataclass_to_namespace(args, "inference_args")
81 |     main(args)
82 | 


--------------------------------------------------------------------------------
/models/README.md:
--------------------------------------------------------------------------------
 1 | # Example models
 2 | 
 3 | We have 2 concept model
 4 | 
 5 | ## 1. dense model
 6 | 
 7 | **I use all of pytorch-lightning⚡'s class as much as i can**
 8 | 
 9 | Custom Dataloader, Custom Dataset, LightningDataModule and also you can use Custom DataSampler (if you have one...)
10 | 
11 | If you want to perfect set of lightning template. PLZ watch this model and paste it!
12 | 
13 | ## 2. rnn model
14 | 
15 | **Just Look Simply as much as i can**
16 | 
17 | I just use torch style format.
18 | 
19 | If you have to make something very quickly and simply, PLZ watch this model and paste it!
20 | 
21 | ALSO, you want to use pytorch-lightning⚡ style **BPTT** in rnn model, watch this too


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YooSungHyun/pytorch-lightning-template/9db69c15a179e33b65a98a6605be3b759ccec84e/models/__init__.py


--------------------------------------------------------------------------------
/models/dense_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YooSungHyun/pytorch-lightning-template/9db69c15a179e33b65a98a6605be3b759ccec84e/models/dense_model/__init__.py


--------------------------------------------------------------------------------
/models/dense_model/dataloader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class CustomDataLoader(torch.utils.data.DataLoader):
 5 |     def __init__(self, *args, **kwargs):
 6 |         """
 7 |         Creates a data loader for AudioDatasets.
 8 |         """
 9 |         super(CustomDataLoader, self).__init__(*args, **kwargs)
10 |         self.collate_fn = self._collate_fn
11 | 
12 |     def _collate_fn(self, batch):
13 |         features = [s["features"] for s in batch]
14 |         feature_lengths = [s["features"].size(0) for s in batch]
15 |         labels = [s["label"] for s in batch]
16 |         label_lengths = [len(s["label"]) for s in batch]
17 | 
18 |         features = torch.FloatTensor(features)
19 |         labels = torch.LongTensor(labels)
20 |         feature_lengths = torch.IntTensor(feature_lengths)
21 |         label_lengths = torch.IntTensor(label_lengths)
22 | 
23 |         return features, labels, feature_lengths, label_lengths
24 | 


--------------------------------------------------------------------------------
/models/dense_model/datamodule.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import pytorch_lightning as pl
 4 | from argparse import Namespace
 5 | from sklearn.model_selection import train_test_split
 6 | from torch.utils.data import DataLoader, Dataset
 7 | 
 8 | 
 9 | class CustomDataset(Dataset):
10 |     def __init__(self, x, y):
11 |         self.x = torch.FloatTensor(x)
12 |         self.y = torch.FloatTensor(y)
13 | 
14 |     def __len__(self):
15 |         return len(self.x)
16 | 
17 |     def __getitem__(self, idx):
18 |         return (self.x[idx], self.y[idx], len(self.x[idx]), len(self.y[idx]))
19 | 
20 | 
21 | class CustomDataModule(pl.LightningDataModule):
22 |     def __init__(self, args: Namespace):
23 |         super().__init__()
24 |         self.data_dir = args.data_dir
25 |         self.ratio = args.ratio
26 |         self.per_device_train_batch_size = args.per_device_train_batch_size
27 |         self.per_device_eval_batch_size = args.per_device_eval_batch_size
28 |         self.num_workers = args.num_workers
29 | 
30 |     def prepare_data(self):
31 |         if not os.path.isfile("./valid.pt"):
32 |             features = torch.randn(2000, 512)
33 |             labels = torch.randn(2000, 1)
34 |             train_x, valid_x, train_y, valid_y = train_test_split(features, labels, test_size=self.ratio)
35 |             train_datasets = CustomDataset(train_x, train_y)
36 |             valid_datasets = CustomDataset(valid_x, valid_y)
37 |             torch.save(train_datasets, "./train.pt")
38 |             torch.save(valid_datasets, "./valid.pt")
39 |         else:
40 |             pass
41 | 
42 |     def setup(self, stage: str):
43 |         if stage == "fit":
44 |             self.train_datasets = torch.load("./train.pt")
45 |             self.valid_datasets = torch.load("./valid.pt")
46 |         if stage == "test":
47 |             pass
48 | 
49 |     def train_dataloader(self):
50 |         # TODO: If you want to use custom sampler and loader follow like this
51 |         # from transformers.trainer_pt_utils import DistributedLengthGroupedSampler
52 |         # train_sampler = DistributedLengthGroupedSampler(
53 |         # batch_size=self.per_device_eval_batch_size,
54 |         # dataset=self.train_datasets,
55 |         # model_input_name="features",
56 |         # lengths=self.train_datasets["feature_lenghths"],
57 |         # )
58 |         # return CustomDataLoader(
59 |         #     dataset=self.train_datasets,
60 |         #     batch_size=self.per_device_train_batch_size,
61 |         #     sampler=train_sampler,
62 |         #     num_workers=self.num_workers,
63 |         #     pin_memory=True,
64 |         # )
65 | 
66 |         # TODO: If you want to use default loader
67 |         return DataLoader(
68 |             dataset=self.train_datasets,
69 |             batch_size=self.per_device_train_batch_size,
70 |             num_workers=self.num_workers,
71 |             pin_memory=True,
72 |         )
73 | 
74 |     def val_dataloader(self):
75 |         return DataLoader(
76 |             dataset=self.valid_datasets,
77 |             batch_size=self.per_device_train_batch_size,
78 |             num_workers=self.num_workers,
79 |             pin_memory=True,
80 |         )
81 | 
82 |     def test_dataloader(self):
83 |         pass
84 | 


--------------------------------------------------------------------------------
/models/dense_model/drop_scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def drop_scheduler(drop_rate, epochs, niter_per_ep, cutoff_epoch=0, mode="standard", schedule="constant"):
 5 |     assert mode in ["standard", "early", "late"]
 6 |     if mode == "standard":
 7 |         return np.full(epochs * niter_per_ep, drop_rate)
 8 | 
 9 |     early_iters = cutoff_epoch * niter_per_ep
10 |     late_iters = (epochs - cutoff_epoch) * niter_per_ep
11 | 
12 |     if mode == "early":
13 |         assert schedule in ["constant", "linear"]
14 |         if schedule == "constant":
15 |             early_schedule = np.full(early_iters, drop_rate)
16 |         elif schedule == "linear":
17 |             early_schedule = np.linspace(drop_rate, 0, early_iters)
18 |         final_schedule = np.concatenate((early_schedule, np.full(late_iters, 0)))
19 | 
20 |     elif mode == "late":
21 |         assert schedule in ["constant"]
22 |         early_schedule = np.full(early_iters, 0)
23 |         final_schedule = np.concatenate((early_schedule, np.full(late_iters, drop_rate)))
24 | 
25 |     assert len(final_schedule) == epochs * niter_per_ep
26 |     return final_schedule
27 | 


--------------------------------------------------------------------------------
/models/dense_model/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import pytorch_lightning as pl
  3 | from torchmetrics import MeanSquaredError
  4 | from torch import nn
  5 | from utils.config_loader import load_config
  6 | 
  7 | 
  8 | class CustomNet(pl.LightningModule):
  9 |     def __init__(self, args):
 10 |         super().__init__()
 11 |         self.args = args
 12 |         self.drop_scheduler = None
 13 |         self.drop_rate = args.dropout_p
 14 |         config_cls = load_config(args.config_path)
 15 |         self.loss_func = MeanSquaredError(compute_on_cpu=args.valid_on_cpu)
 16 |         # TODO: Write down your network
 17 |         self.dense_batch_fc_tanh = nn.Sequential(
 18 |             nn.Linear(config_cls.model.input_dense_dim, config_cls.model.output_dense_dim),
 19 |             nn.BatchNorm1d(config_cls.model.output_dense_dim),
 20 |             nn.Tanh(),
 21 |             nn.Dropout(self.drop_rate),
 22 |             nn.Linear(config_cls.model.output_dense_dim, (config_cls.model.output_dense_dim // 2)),
 23 |             nn.BatchNorm1d((config_cls.model.output_dense_dim // 2)),
 24 |             nn.Tanh(),
 25 |             nn.Dropout(self.drop_rate),
 26 |         )
 27 |         self.fc = nn.Linear(config_cls.model.output_dense_dim // 2, 1)
 28 | 
 29 |     def update_dropout(self, drop_rate):
 30 |         self.drop_rate = drop_rate
 31 |         for module in self.modules():
 32 |             if isinstance(module, nn.Dropout):
 33 |                 module.p = drop_rate
 34 | 
 35 |     def forward(self, features):
 36 |         outputs = self.dense_batch_fc_tanh(features)
 37 |         logits = self.fc(outputs)
 38 |         return logits
 39 | 
 40 |     def on_train_start(self):
 41 |         from models.dense_model.drop_scheduler import drop_scheduler
 42 | 
 43 |         self.drop_scheduler = {}
 44 |         if self.args.dropout_p > 0.0:
 45 |             self.drop_scheduler["do"] = drop_scheduler(
 46 |                 self.args.dropout_p,
 47 |                 self.args.max_epochs,
 48 |                 self.trainer.num_training_batches,
 49 |                 self.args.cutoff_epoch,
 50 |                 self.args.drop_mode,
 51 |                 self.args.drop_schedule,
 52 |             )
 53 |             print(
 54 |                 "on_train_start :: Min DO = %.7f, Max DO = %.7f"
 55 |                 % (min(self.drop_scheduler["do"]), max(self.drop_scheduler["do"]))
 56 |             )
 57 | 
 58 |     def training_step(self, batch, batch_idx):
 59 |         features, labels, feature_lengths, label_lengths = batch
 60 |         if "do" in self.drop_scheduler:
 61 |             dropout_p = self.drop_scheduler["do"][self.trainer.global_step]
 62 |             self.update_dropout(dropout_p)
 63 |             self.log("dropout_p", dropout_p, sync_dist=(self.device != "cpu"))
 64 |         logits = self(features)
 65 |         loss = self.loss_func(logits, labels)
 66 |         self.log("train_loss", loss, sync_dist=(self.device != "cpu"))
 67 |         return {"loss": loss}
 68 | 
 69 |     def validation_step(self, batch, batch_idx):
 70 |         # lightning do sanity eval step first before going training_step. for check your mistake.
 71 |         # I always make mistake on validation logic, so this is good
 72 |         # If don't use check this url. https://github.com/Lightning-AI/lightning/issues/2295
 73 |         features, labels, feature_lengths, label_lengths = batch
 74 |         if self.args.valid_on_cpu:
 75 |             features = features.cpu()
 76 |             labels = labels.cpu()
 77 |             feature_lengths = feature_lengths.cpu()
 78 |             label_lengths = label_lengths.cpu()
 79 |             self.cpu()
 80 |         logits = self(features)
 81 |         loss = self.loss_func(logits, labels)
 82 | 
 83 |         return {"loss": loss}
 84 | 
 85 |     def validation_epoch_end(self, validation_step_outputs):
 86 |         loss_mean = torch.tensor([x["loss"] for x in validation_step_outputs], device=self.device).mean()
 87 | 
 88 |         # sync_dist use follow this url
 89 |         # if using torchmetrics -> https://torchmetrics.readthedocs.io/en/stable/
 90 |         # if not using torchmetrics -> https://github.com/Lightning-AI/lightning/discussions/6501
 91 |         if self.args.valid_on_cpu:
 92 |             # if ddp, each machine output must gather. and lightning can gather only on-gpu items
 93 |             self.log("val_loss", loss_mean.cuda(), sync_dist=True)
 94 |             # model have to training_step on cuda
 95 |             self.cuda()
 96 |         else:
 97 |             self.log("val_loss", loss_mean, sync_dist=(self.device != "cpu"))
 98 |         # self.log_dict(metrics, sync_dist=(self.device != "cpu"))
 99 | 
100 |     def predict_step(self, batch, batch_idx):
101 |         features, labels, feature_lengths, label_lengths = batch
102 |         logits = self(features)
103 |         return logits
104 | 
105 |     def configure_optimizers(self):
106 |         optimizer = torch.optim.AdamW(
107 |             [{"params": [p for p in self.parameters()], "name": "OneCycleLR"}],
108 |             lr=self.args.learning_rate,
109 |             weight_decay=self.args.weight_decay,
110 |         )
111 |         scheduler = torch.optim.lr_scheduler.OneCycleLR(
112 |             optimizer,
113 |             max_lr=self.args.max_lr,
114 |             total_steps=self.trainer.estimated_stepping_batches,
115 |             pct_start=self.args.warmup_ratio,
116 |             epochs=self.trainer.max_epochs,
117 |             final_div_factor=self.args.final_div_factor,
118 |         )
119 |         lr_scheduler = {"interval": "step", "scheduler": scheduler, "name": "AdamW"}
120 |         return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}
121 | 


--------------------------------------------------------------------------------
/models/rnn_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YooSungHyun/pytorch-lightning-template/9db69c15a179e33b65a98a6605be3b759ccec84e/models/rnn_model/__init__.py


--------------------------------------------------------------------------------
/models/rnn_model/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torchmetrics import MeanSquaredError
  3 | from torch.utils.data import DataLoader, TensorDataset
  4 | from pytorch_lightning import LightningModule
  5 | 
  6 | 
  7 | class LSTMModel(LightningModule):
  8 |     """LSTM sequence-to-sequence model for testing TBPTT with automatic optimization."""
  9 | 
 10 |     def __init__(self, args):
 11 |         super().__init__()
 12 |         self.args = args
 13 |         self.input_size = 1
 14 |         self.hidden_size = 8
 15 |         self.lstm = torch.nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True)
 16 |         self.lstm2 = torch.nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True)
 17 |         self.linear = torch.nn.Linear(self.hidden_size * 2, 1)
 18 |         self.loss_func = MeanSquaredError(compute_on_cpu=self.args.valid_on_cpu)
 19 |         self.truncated_bptt_steps = self.args.truncated_bptt_steps
 20 |         self.automatic_optimization = True
 21 | 
 22 |     def configure_optimizers(self):
 23 |         optimizer = torch.optim.AdamW(
 24 |             [{"params": [p for p in self.parameters()], "name": "OneCycleLR"}],
 25 |             lr=self.args.learning_rate,
 26 |             weight_decay=self.args.weight_decay,
 27 |         )
 28 |         scheduler = torch.optim.lr_scheduler.OneCycleLR(
 29 |             optimizer,
 30 |             max_lr=self.args.max_lr,
 31 |             total_steps=self.trainer.estimated_stepping_batches,
 32 |             pct_start=self.args.warmup_ratio,
 33 |             epochs=self.trainer.max_epochs,
 34 |             final_div_factor=self.args.final_div_factor,
 35 |         )
 36 |         lr_scheduler = {"interval": "step", "scheduler": scheduler, "name": "AdamW"}
 37 |         return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}
 38 | 
 39 |     def forward(self, x, hiddens=None):
 40 |         if hiddens is not None:
 41 |             hiddens1, hiddens2 = hiddens
 42 |         else:
 43 |             hiddens1 = None
 44 |             hiddens2 = None
 45 |         self.lstm.flatten_parameters()
 46 |         lstm_last, hiddens1 = self.lstm(x, hiddens1)
 47 |         self.lstm2.flatten_parameters()
 48 |         lstm2_last, hiddens2 = self.lstm2(x, hiddens2)
 49 |         concat_lstm = torch.concat([lstm_last, lstm2_last], dim=-1)
 50 |         logits = self.linear(concat_lstm)
 51 |         return logits, hiddens1, hiddens2
 52 | 
 53 |     def training_step(self, batch, batch_idx, hiddens):
 54 |         # batch_idx: Original step indices, Not TBPTT index (1 step == 1 batch)
 55 |         # hiddens: TBPTT use backwards each sequence using this data
 56 | 
 57 |         # On tbptt, backpropagation is used CHUNK by long sequence. when if using 200 sequence and 100 step chunk,
 58 |         # training_step is needed 2 step for 1 batch (1 step: 0~99, 2 step: 100~199)
 59 |         # very cleverly, we just using hiddens parameter, lightning's tbptt not connected new batch's hiddens to past one
 60 |         x, y = batch
 61 |         logits, hiddens1, hiddens2 = self(x, hiddens)
 62 |         loss = self.loss_func(logits, y)
 63 |         self.log("train_loss", loss, sync_dist=(self.device != "cpu"))
 64 |         # look this discussion for tbptt experiment (https://github.com/Lightning-AI/lightning/discussions/15643)
 65 |         return {"loss": loss, "hiddens": (hiddens1, hiddens2)}
 66 | 
 67 |     def validation_step(self, batch, batch_idx):
 68 |         x, y = batch
 69 |         if self.args.valid_on_cpu:
 70 |             x = x.cpu()
 71 |             y = y.cpu()
 72 |             self.cpu()
 73 |         self.lstm.flatten_parameters()
 74 |         lstm_last, _ = self.lstm(x)
 75 |         self.lstm2.flatten_parameters()
 76 |         lstm2_last, _ = self.lstm2(x)
 77 |         concat_lstm = torch.concat([lstm_last, lstm2_last], dim=-1)
 78 |         logits = self.linear(concat_lstm)
 79 |         return {"pred": logits, "labels": y}
 80 | 
 81 |     def validation_epoch_end(self, validation_step_outputs):
 82 |         for out in validation_step_outputs:
 83 |             loss = self.loss_func(out["pred"], out["labels"])
 84 |         if self.args.valid_on_cpu:
 85 |             # if ddp, each machine output must gather. and lightning can gather only on-gpu items
 86 |             self.log("val_loss", loss.cuda(), sync_dist=True)
 87 |             # model have to training_step on cuda
 88 |             self.cuda()
 89 |         else:
 90 |             self.log("val_loss", loss, sync_dist=(self.device != "cpu"))
 91 | 
 92 |     def predict_step(self, batch, batch_idx):
 93 |         x, y = batch
 94 |         logits = self(x)
 95 |         return logits
 96 | 
 97 |     def train_dataloader(self):
 98 |         dataset = TensorDataset(torch.rand(2000, 200, self.input_size), torch.rand(2000, 200, self.input_size))
 99 |         return DataLoader(
100 |             dataset=dataset, num_workers=self.args.num_workers, batch_size=self.args.per_device_train_batch_size
101 |         )
102 | 
103 |     def val_dataloader(self):
104 |         dataset = TensorDataset(torch.rand(2000, 200, self.input_size), torch.rand(2000, 200, self.input_size))
105 |         return DataLoader(
106 |             dataset=dataset, num_workers=self.args.num_workers, batch_size=self.args.per_device_eval_batch_size
107 |         )
108 | 


--------------------------------------------------------------------------------
/pip_install_deepspeed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUDA_ARCH_LIST=`CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"`
 3 | arch="`echo $CUDA_ARCH_LIST | cut -c2`.`echo $CUDA_ARCH_LIST | cut -c5`"
 4 | 
 5 | git clone https://github.com/microsoft/DeepSpeed/
 6 | cd DeepSpeed
 7 | rm -rf build
 8 | TORCH_CUDA_ARCH_LIST=$arch DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 pip install . \
 9 | --global-option="build_ext" --global-option="-j8" --no-cache -v \
10 | --disable-pip-version-check 2>&1 | tee build.log


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.13.1
 2 | pytorch-lightning==1.9.4
 3 | pandas
 4 | numpy
 5 | setproctitle
 6 | wandb
 7 | flake8
 8 | black
 9 | tqdm
10 | scikit-learn
11 | simple_parsing==0.1.0


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Warning Use DeepSpeed
 2 | 
 3 | DeepSpeed **ONLY** can be ran on **Multi GPU**
 4 | 
 5 | 
 6 | 
 7 | # Warning Use Data parallel or Distributed Data Parallel on Torchmetrics
 8 | 
 9 | https://torchmetrics.readthedocs.io/en/stable/pages/overview.html#metrics-in-dataparallel-dp-mode
10 | 
11 | https://torchmetrics.readthedocs.io/en/stable/pages/overview.html#metrics-in-distributed-data-parallel-ddp-mode
12 | 
13 | 
14 | 
15 | This `run_train_gpu.sh` is **ONLY** can be ran on **SINGLE GPU!**
16 | 
17 | **If you want to use Multi GPU DP, you have to write additional gather source code on evaluation_step_end**
18 | 
19 | 
20 | 
21 | `run_train_gpu_ddp.sh` is ran well, **but I'm not sure on metric gather is correctly**
22 | 
23 | 


--------------------------------------------------------------------------------
/scripts/run_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | GPU_IDS=3
 3 | 
 4 | CUDA_VISIBLE_DEVICES=$GPU_IDS \
 5 | python3 inference.py \
 6 |     --model_path="model_outputs/lightning-template-epoch=02-val_loss=0.0842-v2.ckpt" \
 7 |     --config_path="config/dense_model.json" \
 8 |     --seed=42 \
 9 |     --accelerator=gpu \
10 |     --devices=1 \
11 |     --auto_select_gpus=true \
12 |     --model_select=rnn \
13 |     --truncated_bptt_steps=2


--------------------------------------------------------------------------------
/scripts/run_inference_deepspeed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | GPU_IDS="0,1,2,3"
 3 | export OMP_NUM_THREADS=8
 4 | export CUDA_LAUNCH_BLOCKING=1
 5 | export WANDB_DISABLED=false
 6 | export TOKENIZERS_PARALLELISM=false
 7 | 
 8 | 
 9 | model_path=model_outputs/lightning-template-epoch=00-val_loss=0.0846.ckpt
10 | python3 $model_path/zero_to_fp32.py $model_path $model_path/checkpoint/model.bin
11 | 
12 | if [ $? -eq "0" ]; then
13 |         CUDA_VISIBLE_DEVICES=$GPU_IDS \
14 |         python3 -m torch.distributed.launch --nproc_per_node=4 inference_deepspeed.py \
15 |                 --seed=42 \
16 |                 --model_path=$model_path/checkpoint/model.bin \
17 |                 --config_path=config/dense_model.json \
18 |                 --accelerator=gpu \
19 |                 --strategy=ddp \
20 |                 --devices=4 \
21 |                 --num_nodes=1 \
22 |                 --model_select=rnn \
23 |                 --truncated_bptt_steps=2
24 | fi
25 | 
26 | if [ $? -eq "0" ]; then
27 |         python3 ddp_inference_gather.py
28 | fi


--------------------------------------------------------------------------------
/scripts/run_train_cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | python3 train.py \
 3 |     --output_dir="model_outputs/" \
 4 |     --data_dir="" \
 5 |     --config_path="config/dense_model.json" \
 6 |     --seed=42 \
 7 |     --accelerator=cpu \
 8 |     --num_workers=12 \
 9 |     --per_device_train_batch_size=64 \
10 |     --per_device_eval_batch_size=64 \
11 |     --val_check_interval=0.25 \
12 |     --accumulate_grad_batches=1 \
13 |     --max_epochs=3 \
14 |     --log_every_n_steps=100 \
15 |     --auto_scale_batch_size=false \
16 |     --learning_rate=0.00005 \
17 |     --max_lr=0.0001 \
18 |     --weight_decay=0.0001 \
19 |     --warmup_ratio=0.2 \
20 |     --ratio=0.2 \
21 |     --div_factor=10 \
22 |     --final_div_factor=10 \
23 |     --model_select=linear
24 | 


--------------------------------------------------------------------------------
/scripts/run_train_deepspeed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | GPU_IDS="0,1,2,3"
 3 | export OMP_NUM_THREADS=8
 4 | export CUDA_LAUNCH_BLOCKING=1
 5 | export WANDB_DISABLED=false
 6 | export TOKENIZERS_PARALLELISM=false
 7 | export LOCAL_RANK=0
 8 | 
 9 | CUDA_VISIBLE_DEVICES=$GPU_IDS \
10 | python3 -m torch.distributed.launch --nnodes=1 --nproc_per_node=4 ./train.py \
11 |     --output_dir="model_outputs/" \
12 |     --data_dir="" \
13 |     --config_path="config/dense_model.json" \
14 |     --seed=42 \
15 |     --num_workers=12 \
16 |     --per_device_train_batch_size=64 \
17 |     --per_device_eval_batch_size=64 \
18 |     --val_check_interval=0.25 \
19 |     --accumulate_grad_batches=1 \
20 |     --max_epochs=3 \
21 |     --log_every_n_steps=1 \
22 |     --accelerator=gpu \
23 |     --strategy=deepspeed_stage_2 \
24 |     --num_nodes=1 \
25 |     --replace_sampler_ddp=false \
26 |     --devices=4 \
27 |     --auto_select_gpus=true \
28 |     --auto_scale_batch_size=false \
29 |     --learning_rate=0.00005 \
30 |     --max_lr=0.0001 \
31 |     --weight_decay=0.0001 \
32 |     --warmup_ratio=0.2 \
33 |     --ratio=0.2 \
34 |     --div_factor=10 \
35 |     --final_div_factor=10 \
36 |     --valid_on_cpu=false \
37 |     --model_select=rnn \
38 |     --truncated_bptt_steps=2 \
39 |     --deepspeed_config=ds_config/zero2.json
40 | 


--------------------------------------------------------------------------------
/scripts/run_train_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | GPU_IDS=0
 3 | 
 4 | CUDA_VISIBLE_DEVICES=$GPU_IDS \
 5 | python3 ./train.py \
 6 |     --output_dir="../models/" \
 7 |     --data_dir="" \
 8 |     --config_path="config/dense_model.json" \
 9 |     --seed=42 \
10 |     --num_workers=12 \
11 |     --per_device_train_batch_size=64 \
12 |     --per_device_eval_batch_size=64 \
13 |     --val_check_interval=0.25 \
14 |     --accumulate_grad_batches=1 \
15 |     --max_epochs=3 \
16 |     --log_every_n_steps=1 \
17 |     --accelerator=gpu \
18 |     --replace_sampler_ddp=false \
19 |     --devices=1 \
20 |     --auto_scale_batch_size=false \
21 |     --learning_rate=0.00005 \
22 |     --max_lr=0.0001 \
23 |     --weight_decay=0.0001 \
24 |     --warmup_ratio=0.2 \
25 |     --ratio=0.2 \
26 |     --div_factor=10 \
27 |     --final_div_factor=10 \
28 |     --valid_on_cpu=false \
29 |     --model_select=linear \
30 |     --dropout_p=0.1 \
31 |     --cutoff_epoch=1 \
32 |     --drop_mode=standard \
33 |     --drop_schedule=constant
34 | 


--------------------------------------------------------------------------------
/scripts/run_train_gpu_ddp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | GPU_IDS="0,1,2,3"
 3 | 
 4 | OMP_NUM_THREADS=8 \
 5 | CUDA_VISIBLE_DEVICES=$GPU_IDS \
 6 | python3 -m torch.distributed.launch --nnodes=1 --nproc_per_node=4 ./train.py \
 7 |     --output_dir="model_outputs/" \
 8 |     --data_dir="" \
 9 |     --seed=42 \
10 |     --num_workers=12 \
11 |     --per_device_train_batch_size=64 \
12 |     --per_device_eval_batch_size=64 \
13 |     --val_check_interval=0.25 \
14 |     --accumulate_grad_batches=1 \
15 |     --max_epochs=3 \
16 |     --log_every_n_steps=1 \
17 |     --accelerator=gpu \
18 |     --strategy=ddp \
19 |     --num_nodes=1 \
20 |     --replace_sampler_ddp=false \
21 |     --devices=4 \
22 |     --auto_scale_batch_size=false \
23 |     --learning_rate=0.00005 \
24 |     --max_lr=0.0001 \
25 |     --weight_decay=0.0001 \
26 |     --warmup_ratio=0.2 \
27 |     --ratio=0.2 \
28 |     --div_factor=10 \
29 |     --final_div_factor=10 \
30 |     --valid_on_cpu=false \
31 |     --model_select=rnn \
32 |     --truncated_bptt_steps=1
33 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pytorch_lightning as pl
 4 | from pytorch_lightning.strategies import DDPStrategy
 5 | from datetime import timedelta
 6 | from pytorch_lightning.callbacks import ModelCheckpoint
 7 | from pytorch_lightning.loggers import WandbLogger
 8 | from pytorch_lightning.callbacks import LearningRateMonitor
 9 | from models.dense_model.model import CustomNet
10 | from models.dense_model.datamodule import CustomDataModule
11 | from models.rnn_model.model import LSTMModel
12 | from simple_parsing import ArgumentParser
13 | from arguments.training_args import TrainingArguments
14 | from utils.comfy import dataclass_to_namespace
15 | 
16 | 
17 | def main(hparams):
18 |     wandb_logger = WandbLogger(project="lightning-template", name="default", save_dir="./")
19 |     pl.seed_everything(hparams.seed)
20 |     os.makedirs(hparams.output_dir, exist_ok=True)
21 |     hparams.logger = wandb_logger
22 | 
23 |     checkpoint_callback = ModelCheckpoint(
24 |         dirpath=hparams.output_dir,
25 |         save_top_k=3,
26 |         mode="min",
27 |         monitor="val_loss",
28 |         filename="lightning-template-{epoch:02d}-{val_loss:.4f}",
29 |     )
30 |     lr_monitor = LearningRateMonitor(logging_interval="step")
31 |     hparams.callbacks = [checkpoint_callback, lr_monitor]
32 | 
33 |     if hparams.accelerator == "cpu" and hparams.valid_on_cpu is True:
34 |         print("If you run on cpu, valid must go on cpu, It set automatically")
35 |         hparams.valid_on_cpu = False
36 |     elif hparams.strategy == "ddp":
37 |         hparams.strategy = DDPStrategy(timeout=timedelta(days=30))
38 |     elif hparams.strategy == "deepspeed_stage_2":
39 |         if hparams.deepspeed_config is not None:
40 |             from pytorch_lightning.strategies import DeepSpeedStrategy
41 | 
42 |             hparams.strategy = DeepSpeedStrategy(config=hparams.deepspeed_config)
43 |     elif hparams.accelerator != "cpu" and (hparams.strategy is not None and "deepspeed" in hparams.strategy):
44 |         raise NotImplementedError("If you want to another deepspeed option and config, PLZ IMPLEMENT FIRST!!")
45 |     trainer = pl.Trainer.from_argparse_args(hparams)
46 | 
47 |     if hparams.model_select == "linear":
48 |         datamodule = CustomDataModule(hparams)
49 |         model = CustomNet(hparams)
50 |         wandb_logger.watch(model, log="all")
51 |         trainer.fit(model, datamodule=datamodule)
52 |         """ TODO If use config like dict follow this line
53 |         but, model param is duplicated area between training param and model param
54 |         I want to get training param on run script argument, so I can not use it
55 |         """
56 |         # config_cls = load_config(hparams.config_dir)
57 |         # config = config_to_dict(config_cls)
58 |         # with open(os.path.join(hparams.output_dir, "config.json"), "w") as f:
59 |         # json.dump(config, f, ensure_ascii=False, indent=4)
60 |     else:
61 |         model = LSTMModel(hparams)
62 |         wandb_logger.watch(model, log="all")
63 |         trainer.fit(model)
64 |     # TODO If finetuning follow this line
65 |     # PreTrainedLightningModule.load_state_dict(
66 |     #     torch.load(
67 |     #         "",
68 |     #         map_location="cuda",
69 |     #     ),
70 |     #     strict=False,
71 |     # )
72 |     checkpoint_callback.best_model_path
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     parser = ArgumentParser()
77 |     parser = pl.Trainer.add_argparse_args(parser)
78 |     parser.add_arguments(TrainingArguments, dest="training_args")
79 |     args = parser.parse_args()
80 |     args = dataclass_to_namespace(args, "training_args")
81 |     main(args)
82 | 


--------------------------------------------------------------------------------
/utils/README.md:
--------------------------------------------------------------------------------
1 | # config_loader.py
2 | 
3 | It is looked very useful, just like HuggingFace model config loader. But I feel something uncomportable.
4 | 
5 | Because, When if you use `pl.LightningModule` on your model code, you have to write `optimizing_step` on your same class.
6 | 
7 | I'm enjoying modify training argument in scripts(not model config), but if you used `pl.LightningModule` and `config_loader.py` , maybe you have to load 2 argument in class `__init__` about `training script argument` and  `model_config` class


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YooSungHyun/pytorch-lightning-template/9db69c15a179e33b65a98a6605be3b759ccec84e/utils/__init__.py


--------------------------------------------------------------------------------
/utils/comfy.py:
--------------------------------------------------------------------------------
1 | def dataclass_to_namespace(args, args_name):
2 |     # Dataclass arg to python namespace
3 |     if args.__contains__(args_name):
4 |         for key, value in args.__getattribute__(args_name).__dict__.items():
5 |             args.__setattr__(key, value)
6 |         args.__delattr__(args_name)
7 |     return args
8 | 


--------------------------------------------------------------------------------
/utils/config_loader.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class ConfigLoader:
 5 |     def __init__(self, **kwargs):
 6 |         for k, v in kwargs.items():
 7 |             if type(v) == dict:
 8 |                 v = ConfigLoader(**v)  # use recursively
 9 |             self[k] = v
10 | 
11 |     def keys(self):
12 |         return self.__dict__.keys()
13 | 
14 |     def items(self):
15 |         return self.__dict__.items()
16 | 
17 |     def values(self):
18 |         return self.__dict__.values()
19 | 
20 |     def __len__(self):
21 |         return len(self.__dict__)
22 | 
23 |     def __getitem__(self, key):
24 |         return getattr(self, key)
25 | 
26 |     def __setitem__(self, key, value):
27 |         return setattr(self, key, value)
28 | 
29 |     def __contains__(self, key):
30 |         return key in self.__dict__
31 | 
32 |     def __repr__(self):
33 |         return self.__dict__.__repr__()
34 | 
35 | 
36 | def config_to_dict(config: ConfigLoader):
37 |     result = dict()
38 |     for k, v in config.items():
39 |         if type(v) == ConfigLoader:
40 |             result[k] = config_to_dict(v)
41 |         else:
42 |             result[k] = v
43 |     return result
44 | 
45 | 
46 | def load_config(config_filename: str):
47 |     with open(config_filename, "r") as f:
48 |         data = f.read()
49 |     config = json.loads(data)
50 |     hparams = ConfigLoader(**config)
51 |     return hparams
52 | 


--------------------------------------------------------------------------------