├── .gitignore ├── LICENSE ├── README.md ├── arguments ├── inference_args.py └── training_args.py ├── config └── dense_model.json ├── ddp_inference_gather.py ├── ds_config ├── README.md └── zero2.json ├── inference.py ├── inference_deepspeed.py ├── models ├── README.md ├── __init__.py ├── dense_model │ ├── __init__.py │ ├── dataloader.py │ ├── datamodule.py │ ├── drop_scheduler.py │ └── model.py └── rnn_model │ ├── __init__.py │ └── model.py ├── pip_install_deepspeed.sh ├── requirements.txt ├── scripts ├── README.md ├── run_inference.sh ├── run_inference_deepspeed.sh ├── run_train_cpu.sh ├── run_train_deepspeed.sh ├── run_train_gpu.sh └── run_train_gpu_ddp.sh ├── train.py └── utils ├── README.md ├── __init__.py ├── comfy.py └── config_loader.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # wandb log 132 | wandb/ 133 | 134 | # datasets 135 | *.pt 136 | deepspeed_all_result.txt 137 | model_outputs/ 138 | lightning_logs/ 139 | distributed_result/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 YooSungHyun 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NEW Release 20230312!!! 2 | 3 | update dropout **Dropout Reduces Underfitting** 4 | 5 | [paper](https://arxiv.org/abs/2303.01500) 6 | 7 | [git](https://github.com/facebookresearch/dropout) 8 | 9 | ## Usage 10 | 11 | CHECK PLZ `run_train_gpu.sh` and `./models/dense_model/` 12 | 13 | ### 1. make drop_scheduler.py 14 | 15 | ### 2. make dropout scheduling function your each model classes 16 | 17 | **Example** 18 | 19 | ```python 20 | # 1) update dropout function 21 | def update_dropout(self, drop_rate): 22 | self.drop_rate = drop_rate 23 | for module in self.modules(): 24 | if isinstance(module, nn.Dropout): 25 | module.p = drop_rate 26 | 27 | # 2) we have to count each step (for calculate schedulering global steps) 28 | def on_train_start(self): 29 | from models.dense_model.drop_scheduler import drop_scheduler 30 | 31 | self.drop_scheduler = {} 32 | if self.args.dropout_p > 0.0: 33 | self.drop_scheduler["do"] = drop_scheduler( 34 | self.args.dropout_p, 35 | self.args.max_epochs, 36 | self.trainer.num_training_batches, 37 | self.args.cutoff_epoch, 38 | self.args.drop_mode, 39 | self.args.drop_schedule, 40 | ) 41 | print( 42 | "on_train_start :: Min DO = %.7f, Max DO = %.7f" 43 | % (min(self.drop_scheduler["do"]), max(self.drop_scheduler["do"])) 44 | ) 45 | 46 | # 3) Finally, you can scheduling dropout prob in your training_step 47 | if "do" in self.drop_scheduler: 48 | dropout_p = self.drop_scheduler["do"][self.trainer.global_step] 49 | self.update_dropout(dropout_p) 50 | self.log("dropout_p", dropout_p, sync_dist=(self.device != "cpu")) 51 | ``` 52 | 53 | ### 3. we have to input dropout schedule setting in training_arg 54 | 55 | ### 4. replace run script 56 | 57 | ``` bash 58 | --dropout_p=0.1 \ 59 | --cutoff_epoch=1 \ 60 | --drop_mode=standard \ 61 | --drop_schedule=constant 62 | ``` 63 | 64 | 65 | 66 | ### If you want to use, normal style dropout, input dropout_p and `drop_mode=standard` (default) and `drop_schedule=constant` (default) 67 | you can check your dropout scheduling process in wandb 68 | 69 | ![image](https://user-images.githubusercontent.com/34292279/224522943-37e5779a-3f40-44c0-8aec-5a13f84ff47c.png) 70 | 71 | 72 | # pytorch-lightning-template 73 | 74 | very simple but, write down is boring
75 | boring boiling code rolling ⚡
76 | **If you need some function or someting, plz comment issues (plz write eng or ko). I reply and implement ASAP!!**
77 | 78 | - DataModule more detail: [PyTorch-Lightning Dev Guide](https://pytorch-lightning.readthedocs.io/en/stable/data/datamodule.html) 79 | - Model more detail: [PyTorch-Lightning Dev Guide](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html) 80 | - Inference more detail: [PyTorch-Lightning Dev Guide](https://pytorch-lightning.readthedocs.io/en/stable/deploy/production_intermediate.html) 81 | - WanDB with lightning more detail 82 | - [Weight & Bias Dev Guide 1](https://wandb.ai/wandb_fc/korean/reports/Weights-Biases-Pytorch-Lightning---VmlldzozNzAxOTg) 83 | - [Weight & Bias Dev Guide 2](https://docs.wandb.ai/guides/integrations/lightning) 84 | 85 | # WanDB 86 | 87 | https://docs.wandb.ai/v/ko/quickstart 88 | 89 | # Training Detail 90 | 91 | - Using DDP, Not DP or CPU
92 | Maybe want to using DP or CPU, Change some argument or python Script
93 | See more detail: [PyTorch-Lightning Dev Guide](https://pytorch-lightning.readthedocs.io/en/stable/accelerators/gpu_intermediate.html) 94 | - Optimizer: AdamW 95 | - LearningRate Scheduler: OneCycleLR 96 | - see more detail: [PyTorch Dev Guide](https://pytorch.org/docs/stable/optim.html) 97 | - Monitoring Tool: WanDB 98 | 99 | # Pytorch-Lightning Life Cycle 100 | 101 | ## Training 102 | 103 | 1. train.py(main) -> argparse 104 | - using [simple_parsing library](https://github.com/lebrice/SimpleParsing) looks like HFArgumentParser 105 | - Trainer Argument placed with `pl.Trainer.add_argparse_args` (automatic define argparse) 106 | 2. def] WandbLogger, set seed(os, random, np, torch, torch.cuda) 107 | 3. def] CustomDataModule (`LightningDataModule`) 108 | - You Not have to using `LightningDataModule`. but, if you implement that in 'LightningModule', source code is looked mess 109 | - DataModule important `prepare_data` and `setup` 110 | - `prepare_data` is only run on cpu and not multi processing (Warning, if you using distributed learning, this place's variable is not share) 111 | - I recommand, It just using data download or datasets save 112 | - `setup` is run on gpu or cpu and distributed. using map or dataload or something! 113 | - `setup` can have stage(fit (train), test, predict) 114 | - DataModule can have each stage's dataloader 115 | - using default or someting 116 | - Dataset can define this section or making each python script and just import & using! 117 | 4. def] CustomNet (`LightningModule`) 118 | - each step and step_end or epoch, epoch_end 119 | - i think using just training_step, validation_step, validation_epoch_end is simple and best 120 | - training_step -> forward -> configure_optimizers 121 | - when count in each validation step (each batch step validation) -> validation_epoch_end (all batch result gather) -> log (on wandb) 122 | 5. wandb logger additional setting 123 | 6. checkpoint setting 124 | - monitor name is same on your each step's log name 125 | 7. learning_rate monitor setting 126 | 8. ddp strategy modify 127 | - if your dataset is so big to ddp, timeout parameter change like that 128 | - huggingface is so hard to make it. but lightning is feel free 129 | 9. make trainer to your arg 130 | 10. training run and model save! 131 | 132 | ### Training Script Usage 133 | 134 | 1. cd your project root(./pytorch-lightning-template) 135 | 136 | ``` 137 | # Don't Script RUN in your scripts FOLDER!!!!! CHK PLZ!!!!!!! 138 | bash scripts/run_train_~~~.sh 139 | ``` 140 | 141 | ## Inference 142 | 143 | 1. inference.py(main) -> argparse 144 | 2. set seed 145 | 3. model load (second param is your model init param) 146 | 4. simply torch inference & END! 147 | 148 | ### Inference Script Usage 149 | 150 | 1. cd your project root(./pytorch-lightning-template) 151 | 152 | ``` 153 | # Don't Script RUN in your scripts FOLDER!!!!! CHK PLZ!!!!!!! 154 | bash scripts/run_inference~~~.sh 155 | ``` 156 | 157 | # (Optional) Install DeepSpeed 158 | 159 | 1. run pip_install_deepspeed.sh 160 | 161 | ``` 162 | bash pip_install_deepspeed.sh 163 | ``` 164 | -------------------------------------------------------------------------------- /arguments/inference_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class InferenceArguments: 6 | """Help string for this group of command-line arguments""" 7 | 8 | seed: int = None # all seed 9 | local_rank: int = None # ddp local rank 10 | model_path: str = "model_outputs" # target pytorch lightning model dir 11 | config_path: str = "model_outputs" # target pytorch lightning model dir 12 | per_device_test_batch_size: int = 1 # The batch size per GPU/TPU core/CPU for evaluation. 13 | model_select: str = "linear" # linear or rnn 14 | truncated_bptt_steps: int = 1 # TBPTT step size 15 | valid_on_cpu: bool = False # If you want to run validation_step on cpu -> true 16 | -------------------------------------------------------------------------------- /arguments/training_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import simple_parsing as sp 3 | 4 | 5 | @dataclass 6 | class TrainingArguments: 7 | """Help string for this group of command-line arguments""" 8 | 9 | seed: int = None # all seed 10 | local_rank: int = None # ddp local rank 11 | data_dir: str = "datasets" # target pytorch lightning data dirs 12 | ratio: float = 0.2 # train/valid split ratio 13 | output_dir: str = "model_outputs" # model output path 14 | config_path: str = "config/dense_model.json" 15 | num_workers: int = None # how many proc map? 16 | learning_rate: float = 0.001 # learning rate 17 | warmup_ratio: float = 0.2 # learning rate scheduler warmup ratio per EPOCH 18 | max_lr: float = 0.01 # lr_scheduler max learning rate 19 | div_factor: int = 25 # initial_lr = max_lr/div_factor 20 | final_div_factor: int = 1e4 # (max_lr/div_factor)*final_div_factor is final lr 21 | weight_decay: float = 0.0001 # weigth decay 22 | per_device_train_batch_size: int = 1 # The batch size per GPU/TPU core/CPU for training. 23 | per_device_eval_batch_size: int = 1 # The batch size per GPU/TPU core/CPU for evaluation. 24 | valid_on_cpu: bool = False # If you want to run validation_step on cpu -> true 25 | model_select: str = "linear" # linear or rnn 26 | truncated_bptt_steps: int = 1 # TBPTT step size 27 | deepspeed_config: str = "ds_config/zero2.json" 28 | dropout_p: float = 0.0 # Drop path rate (default: 0.0) 29 | cutoff_epoch: int = 0 # if drop_mode is early / late, this is the epoch where dropout ends / starts 30 | drop_mode: str = sp.field(default="standard", choices=["standard", "early", "late"]) # drop mode 31 | drop_schedule: str = sp.field( 32 | default="constant", choices=["constant", "linear"] 33 | ) # drop schedule for early dropout / s.d. only 34 | -------------------------------------------------------------------------------- /config/dense_model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "input_dense_dim": 512, 4 | "output_dense_dim": 256 5 | } 6 | } -------------------------------------------------------------------------------- /ddp_inference_gather.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | gpu_all_result = list() 5 | gpu0_result = torch.load("distributed_result/predictions_0.pt") 6 | for item in gpu0_result: 7 | gpu_all_result.extend(item) 8 | gpu1_result = torch.load("distributed_result/predictions_1.pt") 9 | for item in gpu1_result: 10 | gpu_all_result.extend(item) 11 | gpu2_result = torch.load("distributed_result/predictions_2.pt") 12 | for item in gpu2_result: 13 | gpu_all_result.extend(item) 14 | gpu3_result = torch.load("distributed_result/predictions_3.pt") 15 | for item in gpu3_result: 16 | gpu_all_result.extend(item) 17 | gpu_all_result = sorted(gpu_all_result, key=lambda x: x[0]) 18 | all_result = list() 19 | for item in gpu_all_result: 20 | # item is (index, torch([int])) 21 | all_result.append(item[1].detach()[0]) 22 | test = torch.stack(all_result) 23 | # test is torch([all result of int]) 24 | np.savetxt("deepspeed_all_result.txt", test.numpy()) 25 | -------------------------------------------------------------------------------- /ds_config/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed Config 2 | 3 | Maybe, In pytorch-lightning**⚡**, You can not use `AUTO config value`. (Not Like HuggingFace, so uncomfortable) 4 | 5 | -------------------------------------------------------------------------------- /ds_config/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": false, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "zero_optimization": { 11 | "stage": 2, 12 | "allgather_partitions": true, 13 | "allgather_bucket_size": 5e8, 14 | "overlap_comm": true, 15 | "reduce_scatter": true, 16 | "reduce_bucket_size": 5e8, 17 | "contiguous_gradients": true, 18 | "cpu_offload": false 19 | }, 20 | "steps_per_print": 2000, 21 | "wall_clock_breakdown": false 22 | } -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytorch_lightning as pl 3 | from models.dense_model.model import CustomNet 4 | from models.rnn_model.model import LSTMModel 5 | from utils.compy import dataclass_to_namespace 6 | from arguments.inference_args import InferenceArguments 7 | from simple_parsing import ArgumentParser 8 | 9 | 10 | def main(hparams): 11 | pl.seed_everything(hparams.seed) 12 | 13 | if hparams.model_select == "linear": 14 | model = CustomNet.load_from_checkpoint(hparams.model_path, args=hparams) 15 | features = torch.randn(1, 512) 16 | else: 17 | model = LSTMModel.load_from_checkpoint(hparams.model_path, args=hparams) 18 | features = torch.randn(200, 1) 19 | model.eval() 20 | with torch.no_grad(): 21 | logits = model(features) 22 | print(logits) 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = ArgumentParser() 27 | parser = pl.Trainer.add_argparse_args(parser) 28 | parser.add_arguments(InferenceArguments, dest="inference_args") 29 | args = parser.parse_args() 30 | args = dataclass_to_namespace(args, "inference_args") 31 | main(args) 32 | -------------------------------------------------------------------------------- /inference_deepspeed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import pytorch_lightning as pl 4 | from utils.compy import dataclass_to_namespace 5 | from arguments.inference_args import InferenceArguments 6 | from simple_parsing import ArgumentParser 7 | from models.dense_model.model import CustomNet 8 | from models.dense_model.datamodule import CustomDataset 9 | from models.rnn_model.model import LSTMModel 10 | from pytorch_lightning.callbacks import BasePredictionWriter 11 | from torch.utils.data import DataLoader, TensorDataset 12 | 13 | 14 | class CustomWriter(BasePredictionWriter): 15 | def __init__(self, output_dir, write_interval): 16 | super().__init__(write_interval) 17 | self.output_dir = output_dir 18 | 19 | def write_on_epoch_end(self, trainer, pl_module, predictions, batch_indices): 20 | result_list = list() 21 | for batch_index, pred in list(zip(batch_indices[0], predictions[0])): 22 | result_list.append(list(zip(batch_index, pred))) 23 | torch.save(result_list, os.path.join(self.output_dir, f"predictions_{trainer.global_rank}.pt")) 24 | 25 | 26 | def read_json(fname): 27 | from pathlib import Path 28 | import json 29 | 30 | fname = Path(fname) 31 | with fname.open("rt") as handle: 32 | return json.load(handle) 33 | 34 | 35 | def on_load_checkpoint(checkpoint): 36 | state_dict = {k.partition("_forward_module.")[2]: checkpoint[k] for k in checkpoint.keys()} 37 | checkpoint["state_dict"] = state_dict 38 | return checkpoint 39 | 40 | 41 | def main(hparams): 42 | pl.seed_everything(hparams.seed) 43 | device = torch.device("cuda") 44 | os.makedirs("distributed_result", exist_ok=True) 45 | if hparams.model_select == "linear": 46 | model = CustomNet(hparams) 47 | checkpoint = torch.load(hparams.model_path, map_location=device) 48 | checkpoint = on_load_checkpoint(checkpoint) 49 | model.load_state_dict(checkpoint["state_dict"]) 50 | features = torch.randn(10, 512) 51 | temp_label = torch.randn(10, 1) 52 | infer_datasets = CustomDataset(features, temp_label) 53 | infer_loader = DataLoader( 54 | dataset=infer_datasets, batch_size=hparams.per_device_test_batch_size, num_workers=4, pin_memory=True 55 | ) 56 | else: 57 | model = LSTMModel(hparams) 58 | checkpoint = torch.load(hparams.model_path, map_location=device) 59 | checkpoint = on_load_checkpoint(checkpoint) 60 | model.load_state_dict(checkpoint["state_dict"]) 61 | features = torch.randn(200, 1) 62 | temp_label = torch.randn(200, 1) 63 | infer_datasets = TensorDataset(features, temp_label) 64 | infer_loader = DataLoader( 65 | dataset=infer_datasets, batch_size=hparams.per_device_test_batch_size, num_workers=4, pin_memory=True 66 | ) 67 | 68 | pred_writer = CustomWriter(output_dir="distributed_result", write_interval="epoch") 69 | hparams.callbacks = [pred_writer] 70 | trainer = pl.Trainer.from_argparse_args(hparams) 71 | 72 | trainer.predict(model, infer_loader, return_predictions=False) 73 | 74 | 75 | if __name__ == "__main__": 76 | parser = ArgumentParser() 77 | parser = pl.Trainer.add_argparse_args(parser) 78 | parser.add_arguments(InferenceArguments, dest="inference_args") 79 | args = parser.parse_args() 80 | args = dataclass_to_namespace(args, "inference_args") 81 | main(args) 82 | -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | # Example models 2 | 3 | We have 2 concept model 4 | 5 | ## 1. dense model 6 | 7 | **I use all of pytorch-lightning⚡'s class as much as i can** 8 | 9 | Custom Dataloader, Custom Dataset, LightningDataModule and also you can use Custom DataSampler (if you have one...) 10 | 11 | If you want to perfect set of lightning template. PLZ watch this model and paste it! 12 | 13 | ## 2. rnn model 14 | 15 | **Just Look Simply as much as i can** 16 | 17 | I just use torch style format. 18 | 19 | If you have to make something very quickly and simply, PLZ watch this model and paste it! 20 | 21 | ALSO, you want to use pytorch-lightning⚡ style **BPTT** in rnn model, watch this too -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YooSungHyun/pytorch-lightning-template/9db69c15a179e33b65a98a6605be3b759ccec84e/models/__init__.py -------------------------------------------------------------------------------- /models/dense_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YooSungHyun/pytorch-lightning-template/9db69c15a179e33b65a98a6605be3b759ccec84e/models/dense_model/__init__.py -------------------------------------------------------------------------------- /models/dense_model/dataloader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class CustomDataLoader(torch.utils.data.DataLoader): 5 | def __init__(self, *args, **kwargs): 6 | """ 7 | Creates a data loader for AudioDatasets. 8 | """ 9 | super(CustomDataLoader, self).__init__(*args, **kwargs) 10 | self.collate_fn = self._collate_fn 11 | 12 | def _collate_fn(self, batch): 13 | features = [s["features"] for s in batch] 14 | feature_lengths = [s["features"].size(0) for s in batch] 15 | labels = [s["label"] for s in batch] 16 | label_lengths = [len(s["label"]) for s in batch] 17 | 18 | features = torch.FloatTensor(features) 19 | labels = torch.LongTensor(labels) 20 | feature_lengths = torch.IntTensor(feature_lengths) 21 | label_lengths = torch.IntTensor(label_lengths) 22 | 23 | return features, labels, feature_lengths, label_lengths 24 | -------------------------------------------------------------------------------- /models/dense_model/datamodule.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import pytorch_lightning as pl 4 | from argparse import Namespace 5 | from sklearn.model_selection import train_test_split 6 | from torch.utils.data import DataLoader, Dataset 7 | 8 | 9 | class CustomDataset(Dataset): 10 | def __init__(self, x, y): 11 | self.x = torch.FloatTensor(x) 12 | self.y = torch.FloatTensor(y) 13 | 14 | def __len__(self): 15 | return len(self.x) 16 | 17 | def __getitem__(self, idx): 18 | return (self.x[idx], self.y[idx], len(self.x[idx]), len(self.y[idx])) 19 | 20 | 21 | class CustomDataModule(pl.LightningDataModule): 22 | def __init__(self, args: Namespace): 23 | super().__init__() 24 | self.data_dir = args.data_dir 25 | self.ratio = args.ratio 26 | self.per_device_train_batch_size = args.per_device_train_batch_size 27 | self.per_device_eval_batch_size = args.per_device_eval_batch_size 28 | self.num_workers = args.num_workers 29 | 30 | def prepare_data(self): 31 | if not os.path.isfile("./valid.pt"): 32 | features = torch.randn(2000, 512) 33 | labels = torch.randn(2000, 1) 34 | train_x, valid_x, train_y, valid_y = train_test_split(features, labels, test_size=self.ratio) 35 | train_datasets = CustomDataset(train_x, train_y) 36 | valid_datasets = CustomDataset(valid_x, valid_y) 37 | torch.save(train_datasets, "./train.pt") 38 | torch.save(valid_datasets, "./valid.pt") 39 | else: 40 | pass 41 | 42 | def setup(self, stage: str): 43 | if stage == "fit": 44 | self.train_datasets = torch.load("./train.pt") 45 | self.valid_datasets = torch.load("./valid.pt") 46 | if stage == "test": 47 | pass 48 | 49 | def train_dataloader(self): 50 | # TODO: If you want to use custom sampler and loader follow like this 51 | # from transformers.trainer_pt_utils import DistributedLengthGroupedSampler 52 | # train_sampler = DistributedLengthGroupedSampler( 53 | # batch_size=self.per_device_eval_batch_size, 54 | # dataset=self.train_datasets, 55 | # model_input_name="features", 56 | # lengths=self.train_datasets["feature_lenghths"], 57 | # ) 58 | # return CustomDataLoader( 59 | # dataset=self.train_datasets, 60 | # batch_size=self.per_device_train_batch_size, 61 | # sampler=train_sampler, 62 | # num_workers=self.num_workers, 63 | # pin_memory=True, 64 | # ) 65 | 66 | # TODO: If you want to use default loader 67 | return DataLoader( 68 | dataset=self.train_datasets, 69 | batch_size=self.per_device_train_batch_size, 70 | num_workers=self.num_workers, 71 | pin_memory=True, 72 | ) 73 | 74 | def val_dataloader(self): 75 | return DataLoader( 76 | dataset=self.valid_datasets, 77 | batch_size=self.per_device_train_batch_size, 78 | num_workers=self.num_workers, 79 | pin_memory=True, 80 | ) 81 | 82 | def test_dataloader(self): 83 | pass 84 | -------------------------------------------------------------------------------- /models/dense_model/drop_scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def drop_scheduler(drop_rate, epochs, niter_per_ep, cutoff_epoch=0, mode="standard", schedule="constant"): 5 | assert mode in ["standard", "early", "late"] 6 | if mode == "standard": 7 | return np.full(epochs * niter_per_ep, drop_rate) 8 | 9 | early_iters = cutoff_epoch * niter_per_ep 10 | late_iters = (epochs - cutoff_epoch) * niter_per_ep 11 | 12 | if mode == "early": 13 | assert schedule in ["constant", "linear"] 14 | if schedule == "constant": 15 | early_schedule = np.full(early_iters, drop_rate) 16 | elif schedule == "linear": 17 | early_schedule = np.linspace(drop_rate, 0, early_iters) 18 | final_schedule = np.concatenate((early_schedule, np.full(late_iters, 0))) 19 | 20 | elif mode == "late": 21 | assert schedule in ["constant"] 22 | early_schedule = np.full(early_iters, 0) 23 | final_schedule = np.concatenate((early_schedule, np.full(late_iters, drop_rate))) 24 | 25 | assert len(final_schedule) == epochs * niter_per_ep 26 | return final_schedule 27 | -------------------------------------------------------------------------------- /models/dense_model/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytorch_lightning as pl 3 | from torchmetrics import MeanSquaredError 4 | from torch import nn 5 | from utils.config_loader import load_config 6 | 7 | 8 | class CustomNet(pl.LightningModule): 9 | def __init__(self, args): 10 | super().__init__() 11 | self.args = args 12 | self.drop_scheduler = None 13 | self.drop_rate = args.dropout_p 14 | config_cls = load_config(args.config_path) 15 | self.loss_func = MeanSquaredError(compute_on_cpu=args.valid_on_cpu) 16 | # TODO: Write down your network 17 | self.dense_batch_fc_tanh = nn.Sequential( 18 | nn.Linear(config_cls.model.input_dense_dim, config_cls.model.output_dense_dim), 19 | nn.BatchNorm1d(config_cls.model.output_dense_dim), 20 | nn.Tanh(), 21 | nn.Dropout(self.drop_rate), 22 | nn.Linear(config_cls.model.output_dense_dim, (config_cls.model.output_dense_dim // 2)), 23 | nn.BatchNorm1d((config_cls.model.output_dense_dim // 2)), 24 | nn.Tanh(), 25 | nn.Dropout(self.drop_rate), 26 | ) 27 | self.fc = nn.Linear(config_cls.model.output_dense_dim // 2, 1) 28 | 29 | def update_dropout(self, drop_rate): 30 | self.drop_rate = drop_rate 31 | for module in self.modules(): 32 | if isinstance(module, nn.Dropout): 33 | module.p = drop_rate 34 | 35 | def forward(self, features): 36 | outputs = self.dense_batch_fc_tanh(features) 37 | logits = self.fc(outputs) 38 | return logits 39 | 40 | def on_train_start(self): 41 | from models.dense_model.drop_scheduler import drop_scheduler 42 | 43 | self.drop_scheduler = {} 44 | if self.args.dropout_p > 0.0: 45 | self.drop_scheduler["do"] = drop_scheduler( 46 | self.args.dropout_p, 47 | self.args.max_epochs, 48 | self.trainer.num_training_batches, 49 | self.args.cutoff_epoch, 50 | self.args.drop_mode, 51 | self.args.drop_schedule, 52 | ) 53 | print( 54 | "on_train_start :: Min DO = %.7f, Max DO = %.7f" 55 | % (min(self.drop_scheduler["do"]), max(self.drop_scheduler["do"])) 56 | ) 57 | 58 | def training_step(self, batch, batch_idx): 59 | features, labels, feature_lengths, label_lengths = batch 60 | if "do" in self.drop_scheduler: 61 | dropout_p = self.drop_scheduler["do"][self.trainer.global_step] 62 | self.update_dropout(dropout_p) 63 | self.log("dropout_p", dropout_p, sync_dist=(self.device != "cpu")) 64 | logits = self(features) 65 | loss = self.loss_func(logits, labels) 66 | self.log("train_loss", loss, sync_dist=(self.device != "cpu")) 67 | return {"loss": loss} 68 | 69 | def validation_step(self, batch, batch_idx): 70 | # lightning do sanity eval step first before going training_step. for check your mistake. 71 | # I always make mistake on validation logic, so this is good 72 | # If don't use check this url. https://github.com/Lightning-AI/lightning/issues/2295 73 | features, labels, feature_lengths, label_lengths = batch 74 | if self.args.valid_on_cpu: 75 | features = features.cpu() 76 | labels = labels.cpu() 77 | feature_lengths = feature_lengths.cpu() 78 | label_lengths = label_lengths.cpu() 79 | self.cpu() 80 | logits = self(features) 81 | loss = self.loss_func(logits, labels) 82 | 83 | return {"loss": loss} 84 | 85 | def validation_epoch_end(self, validation_step_outputs): 86 | loss_mean = torch.tensor([x["loss"] for x in validation_step_outputs], device=self.device).mean() 87 | 88 | # sync_dist use follow this url 89 | # if using torchmetrics -> https://torchmetrics.readthedocs.io/en/stable/ 90 | # if not using torchmetrics -> https://github.com/Lightning-AI/lightning/discussions/6501 91 | if self.args.valid_on_cpu: 92 | # if ddp, each machine output must gather. and lightning can gather only on-gpu items 93 | self.log("val_loss", loss_mean.cuda(), sync_dist=True) 94 | # model have to training_step on cuda 95 | self.cuda() 96 | else: 97 | self.log("val_loss", loss_mean, sync_dist=(self.device != "cpu")) 98 | # self.log_dict(metrics, sync_dist=(self.device != "cpu")) 99 | 100 | def predict_step(self, batch, batch_idx): 101 | features, labels, feature_lengths, label_lengths = batch 102 | logits = self(features) 103 | return logits 104 | 105 | def configure_optimizers(self): 106 | optimizer = torch.optim.AdamW( 107 | [{"params": [p for p in self.parameters()], "name": "OneCycleLR"}], 108 | lr=self.args.learning_rate, 109 | weight_decay=self.args.weight_decay, 110 | ) 111 | scheduler = torch.optim.lr_scheduler.OneCycleLR( 112 | optimizer, 113 | max_lr=self.args.max_lr, 114 | total_steps=self.trainer.estimated_stepping_batches, 115 | pct_start=self.args.warmup_ratio, 116 | epochs=self.trainer.max_epochs, 117 | final_div_factor=self.args.final_div_factor, 118 | ) 119 | lr_scheduler = {"interval": "step", "scheduler": scheduler, "name": "AdamW"} 120 | return {"optimizer": optimizer, "lr_scheduler": lr_scheduler} 121 | -------------------------------------------------------------------------------- /models/rnn_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YooSungHyun/pytorch-lightning-template/9db69c15a179e33b65a98a6605be3b759ccec84e/models/rnn_model/__init__.py -------------------------------------------------------------------------------- /models/rnn_model/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchmetrics import MeanSquaredError 3 | from torch.utils.data import DataLoader, TensorDataset 4 | from pytorch_lightning import LightningModule 5 | 6 | 7 | class LSTMModel(LightningModule): 8 | """LSTM sequence-to-sequence model for testing TBPTT with automatic optimization.""" 9 | 10 | def __init__(self, args): 11 | super().__init__() 12 | self.args = args 13 | self.input_size = 1 14 | self.hidden_size = 8 15 | self.lstm = torch.nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True) 16 | self.lstm2 = torch.nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True) 17 | self.linear = torch.nn.Linear(self.hidden_size * 2, 1) 18 | self.loss_func = MeanSquaredError(compute_on_cpu=self.args.valid_on_cpu) 19 | self.truncated_bptt_steps = self.args.truncated_bptt_steps 20 | self.automatic_optimization = True 21 | 22 | def configure_optimizers(self): 23 | optimizer = torch.optim.AdamW( 24 | [{"params": [p for p in self.parameters()], "name": "OneCycleLR"}], 25 | lr=self.args.learning_rate, 26 | weight_decay=self.args.weight_decay, 27 | ) 28 | scheduler = torch.optim.lr_scheduler.OneCycleLR( 29 | optimizer, 30 | max_lr=self.args.max_lr, 31 | total_steps=self.trainer.estimated_stepping_batches, 32 | pct_start=self.args.warmup_ratio, 33 | epochs=self.trainer.max_epochs, 34 | final_div_factor=self.args.final_div_factor, 35 | ) 36 | lr_scheduler = {"interval": "step", "scheduler": scheduler, "name": "AdamW"} 37 | return {"optimizer": optimizer, "lr_scheduler": lr_scheduler} 38 | 39 | def forward(self, x, hiddens=None): 40 | if hiddens is not None: 41 | hiddens1, hiddens2 = hiddens 42 | else: 43 | hiddens1 = None 44 | hiddens2 = None 45 | self.lstm.flatten_parameters() 46 | lstm_last, hiddens1 = self.lstm(x, hiddens1) 47 | self.lstm2.flatten_parameters() 48 | lstm2_last, hiddens2 = self.lstm2(x, hiddens2) 49 | concat_lstm = torch.concat([lstm_last, lstm2_last], dim=-1) 50 | logits = self.linear(concat_lstm) 51 | return logits, hiddens1, hiddens2 52 | 53 | def training_step(self, batch, batch_idx, hiddens): 54 | # batch_idx: Original step indices, Not TBPTT index (1 step == 1 batch) 55 | # hiddens: TBPTT use backwards each sequence using this data 56 | 57 | # On tbptt, backpropagation is used CHUNK by long sequence. when if using 200 sequence and 100 step chunk, 58 | # training_step is needed 2 step for 1 batch (1 step: 0~99, 2 step: 100~199) 59 | # very cleverly, we just using hiddens parameter, lightning's tbptt not connected new batch's hiddens to past one 60 | x, y = batch 61 | logits, hiddens1, hiddens2 = self(x, hiddens) 62 | loss = self.loss_func(logits, y) 63 | self.log("train_loss", loss, sync_dist=(self.device != "cpu")) 64 | # look this discussion for tbptt experiment (https://github.com/Lightning-AI/lightning/discussions/15643) 65 | return {"loss": loss, "hiddens": (hiddens1, hiddens2)} 66 | 67 | def validation_step(self, batch, batch_idx): 68 | x, y = batch 69 | if self.args.valid_on_cpu: 70 | x = x.cpu() 71 | y = y.cpu() 72 | self.cpu() 73 | self.lstm.flatten_parameters() 74 | lstm_last, _ = self.lstm(x) 75 | self.lstm2.flatten_parameters() 76 | lstm2_last, _ = self.lstm2(x) 77 | concat_lstm = torch.concat([lstm_last, lstm2_last], dim=-1) 78 | logits = self.linear(concat_lstm) 79 | return {"pred": logits, "labels": y} 80 | 81 | def validation_epoch_end(self, validation_step_outputs): 82 | for out in validation_step_outputs: 83 | loss = self.loss_func(out["pred"], out["labels"]) 84 | if self.args.valid_on_cpu: 85 | # if ddp, each machine output must gather. and lightning can gather only on-gpu items 86 | self.log("val_loss", loss.cuda(), sync_dist=True) 87 | # model have to training_step on cuda 88 | self.cuda() 89 | else: 90 | self.log("val_loss", loss, sync_dist=(self.device != "cpu")) 91 | 92 | def predict_step(self, batch, batch_idx): 93 | x, y = batch 94 | logits = self(x) 95 | return logits 96 | 97 | def train_dataloader(self): 98 | dataset = TensorDataset(torch.rand(2000, 200, self.input_size), torch.rand(2000, 200, self.input_size)) 99 | return DataLoader( 100 | dataset=dataset, num_workers=self.args.num_workers, batch_size=self.args.per_device_train_batch_size 101 | ) 102 | 103 | def val_dataloader(self): 104 | dataset = TensorDataset(torch.rand(2000, 200, self.input_size), torch.rand(2000, 200, self.input_size)) 105 | return DataLoader( 106 | dataset=dataset, num_workers=self.args.num_workers, batch_size=self.args.per_device_eval_batch_size 107 | ) 108 | -------------------------------------------------------------------------------- /pip_install_deepspeed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUDA_ARCH_LIST=`CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"` 3 | arch="`echo $CUDA_ARCH_LIST | cut -c2`.`echo $CUDA_ARCH_LIST | cut -c5`" 4 | 5 | git clone https://github.com/microsoft/DeepSpeed/ 6 | cd DeepSpeed 7 | rm -rf build 8 | TORCH_CUDA_ARCH_LIST=$arch DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 pip install . \ 9 | --global-option="build_ext" --global-option="-j8" --no-cache -v \ 10 | --disable-pip-version-check 2>&1 | tee build.log -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.13.1 2 | pytorch-lightning==1.9.4 3 | pandas 4 | numpy 5 | setproctitle 6 | wandb 7 | flake8 8 | black 9 | tqdm 10 | scikit-learn 11 | simple_parsing==0.1.0 -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Warning Use DeepSpeed 2 | 3 | DeepSpeed **ONLY** can be ran on **Multi GPU** 4 | 5 | 6 | 7 | # Warning Use Data parallel or Distributed Data Parallel on Torchmetrics 8 | 9 | https://torchmetrics.readthedocs.io/en/stable/pages/overview.html#metrics-in-dataparallel-dp-mode 10 | 11 | https://torchmetrics.readthedocs.io/en/stable/pages/overview.html#metrics-in-distributed-data-parallel-ddp-mode 12 | 13 | 14 | 15 | This `run_train_gpu.sh` is **ONLY** can be ran on **SINGLE GPU!** 16 | 17 | **If you want to use Multi GPU DP, you have to write additional gather source code on evaluation_step_end** 18 | 19 | 20 | 21 | `run_train_gpu_ddp.sh` is ran well, **but I'm not sure on metric gather is correctly** 22 | 23 | -------------------------------------------------------------------------------- /scripts/run_inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GPU_IDS=3 3 | 4 | CUDA_VISIBLE_DEVICES=$GPU_IDS \ 5 | python3 inference.py \ 6 | --model_path="model_outputs/lightning-template-epoch=02-val_loss=0.0842-v2.ckpt" \ 7 | --config_path="config/dense_model.json" \ 8 | --seed=42 \ 9 | --accelerator=gpu \ 10 | --devices=1 \ 11 | --auto_select_gpus=true \ 12 | --model_select=rnn \ 13 | --truncated_bptt_steps=2 -------------------------------------------------------------------------------- /scripts/run_inference_deepspeed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GPU_IDS="0,1,2,3" 3 | export OMP_NUM_THREADS=8 4 | export CUDA_LAUNCH_BLOCKING=1 5 | export WANDB_DISABLED=false 6 | export TOKENIZERS_PARALLELISM=false 7 | 8 | 9 | model_path=model_outputs/lightning-template-epoch=00-val_loss=0.0846.ckpt 10 | python3 $model_path/zero_to_fp32.py $model_path $model_path/checkpoint/model.bin 11 | 12 | if [ $? -eq "0" ]; then 13 | CUDA_VISIBLE_DEVICES=$GPU_IDS \ 14 | python3 -m torch.distributed.launch --nproc_per_node=4 inference_deepspeed.py \ 15 | --seed=42 \ 16 | --model_path=$model_path/checkpoint/model.bin \ 17 | --config_path=config/dense_model.json \ 18 | --accelerator=gpu \ 19 | --strategy=ddp \ 20 | --devices=4 \ 21 | --num_nodes=1 \ 22 | --model_select=rnn \ 23 | --truncated_bptt_steps=2 24 | fi 25 | 26 | if [ $? -eq "0" ]; then 27 | python3 ddp_inference_gather.py 28 | fi -------------------------------------------------------------------------------- /scripts/run_train_cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python3 train.py \ 3 | --output_dir="model_outputs/" \ 4 | --data_dir="" \ 5 | --config_path="config/dense_model.json" \ 6 | --seed=42 \ 7 | --accelerator=cpu \ 8 | --num_workers=12 \ 9 | --per_device_train_batch_size=64 \ 10 | --per_device_eval_batch_size=64 \ 11 | --val_check_interval=0.25 \ 12 | --accumulate_grad_batches=1 \ 13 | --max_epochs=3 \ 14 | --log_every_n_steps=100 \ 15 | --auto_scale_batch_size=false \ 16 | --learning_rate=0.00005 \ 17 | --max_lr=0.0001 \ 18 | --weight_decay=0.0001 \ 19 | --warmup_ratio=0.2 \ 20 | --ratio=0.2 \ 21 | --div_factor=10 \ 22 | --final_div_factor=10 \ 23 | --model_select=linear 24 | -------------------------------------------------------------------------------- /scripts/run_train_deepspeed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GPU_IDS="0,1,2,3" 3 | export OMP_NUM_THREADS=8 4 | export CUDA_LAUNCH_BLOCKING=1 5 | export WANDB_DISABLED=false 6 | export TOKENIZERS_PARALLELISM=false 7 | export LOCAL_RANK=0 8 | 9 | CUDA_VISIBLE_DEVICES=$GPU_IDS \ 10 | python3 -m torch.distributed.launch --nnodes=1 --nproc_per_node=4 ./train.py \ 11 | --output_dir="model_outputs/" \ 12 | --data_dir="" \ 13 | --config_path="config/dense_model.json" \ 14 | --seed=42 \ 15 | --num_workers=12 \ 16 | --per_device_train_batch_size=64 \ 17 | --per_device_eval_batch_size=64 \ 18 | --val_check_interval=0.25 \ 19 | --accumulate_grad_batches=1 \ 20 | --max_epochs=3 \ 21 | --log_every_n_steps=1 \ 22 | --accelerator=gpu \ 23 | --strategy=deepspeed_stage_2 \ 24 | --num_nodes=1 \ 25 | --replace_sampler_ddp=false \ 26 | --devices=4 \ 27 | --auto_select_gpus=true \ 28 | --auto_scale_batch_size=false \ 29 | --learning_rate=0.00005 \ 30 | --max_lr=0.0001 \ 31 | --weight_decay=0.0001 \ 32 | --warmup_ratio=0.2 \ 33 | --ratio=0.2 \ 34 | --div_factor=10 \ 35 | --final_div_factor=10 \ 36 | --valid_on_cpu=false \ 37 | --model_select=rnn \ 38 | --truncated_bptt_steps=2 \ 39 | --deepspeed_config=ds_config/zero2.json 40 | -------------------------------------------------------------------------------- /scripts/run_train_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GPU_IDS=0 3 | 4 | CUDA_VISIBLE_DEVICES=$GPU_IDS \ 5 | python3 ./train.py \ 6 | --output_dir="../models/" \ 7 | --data_dir="" \ 8 | --config_path="config/dense_model.json" \ 9 | --seed=42 \ 10 | --num_workers=12 \ 11 | --per_device_train_batch_size=64 \ 12 | --per_device_eval_batch_size=64 \ 13 | --val_check_interval=0.25 \ 14 | --accumulate_grad_batches=1 \ 15 | --max_epochs=3 \ 16 | --log_every_n_steps=1 \ 17 | --accelerator=gpu \ 18 | --replace_sampler_ddp=false \ 19 | --devices=1 \ 20 | --auto_scale_batch_size=false \ 21 | --learning_rate=0.00005 \ 22 | --max_lr=0.0001 \ 23 | --weight_decay=0.0001 \ 24 | --warmup_ratio=0.2 \ 25 | --ratio=0.2 \ 26 | --div_factor=10 \ 27 | --final_div_factor=10 \ 28 | --valid_on_cpu=false \ 29 | --model_select=linear \ 30 | --dropout_p=0.1 \ 31 | --cutoff_epoch=1 \ 32 | --drop_mode=standard \ 33 | --drop_schedule=constant 34 | -------------------------------------------------------------------------------- /scripts/run_train_gpu_ddp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GPU_IDS="0,1,2,3" 3 | 4 | OMP_NUM_THREADS=8 \ 5 | CUDA_VISIBLE_DEVICES=$GPU_IDS \ 6 | python3 -m torch.distributed.launch --nnodes=1 --nproc_per_node=4 ./train.py \ 7 | --output_dir="model_outputs/" \ 8 | --data_dir="" \ 9 | --seed=42 \ 10 | --num_workers=12 \ 11 | --per_device_train_batch_size=64 \ 12 | --per_device_eval_batch_size=64 \ 13 | --val_check_interval=0.25 \ 14 | --accumulate_grad_batches=1 \ 15 | --max_epochs=3 \ 16 | --log_every_n_steps=1 \ 17 | --accelerator=gpu \ 18 | --strategy=ddp \ 19 | --num_nodes=1 \ 20 | --replace_sampler_ddp=false \ 21 | --devices=4 \ 22 | --auto_scale_batch_size=false \ 23 | --learning_rate=0.00005 \ 24 | --max_lr=0.0001 \ 25 | --weight_decay=0.0001 \ 26 | --warmup_ratio=0.2 \ 27 | --ratio=0.2 \ 28 | --div_factor=10 \ 29 | --final_div_factor=10 \ 30 | --valid_on_cpu=false \ 31 | --model_select=rnn \ 32 | --truncated_bptt_steps=1 33 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pytorch_lightning as pl 4 | from pytorch_lightning.strategies import DDPStrategy 5 | from datetime import timedelta 6 | from pytorch_lightning.callbacks import ModelCheckpoint 7 | from pytorch_lightning.loggers import WandbLogger 8 | from pytorch_lightning.callbacks import LearningRateMonitor 9 | from models.dense_model.model import CustomNet 10 | from models.dense_model.datamodule import CustomDataModule 11 | from models.rnn_model.model import LSTMModel 12 | from simple_parsing import ArgumentParser 13 | from arguments.training_args import TrainingArguments 14 | from utils.comfy import dataclass_to_namespace 15 | 16 | 17 | def main(hparams): 18 | wandb_logger = WandbLogger(project="lightning-template", name="default", save_dir="./") 19 | pl.seed_everything(hparams.seed) 20 | os.makedirs(hparams.output_dir, exist_ok=True) 21 | hparams.logger = wandb_logger 22 | 23 | checkpoint_callback = ModelCheckpoint( 24 | dirpath=hparams.output_dir, 25 | save_top_k=3, 26 | mode="min", 27 | monitor="val_loss", 28 | filename="lightning-template-{epoch:02d}-{val_loss:.4f}", 29 | ) 30 | lr_monitor = LearningRateMonitor(logging_interval="step") 31 | hparams.callbacks = [checkpoint_callback, lr_monitor] 32 | 33 | if hparams.accelerator == "cpu" and hparams.valid_on_cpu is True: 34 | print("If you run on cpu, valid must go on cpu, It set automatically") 35 | hparams.valid_on_cpu = False 36 | elif hparams.strategy == "ddp": 37 | hparams.strategy = DDPStrategy(timeout=timedelta(days=30)) 38 | elif hparams.strategy == "deepspeed_stage_2": 39 | if hparams.deepspeed_config is not None: 40 | from pytorch_lightning.strategies import DeepSpeedStrategy 41 | 42 | hparams.strategy = DeepSpeedStrategy(config=hparams.deepspeed_config) 43 | elif hparams.accelerator != "cpu" and (hparams.strategy is not None and "deepspeed" in hparams.strategy): 44 | raise NotImplementedError("If you want to another deepspeed option and config, PLZ IMPLEMENT FIRST!!") 45 | trainer = pl.Trainer.from_argparse_args(hparams) 46 | 47 | if hparams.model_select == "linear": 48 | datamodule = CustomDataModule(hparams) 49 | model = CustomNet(hparams) 50 | wandb_logger.watch(model, log="all") 51 | trainer.fit(model, datamodule=datamodule) 52 | """ TODO If use config like dict follow this line 53 | but, model param is duplicated area between training param and model param 54 | I want to get training param on run script argument, so I can not use it 55 | """ 56 | # config_cls = load_config(hparams.config_dir) 57 | # config = config_to_dict(config_cls) 58 | # with open(os.path.join(hparams.output_dir, "config.json"), "w") as f: 59 | # json.dump(config, f, ensure_ascii=False, indent=4) 60 | else: 61 | model = LSTMModel(hparams) 62 | wandb_logger.watch(model, log="all") 63 | trainer.fit(model) 64 | # TODO If finetuning follow this line 65 | # PreTrainedLightningModule.load_state_dict( 66 | # torch.load( 67 | # "", 68 | # map_location="cuda", 69 | # ), 70 | # strict=False, 71 | # ) 72 | checkpoint_callback.best_model_path 73 | 74 | 75 | if __name__ == "__main__": 76 | parser = ArgumentParser() 77 | parser = pl.Trainer.add_argparse_args(parser) 78 | parser.add_arguments(TrainingArguments, dest="training_args") 79 | args = parser.parse_args() 80 | args = dataclass_to_namespace(args, "training_args") 81 | main(args) 82 | -------------------------------------------------------------------------------- /utils/README.md: -------------------------------------------------------------------------------- 1 | # config_loader.py 2 | 3 | It is looked very useful, just like HuggingFace model config loader. But I feel something uncomportable. 4 | 5 | Because, When if you use `pl.LightningModule` on your model code, you have to write `optimizing_step` on your same class. 6 | 7 | I'm enjoying modify training argument in scripts(not model config), but if you used `pl.LightningModule` and `config_loader.py` , maybe you have to load 2 argument in class `__init__` about `training script argument` and `model_config` class -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YooSungHyun/pytorch-lightning-template/9db69c15a179e33b65a98a6605be3b759ccec84e/utils/__init__.py -------------------------------------------------------------------------------- /utils/comfy.py: -------------------------------------------------------------------------------- 1 | def dataclass_to_namespace(args, args_name): 2 | # Dataclass arg to python namespace 3 | if args.__contains__(args_name): 4 | for key, value in args.__getattribute__(args_name).__dict__.items(): 5 | args.__setattr__(key, value) 6 | args.__delattr__(args_name) 7 | return args 8 | -------------------------------------------------------------------------------- /utils/config_loader.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ConfigLoader: 5 | def __init__(self, **kwargs): 6 | for k, v in kwargs.items(): 7 | if type(v) == dict: 8 | v = ConfigLoader(**v) # use recursively 9 | self[k] = v 10 | 11 | def keys(self): 12 | return self.__dict__.keys() 13 | 14 | def items(self): 15 | return self.__dict__.items() 16 | 17 | def values(self): 18 | return self.__dict__.values() 19 | 20 | def __len__(self): 21 | return len(self.__dict__) 22 | 23 | def __getitem__(self, key): 24 | return getattr(self, key) 25 | 26 | def __setitem__(self, key, value): 27 | return setattr(self, key, value) 28 | 29 | def __contains__(self, key): 30 | return key in self.__dict__ 31 | 32 | def __repr__(self): 33 | return self.__dict__.__repr__() 34 | 35 | 36 | def config_to_dict(config: ConfigLoader): 37 | result = dict() 38 | for k, v in config.items(): 39 | if type(v) == ConfigLoader: 40 | result[k] = config_to_dict(v) 41 | else: 42 | result[k] = v 43 | return result 44 | 45 | 46 | def load_config(config_filename: str): 47 | with open(config_filename, "r") as f: 48 | data = f.read() 49 | config = json.loads(data) 50 | hparams = ConfigLoader(**config) 51 | return hparams 52 | --------------------------------------------------------------------------------