├── deep_learning_template ├── __init__.py ├── loss │ ├── __init__.py │ └── build.py ├── metric │ ├── __init__.py │ ├── accuracy.py │ └── build.py ├── solver │ ├── __init__.py │ ├── build.py │ └── lr_scheduler.py ├── trainer │ ├── __init__.py │ ├── build.py │ ├── utils.py │ └── base.py ├── data │ ├── samplers │ │ ├── __init__.py │ │ ├── build.py │ │ ├── iteration_based_batch_sampler.py │ │ ├── image_size_batch_sampler.py │ │ └── ordered_distributed_sampler.py │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── cifar10.py │ │ ├── roi_z_ds.py │ │ └── mnist.py │ ├── collators │ │ ├── default_batch_collator.py │ │ └── build.py │ ├── transforms │ │ ├── __init__.py │ │ ├── build.py │ │ └── transforms.py │ └── build.py ├── evaluators │ ├── __init__.py │ └── build.py ├── config │ ├── __init__.py │ ├── paths_catalog.py │ └── defaults.py ├── modeling │ └── models │ │ ├── __init__.py │ │ ├── models.py │ │ ├── lenet5.py │ │ └── resnet.py ├── csrc │ ├── cpu │ │ └── vision.h │ ├── vision.cpp │ ├── cuda │ │ ├── vision.h │ │ └── SigmoidFocalLoss_cuda.cu │ └── SigmoidFocalLoss.h ├── utils │ ├── os_utils.py │ ├── miscellaneous.py │ ├── imports.py │ ├── tb_utils.py │ ├── logger.py │ ├── timer.py │ ├── registry.py │ └── comm.py ├── registry.py ├── visualizers │ └── build.py └── engine │ ├── defaults.py │ └── launch.py ├── tests └── lr.jpg ├── requirements.txt ├── configs └── mnist │ └── default.yaml ├── .gitignore ├── tools ├── train_net.py └── test_net.py ├── setup.py └── README.md /deep_learning_template/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep_learning_template/loss/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep_learning_template/metric/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep_learning_template/solver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep_learning_template/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep_learning_template/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep_learning_template/data/samplers/build.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep_learning_template/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep_learning_template/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .defaults import _C as cfg 2 | -------------------------------------------------------------------------------- /deep_learning_template/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import make_data_loader 2 | -------------------------------------------------------------------------------- /deep_learning_template/modeling/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import build_model 2 | -------------------------------------------------------------------------------- /tests/lr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ootts/deep_learning_template/HEAD/tests/lr.jpg -------------------------------------------------------------------------------- /deep_learning_template/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | -------------------------------------------------------------------------------- /deep_learning_template/utils/os_utils.py: -------------------------------------------------------------------------------- 1 | def isckpt(fname: str): 2 | return fname.endswith('.pth') 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | dl_ext 3 | yacs 4 | torchvision 5 | numpy 6 | torch 7 | termcolor 8 | tqdm 9 | Pillow 10 | tensorboardX 11 | -------------------------------------------------------------------------------- /deep_learning_template/metric/accuracy.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | 3 | 4 | def accuracy(x: Tensor, y: Tensor): 5 | return (x.argmax(-1) == y).sum().float() / y.shape[0] 6 | -------------------------------------------------------------------------------- /deep_learning_template/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .mnist import MNIST 2 | from .cifar10 import CIFAR10 3 | from .roi_z_ds import ROI_Z_DS 4 | 5 | __all__ = ["MNIST", "CIFAR10", "ROI_Z_DS"] 6 | -------------------------------------------------------------------------------- /deep_learning_template/utils/miscellaneous.py: -------------------------------------------------------------------------------- 1 | from dl_ext.pytorch_ext import is_main_process 2 | 3 | 4 | def save_config(cfg, path): 5 | if is_main_process(): 6 | with open(path, 'w') as f: 7 | f.write(cfg.dump()) 8 | -------------------------------------------------------------------------------- /deep_learning_template/data/collators/default_batch_collator.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data._utils.collate import default_collate 2 | 3 | 4 | class DefaultBatchCollator(object): 5 | def __call__(self, batch): 6 | return default_collate(batch) 7 | -------------------------------------------------------------------------------- /deep_learning_template/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .transforms import Compose 2 | from .transforms import Resize 3 | from .transforms import RandomHorizontalFlip 4 | from .transforms import ToTensor 5 | from .transforms import Normalize 6 | 7 | from .build import build_transforms 8 | -------------------------------------------------------------------------------- /deep_learning_template/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | #include "SigmoidFocalLoss.h" 2 | 3 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 4 | m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); 5 | m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); 6 | } -------------------------------------------------------------------------------- /deep_learning_template/registry.py: -------------------------------------------------------------------------------- 1 | from deep_learning_template.utils.registry import Registry 2 | 3 | BATCH_COLLATORS = Registry() 4 | LOSS_FUNCTIONS = Registry() 5 | METRIC_FUNCTIONS = Registry() 6 | EVALUATORS = Registry() 7 | VISUALIZERS = Registry() 8 | TRAINERS = Registry() 9 | SAMPLERS = Registry() 10 | -------------------------------------------------------------------------------- /deep_learning_template/modeling/models/models.py: -------------------------------------------------------------------------------- 1 | from .lenet5 import LeNet5 2 | from .resnet import ResNet18, ResNet50 3 | 4 | _META_ARCHITECTURES = {'LeNet5': LeNet5, 'ResNet18': ResNet18, 'ResNet50': ResNet50} 5 | 6 | 7 | def build_model(cfg): 8 | meta_arch = _META_ARCHITECTURES[cfg.model.meta_architecture] 9 | return meta_arch(cfg) 10 | -------------------------------------------------------------------------------- /deep_learning_template/trainer/build.py: -------------------------------------------------------------------------------- 1 | from deep_learning_template.registry import TRAINERS 2 | from deep_learning_template.trainer.base import BaseTrainer 3 | 4 | 5 | @TRAINERS.register('base') 6 | def build_base_trainer(cfg): 7 | return BaseTrainer(cfg) 8 | 9 | 10 | def build_trainer(cfg) -> BaseTrainer: 11 | return TRAINERS[cfg.solver.trainer](cfg) 12 | -------------------------------------------------------------------------------- /deep_learning_template/visualizers/build.py: -------------------------------------------------------------------------------- 1 | from deep_learning_template.registry import VISUALIZERS 2 | 3 | 4 | @VISUALIZERS.register('default') 5 | class DefaultVisualizer: 6 | def __init__(self, cfg): 7 | self.cfg = cfg 8 | 9 | def __call__(self, *args, **kwargs): 10 | return 11 | 12 | 13 | def build_visualizer(cfg): 14 | return VISUALIZERS[cfg.test.visualizer](cfg) 15 | -------------------------------------------------------------------------------- /deep_learning_template/data/collators/build.py: -------------------------------------------------------------------------------- 1 | from deep_learning_template.registry import BATCH_COLLATORS 2 | from .default_batch_collator import DefaultBatchCollator 3 | 4 | 5 | @BATCH_COLLATORS.register('DefaultBatchCollator') 6 | def build_default_batch_colloator(cfg): 7 | return DefaultBatchCollator() 8 | 9 | 10 | def make_batch_collator(cfg): 11 | return BATCH_COLLATORS[cfg.dataloader.collator](cfg) 12 | -------------------------------------------------------------------------------- /deep_learning_template/utils/imports.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import importlib.util 3 | import sys 4 | 5 | 6 | def import_file(module_name, file_path, make_importable=False): 7 | spec = importlib.util.spec_from_file_location(module_name, file_path) 8 | module = importlib.util.module_from_spec(spec) 9 | spec.loader.exec_module(module) 10 | if make_importable: 11 | sys.modules[module_name] = module 12 | return module 13 | -------------------------------------------------------------------------------- /configs/mnist/default.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | meta_architecture: LeNet5 3 | datasets: 4 | train: ("MNIST_TRAIN",) 5 | test: MNIST_TEST 6 | dataloader: 7 | num_workers: 4 8 | input: 9 | do_normalize: False 10 | min_size_train: (28,) 11 | max_size_train: 28 12 | min_size_test: 28 13 | max_size_test: 28 14 | solver: 15 | loss_function: cross_entropy_loss 16 | metric_functions: ("accuracy",) 17 | batch_size: 128 18 | num_epochs: 10 19 | trainer: base 20 | test: 21 | batch_size: 128 22 | evaluators: ('mnist',) -------------------------------------------------------------------------------- /deep_learning_template/loss/build.py: -------------------------------------------------------------------------------- 1 | from torch.nn import CrossEntropyLoss, SmoothL1Loss 2 | 3 | from deep_learning_template.registry import LOSS_FUNCTIONS 4 | 5 | 6 | @LOSS_FUNCTIONS.register('cross_entropy_loss') 7 | def build_cross_entropy_loss(cfg): 8 | return CrossEntropyLoss() 9 | 10 | 11 | @LOSS_FUNCTIONS.register('smooth_l1_loss') 12 | def build_cross_entropy_loss(cfg): 13 | return SmoothL1Loss() 14 | 15 | 16 | def build_loss_function(cfg): 17 | return LOSS_FUNCTIONS[cfg.solver.loss_function](cfg) 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # compilation and distribution 2 | __pycache__ 3 | _ext 4 | *.pyc 5 | *.so 6 | *.egg-info/ 7 | build/ 8 | dist/ 9 | # pytorch/python/numpy formats 10 | *.pth 11 | *.pkl 12 | *.npy 13 | 14 | # ipython/jupyter notebooks 15 | **/.ipynb_checkpoints/ 16 | 17 | # Editor temporaries 18 | *.swn 19 | *.swo 20 | *.swp 21 | *~ 22 | 23 | # Pycharm editor settings 24 | .idea 25 | 26 | # vscode editor settings 27 | .vscode 28 | 29 | # MacOS 30 | .DS_Store 31 | 32 | # project dirs 33 | /datasets 34 | /models 35 | /output 36 | /.idea/ 37 | -------------------------------------------------------------------------------- /deep_learning_template/metric/build.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from torch import Tensor 4 | 5 | from deep_learning_template.registry import METRIC_FUNCTIONS 6 | from .accuracy import accuracy 7 | 8 | 9 | @METRIC_FUNCTIONS.register('accuracy') 10 | def build_accuracy(cfg): 11 | return accuracy 12 | 13 | 14 | def build_metric_functions(cfg): 15 | metric_functions = {} 16 | for k in cfg.solver.metric_functions: 17 | v = METRIC_FUNCTIONS[k](cfg) 18 | metric_functions[k] = v 19 | return metric_functions 20 | -------------------------------------------------------------------------------- /deep_learning_template/utils/tb_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorboardX import SummaryWriter 3 | 4 | 5 | def get_summary_writer(logdir=None, comment='', purge_step=None, max_queue=10, 6 | flush_secs=120, filename_suffix='', write_to_disk=True, log_dir=None, **kwargs): 7 | if os.path.exists(logdir): 8 | os.system(f'rm {logdir}/events*') 9 | tb = SummaryWriter(logdir, comment, purge_step, max_queue, flush_secs, filename_suffix, 10 | write_to_disk, log_dir, **kwargs) 11 | return tb 12 | -------------------------------------------------------------------------------- /deep_learning_template/csrc/cuda/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | 5 | at::Tensor SigmoidFocalLoss_forward_cuda( 6 | const at::Tensor& logits, 7 | const at::Tensor& targets, 8 | const int num_classes, 9 | const float gamma, 10 | const float alpha); 11 | 12 | at::Tensor SigmoidFocalLoss_backward_cuda( 13 | const at::Tensor& logits, 14 | const at::Tensor& targets, 15 | const at::Tensor& d_losses, 16 | const int num_classes, 17 | const float gamma, 18 | const float alpha); 19 | 20 | -------------------------------------------------------------------------------- /deep_learning_template/data/datasets/cifar10.py: -------------------------------------------------------------------------------- 1 | from torchvision.datasets import cifar 2 | 3 | 4 | class CIFAR10(cifar.CIFAR10): 5 | def __init__(self, root, train, transforms, ds_len=-1): 6 | super().__init__(root, train=train, transform=None, target_transform=None, download=True) 7 | self.transforms2 = transforms 8 | self.ds_len = ds_len 9 | 10 | def __getitem__(self, index): 11 | x, y = super().__getitem__(index) 12 | x, y = self.transforms2(x, y) 13 | return x, y 14 | 15 | def __len__(self): 16 | if self.ds_len < 0: 17 | return super().__len__() 18 | else: 19 | return self.ds_len 20 | -------------------------------------------------------------------------------- /deep_learning_template/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | 6 | def setup_logger(name, save_dir, distributed_rank, filename="log.txt"): 7 | logger = logging.getLogger(name) 8 | logger.setLevel(logging.DEBUG) 9 | if distributed_rank > 0: 10 | return logger 11 | ch = logging.StreamHandler(stream=sys.stdout) 12 | ch.setLevel(logging.DEBUG) 13 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 14 | ch.setFormatter(formatter) 15 | logger.addHandler(ch) 16 | 17 | if save_dir: 18 | fh = logging.FileHandler(os.path.join(save_dir, filename)) 19 | fh.setLevel(logging.DEBUG) 20 | fh.setFormatter(formatter) 21 | logger.addHandler(fh) 22 | 23 | return logger 24 | -------------------------------------------------------------------------------- /deep_learning_template/data/samplers/iteration_based_batch_sampler.py: -------------------------------------------------------------------------------- 1 | class IterationBasedBatchSampler: 2 | """ 3 | Wraps a BatchSampler, resampling from it until 4 | a specified number of iterations have been sampled 5 | """ 6 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 7 | self.batch_sampler = batch_sampler 8 | self.num_iterations = num_iterations 9 | self.start_iter = start_iter 10 | 11 | def __iter__(self): 12 | iteration = self.start_iter 13 | while iteration <= self.num_iterations: 14 | for batch in self.batch_sampler: 15 | iteration += 1 16 | if iteration > self.num_iterations: 17 | break 18 | yield batch 19 | 20 | def __len__(self): 21 | return self.num_iterations 22 | -------------------------------------------------------------------------------- /deep_learning_template/modeling/models/lenet5.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class LeNet5(nn.Module): 5 | def __init__(self, cfg): 6 | super().__init__() 7 | self.conv1 = nn.Sequential( 8 | nn.Conv2d(1, 6, 5, 1, 2), 9 | nn.ReLU(), 10 | nn.AvgPool2d(2, 2) 11 | ) 12 | self.conv2 = nn.Sequential( 13 | nn.Conv2d(6, 16, 5, 1), 14 | nn.ReLU(), 15 | nn.AvgPool2d(2, 2) 16 | ) 17 | self.fc = nn.Sequential( 18 | nn.Linear(400, 120), 19 | nn.ReLU(), 20 | nn.Linear(120, 84), 21 | nn.ReLU(), 22 | nn.Linear(84, 10) 23 | ) 24 | 25 | def forward(self, x): 26 | bsz = x.shape[0] 27 | x = self.conv1(x) 28 | x = self.conv2(x) 29 | x = x.view(bsz, -1) 30 | x = self.fc(x) 31 | return x 32 | -------------------------------------------------------------------------------- /deep_learning_template/utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | 4 | import torch 5 | 6 | 7 | class Timer(object): 8 | def __init__(self): 9 | self.reset() 10 | 11 | @property 12 | def average_time(self): 13 | return self.total_time / self.calls if self.calls > 0 else 0.0 14 | 15 | def tic(self, sync_cuda=False): 16 | if sync_cuda: 17 | torch.cuda.synchronize() 18 | self.start_time = time.time() 19 | 20 | def toc(self, sync_cuda=False): 21 | if sync_cuda: 22 | torch.cuda.synchronize() 23 | self.add(time.time() - self.start_time) 24 | 25 | def add(self, time_diff): 26 | self.diff = time_diff 27 | self.total_time += self.diff 28 | self.calls += 1 29 | 30 | def reset(self): 31 | self.total_time = 0.0 32 | self.calls = 0 33 | self.start_time = 0.0 34 | self.diff = 0.0 35 | 36 | def avg_time_str(self): 37 | time_str = str(datetime.timedelta(seconds=self.average_time)) 38 | return time_str 39 | -------------------------------------------------------------------------------- /deep_learning_template/data/datasets/roi_z_ds.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data.dataset import Dataset 3 | import os 4 | from dl_ext.vision_ext.transforms import imagenet_normalize 5 | from PIL import Image 6 | from torchvision.transforms.transforms import * 7 | 8 | 9 | class ROI_Z_DS(Dataset): 10 | 11 | def __init__(self, root, split, transforms=None, ds_len=-1): 12 | self.root = root 13 | self.split = split 14 | self.len = len(os.listdir(os.path.join(self.root, self.split))) - 1 15 | if ds_len > 0: 16 | self.len = ds_len 17 | self.zs = torch.load(os.path.join(self.root, self.split, 'zs.pth'), 'cpu') 18 | self.transforms = Compose([ 19 | Resize((224,224)), 20 | ToTensor(), 21 | imagenet_normalize 22 | ]) 23 | print('dataset length', self.len) 24 | 25 | def __getitem__(self, i: int): 26 | img = os.path.join(self.root, self.split, str(i) + '.webp') 27 | img = Image.open(img) 28 | z = torch.tensor(self.zs[i]) 29 | img = self.transforms(img) 30 | return img, z 31 | 32 | def __len__(self) -> int: 33 | return self.len 34 | -------------------------------------------------------------------------------- /deep_learning_template/trainer/utils.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | 3 | from dl_ext.pytorch_ext import get_rank 4 | 5 | 6 | def to_cuda(x): 7 | if hasattr(x, 'cuda'): 8 | return x.cuda(device=get_rank()) 9 | elif isinstance(x, (list, tuple)): 10 | return [to_cuda(xi) for xi in x] 11 | elif isinstance(x, dict): 12 | return {k: to_cuda(v) for k, v in x.items()} 13 | 14 | 15 | def to_cpu(x): 16 | if hasattr(x, 'cpu'): 17 | return x.cpu() 18 | elif isinstance(x, (list, tuple)): 19 | return [to_cpu(xi) for xi in x] 20 | elif isinstance(x, dict): 21 | return {k: to_cpu(v) for k, v in x.items()} 22 | 23 | 24 | def batch_gpu(batch): 25 | x, y = batch 26 | return to_cuda(x), to_cuda(y) 27 | 28 | 29 | def format_time(t): 30 | t = int(t) 31 | h, m, s = t // 3600, (t // 60) % 60, t % 60 32 | if h != 0: 33 | return f'{h}:{m:02d}:{s:02d}' 34 | else: 35 | return f'{m:02d}:{s:02d}' 36 | 37 | 38 | class TrainerState(IntEnum): 39 | BASE = 1 40 | PARALLEL = 2 41 | DISTRIBUTEDPARALLEL = 3 42 | 43 | 44 | def split_list(vals, skip_start: int, skip_end: int): 45 | return vals[skip_start:-skip_end] if skip_end > 0 else vals[skip_start:] 46 | -------------------------------------------------------------------------------- /deep_learning_template/csrc/SigmoidFocalLoss.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | // Interface for Python 10 | at::Tensor SigmoidFocalLoss_forward( 11 | const at::Tensor& logits, 12 | const at::Tensor& targets, 13 | const int num_classes, 14 | const float gamma, 15 | const float alpha) { 16 | if (logits.type().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor SigmoidFocalLoss_backward( 27 | const at::Tensor& logits, 28 | const at::Tensor& targets, 29 | const at::Tensor& d_losses, 30 | const int num_classes, 31 | const float gamma, 32 | const float alpha) { 33 | if (logits.type().is_cuda()) { 34 | #ifdef WITH_CUDA 35 | return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); 36 | #else 37 | AT_ERROR("Not compiled with GPU support"); 38 | #endif 39 | } 40 | AT_ERROR("Not implemented on the CPU"); 41 | } 42 | -------------------------------------------------------------------------------- /deep_learning_template/evaluators/build.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | 4 | from deep_learning_template.metric.accuracy import accuracy 5 | from deep_learning_template.registry import EVALUATORS 6 | from deep_learning_template.utils import comm 7 | 8 | 9 | class DefaultEvaluator: 10 | def __init__(self, cfg): 11 | pass 12 | 13 | def __call__(self, *args, **kwargs): 14 | raise NotImplementedError() 15 | 16 | 17 | class MnistEvaluator(DefaultEvaluator): 18 | def __call__(self, x, ds): 19 | if comm.get_rank() == 0: 20 | y = [] 21 | print('collecting targets...') 22 | for batch in tqdm(ds): 23 | y.append(batch[1]) 24 | y = torch.tensor(y).to(x.device) 25 | acc = accuracy(x, y) 26 | print(acc.item()) 27 | 28 | 29 | CIFAR10Evaluator = MnistEvaluator 30 | 31 | 32 | @EVALUATORS.register('mnist') 33 | def mnist_evaluator(cfg): 34 | return MnistEvaluator(cfg) 35 | 36 | 37 | @EVALUATORS.register('cifar10') 38 | def cifar10_evaluator(cfg): 39 | return CIFAR10Evaluator(cfg) 40 | 41 | 42 | def build_evaluators(cfg): 43 | evaluators = [] 44 | for e in cfg.test.evaluators: 45 | evaluators.append(EVALUATORS[e](cfg)) 46 | return evaluators 47 | -------------------------------------------------------------------------------- /deep_learning_template/data/transforms/build.py: -------------------------------------------------------------------------------- 1 | from . import transforms as T 2 | 3 | 4 | def build_transforms(cfg, is_train=True): 5 | if is_train: 6 | min_size = cfg.input.min_size_train 7 | max_size = cfg.input.max_size_train 8 | horizontal_flip_prob = cfg.input.horizontal_flip_prob 9 | brightness = cfg.input.brightness 10 | contrast = cfg.input.contrast 11 | saturation = cfg.input.saturation 12 | hue = cfg.input.hue 13 | else: 14 | min_size = cfg.input.min_size_test 15 | max_size = cfg.input.max_size_test 16 | horizontal_flip_prob = 0.0 17 | brightness = 0.0 18 | contrast = 0.0 19 | saturation = 0.0 20 | hue = 0.0 21 | 22 | normalize_transform = T.Normalize( 23 | mean=cfg.input.pixel_mean, std=cfg.input.pixel_std 24 | ) 25 | color_jitter = T.ColorJitter( 26 | brightness=brightness, 27 | contrast=contrast, 28 | saturation=saturation, 29 | hue=hue, 30 | ) 31 | ts = [ 32 | T.Resize(min_size, max_size), 33 | color_jitter, 34 | T.RandomHorizontalFlip(horizontal_flip_prob), 35 | T.ToTensor(), 36 | ] 37 | if cfg.input.do_normalize: 38 | ts.append(normalize_transform) 39 | transform = T.Compose(ts) 40 | return transform 41 | -------------------------------------------------------------------------------- /deep_learning_template/utils/registry.py: -------------------------------------------------------------------------------- 1 | def _register_generic(module_dict, module_name, module): 2 | assert module_name not in module_dict 3 | module_dict[module_name] = module 4 | 5 | 6 | class Registry(dict): 7 | ''' 8 | A helper class for managing registering modules, it extends a dictionary 9 | and provides a register functions. 10 | 11 | Eg. creeting a registry: 12 | some_registry = Registry({"default": default_module}) 13 | 14 | There're two ways of registering new modules: 15 | 1): normal way is just calling register function: 16 | def foo(): 17 | ... 18 | some_registry.register("foo_module", foo) 19 | 2): used as decorator when declaring the module: 20 | @some_registry.register("foo_module") 21 | @some_registry.register("foo_modeul_nickname") 22 | def foo(): 23 | ... 24 | 25 | Access of module is just like using a dictionary, eg: 26 | f = some_registry["foo_modeul"] 27 | ''' 28 | def __init__(self, *args, **kwargs): 29 | super(Registry, self).__init__(*args, **kwargs) 30 | 31 | def register(self, module_name, module=None): 32 | # used as function call 33 | if module is not None: 34 | _register_generic(self, module_name, module) 35 | return 36 | 37 | # used as decorator 38 | def register_fn(fn): 39 | _register_generic(self, module_name, fn) 40 | return fn 41 | 42 | return register_fn 43 | -------------------------------------------------------------------------------- /deep_learning_template/data/samplers/image_size_batch_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ImageSizeBatchSampler: 5 | def __init__(self, sampler, batch_size, drop_last, min_size=600, max_size=800, size_int=8): 6 | self.sampler = sampler 7 | self.batch_size = batch_size 8 | self.drop_last = drop_last 9 | self.hmin = min_size 10 | self.hmax = max_size 11 | self.wmin = min_size 12 | self.wmax = max_size 13 | self.size_int = size_int 14 | self.hint = (self.hmax - self.hmin) // self.size_int + 1 15 | self.wint = (self.wmax - self.wmin) // self.size_int + 1 16 | 17 | def generate_height_width(self): 18 | hi, wi = np.random.randint(0, self.hint), np.random.randint(0, self.wint) 19 | h, w = self.hmin + hi * self.size_int, self.wmin + wi * self.size_int 20 | return h, w 21 | 22 | def __iter__(self): 23 | batch = [] 24 | h, w = self.generate_height_width() 25 | for idx in self.sampler: 26 | batch.append((idx, h, w)) 27 | if len(batch) == self.batch_size: 28 | h, w = self.generate_height_width() 29 | yield batch 30 | batch = [] 31 | if len(batch) > 0 and not self.drop_last: 32 | yield batch 33 | 34 | def __len__(self): 35 | if self.drop_last: 36 | return len(self.sampler) // self.batch_size 37 | else: 38 | return (len(self.sampler) + self.batch_size - 1) // self.batch_size 39 | -------------------------------------------------------------------------------- /tools/train_net.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from deep_learning_template.config import cfg 4 | from deep_learning_template.engine.defaults import default_argument_parser, default_setup 5 | from deep_learning_template.engine.launch import launch 6 | from deep_learning_template.trainer.build import build_trainer 7 | from deep_learning_template.utils.comm import get_world_size 8 | 9 | 10 | def setup(args): 11 | """ 12 | Create configs and perform basic setups. 13 | """ 14 | cfg.merge_from_file(args.config_file) 15 | cfg.merge_from_list(args.opts) 16 | if cfg.output_dir == '': 17 | assert args.config_file.startswith('configs') and args.config_file.endswith('.yaml') 18 | cfg.output_dir = args.config_file[:-5].replace('configs', 'models') 19 | cfg.freeze() 20 | os.makedirs(cfg.output_dir, exist_ok=True) 21 | default_setup(cfg, args) 22 | return cfg 23 | 24 | 25 | def main(): 26 | args = default_argument_parser().parse_args() 27 | print("Command Line Args:", args) 28 | launch( 29 | main_func, 30 | args.num_gpus, 31 | dist_url=args.dist_url, 32 | args=(args,), 33 | ) 34 | 35 | 36 | def main_func(args): 37 | world_size = get_world_size() 38 | distributed = world_size > 1 39 | cfg = setup(args) 40 | trainer = build_trainer(cfg) 41 | if args.resume: 42 | trainer.resume() 43 | if distributed: 44 | trainer.to_distributed() 45 | if args.mode == 'train': 46 | trainer.fit() 47 | elif args.mode == 'findlr': 48 | trainer.to_base() 49 | trainer.find_lr() 50 | else: 51 | raise NotImplementedError() 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /deep_learning_template/solver/build.py: -------------------------------------------------------------------------------- 1 | from dl_ext.pytorch_ext import OneCycleScheduler 2 | from torch.optim import SGD, Adam 3 | from .lr_scheduler import WarmupMultiStepLR 4 | 5 | 6 | def make_optimizer(cfg, model): 7 | params = [] 8 | lr = cfg.solver.max_lr 9 | for key, value in model.named_parameters(): 10 | if not value.requires_grad: 11 | continue 12 | weight_decay = cfg.solver.weight_decay 13 | if "bias" in key: 14 | lr = cfg.solver.max_lr * cfg.solver.bias_lr_factor 15 | weight_decay = cfg.solver.weight_decay_bias 16 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 17 | if cfg.solver.optimizer == 'SGD': 18 | optimizer = SGD(params, lr, momentum=cfg.solver.momentum) 19 | elif cfg.solver.optimizer == 'Adam': 20 | optimizer = Adam(params, lr) 21 | else: 22 | raise NotImplementedError() 23 | return optimizer 24 | 25 | 26 | def make_lr_scheduler(cfg, optimizer, max_iter): 27 | if cfg.solver.scheduler == 'WarmupMultiStepLR': 28 | return WarmupMultiStepLR( 29 | optimizer, 30 | cfg.solver.steps, 31 | cfg.solver.gamma, 32 | warmup_factor=cfg.solver.warmup_factor, 33 | warmup_iters=cfg.solver.warmup_iters, 34 | warmup_method=cfg.solver.warmup_method, 35 | ) 36 | elif cfg.solver.scheduler == 'OneCycleScheduler': 37 | return OneCycleScheduler( 38 | optimizer, 39 | cfg.solver.max_lr, 40 | max_iter 41 | ) 42 | elif name == "WarmupCosineLR": 43 | return WarmupCosineLR( 44 | optimizer, 45 | cfg.SOLVER.MAX_ITER, 46 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 47 | warmup_iters=cfg.SOLVER.WARMUP_ITERS, 48 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 49 | ) 50 | else: 51 | raise NotImplementedError() 52 | -------------------------------------------------------------------------------- /deep_learning_template/engine/defaults.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import sys 4 | 5 | from deep_learning_template.utils.comm import get_rank, get_world_size 6 | from deep_learning_template.utils.logger import setup_logger 7 | from deep_learning_template.utils.miscellaneous import save_config 8 | 9 | 10 | def default_argument_parser(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--config-file", '-c', default="", metavar="FILE", help="path to config file") 13 | parser.add_argument("--mode", default="train", choices=['train', 'eval', 'findlr']) 14 | parser.add_argument( 15 | "--resume", 16 | action="store_true", 17 | help="whether to attempt to resume from the checkpoint directory", 18 | ) 19 | parser.add_argument("--num-gpus", '--gpus', type=int, default=1, help="number of gpus *per machine*") 20 | 21 | port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14 22 | parser.add_argument( 23 | "--dist-url", 24 | default="tcp://127.0.0.1:{}".format(port), 25 | help="initialization URL for pytorch distributed backend. See " 26 | "https://pytorch.org/docs/stable/distributed.html for details.", 27 | ) 28 | parser.add_argument( 29 | "opts", 30 | help="Modify config options using the command-line", 31 | default=None, 32 | nargs=argparse.REMAINDER, 33 | ) 34 | return parser 35 | 36 | 37 | def default_setup(cfg, args): 38 | logger = setup_logger("deep_learning_template", cfg.output_dir, get_rank()) 39 | world_size = get_world_size() 40 | logger.info("Using {} GPUs".format(world_size)) 41 | logger.info(args) 42 | logger.info("Loaded configuration file {}".format(args.config_file)) 43 | logger.info("\n" + open(args.config_file, "r").read()) 44 | logger.info("Running with config:\n{}".format(cfg)) 45 | output_config_path = os.path.join(cfg.output_dir, 'config.yml') 46 | logger.info("Saving config into: {}".format(output_config_path)) 47 | save_config(cfg, output_config_path) 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import glob 3 | import os 4 | 5 | import torch 6 | from setuptools import find_packages 7 | from setuptools import setup 8 | from torch.utils.cpp_extension import CUDA_HOME 9 | from torch.utils.cpp_extension import CppExtension 10 | from torch.utils.cpp_extension import CUDAExtension 11 | 12 | requirements = ["torch", "torchvision"] 13 | 14 | 15 | def get_extensions(): 16 | this_dir = os.path.dirname(os.path.abspath(__file__)) 17 | extensions_dir = os.path.join(this_dir, "deep_learning_template", "csrc") 18 | 19 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 20 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 21 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 22 | 23 | sources = main_file + source_cpu 24 | extension = CppExtension 25 | 26 | extra_compile_args = {"cxx": []} 27 | define_macros = [] 28 | 29 | if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": 30 | extension = CUDAExtension 31 | sources += source_cuda 32 | define_macros += [("WITH_CUDA", None)] 33 | extra_compile_args["nvcc"] = [ 34 | "-DCUDA_HAS_FP16=1", 35 | "-D__CUDA_NO_HALF_OPERATORS__", 36 | "-D__CUDA_NO_HALF_CONVERSIONS__", 37 | "-D__CUDA_NO_HALF2_OPERATORS__", 38 | ] 39 | 40 | sources = [os.path.join(extensions_dir, s) for s in sources] 41 | 42 | include_dirs = [extensions_dir] 43 | 44 | ext_modules = [ 45 | extension( 46 | "deep_learning_template._C", 47 | sources, 48 | include_dirs=include_dirs, 49 | define_macros=define_macros, 50 | extra_compile_args=extra_compile_args, 51 | ) 52 | ] 53 | 54 | return ext_modules 55 | 56 | 57 | setup( 58 | name="deep_learning_template", 59 | version="0.1", 60 | author="chenlinghao", 61 | packages=find_packages(exclude=("configs", "tests",)), 62 | ext_modules=get_extensions(), 63 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 64 | ) 65 | -------------------------------------------------------------------------------- /deep_learning_template/data/build.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data 2 | from torch.utils.data.dataset import ConcatDataset 3 | 4 | from deep_learning_template.utils.imports import import_file 5 | from . import datasets as D 6 | from .collators.build import make_batch_collator 7 | from .transforms import build_transforms 8 | 9 | 10 | def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True, ds_len=-1): 11 | if not isinstance(dataset_list, (list, tuple)): 12 | dataset_list = [dataset_list] 13 | datasets = [] 14 | for dataset_name in dataset_list: 15 | data = dataset_catalog.get(dataset_name) 16 | factory = getattr(D, data["factory"]) 17 | args = data["args"] 18 | args['ds_len'] = ds_len 19 | args["transforms"] = transforms 20 | dataset = factory(**args) 21 | datasets.append(dataset) 22 | dataset = datasets[0] 23 | if is_train and len(datasets) > 1: 24 | dataset = ConcatDataset(datasets) 25 | return [dataset] 26 | 27 | 28 | def make_data_loader(cfg, is_train=True): 29 | if is_train: 30 | batch_size = cfg.solver.batch_size 31 | shuffle = cfg.input.shuffle 32 | else: 33 | batch_size = cfg.test.batch_size 34 | shuffle = False 35 | 36 | paths_catalog = import_file( 37 | "deep_learning_template.config.paths_catalog", cfg.paths_catalog, True 38 | ) 39 | DatasetCatalog = paths_catalog.DatasetCatalog 40 | dataset_list = cfg.datasets.train if is_train else cfg.datasets.test 41 | 42 | transforms = build_transforms(cfg, is_train) 43 | datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) 44 | 45 | data_loaders = [] 46 | for dataset in datasets: 47 | collator = make_batch_collator(cfg) 48 | num_workers = cfg.dataloader.num_workers 49 | data_loader = torch.utils.data.DataLoader( 50 | dataset, 51 | batch_size=batch_size, 52 | shuffle=shuffle, 53 | num_workers=num_workers, 54 | collate_fn=collator, 55 | ) 56 | data_loaders.append(data_loader) 57 | assert len(data_loaders) == 1 58 | return data_loaders[0] 59 | -------------------------------------------------------------------------------- /deep_learning_template/config/paths_catalog.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class DatasetCatalog(object): 5 | default_data_dir = os.path.expanduser('~/Datasets') 6 | DATA_DIR = os.environ.get('DATASET_HOME', default_data_dir) 7 | DATASETS = { 8 | "MNIST_TRAIN": { 9 | "root": "mnist", 10 | "train": True, 11 | "download": True 12 | }, 13 | "MNIST_TEST": { 14 | "root": "mnist", 15 | "train": False, 16 | "download": True 17 | }, 18 | "CIFAR10_TRAIN": { 19 | "root": "cifar10", 20 | "train": True, 21 | }, 22 | "CIFAR10_TEST": { 23 | "root": "cifar10", 24 | "train": False, 25 | }, 26 | "KITTI_ROI_Z_TRAIN": { 27 | "root": "/home/linghao/Datasets/kitti/object/training/roi_z", 28 | "split": 'train' 29 | }, 30 | "KITTI_ROI_Z_VAL": { 31 | "root": "/home/linghao/Datasets/kitti/object/training/roi_z", 32 | "split": 'val' 33 | }, 34 | } 35 | 36 | @staticmethod 37 | def get(name): 38 | if name in ['MNIST_TRAIN', "MNIST_TEST"]: 39 | attrs = DatasetCatalog.DATASETS[name] 40 | root = os.path.join(DatasetCatalog.DATA_DIR, attrs['root']) 41 | attrs['root'] = root 42 | return dict( 43 | factory='MNIST', 44 | args=attrs 45 | ) 46 | if name in ['CIFAR10_TRAIN', 'CIFAR10_TEST']: 47 | attrs = DatasetCatalog.DATASETS[name] 48 | root = os.path.join(DatasetCatalog.DATA_DIR, attrs['root']) 49 | attrs['root'] = root 50 | return dict( 51 | factory='CIFAR10', 52 | args=attrs 53 | ) 54 | if name in ['KITTI_ROI_Z_TRAIN', 'KITTI_ROI_Z_VAL']: 55 | attrs = DatasetCatalog.DATASETS[name] 56 | root = os.path.join(DatasetCatalog.DATA_DIR, attrs['root']) 57 | attrs['root'] = root 58 | return dict( 59 | factory='ROI_Z_DS', 60 | args=attrs 61 | ) 62 | raise RuntimeError("Dataset not available: {}".format(name)) 63 | -------------------------------------------------------------------------------- /deep_learning_template/data/samplers/ordered_distributed_sampler.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.distributed as dist 4 | from torch.utils.data import Sampler 5 | 6 | 7 | class OrderedDistributedSampler(Sampler): 8 | """Sampler that restricts data loading to a subset of the datasets. 9 | 10 | It is especially useful in conjunction with 11 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 12 | process can pass a DistributedSampler instance as a DataLoader sampler, 13 | and load a subset of the original datasets that is exclusive to it. 14 | 15 | .. note:: 16 | Dataset is assumed to be of constant size. 17 | 18 | Arguments: 19 | dataset: Dataset used for sampling. 20 | num_replicas (optional): Number of processes participating in 21 | distributed training. 22 | rank (optional): Rank of the current process within num_replicas. 23 | """ 24 | 25 | def __init__(self, dataset, num_replicas=None, rank=None): 26 | super().__init__(dataset) 27 | if num_replicas is None: 28 | if not dist.is_available(): 29 | raise RuntimeError("Requires distributed package to be available") 30 | num_replicas = dist.get_world_size() 31 | if rank is None: 32 | if not dist.is_available(): 33 | raise RuntimeError("Requires distributed package to be available") 34 | rank = dist.get_rank() 35 | self.dataset = dataset 36 | self.num_replicas = num_replicas 37 | self.rank = rank 38 | self.epoch = 0 39 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 40 | self.total_size = self.num_samples * self.num_replicas 41 | 42 | def __iter__(self): 43 | # deterministically shuffle based on epoch 44 | indices = torch.arange(len(self.dataset)).tolist() 45 | # add extra samples to make it evenly divisible 46 | indices += indices[:(self.total_size - len(indices))] 47 | assert len(indices) == self.total_size 48 | 49 | # subsample 50 | indices = indices[self.rank * self.num_samples:(self.rank + 1) * self.num_samples] 51 | assert len(indices) == self.num_samples 52 | 53 | return iter(indices) 54 | 55 | def __len__(self): 56 | return self.num_samples 57 | 58 | def set_epoch(self, epoch): 59 | self.epoch = epoch 60 | -------------------------------------------------------------------------------- /deep_learning_template/config/defaults.py: -------------------------------------------------------------------------------- 1 | import os 2 | from yacs.config import CfgNode as CN 3 | 4 | _C = CN() 5 | 6 | _C.model = CN() 7 | _C.model.device = "cuda" 8 | _C.model.meta_architecture = "GeneralizedRCNN" 9 | _C.model.resnet = CN() 10 | _C.model.resnet.num_classes = 1000 11 | _C.model.resnet.pretrained = True 12 | 13 | _C.input = CN() 14 | _C.input.min_size_train = (600,) 15 | _C.input.max_size_train = 2000 16 | _C.input.min_size_test = 600 17 | _C.input.max_size_test = 2000 18 | _C.input.do_normalize = True 19 | _C.input.pixel_mean = [0.485, 0.456, 0.406] 20 | _C.input.pixel_std = [0.229, 0.224, 0.225] 21 | _C.input.horizontal_flip_prob = 0.5 22 | _C.input.shuffle = True 23 | _C.input.brightness = 0.0 24 | _C.input.contrast = 0.0 25 | _C.input.saturation = 0.0 26 | _C.input.hue = 0.0 27 | 28 | _C.datasets = CN() 29 | _C.datasets.train = () 30 | _C.datasets.test = "" 31 | 32 | _C.dataloader = CN() 33 | _C.dataloader.num_workers = 0 34 | _C.dataloader.collator = 'DefaultBatchCollator' 35 | 36 | _C.solver = CN() 37 | _C.solver.num_epochs = 1 38 | _C.solver.max_lr = 0.01 39 | _C.solver.bias_lr_factor = 2 40 | _C.solver.momentum = 0.9 41 | _C.solver.weight_decay = 0.0005 42 | _C.solver.weight_decay_bias = 0 43 | _C.solver.gamma = 0.1 44 | _C.solver.steps = (30000,) 45 | _C.solver.warmup_factor = 1.0 / 3 46 | _C.solver.warmup_iters = 500 47 | _C.solver.warmup_method = "linear" 48 | _C.solver.optimizer = 'Adam' 49 | _C.solver.scheduler = 'OneCycleScheduler' 50 | _C.solver.do_grad_clip = False 51 | _C.solver.grad_clip_type = 'norm' # norm or value 52 | _C.solver.grad_clip = 1.0 53 | _C.solver.ds_len = -1 54 | _C.solver.batch_size = 2 55 | _C.solver.loss_function = '' 56 | _C.solver.skip_validation = False 57 | _C.solver.save_every = False 58 | _C.solver.save_freq = 1 59 | _C.solver.save_mode = 'epoch' # epoch or iteration 60 | # save model config: 61 | # save_every: False --->save model when smaller val loss is detected. 62 | # save_every: True, save_mode: epoch --->save model when epoch % save_freq==0 63 | # save_every: True, save_mode: iteration --->save model when epoch % save_freq==0 64 | _C.solver.metric_functions = () 65 | _C.solver.trainer = "base" 66 | _C.solver.load_model = "" 67 | _C.solver.load = "" 68 | 69 | _C.test = CN() 70 | _C.test.batch_size = 2 71 | _C.test.evaluators = [''] 72 | _C.test.visualizer = 'default' 73 | _C.test.force_recompute = True 74 | _C.test.skip_evaluation = False 75 | _C.test.skip_visualization = True 76 | _C.test.eval_all = False 77 | _C.test.aggregate = True 78 | # _C.test.eval_with_target = False 79 | 80 | _C.output_dir = '' 81 | 82 | _C.paths_catalog = os.path.join(os.path.dirname(__file__), "paths_catalog.py") 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning Template in PyTorch 2 | 3 | ## Features 4 | 1. Easy distributed training. 5 | ```bash 6 | trainer=Trainer(...) 7 | trainer.to_distributed() 8 | trainer.fit() 9 | ``` 10 | 2. Easy distributed inference. 11 | ```bash 12 | trainer=Trainer(...) 13 | trainer.to_distributed() 14 | trainer.get_preds() 15 | ``` 16 | 3. Learning-rate finder helps you find best learning rate. 17 | ```bash 18 | trainer=Trainer(...) 19 | trainer.find_lr() 20 | ``` 21 | 22 | ![](tests/lr.jpg) 23 | 24 | ### Install 25 | 26 | 1. install PyTorch according to https://pytorch.org/ 27 | 2. pip install -r requirements.txt 28 | 3. sh build_and_install.sh 29 | 30 | ## Training and inference example for MNIST 31 | 1. Find learning-rate. 32 | ```bash 33 | python train_net.py --config-file configs/mnist/defaults.yaml --mode findlr 34 | ``` 35 | 36 | 2. Training. 37 | ```bash 38 | # single gpu 39 | python train_net.py --config-file configs/mnist/defaults.yaml --num-gpus 1 40 | ``` 41 | ```bash 42 | # multi-gpu distributed training. 43 | python train_net.py --config-file configs/mnist/defaults.yaml --num-gpus 4 44 | ``` 45 | 2. Inference and evaluation. 46 | ```bash 47 | # single gpu 48 | python train_net.py --config-file configs/mnist/defaults.yaml --mode eval --num-gpus 1 49 | ``` 50 | ```bash 51 | # multi-gpu distributed inference. 52 | python train_net.py --config-file configs/mnist/defaults.yaml --mode eval --num-gpus 4 53 | ``` 54 | 55 | ## Extend by your own dataset. 56 | 57 | 1. Configs 58 | 59 | 2. Dataset 60 | 61 | 1. Override torch.utils.dataset.Dataset 62 | 63 | Create a new Python file in deep_learning_template/data/datasets. 64 | 65 | The dataset must accept ds_len and transforms as parameter at least. 66 | 67 | 2. Register 68 | 69 | Add in deep_learning_template/data/datasets/\__init__.py 70 | 71 | 3. Define in PathCatalog:deep_learning_template/config/paths_catalog.py 72 | 73 | Add in **DATASET**. 74 | 75 | Add in **get** method. 76 | 77 | 3. Trainer 78 | 79 | Add a new file in deep_learning_template/trainer. 80 | 81 | Register in deep_learning_template/trainer/build.py 82 | 83 | 4. Loss 84 | 85 | Define in deep_learning_template/loss/build.py 86 | 87 | 5. Metrics 88 | 89 | Add new file in deep_learning_template/metric. 90 | 91 | Register in deep_learning_template/metric/build.py 92 | 93 | 6. Model 94 | 95 | Add new file in deep_learning_template/modeling/models. 96 | 97 | Register in deep_learning_template/modeling/models/models.py 98 | 99 | All models must accept cfg as parameter only. 100 | 101 | 7. Evaluator 102 | 103 | Define in deep_learning_template/evaluators/build.py 104 | 105 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import os 3 | 4 | import torch 5 | 6 | from deep_learning_template.config import cfg 7 | from deep_learning_template.engine.defaults import default_argument_parser, default_setup 8 | from deep_learning_template.engine.launch import launch 9 | from deep_learning_template.evaluators.build import build_evaluators 10 | from deep_learning_template.trainer.build import build_trainer 11 | from deep_learning_template.utils.comm import get_world_size, get_rank 12 | from deep_learning_template.utils.os_utils import isckpt 13 | from deep_learning_template.visualizers.build import build_visualizer 14 | 15 | 16 | def setup(args): 17 | """ 18 | Create configs and perform basic setups. 19 | """ 20 | cfg.merge_from_file(args.config_file) 21 | cfg.merge_from_list(args.opts) 22 | if cfg.output_dir == '': 23 | assert args.config_file.startswith('configs') and args.config_file.endswith('.yaml') 24 | cfg.output_dir = args.config_file[:-5].replace('configs', 'models') 25 | cfg.freeze() 26 | os.makedirs(cfg.output_dir, exist_ok=True) 27 | default_setup(cfg, args) 28 | return cfg 29 | 30 | 31 | def main(): 32 | args = default_argument_parser().parse_args() 33 | print("Command Line Args:", args) 34 | launch( 35 | main_func, 36 | args.num_gpus, 37 | dist_url=args.dist_url, 38 | args=(args,), 39 | ) 40 | 41 | 42 | def eval_one_ckpt(trainer): 43 | preds = trainer.get_preds() 44 | if get_rank() == 0: 45 | if not cfg.test.skip_evaluation: 46 | evaluators = build_evaluators(cfg) 47 | for evaluator in evaluators: 48 | evaluator(preds, trainer.valid_dl.dataset) 49 | if not cfg.test.skip_visualization: 50 | visualizer = build_visualizer(cfg) 51 | visualizer(preds, trainer.valid_dl.dataset) 52 | 53 | 54 | def eval_all_ckpts(trainer): 55 | if not cfg.test.skip_evaluation: 56 | evaluators = build_evaluators(cfg) 57 | if not cfg.test.skip_visualization: 58 | visualizer = build_visualizer(cfg) 59 | for fname in sorted(os.listdir(cfg.output_dir)): 60 | if isckpt(fname): 61 | cfg.defrost() 62 | cfg.solver.load = fname[:-4] 63 | cfg.freeze() 64 | trainer.resume() 65 | preds = trainer.get_preds() 66 | if not cfg.test.skip_evaluation: 67 | for evaluator in evaluators: 68 | evaluator(preds, trainer.valid_dl.dataset) 69 | if not cfg.test.skip_visualization: 70 | visualizer(preds, trainer.valid_dl.dataset) 71 | 72 | 73 | def main_func(args): 74 | world_size = get_world_size() 75 | distributed = world_size > 1 76 | cfg = setup(args) 77 | trainer = build_trainer(cfg) 78 | trainer.resume() 79 | if distributed: 80 | trainer.to_distributed() 81 | if cfg.test.eval_all: 82 | eval_all_ckpts(trainer) 83 | else: 84 | eval_one_ckpt(trainer) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /deep_learning_template/solver/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | from bisect import bisect_right 2 | 3 | import torch 4 | 5 | 6 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 7 | def __init__( 8 | self, 9 | optimizer, 10 | milestones, 11 | gamma=0.1, 12 | warmup_factor=1.0 / 3, 13 | warmup_iters=500, 14 | warmup_method="linear", 15 | last_epoch=-1, 16 | ): 17 | if not list(milestones) == sorted(milestones): 18 | raise ValueError( 19 | "Milestones should be a list of" " increasing integers. Got {}", 20 | milestones, 21 | ) 22 | 23 | if warmup_method not in ("constant", "linear"): 24 | raise ValueError( 25 | "Only 'constant' or 'linear' warmup_method accepted" 26 | "got {}".format(warmup_method) 27 | ) 28 | self.milestones = milestones 29 | self.gamma = gamma 30 | self.warmup_factor = warmup_factor 31 | self.warmup_iters = warmup_iters 32 | self.warmup_method = warmup_method 33 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 34 | 35 | def get_lr(self): 36 | warmup_factor = 1 37 | if self.last_epoch < self.warmup_iters: 38 | if self.warmup_method == "constant": 39 | warmup_factor = self.warmup_factor 40 | elif self.warmup_method == "linear": 41 | alpha = float(self.last_epoch) / self.warmup_iters 42 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 43 | return [ 44 | base_lr 45 | * warmup_factor 46 | * self.gamma ** bisect_right(self.milestones, self.last_epoch) 47 | for base_lr in self.base_lrs 48 | ] 49 | 50 | 51 | class WarmupCosineLR(torch.optim.lr_scheduler._LRScheduler): 52 | def __init__( 53 | self, 54 | optimizer: torch.optim.Optimizer, 55 | max_iters: int, 56 | warmup_factor: float = 0.001, 57 | warmup_iters: int = 1000, 58 | warmup_method: str = "linear", 59 | last_epoch: int = -1, 60 | ): 61 | self.max_iters = max_iters 62 | self.warmup_factor = warmup_factor 63 | self.warmup_iters = warmup_iters 64 | self.warmup_method = warmup_method 65 | super().__init__(optimizer, last_epoch) 66 | 67 | def get_lr(self) -> List[float]: 68 | warmup_factor = _get_warmup_factor_at_iter( 69 | self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor 70 | ) 71 | # Different definitions of half-cosine with warmup are possible. For 72 | # simplicity we multiply the standard half-cosine schedule by the warmup 73 | # factor. An alternative is to start the period of the cosine at warmup_iters 74 | # instead of at 0. In the case that warmup_iters << max_iters the two are 75 | # very close to each other. 76 | return [ 77 | base_lr 78 | * warmup_factor 79 | * 0.5 80 | * (1.0 + math.cos(math.pi * self.last_epoch / self.max_iters)) 81 | for base_lr in self.base_lrs 82 | ] 83 | 84 | def _compute_values(self) -> List[float]: 85 | # The new interface 86 | return self.get_lr() 87 | -------------------------------------------------------------------------------- /deep_learning_template/engine/launch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import logging 3 | import torch 4 | import torch.distributed as dist 5 | import torch.multiprocessing as mp 6 | 7 | from deep_learning_template.utils import comm 8 | 9 | __all__ = ["launch"] 10 | 11 | 12 | def _find_free_port(): 13 | import socket 14 | 15 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 16 | # Binding to port 0 will cause the OS to find an available port for us 17 | sock.bind(("", 0)) 18 | port = sock.getsockname()[1] 19 | sock.close() 20 | # NOTE: there is still a chance the port could be taken by other processes. 21 | return port 22 | 23 | 24 | def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=()): 25 | """ 26 | Args: 27 | main_func: a function that will be called by `main_func(*args)` 28 | num_machines (int): the total number of machines 29 | machine_rank (int): the rank of this machine (one per machine) 30 | dist_url (str): url to connect to for distributed training, including protocol 31 | e.g. "tcp://127.0.0.1:8686". 32 | Can be set to auto to automatically select a free port on localhost 33 | args (tuple): arguments passed to main_func 34 | """ 35 | world_size = num_machines * num_gpus_per_machine 36 | if world_size > 1: 37 | # https://github.com/pytorch/pytorch/pull/14391 38 | # TODO prctl in spawned processes 39 | 40 | if dist_url == "auto": 41 | assert num_machines == 1, "dist_url=auto cannot work with distributed training." 42 | port = _find_free_port() 43 | dist_url = f"tcp://127.0.0.1:{port}" 44 | 45 | mp.spawn( 46 | _distributed_worker, 47 | nprocs=num_gpus_per_machine, 48 | args=(main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args), 49 | daemon=False, 50 | ) 51 | else: 52 | main_func(*args) 53 | 54 | 55 | def _distributed_worker( 56 | local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args 57 | ): 58 | assert torch.cuda.is_available(), "cuda is not available. Please check your installation." 59 | global_rank = machine_rank * num_gpus_per_machine + local_rank 60 | try: 61 | dist.init_process_group( 62 | backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank 63 | ) 64 | except Exception as e: 65 | logger = logging.getLogger(__name__) 66 | logger.error("Process group URL: {}".format(dist_url)) 67 | raise e 68 | # synchronize is needed here to prevent a possible timeout after calling init_process_group 69 | # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 70 | comm.synchronize() 71 | 72 | assert num_gpus_per_machine <= torch.cuda.device_count() 73 | torch.cuda.set_device(local_rank) 74 | 75 | # Setup the local process group (which contains ranks within the same machine) 76 | assert comm._LOCAL_PROCESS_GROUP is None 77 | num_machines = world_size // num_gpus_per_machine 78 | for i in range(num_machines): 79 | ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) 80 | pg = dist.new_group(ranks_on_i) 81 | if i == machine_rank: 82 | comm._LOCAL_PROCESS_GROUP = pg 83 | 84 | main_func(*args) 85 | -------------------------------------------------------------------------------- /deep_learning_template/data/transforms/transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import torchvision 4 | from torchvision.transforms import functional as F 5 | 6 | 7 | class Compose(object): 8 | def __init__(self, transforms): 9 | self.transforms = transforms 10 | 11 | def __call__(self, image, target=None): 12 | if target is None: 13 | for t in self.transforms: 14 | image = t(image) 15 | return image 16 | else: 17 | for t in self.transforms: 18 | image, target = t(image, target) 19 | return image, target 20 | 21 | def __repr__(self): 22 | format_string = self.__class__.__name__ + "(" 23 | for t in self.transforms: 24 | format_string += "\n" 25 | format_string += " {0}".format(t) 26 | format_string += "\n)" 27 | return format_string 28 | 29 | 30 | class Resize(object): 31 | def __init__(self, min_size, max_size): 32 | if not isinstance(min_size, (list, tuple)): 33 | min_size = (min_size,) 34 | self.min_size = min_size 35 | self.max_size = max_size 36 | 37 | # modified from torchvision to add support for max size 38 | def get_size(self, image_size): 39 | w, h = image_size 40 | size = random.choice(self.min_size) 41 | max_size = self.max_size 42 | if max_size is not None: 43 | min_original_size = float(min((w, h))) 44 | max_original_size = float(max((w, h))) 45 | if max_original_size / min_original_size * size > max_size: 46 | size = int(round(max_size * min_original_size / max_original_size)) 47 | 48 | if (w <= h and w == size) or (h <= w and h == size): 49 | return h, w 50 | 51 | if w < h: 52 | ow = size 53 | oh = int(size * h / w) 54 | else: 55 | oh = size 56 | ow = int(size * w / h) 57 | 58 | return oh, ow 59 | 60 | def __call__(self, image, target=None): 61 | size = self.get_size(image.size) 62 | image = F.resize(image, size) 63 | if target is None: 64 | return image 65 | if hasattr(target, 'resize'): 66 | target = target.resize(image.size) 67 | return image, target 68 | 69 | 70 | class RandomHorizontalFlip(object): 71 | def __init__(self, prob=0.5): 72 | self.prob = prob 73 | 74 | def __call__(self, image, target=None): 75 | if random.random() < self.prob: 76 | image = F.hflip(image) 77 | if target is not None and hasattr(target, 'transpose'): 78 | target = target.transpose(0) 79 | if target is None: 80 | return image 81 | return image, target 82 | 83 | 84 | class ColorJitter(object): 85 | def __init__(self, 86 | brightness=None, 87 | contrast=None, 88 | saturation=None, 89 | hue=None, 90 | ): 91 | self.color_jitter = torchvision.transforms.ColorJitter( 92 | brightness=brightness, 93 | contrast=contrast, 94 | saturation=saturation, 95 | hue=hue, ) 96 | 97 | def __call__(self, image, target=None): 98 | image = self.color_jitter(image) 99 | if target is None: 100 | return image 101 | return image, target 102 | 103 | 104 | class ToTensor(object): 105 | def __call__(self, image, target=None): 106 | image = F.to_tensor(image) 107 | if target is None: 108 | return image 109 | return image, target 110 | 111 | 112 | class Normalize(object): 113 | def __init__(self, mean, std): 114 | self.mean = mean 115 | self.std = std 116 | 117 | def __call__(self, image, target=None): 118 | image = F.normalize(image, mean=self.mean, std=self.std) 119 | if target is None: 120 | return image 121 | return image, target 122 | -------------------------------------------------------------------------------- /deep_learning_template/modeling/models/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torchvision.models.resnet import resnet50 5 | 6 | 7 | class BasicBlock(nn.Module): 8 | expansion = 1 9 | 10 | def __init__(self, in_planes, planes, stride=1): 11 | super(BasicBlock, self).__init__() 12 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 13 | self.bn1 = nn.BatchNorm2d(planes) 14 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) 15 | self.bn2 = nn.BatchNorm2d(planes) 16 | 17 | self.shortcut = nn.Sequential() 18 | if stride != 1 or in_planes != self.expansion * planes: 19 | self.shortcut = nn.Sequential( 20 | nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), 21 | nn.BatchNorm2d(self.expansion * planes) 22 | ) 23 | 24 | def forward(self, x): 25 | out = F.relu(self.bn1(self.conv1(x))) 26 | out = self.bn2(self.conv2(out)) 27 | out += self.shortcut(x) 28 | out = F.relu(out) 29 | return out 30 | 31 | 32 | class Bottleneck(nn.Module): 33 | expansion = 4 34 | 35 | def __init__(self, in_planes, planes, stride=1): 36 | super(Bottleneck, self).__init__() 37 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 38 | self.bn1 = nn.BatchNorm2d(planes) 39 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 40 | self.bn2 = nn.BatchNorm2d(planes) 41 | self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) 42 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 43 | 44 | self.shortcut = nn.Sequential() 45 | if stride != 1 or in_planes != self.expansion * planes: 46 | self.shortcut = nn.Sequential( 47 | nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), 48 | nn.BatchNorm2d(self.expansion * planes) 49 | ) 50 | 51 | def forward(self, x): 52 | out = F.relu(self.bn1(self.conv1(x))) 53 | out = F.relu(self.bn2(self.conv2(out))) 54 | out = self.bn3(self.conv3(out)) 55 | out += self.shortcut(x) 56 | out = F.relu(out) 57 | return out 58 | 59 | 60 | class ResNet(nn.Module): 61 | def __init__(self, block, num_blocks, num_classes=10): 62 | super(ResNet, self).__init__() 63 | self.in_planes = 64 64 | 65 | self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) 66 | self.bn1 = nn.BatchNorm2d(64) 67 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 68 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 69 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 70 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 71 | self.linear = nn.Linear(512 * block.expansion, num_classes) 72 | 73 | def _make_layer(self, block, planes, num_blocks, stride): 74 | strides = [stride] + [1] * (num_blocks - 1) 75 | layers = [] 76 | for stride in strides: 77 | layers.append(block(self.in_planes, planes, stride)) 78 | self.in_planes = planes * block.expansion 79 | return nn.Sequential(*layers) 80 | 81 | def forward(self, x): 82 | out = F.relu(self.bn1(self.conv1(x))) 83 | out = self.layer1(out) 84 | out = self.layer2(out) 85 | out = self.layer3(out) 86 | out = self.layer4(out) 87 | out = F.avg_pool2d(out, 4) 88 | out = out.view(out.size(0), -1) 89 | out = self.linear(out) 90 | return out 91 | 92 | 93 | def ResNet18(cfg): 94 | return ResNet(BasicBlock, [2, 2, 2, 2]) 95 | 96 | 97 | def ResNet34(cfg): 98 | return ResNet(BasicBlock, [3, 4, 6, 3]) 99 | 100 | 101 | def ResNet50(cfg): 102 | resnet = resnet50(cfg.model.resnet.pretrained) 103 | if cfg.model.resnet.num_classes != resnet.fc.out_features: 104 | resnet.fc = nn.Linear(resnet.fc.in_features, cfg.model.resnet.num_classes) 105 | return resnet 106 | 107 | 108 | def ResNet101(cfg): 109 | return ResNet(Bottleneck, [3, 4, 23, 3]) 110 | 111 | 112 | def ResNet152(cfg): 113 | return ResNet(Bottleneck, [3, 8, 36, 3]) 114 | -------------------------------------------------------------------------------- /deep_learning_template/data/datasets/mnist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import warnings 4 | 5 | import torch 6 | from PIL import Image 7 | from torchvision.datasets import VisionDataset 8 | from torchvision.datasets.mnist import read_image_file, read_label_file 9 | from torchvision.datasets.utils import download_and_extract_archive 10 | 11 | 12 | class MNIST(VisionDataset): 13 | urls = [ 14 | 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 15 | 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 16 | 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 17 | 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', 18 | ] 19 | training_file = 'training.pt' 20 | test_file = 'test.pt' 21 | classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', 22 | '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine'] 23 | 24 | @property 25 | def train_labels(self): 26 | warnings.warn("train_labels has been renamed targets") 27 | return self.targets 28 | 29 | @property 30 | def test_labels(self): 31 | warnings.warn("test_labels has been renamed targets") 32 | return self.targets 33 | 34 | @property 35 | def train_data(self): 36 | warnings.warn("train_data has been renamed data") 37 | return self.data 38 | 39 | @property 40 | def test_data(self): 41 | warnings.warn("test_data has been renamed data") 42 | return self.data 43 | 44 | def __init__(self, root, train=True, transforms=None, 45 | download=False, ds_len=-1): 46 | super(MNIST, self).__init__(root, transforms=transforms) 47 | self.train = train # training set or test set 48 | self.ds_len = ds_len 49 | if download: 50 | self.download() 51 | 52 | if not self._check_exists(): 53 | raise RuntimeError('Dataset not found.' + 54 | ' You can use download=True to download it') 55 | 56 | if self.train: 57 | data_file = self.training_file 58 | else: 59 | data_file = self.test_file 60 | self.data, self.targets = torch.load(os.path.join(self.processed_folder, data_file)) 61 | 62 | def __getitem__(self, index): 63 | """ 64 | Args: 65 | index (int): Index 66 | 67 | Returns: 68 | tuple: (image, target) where target is index of the target class. 69 | """ 70 | img, target = self.data[index], int(self.targets[index]) 71 | 72 | # doing this so that it is consistent with all other datasets 73 | # to return a PIL Image 74 | img = Image.fromarray(img.numpy(), mode='L') 75 | 76 | if self.transforms is not None: 77 | img, target = self.transforms(img, target) 78 | 79 | return img, target 80 | 81 | def __len__(self): 82 | if self.ds_len < 0: 83 | return len(self.data) 84 | else: 85 | return self.ds_len 86 | 87 | @property 88 | def raw_folder(self): 89 | return os.path.join(self.root, self.__class__.__name__, 'raw') 90 | 91 | @property 92 | def processed_folder(self): 93 | return os.path.join(self.root, self.__class__.__name__, 'processed') 94 | 95 | @property 96 | def class_to_idx(self): 97 | return {_class: i for i, _class in enumerate(self.classes)} 98 | 99 | def _check_exists(self): 100 | return (os.path.exists(os.path.join(self.processed_folder, 101 | self.training_file)) and 102 | os.path.exists(os.path.join(self.processed_folder, 103 | self.test_file))) 104 | 105 | def download(self): 106 | """Download the MNIST data if it doesn't exist in processed_folder already.""" 107 | 108 | if self._check_exists(): 109 | return 110 | 111 | os.makedirs(self.raw_folder, exist_ok=True) 112 | os.makedirs(self.processed_folder, exist_ok=True) 113 | 114 | # download files 115 | for url in self.urls: 116 | filename = url.rpartition('/')[2] 117 | download_and_extract_archive(url, download_root=self.raw_folder, filename=filename) 118 | 119 | # process and save as torch files 120 | print('Processing...') 121 | 122 | training_set = ( 123 | read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')), 124 | read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte')) 125 | ) 126 | test_set = ( 127 | read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')), 128 | read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte')) 129 | ) 130 | with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: 131 | torch.save(training_set, f) 132 | with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: 133 | torch.save(test_set, f) 134 | 135 | print('Done!') 136 | 137 | def extra_repr(self): 138 | return "Split: {}".format("Train" if self.train is True else "Test") 139 | -------------------------------------------------------------------------------- /deep_learning_template/csrc/cuda/SigmoidFocalLoss_cuda.cu: -------------------------------------------------------------------------------- 1 | // This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu 2 | // Cheng-Yang Fu 3 | // cyfu@cs.unc.edu 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | // TODO make it in a common file 14 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 15 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 16 | i += blockDim.x * gridDim.x) 17 | 18 | 19 | template 20 | __global__ void SigmoidFocalLossForward(const int nthreads, 21 | const T* logits, 22 | const int* targets, 23 | const int num_classes, 24 | const float gamma, 25 | const float alpha, 26 | const int num, 27 | T* losses) { 28 | CUDA_1D_KERNEL_LOOP(i, nthreads) { 29 | 30 | int n = i / num_classes; 31 | int d = i % num_classes; // current class[0~79]; 32 | int t = targets[n]; // target class [1~80]; 33 | 34 | // Decide it is positive or negative case. 35 | T c1 = (t == (d+1)); 36 | T c2 = (t>=0 & t != (d+1)); 37 | 38 | T zn = (1.0 - alpha); 39 | T zp = (alpha); 40 | 41 | // p = 1. / 1. + expf(-x); p = sigmoid(x) 42 | T p = 1. / (1. + expf(-logits[i])); 43 | 44 | // (1-p)**gamma * log(p) where 45 | T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); 46 | 47 | // p**gamma * log(1-p) 48 | T term2 = powf(p, gamma) * 49 | (-1. * logits[i] * (logits[i] >= 0) - 50 | logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); 51 | 52 | losses[i] = 0.0; 53 | losses[i] += -c1 * term1 * zp; 54 | losses[i] += -c2 * term2 * zn; 55 | 56 | } // CUDA_1D_KERNEL_LOOP 57 | } // SigmoidFocalLossForward 58 | 59 | 60 | template 61 | __global__ void SigmoidFocalLossBackward(const int nthreads, 62 | const T* logits, 63 | const int* targets, 64 | const T* d_losses, 65 | const int num_classes, 66 | const float gamma, 67 | const float alpha, 68 | const int num, 69 | T* d_logits) { 70 | CUDA_1D_KERNEL_LOOP(i, nthreads) { 71 | 72 | int n = i / num_classes; 73 | int d = i % num_classes; // current class[0~79]; 74 | int t = targets[n]; // target class [1~80], 0 is background; 75 | 76 | // Decide it is positive or negative case. 77 | T c1 = (t == (d+1)); 78 | T c2 = (t>=0 & t != (d+1)); 79 | 80 | T zn = (1.0 - alpha); 81 | T zp = (alpha); 82 | // p = 1. / 1. + expf(-x); p = sigmoid(x) 83 | T p = 1. / (1. + expf(-logits[i])); 84 | 85 | // (1-p)**g * (1 - p - g*p*log(p) 86 | T term1 = powf((1. - p), gamma) * 87 | (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); 88 | 89 | // (p**g) * (g*(1-p)*log(1-p) - p) 90 | T term2 = powf(p, gamma) * 91 | ((-1. * logits[i] * (logits[i] >= 0) - 92 | logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * 93 | (1. - p) * gamma - p); 94 | d_logits[i] = 0.0; 95 | d_logits[i] += -c1 * term1 * zp; 96 | d_logits[i] += -c2 * term2 * zn; 97 | d_logits[i] = d_logits[i] * d_losses[i]; 98 | 99 | } // CUDA_1D_KERNEL_LOOP 100 | } // SigmoidFocalLossBackward 101 | 102 | 103 | at::Tensor SigmoidFocalLoss_forward_cuda( 104 | const at::Tensor& logits, 105 | const at::Tensor& targets, 106 | const int num_classes, 107 | const float gamma, 108 | const float alpha) { 109 | AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); 110 | AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); 111 | AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); 112 | 113 | const int num_samples = logits.size(0); 114 | 115 | auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); 116 | auto losses_size = num_samples * logits.size(1); 117 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 118 | 119 | dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L)); 120 | 121 | dim3 block(512); 122 | 123 | if (losses.numel() == 0) { 124 | THCudaCheck(cudaGetLastError()); 125 | return losses; 126 | } 127 | 128 | AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] { 129 | SigmoidFocalLossForward<<>>( 130 | losses_size, 131 | logits.contiguous().data(), 132 | targets.contiguous().data(), 133 | num_classes, 134 | gamma, 135 | alpha, 136 | num_samples, 137 | losses.data()); 138 | }); 139 | THCudaCheck(cudaGetLastError()); 140 | return losses; 141 | } 142 | 143 | 144 | at::Tensor SigmoidFocalLoss_backward_cuda( 145 | const at::Tensor& logits, 146 | const at::Tensor& targets, 147 | const at::Tensor& d_losses, 148 | const int num_classes, 149 | const float gamma, 150 | const float alpha) { 151 | AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); 152 | AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); 153 | AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); 154 | 155 | AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); 156 | 157 | const int num_samples = logits.size(0); 158 | AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes"); 159 | 160 | auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); 161 | auto d_logits_size = num_samples * logits.size(1); 162 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 163 | 164 | dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L)); 165 | dim3 block(512); 166 | 167 | if (d_logits.numel() == 0) { 168 | THCudaCheck(cudaGetLastError()); 169 | return d_logits; 170 | } 171 | 172 | AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] { 173 | SigmoidFocalLossBackward<<>>( 174 | d_logits_size, 175 | logits.contiguous().data(), 176 | targets.contiguous().data(), 177 | d_losses.contiguous().data(), 178 | num_classes, 179 | gamma, 180 | alpha, 181 | num_samples, 182 | d_logits.data()); 183 | }); 184 | 185 | THCudaCheck(cudaGetLastError()); 186 | return d_logits; 187 | } 188 | 189 | -------------------------------------------------------------------------------- /deep_learning_template/utils/comm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | This file contains primitives for multi-gpu communication. 4 | This is useful when doing distributed training. 5 | """ 6 | 7 | import functools 8 | import logging 9 | import numpy as np 10 | import pickle 11 | import torch 12 | import torch.distributed as dist 13 | 14 | _LOCAL_PROCESS_GROUP = None 15 | """ 16 | A torch process group which only includes processes that on the same machine as the current process. 17 | This variable is set when processes are spawned by `launch()` in "engine/launch.py". 18 | """ 19 | 20 | 21 | def get_world_size() -> int: 22 | if not dist.is_available(): 23 | return 1 24 | if not dist.is_initialized(): 25 | return 1 26 | return dist.get_world_size() 27 | 28 | 29 | def get_rank() -> int: 30 | if not dist.is_available(): 31 | return 0 32 | if not dist.is_initialized(): 33 | return 0 34 | return dist.get_rank() 35 | 36 | 37 | def get_local_rank() -> int: 38 | """ 39 | Returns: 40 | The rank of the current process within the local (per-machine) process group. 41 | """ 42 | if not dist.is_available(): 43 | return 0 44 | if not dist.is_initialized(): 45 | return 0 46 | assert _LOCAL_PROCESS_GROUP is not None 47 | return dist.get_rank(group=_LOCAL_PROCESS_GROUP) 48 | 49 | 50 | def get_local_size() -> int: 51 | """ 52 | Returns: 53 | The size of the per-machine process group, 54 | i.e. the number of processes per machine. 55 | """ 56 | if not dist.is_available(): 57 | return 1 58 | if not dist.is_initialized(): 59 | return 1 60 | return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) 61 | 62 | 63 | def is_main_process() -> bool: 64 | return get_rank() == 0 65 | 66 | 67 | def synchronize(): 68 | """ 69 | Helper function to synchronize (barrier) among all processes when 70 | using distributed training 71 | """ 72 | if not dist.is_available(): 73 | return 74 | if not dist.is_initialized(): 75 | return 76 | world_size = dist.get_world_size() 77 | if world_size == 1: 78 | return 79 | dist.barrier() 80 | 81 | 82 | @functools.lru_cache() 83 | def _get_global_gloo_group(): 84 | """ 85 | Return a process group based on gloo backend, containing all the ranks 86 | The result is cached. 87 | """ 88 | if dist.get_backend() == "nccl": 89 | return dist.new_group(backend="gloo") 90 | else: 91 | return dist.group.WORLD 92 | 93 | 94 | def _serialize_to_tensor(data, group): 95 | backend = dist.get_backend(group) 96 | assert backend in ["gloo", "nccl"] 97 | device = torch.device("cpu" if backend == "gloo" else "cuda") 98 | 99 | buffer = pickle.dumps(data) 100 | if len(buffer) > 1024 ** 3: 101 | logger = logging.getLogger(__name__) 102 | logger.warning( 103 | "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( 104 | get_rank(), len(buffer) / (1024 ** 3), device 105 | ) 106 | ) 107 | storage = torch.ByteStorage.from_buffer(buffer) 108 | tensor = torch.ByteTensor(storage).to(device=device) 109 | return tensor 110 | 111 | 112 | def _pad_to_largest_tensor(tensor, group): 113 | """ 114 | Returns: 115 | list[int]: size of the tensor, on each rank 116 | Tensor: padded tensor that has the max size 117 | """ 118 | world_size = dist.get_world_size(group=group) 119 | assert ( 120 | world_size >= 1 121 | ), "comm.gather/all_gather must be called from ranks within the given group!" 122 | local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) 123 | size_list = [ 124 | torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) 125 | ] 126 | dist.all_gather(size_list, local_size, group=group) 127 | size_list = [int(size.item()) for size in size_list] 128 | 129 | max_size = max(size_list) 130 | 131 | # we pad the tensor because torch all_gather does not support 132 | # gathering tensors of different shapes 133 | if local_size != max_size: 134 | padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device) 135 | tensor = torch.cat((tensor, padding), dim=0) 136 | return size_list, tensor 137 | 138 | 139 | def all_gather(data, group=None): 140 | """ 141 | Run all_gather on arbitrary picklable data (not necessarily tensors). 142 | 143 | Args: 144 | data: any picklable object 145 | group: a torch process group. By default, will use a group which 146 | contains all ranks on gloo backend. 147 | 148 | Returns: 149 | list[data]: list of data gathered from each rank 150 | """ 151 | if get_world_size() == 1: 152 | return [data] 153 | if group is None: 154 | group = _get_global_gloo_group() 155 | if dist.get_world_size(group) == 1: 156 | return [data] 157 | 158 | tensor = _serialize_to_tensor(data, group) 159 | 160 | size_list, tensor = _pad_to_largest_tensor(tensor, group) 161 | max_size = max(size_list) 162 | 163 | # receiving Tensor from all ranks 164 | tensor_list = [ 165 | torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list 166 | ] 167 | dist.all_gather(tensor_list, tensor, group=group) 168 | 169 | data_list = [] 170 | for size, tensor in zip(size_list, tensor_list): 171 | buffer = tensor.cpu().numpy().tobytes()[:size] 172 | data_list.append(pickle.loads(buffer)) 173 | 174 | return data_list 175 | 176 | 177 | def gather(data, dst=0, group=None): 178 | """ 179 | Run gather on arbitrary picklable data (not necessarily tensors). 180 | 181 | Args: 182 | data: any picklable object 183 | dst (int): destination rank 184 | group: a torch process group. By default, will use a group which 185 | contains all ranks on gloo backend. 186 | 187 | Returns: 188 | list[data]: on dst, a list of data gathered from each rank. Otherwise, 189 | an empty list. 190 | """ 191 | if get_world_size() == 1: 192 | return [data] 193 | if group is None: 194 | group = _get_global_gloo_group() 195 | if dist.get_world_size(group=group) == 1: 196 | return [data] 197 | rank = dist.get_rank(group=group) 198 | 199 | tensor = _serialize_to_tensor(data, group) 200 | size_list, tensor = _pad_to_largest_tensor(tensor, group) 201 | 202 | # receiving Tensor from all ranks 203 | if rank == dst: 204 | max_size = max(size_list) 205 | tensor_list = [ 206 | torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list 207 | ] 208 | dist.gather(tensor, tensor_list, dst=dst, group=group) 209 | 210 | data_list = [] 211 | for size, tensor in zip(size_list, tensor_list): 212 | buffer = tensor.cpu().numpy().tobytes()[:size] 213 | data_list.append(pickle.loads(buffer)) 214 | return data_list 215 | else: 216 | dist.gather(tensor, [], dst=dst, group=group) 217 | return [] 218 | 219 | 220 | def shared_random_seed(): 221 | """ 222 | Returns: 223 | int: a random number that is the same across all workers. 224 | If workers need a shared RNG, they can use this shared seed to 225 | create one. 226 | 227 | All workers must call this function, otherwise it will deadlock. 228 | """ 229 | ints = np.random.randint(2 ** 31) 230 | all_ints = all_gather(ints) 231 | return all_ints[0] 232 | 233 | 234 | def reduce_dict(input_dict, average=True): 235 | """ 236 | Reduce the values in the dictionary from all processes so that process with rank 237 | 0 has the reduced results. 238 | 239 | Args: 240 | input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor. 241 | average (bool): whether to do average or sum 242 | 243 | Returns: 244 | a dict with the same keys as input_dict, after reduction. 245 | """ 246 | world_size = get_world_size() 247 | if world_size < 2: 248 | return input_dict 249 | with torch.no_grad(): 250 | names = [] 251 | values = [] 252 | # sort the keys so that they are consistent across processes 253 | for k in sorted(input_dict.keys()): 254 | names.append(k) 255 | values.append(input_dict[k]) 256 | values = torch.stack(values, dim=0) 257 | dist.reduce(values, dst=0) 258 | if dist.get_rank() == 0 and average: 259 | # only main process gets accumulated, so only divide by 260 | # world_size in this case 261 | values /= world_size 262 | reduced_dict = {k: v for k, v in zip(names, values)} 263 | return reduced_dict 264 | -------------------------------------------------------------------------------- /deep_learning_template/trainer/base.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import logging 3 | import math 4 | import os 5 | import sys 6 | import time 7 | 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | from torch.nn.utils import clip_grad_norm_, clip_grad_value_ 11 | 12 | from dl_ext.average_meter import AverageMeter 13 | from dl_ext.pytorch_ext.dist import * 14 | from dl_ext.pytorch_ext.optim import OneCycleScheduler, LRFinder 15 | 16 | from matplotlib import axes, figure 17 | from tensorboardX import SummaryWriter 18 | from termcolor import colored 19 | from torch import nn 20 | from torch.nn import DataParallel 21 | from torch.nn.parallel import DistributedDataParallel 22 | from torch.utils.data import DataLoader, DistributedSampler 23 | from tqdm import tqdm 24 | 25 | from deep_learning_template.data.samplers.ordered_distributed_sampler import OrderedDistributedSampler 26 | from .utils import * 27 | from ..data import make_data_loader 28 | from ..loss.build import build_loss_function 29 | from ..metric.build import build_metric_functions 30 | from ..modeling.models import build_model 31 | from ..solver.build import make_optimizer, make_lr_scheduler 32 | from ..utils.tb_utils import get_summary_writer 33 | 34 | 35 | class BaseTrainer: 36 | 37 | def __init__(self, cfg): 38 | self.model: nn.Module = build_model(cfg).to(torch.device(cfg.model.device)) 39 | self.loss_function = build_loss_function(cfg) 40 | self.train_dl = make_data_loader(cfg, is_train=True) 41 | self.valid_dl = make_data_loader(cfg, is_train=False) 42 | self.output_dir = cfg.output_dir 43 | self.num_epochs = cfg.solver.num_epochs 44 | self.begin_epoch = 0 45 | self.max_lr = cfg.solver.max_lr 46 | self.skip_validation = cfg.solver.skip_validation 47 | self.save_every = cfg.solver.save_every 48 | if self.skip_validation and not self.save_every: 49 | raise RuntimeError('model must be save every [epoch/iteration] when validation is skipped.') 50 | self.save_mode = cfg.solver.save_mode 51 | self.save_freq = cfg.solver.save_freq 52 | self.optimizer = make_optimizer(cfg, self.model) 53 | self.scheduler = make_lr_scheduler(cfg, self.optimizer, 54 | cfg.solver.num_epochs * len(self.train_dl)) 55 | self.metric_functions = build_metric_functions(cfg) 56 | self.cfg = cfg 57 | self._tb_writer = None 58 | self.state = TrainerState.BASE 59 | self.global_steps = 0 60 | self.best_val_loss = 100000 61 | self.val_loss = 100000 62 | self.logger = self._setup_logger() 63 | 64 | def train(self, epoch): 65 | loss_meter = AverageMeter() 66 | metric_ams = {} 67 | for metric in self.metric_functions.keys(): 68 | metric_ams[metric] = AverageMeter() 69 | self.model.train() 70 | bar = tqdm(self.train_dl, leave=False) if is_main_process() else self.train_dl 71 | begin = time.time() 72 | for batch in bar: 73 | self.optimizer.zero_grad() 74 | x, y = batch_gpu(batch) 75 | output = self.model(x) 76 | loss = self.loss_function(output, y) 77 | loss = loss.mean() 78 | loss.backward() 79 | if self.cfg.solver.do_grad_clip: 80 | if self.cfg.solver.grad_clip_type == 'norm': 81 | clip_grad_norm_(self.model.parameters(), self.cfg.solver.grad_clip) 82 | else: 83 | clip_grad_value_(self.model.parameters(), self.cfg.solver.grad_clip) 84 | self.optimizer.step() 85 | if self.scheduler is not None and isinstance(self.scheduler, OneCycleScheduler): 86 | self.scheduler.step() 87 | # record and plot loss and metrics 88 | reduced_loss = reduce_loss(loss) 89 | metrics = {} 90 | for metric, f in self.metric_functions.items(): 91 | s = f(output, y).mean() 92 | reduced_s = reduce_loss(s) 93 | metrics[metric] = reduced_s 94 | if is_main_process(): 95 | loss_meter.update(reduced_loss.item()) 96 | lr = self.optimizer.param_groups[0]['lr'] 97 | self.tb_writer.add_scalar('train/loss', reduced_loss.item(), self.global_steps) 98 | self.tb_writer.add_scalar('train/lr', lr, self.global_steps) 99 | bar_vals = {'epoch': epoch, 'phase': 'train', 'loss': loss_meter.avg, 'lr': lr} 100 | for k, v in metrics.items(): 101 | metric_ams[k].update(v.item()) 102 | self.tb_writer.add_scalar(f'train/{k}', v.item(), self.global_steps) 103 | bar_vals[k] = metric_ams[k].avg 104 | bar.set_postfix(bar_vals) 105 | self.global_steps += 1 106 | if self.global_steps % self.save_freq == 0: 107 | self.try_to_save(epoch, 'iteration') 108 | torch.cuda.synchronize() 109 | epoch_time = format_time(time.time() - begin) 110 | if is_main_process(): 111 | metric_msgs = ['epoch %d, train, loss %.4f, time %s' % ( 112 | epoch, loss_meter.avg, epoch_time)] 113 | for metric, v in metric_ams.items(): 114 | metric_msgs.append('%s %.4f' % (metric, v.avg)) 115 | s = ', '.join(metric_msgs) 116 | self.logger.info(s) 117 | if self.scheduler is not None and not isinstance(self.scheduler, OneCycleScheduler): 118 | self.scheduler.step() 119 | 120 | @torch.no_grad() 121 | def val(self, epoch): 122 | loss_meter = AverageMeter() 123 | metric_ams = {} 124 | for metric in self.metric_functions.keys(): 125 | metric_ams[metric] = AverageMeter() 126 | self.model.eval() 127 | bar = tqdm(self.valid_dl, leave=False) if is_main_process() else self.valid_dl 128 | begin = time.time() 129 | for batch in bar: 130 | x, y = batch_gpu(batch) 131 | output = self.model(x) 132 | loss = self.loss_function(output, y) 133 | loss = loss.mean() 134 | reduced_loss = reduce_loss(loss) 135 | metrics = {} 136 | for metric, f in self.metric_functions.items(): 137 | s = f(output, y).mean() 138 | reduced_s = reduce_loss(s) 139 | metrics[metric] = reduced_s 140 | if is_main_process(): 141 | loss_meter.update(reduced_loss.item()) 142 | bar_vals = {'epoch': epoch, 'phase': 'val', 'loss': loss_meter.avg} 143 | for k, v in metrics.items(): 144 | metric_ams[k].update(v.item()) 145 | bar_vals[k] = metric_ams[k].avg 146 | bar.set_postfix(bar_vals) 147 | torch.cuda.synchronize() 148 | epoch_time = format_time(time.time() - begin) 149 | if is_main_process(): 150 | metric_msgs = ['epoch %d, val, loss %.4f, time %s' % ( 151 | epoch, loss_meter.avg, epoch_time)] 152 | for metric, v in metric_ams.items(): 153 | metric_msgs.append('%s %.4f' % (metric, v.avg)) 154 | s = ', '.join(metric_msgs) 155 | self.logger.info(s) 156 | self.tb_writer.add_scalar('val/loss', loss_meter.avg, epoch) 157 | for metric, s in metric_ams.items(): 158 | self.tb_writer.add_scalar(f'val/{metric}', s.avg, epoch) 159 | return loss_meter.avg 160 | 161 | def fit(self): 162 | os.makedirs(self.output_dir, exist_ok=True) 163 | num_epochs = self.num_epochs 164 | begin = time.time() 165 | for epoch in range(self.begin_epoch, num_epochs): 166 | self.train(epoch) 167 | synchronize() 168 | if not self.skip_validation: 169 | self.val_loss = self.val(epoch) 170 | synchronize() 171 | self.try_to_save(epoch, 'epoch') 172 | # if is_main_process(): 173 | # if self.save_every and ( 174 | # self.save_mode == 'epoch' and epoch % self.cfg.solver.save_freq == 0 or epoch == num_epochs - 1): 175 | # self.save(epoch) 176 | # elif val_loss < self.best_val_loss: 177 | # self.logger.info( 178 | # colored('Better model found at epoch %d with val_loss %.4f.' % (epoch, val_loss), 'red')) 179 | # self.best_val_loss = val_loss 180 | # self.save(epoch) 181 | synchronize() 182 | if is_main_process(): 183 | self.logger.info('Training finished. Total time %s' % (format_time(time.time() - begin))) 184 | 185 | @torch.no_grad() 186 | def get_preds(self): 187 | prediction_path = osp.join(self.cfg.output_dir, 'inference', self.cfg.datasets.test, 'predictions.pth') 188 | if not self.cfg.test.force_recompute and osp.exists(prediction_path): 189 | self.logger.info(colored(f'predictions found at {prediction_path}, skip recomputing.', 'red')) 190 | outputs = torch.load(prediction_path) 191 | else: 192 | if get_world_size() > 1: 193 | outputs = self.get_preds_dist() 194 | else: 195 | self.model.eval() 196 | ordered_valid_dl = DataLoader(self.valid_dl.dataset, self.valid_dl.batch_size, shuffle=False, 197 | sampler=None, num_workers=self.valid_dl.num_workers, 198 | collate_fn=self.valid_dl.collate_fn, pin_memory=self.valid_dl.pin_memory, 199 | timeout=self.valid_dl.timeout, 200 | worker_init_fn=self.valid_dl.worker_init_fn) 201 | bar = tqdm(ordered_valid_dl) 202 | outputs = [] 203 | for batch in bar: 204 | x, y = batch_gpu(batch) 205 | output = self.model(x) 206 | output = to_cpu(output) 207 | outputs.append(output) 208 | outputs = torch.cat(outputs) 209 | os.makedirs(osp.dirname(prediction_path), exist_ok=True) 210 | torch.save(outputs, prediction_path) 211 | return outputs 212 | 213 | @torch.no_grad() 214 | def get_preds_dist(self): 215 | self.model.eval() 216 | valid_sampler = OrderedDistributedSampler(self.valid_dl.dataset, get_world_size(), rank=get_rank()) 217 | ordered_dist_valid_dl = DataLoader(self.valid_dl.dataset, self.valid_dl.batch_size, shuffle=False, 218 | sampler=valid_sampler, num_workers=self.valid_dl.num_workers, 219 | collate_fn=self.valid_dl.collate_fn, pin_memory=self.valid_dl.pin_memory, 220 | timeout=self.valid_dl.timeout, 221 | worker_init_fn=self.valid_dl.worker_init_fn) 222 | bar = tqdm(ordered_dist_valid_dl) if is_main_process() else ordered_dist_valid_dl 223 | outputs = [] 224 | for batch in bar: 225 | x, y = batch_gpu(batch) 226 | output = self.model(x) 227 | output = to_cpu(output) 228 | outputs.append(output) 229 | outputs = torch.cat(outputs) 230 | all_outputs = all_gather(outputs) 231 | if not is_main_process(): 232 | return 233 | all_outputs = torch.cat(all_outputs, dim=0).cpu()[:len(self.valid_dl.dataset)] 234 | return all_outputs 235 | 236 | def to_base(self): 237 | if self.state == TrainerState.BASE: 238 | return 239 | elif self.state == TrainerState.PARALLEL: 240 | self.model = self.model.module 241 | if isinstance(self.scheduler, OneCycleScheduler): 242 | world_size = get_world_size() 243 | self.scheduler.total_steps *= world_size 244 | self.scheduler.step_size_up *= world_size 245 | self.scheduler.step_size_down *= world_size 246 | else: 247 | self.model = self.model.module 248 | self.train_dl = self.old_train_dl 249 | self.valid_dl = self.old_valid_dl 250 | if isinstance(self.scheduler, OneCycleScheduler): 251 | world_size = get_world_size() 252 | self.scheduler.total_steps *= world_size 253 | self.scheduler.step_size_up *= world_size 254 | self.scheduler.step_size_down *= world_size 255 | 256 | def to_parallel(self): 257 | assert self.state == TrainerState.BASE 258 | devices = os.environ['CUDA_VISIBLE_DEVICES'] 259 | print('visible devices', devices) 260 | self.model = DataParallel(self.model) 261 | if isinstance(self.scheduler, OneCycleScheduler): 262 | world_size = get_world_size() 263 | self.scheduler.total_steps //= world_size 264 | self.scheduler.step_size_up //= world_size 265 | self.scheduler.step_size_down //= world_size 266 | 267 | def to_distributed(self, convert_sync_batchnorm=True): 268 | assert dist.is_available() and dist.is_initialized() 269 | local_rank = dist.get_rank() 270 | if convert_sync_batchnorm: 271 | self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) 272 | self.model = DistributedDataParallel(self.model, [local_rank], 273 | output_device=local_rank, 274 | broadcast_buffers=False) 275 | self.old_train_dl = self.train_dl 276 | train_sampler = DistributedSampler(self.train_dl.dataset, shuffle=True) 277 | new_train_dl = DataLoader(self.train_dl.dataset, self.train_dl.batch_size, shuffle=False, 278 | sampler=train_sampler, num_workers=self.train_dl.num_workers, 279 | collate_fn=self.train_dl.collate_fn, pin_memory=self.train_dl.pin_memory, 280 | timeout=self.train_dl.timeout, worker_init_fn=self.train_dl.worker_init_fn) 281 | self.train_dl = new_train_dl 282 | self.old_valid_dl = self.valid_dl 283 | valid_sampler = DistributedSampler(self.valid_dl.dataset, shuffle=False) 284 | new_valid_dl = DataLoader(self.valid_dl.dataset, self.valid_dl.batch_size, shuffle=False, 285 | sampler=valid_sampler, num_workers=self.valid_dl.num_workers, 286 | collate_fn=self.valid_dl.collate_fn, pin_memory=self.valid_dl.pin_memory, 287 | timeout=self.valid_dl.timeout, worker_init_fn=self.valid_dl.worker_init_fn) 288 | self.valid_dl = new_valid_dl 289 | if isinstance(self.scheduler, OneCycleScheduler): 290 | world_size = get_world_size() 291 | self.scheduler.total_steps /= world_size 292 | self.scheduler.step_size_up /= world_size 293 | self.scheduler.step_size_down /= world_size 294 | 295 | def find_lr(self, start_lr: float = 1e-7, end_lr: float = 10, 296 | num_it: int = 100, stop_div: bool = True, 297 | skip_start: int = 10, skip_end: int = 5, suggestion: bool = True): 298 | assert self.state == TrainerState.BASE 299 | self.old_scheduler = self.scheduler 300 | self.scheduler = LRFinder(self.optimizer, start_lr, end_lr, num_it, stop_div) 301 | loss_meter = AverageMeter() 302 | self.model.train() 303 | 304 | it = 0 305 | lrs, smooth_losses = [], [] 306 | best_loss = 10000 307 | for epoch in range(round(math.ceil(num_it / len(self.train_dl)))): 308 | bar = tqdm(self.train_dl, leave=False) 309 | for batch in bar: 310 | if it > num_it: break 311 | self.optimizer.zero_grad() 312 | x, y = batch_gpu(batch) 313 | output = self.model(x) 314 | loss = self.loss_function(output, y) 315 | loss = loss.mean() 316 | if (loss > 40 * best_loss or torch.isnan(loss).sum() != 0) and stop_div: 317 | print('loss diverge, stop.') 318 | break 319 | loss.backward() 320 | self.optimizer.step() 321 | self.scheduler.step() 322 | # record and plot loss and metrics 323 | loss_meter.update(loss.item()) 324 | best_loss = min(loss.item(), best_loss) 325 | lr = self.optimizer.param_groups[0]['lr'] 326 | lrs.append(lr) 327 | smooth_losses.append(loss_meter.avg) 328 | bar_vals = {'it': it, 'phase': 'train', 'loss': loss_meter.avg, 'lr': lr} 329 | bar.set_postfix(bar_vals) 330 | it += 1 331 | lrs = split_list(lrs, skip_start, skip_end) 332 | losses = split_list(smooth_losses, skip_start, skip_end) 333 | # losses = [x() for x in losses] 334 | fig, ax = plt.subplots(1, 1) 335 | ax.plot(lrs, losses) 336 | ax.set_ylabel("Loss") 337 | ax.set_xlabel("Learning Rate") 338 | ax.set_xscale('log') 339 | ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e')) 340 | if suggestion: 341 | try: 342 | mg = (np.gradient(np.array(losses))).argmin() 343 | except: 344 | print("Failed to compute the gradients, there might not be enough points.") 345 | return 346 | print(f"Min numerical gradient: {lrs[mg]:.2E}") 347 | ax.plot(lrs[mg], losses[mg], markersize=10, marker='o', color='red') 348 | ml = np.argmin(losses) 349 | print(f"Min loss divided by 10: {lrs[ml] / 10:.2E}") 350 | fig: figure.Figure 351 | ax: axes.Axes 352 | fig.savefig(os.path.join(self.output_dir, 'lr.jpg')) 353 | # reset scheduler 354 | self.scheduler = self.old_scheduler 355 | 356 | def save(self, epoch): 357 | if self.save_mode == 'epoch': 358 | name = os.path.join(self.output_dir, 'model_epoch_%06d.pth' % epoch) 359 | else: 360 | name = os.path.join(self.output_dir, 'model_iteration_%06d.pth' % self.global_steps) 361 | net_sd = self.model.module.state_dict() if hasattr(self.model, 'module') else self.model.state_dict() 362 | d = {'model': net_sd, 363 | 'optimizer': self.optimizer.state_dict(), 364 | 'scheduler': self.scheduler.state_dict(), 365 | 'epoch': epoch, 366 | 'best_val_loss': self.best_val_loss, 367 | 'global_steps': self.global_steps} 368 | torch.save(d, name) 369 | 370 | def load(self, name): 371 | name = os.path.join(self.output_dir, name + '.pth') 372 | d = torch.load(name, 'cpu') 373 | net_sd = d['model'] 374 | if hasattr(self.model, 'module'): 375 | self.model.module.load_state_dict(net_sd) 376 | else: 377 | self.model.load_state_dict(net_sd) 378 | self.optimizer.load_state_dict(d['optimizer']) 379 | self.scheduler.load_state_dict(d['scheduler']) 380 | self.begin_epoch = d['epoch'] 381 | self.best_val_loss = d['best_val_loss'] 382 | if 'global_steps' in d: # compat 383 | self.global_steps = d['global_steps'] 384 | 385 | def load_model(self, name): 386 | d = torch.load(name, 'cpu') 387 | if hasattr(self.model, 'module'): 388 | self.model.module.load_state_dict(d) 389 | else: 390 | self.model.load_state_dict(d) 391 | 392 | def _setup_logger(self): 393 | logger = logging.getLogger(self.__class__.__name__) 394 | logger.setLevel(logging.DEBUG) 395 | # don't log results for the non-master process 396 | if get_rank() > 0: 397 | return logger 398 | ch = logging.StreamHandler(stream=sys.stdout) 399 | ch.setLevel(logging.DEBUG) 400 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 401 | ch.setFormatter(formatter) 402 | logger.addHandler(ch) 403 | 404 | fh = logging.FileHandler(os.path.join(self.output_dir, 'log.txt')) 405 | fh.setLevel(logging.DEBUG) 406 | fh.setFormatter(formatter) 407 | logger.addHandler(fh) 408 | return logger 409 | 410 | @property 411 | def tb_writer(self): 412 | if self._tb_writer is None and is_main_process(): 413 | self._tb_writer = get_summary_writer(self.output_dir, flush_secs=20) 414 | return self._tb_writer 415 | 416 | def resume(self): 417 | if self.cfg.solver.load == '' and self.cfg.solver.load_model == '': 418 | self.logger.warning('try to resume without loading anything!') 419 | if self.cfg.solver.load_model != '': 420 | self.logger.info(colored('loading model from %s' % self.cfg.solver.load_model, 'red')) 421 | self.load_model(self.cfg.solver.load_model) 422 | if self.cfg.solver.load != '': 423 | self.logger.info(colored('loading checkpoint from %s' % self.cfg.solver.load, 'red')) 424 | self.load(self.cfg.solver.load) 425 | 426 | def try_to_save(self, epoch, flag): 427 | if not is_main_process(): 428 | return 429 | if self.skip_validation: 430 | # validation is not performed, must save every epoch/iteration 431 | assert self.save_every 432 | if flag == self.save_mode: 433 | self.save(epoch) 434 | else: 435 | # validation is performed, can save according to val_loss or every epoch 436 | if self.save_every: 437 | # save every x epochs 438 | assert self.save_mode == 'epoch' 439 | if epoch % self.save_freq == 0 or epoch == self.num_epochs - 1: 440 | self.save(epoch) 441 | if self.val_loss < self.best_val_loss: 442 | self.logger.info( 443 | colored('Better model found at epoch' 444 | ' %d with val_loss %.4f.' % (epoch, self.val_loss), 'red')) 445 | self.save(epoch) 446 | --------------------------------------------------------------------------------