├── src ├── __init__.py ├── data │ ├── __init_.py │ └── in_memory_dataset.py ├── models │ ├── __init__.py │ └── lightning_modules.py ├── log.py ├── callbacks.py ├── train.py └── inference.py ├── requirements-dev.txt ├── requirements.txt ├── dockerfiles ├── cuda118 │ └── Dockerfile └── cuda120 │ └── Dockerfile ├── templates └── rtx_6000_ada.sh ├── .github └── workflows │ ├── linter.yml │ └── docker_push.yaml ├── LICENSE ├── .pre-commit-config.yaml ├── .gitignore └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/__init_.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit==3.* 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lightning==2.2.5 2 | protobuf==3.20.* 3 | segmentation-models-pytorch==0.3.3 4 | six==1.16.0 5 | torch==2.3.1 6 | torchvision==0.18.1 7 | -------------------------------------------------------------------------------- /dockerfiles/cuda118/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 4 | python3 \ 5 | python3-pip \ 6 | curl && \ 7 | apt clean && \ 8 | rm -rf /var/lib/apt/lists/* 9 | 10 | COPY requirements.txt /tmp/requirements.txt 11 | RUN pip3 install --no-cache-dir -r /tmp/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu118 12 | 13 | COPY ./src /workdir/src 14 | WORKDIR /workdir 15 | 16 | ENTRYPOINT [ "python3", "-m" ] 17 | -------------------------------------------------------------------------------- /dockerfiles/cuda120/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 4 | python3 \ 5 | python3-pip \ 6 | curl && \ 7 | apt clean && \ 8 | rm -rf /var/lib/apt/lists/* 9 | 10 | COPY requirements.txt /tmp/requirements.txt 11 | RUN pip3 install --no-cache-dir -r /tmp/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121 12 | 13 | COPY ./src /workdir/src 14 | WORKDIR /workdir 15 | 16 | ENTRYPOINT [ "python3", "-m" ] 17 | -------------------------------------------------------------------------------- /templates/rtx_6000_ada.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # GPU: 1x NVIDIA RTX 6000 Ada, 48 GB VRAM 3 | 4 | #cfg32=("resnet50,644" "resnext50,512" "unet_resnet50,440" "swin,240" "convnext,256") 5 | cfg16=("resnet50,1280" "resnext50,1024" "unet_resnet50,880" "swin,360" "convnext,500") 6 | 7 | N_ITERS=300 8 | PRECISION="16-mixed" 9 | 10 | for str in ${cfg16[@]}; do 11 | IFS=',' read -r -a parts <<< "$str" 12 | 13 | model="${parts[0]}" 14 | batch="${parts[1]}" 15 | 16 | docker run --ipc=host --ulimit memlock=-1 --gpus '"device=1"' cv-benchmark --model $model --batch-size $batch --n-iter $N_ITERS --precision $PRECISION 17 | done 18 | -------------------------------------------------------------------------------- /src/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from pip._internal.operations import freeze 4 | 5 | 6 | def setup_custom_logger(name: str = "benchmark"): 7 | logger = logging.getLogger(name) 8 | 9 | sh = logging.StreamHandler() 10 | 11 | formatter = logging.Formatter("%(asctime)s - %(message)s") 12 | 13 | sh.setFormatter(formatter) 14 | 15 | logger.addHandler(sh) 16 | logger.setLevel(level=logging.DEBUG) 17 | 18 | return logger 19 | 20 | 21 | def print_requirements(): 22 | pkgs = freeze.freeze() 23 | for pkg in pkgs: 24 | logger.info(pkg) 25 | 26 | 27 | logger = setup_custom_logger() 28 | -------------------------------------------------------------------------------- /.github/workflows/linter.yml: -------------------------------------------------------------------------------- 1 | name: Run pre-commit hooks 2 | 3 | on: 4 | push: 5 | pull_request: 6 | branches: [master] 7 | 8 | jobs: 9 | build: 10 | name: Run pre-commit hooks 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v4 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: "3.10" 21 | 22 | - name: Install pre-commit 23 | run: pip install -r requirements-dev.txt 24 | 25 | - name: Run pre-commit checks 26 | run: pre-commit run --all-files 27 | -------------------------------------------------------------------------------- /src/models/lightning_modules.py: -------------------------------------------------------------------------------- 1 | import lightning as L 2 | import torch 3 | from torch import nn 4 | 5 | 6 | class LitClassification(L.LightningModule): 7 | def __init__(self, model: nn.Module, optimizer=torch.optim.Adam): 8 | super().__init__() 9 | self.model = model 10 | self.loss = nn.CrossEntropyLoss() 11 | self.optimizer = optimizer 12 | 13 | def training_step(self, batch, batch_idx) -> torch.Tensor: 14 | y_hat = self.model(batch) 15 | y = torch.rand_like(y_hat) 16 | 17 | loss = self.loss(y_hat, y) 18 | return loss 19 | 20 | def configure_optimizers(self): 21 | return self.optimizer(self.parameters(), lr=2e-5) 22 | -------------------------------------------------------------------------------- /src/data/in_memory_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class InMemoryDataset(Dataset): 6 | def __init__( 7 | self, 8 | width: int = 224, 9 | height: int = 224, 10 | n_channels: int = 3, 11 | dataset_size: int = int(1e7), 12 | ): 13 | super().__init__() 14 | self.width = width 15 | self.height = height 16 | self.n_channels = n_channels 17 | self.dataset_size = dataset_size 18 | 19 | def __len__(self): 20 | return self.dataset_size 21 | 22 | def __getitem__(self, idx: int) -> torch.Tensor: 23 | """ 24 | Must return a tensor of shape C x H x W with values in [0, 1] range. 25 | """ 26 | return torch.rand(self.n_channels, self.height, self.width, dtype=torch.float32) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 TensorPix 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | default_language_version: 4 | python: python3 5 | 6 | repos: 7 | - repo: https://github.com/PyCQA/isort 8 | rev: 5.13.2 9 | hooks: 10 | - id: isort 11 | name: Format imports 12 | args: ["--profile", "black"] 13 | 14 | - repo: https://github.com/psf/black 15 | rev: 24.1.1 16 | hooks: 17 | - id: black 18 | name: black 19 | entry: black 20 | types: [python] 21 | 22 | - repo: https://github.com/pre-commit/pre-commit-hooks 23 | rev: v4.5.0 24 | hooks: 25 | - id: check-yaml 26 | - id: end-of-file-fixer 27 | - id: check-case-conflict 28 | - id: check-docstring-first 29 | - id: check-executables-have-shebangs 30 | - id: check-added-large-files 31 | args: ["--maxkb=350", "--enforce-all"] 32 | - id: detect-private-key 33 | - id: requirements-txt-fixer 34 | - id: mixed-line-ending 35 | - id: check-merge-conflict 36 | 37 | - repo: https://github.com/asottile/pyupgrade 38 | rev: v3.15.0 39 | hooks: 40 | - id: pyupgrade 41 | args: [--py38-plus] 42 | name: Upgrade code 43 | 44 | - repo: https://github.com/PyCQA/flake8 45 | rev: 7.0.0 46 | hooks: 47 | - id: flake8 48 | types: [python] 49 | args: ["--max-line-length=120", "--ignore=E203,W503"] 50 | -------------------------------------------------------------------------------- /.github/workflows/docker_push.yaml: -------------------------------------------------------------------------------- 1 | name: Create and push cuda118 + cuda120 docker images to this repo's packages 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | workflow_dispatch: {} 8 | 9 | env: 10 | REGISTRY: ghcr.io 11 | IMAGE_NAME: ${{ github.repository }} 12 | 13 | jobs: 14 | build-and-push-cuda-images: 15 | runs-on: ubuntu-latest 16 | permissions: 17 | contents: read 18 | packages: write 19 | steps: 20 | - 21 | name: Checkout repository 22 | uses: actions/checkout@v4 23 | 24 | - 25 | name: Log in to the Container registry 26 | uses: docker/login-action@v3 27 | with: 28 | registry: ${{ env.REGISTRY }} 29 | username: ${{ github.actor }} 30 | password: ${{ secrets.GITHUB_TOKEN }} 31 | #CUDA118 steps 32 | - 33 | name: Extract cuda118 image metadata 34 | id: meta_118 35 | uses: docker/metadata-action@v3 36 | with: 37 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 38 | tags: cuda118 39 | 40 | - 41 | name: Build and push cuda118 image 42 | uses: docker/build-push-action@v5 43 | with: 44 | context: . 45 | file: dockerfiles/cuda118/Dockerfile 46 | push: true 47 | tags: ${{ steps.meta_118.outputs.tags }} 48 | labels: ${{ steps.meta_118.outputs.labels }} 49 | #CUDA120 steps 50 | - 51 | name: Extract cuda120 image metadata 52 | id: meta_120 53 | uses: docker/metadata-action@v3 54 | with: 55 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 56 | tags: | 57 | type=raw,value=cuda120 58 | type=raw,value=latest 59 | - 60 | name: Build and push cuda120 image 61 | uses: docker/build-push-action@v5 62 | with: 63 | context: . 64 | file: dockerfiles/cuda120/Dockerfile 65 | push: true 66 | tags: ${{ steps.meta_120.outputs.tags }} 67 | labels: ${{ steps.meta_120.outputs.labels }} 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | lightning_logs/ 2 | 3 | *.csv 4 | benchmarks/ 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # poetry 103 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 104 | # This is especially recommended for binary packages to ensure reproducibility, and is more 105 | # commonly ignored for libraries. 106 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 107 | #poetry.lock 108 | 109 | # pdm 110 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 111 | #pdm.lock 112 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 113 | # in version control. 114 | # https://pdm.fming.dev/#use-with-ide 115 | .pdm.toml 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ 166 | -------------------------------------------------------------------------------- /src/callbacks.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | import os 4 | import stat 5 | import time 6 | from datetime import datetime 7 | 8 | import torch 9 | from lightning.pytorch.callbacks import Callback 10 | 11 | logger = logging.getLogger("benchmark") 12 | 13 | 14 | class BenchmarkCallback(Callback): 15 | def __init__( 16 | self, 17 | model_name: str, 18 | precision: str, 19 | workers: int, 20 | warmup_steps: int = 50, 21 | ): 22 | self.warmup_steps = warmup_steps 23 | self.start_time = 0 24 | self.end_time = 0 25 | self.precision = precision 26 | self.model = model_name 27 | self.workers = workers 28 | 29 | def on_fit_start(self, trainer, pl_module): 30 | logger.info( 31 | f"Benchmark started. Number of warmup iterations: {self.warmup_steps}" 32 | ) 33 | 34 | def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int): 35 | if batch_idx == self.warmup_steps: 36 | logger.info( 37 | f"Completed {self.warmup_steps} warmup steps. Benchmark timer started." 38 | ) 39 | self.start_time = time.time() 40 | 41 | def on_fit_end(self, trainer, pl_module): 42 | self.end_time = time.time() 43 | logger.info("Fit function finished") 44 | 45 | dataset = trainer.train_dataloader.dataset 46 | batch_size = trainer.train_dataloader.batch_size 47 | in_w, in_h = dataset.width, dataset.height 48 | 49 | benchmark_steps = trainer.global_step - self.warmup_steps 50 | processed_megapixels = ( 51 | trainer.world_size * in_w * in_h * batch_size * benchmark_steps / 1e6 52 | ) 53 | 54 | elapsed_time = ( 55 | self.end_time - self.start_time 56 | ) + 1e-7 # for numerical stability 57 | mpx_s = processed_megapixels / (elapsed_time) 58 | 59 | processed_imgs = batch_size * benchmark_steps * trainer.world_size 60 | images_s = processed_imgs / (elapsed_time) 61 | 62 | batches_s = benchmark_steps * trainer.world_size / elapsed_time 63 | 64 | logger.info(f"Benchmark finished in {elapsed_time:.1f} seconds") 65 | logger.info( 66 | f"Average training throughput: {mpx_s:.2f} MPx/s (megapixels per second) | " 67 | + f"{images_s:.2f} images/s | {batches_s:.2f} batches/s" 68 | ) 69 | 70 | os.makedirs("./benchmarks", exist_ok=True) 71 | csv_path = os.path.join("./benchmarks", "benchmark.csv") 72 | file_exists = os.path.isfile(csv_path) and os.stat(csv_path).st_size >= 0 73 | with open(csv_path, "a") as file: 74 | writer = csv.writer(file) 75 | if not file_exists: 76 | writer.writerow( 77 | [ 78 | "Datetime", 79 | "GPU", 80 | "cuDNN version", 81 | "N GPUs", 82 | "Data Loader workers", 83 | "Model", 84 | "Precision", 85 | "Minibatch", 86 | "Input width [px]", 87 | "Input height [px]", 88 | "Warmup steps", 89 | "Benchmark steps", 90 | "MPx/s", 91 | "images/s", 92 | "batches/s", 93 | ] 94 | ) 95 | 96 | data = [ 97 | datetime.now().strftime("%d/%m/%Y %H:%M:%S"), 98 | torch.cuda.get_device_name(0), 99 | torch.backends.cudnn.version(), 100 | trainer.world_size, 101 | self.workers, 102 | self.model, 103 | self.precision, 104 | batch_size, 105 | in_w, 106 | in_h, 107 | self.warmup_steps, 108 | benchmark_steps, 109 | mpx_s, 110 | images_s, 111 | batches_s, 112 | ] 113 | writer.writerow(data) 114 | logger.info( 115 | "Written benchmark data to a CSV file. " 116 | + "See 'Logging Results to a Persisent CSV File' section to " 117 | + "save the file on your disk: " 118 | + "https://github.com/tensorpix/benchmarking-cv-models#logging-results-to-a-persistent-csv-file" 119 | ) 120 | 121 | try: 122 | os.chmod( 123 | csv_path, 124 | stat.S_IRUSR 125 | | stat.S_IRGRP 126 | | stat.S_IWUSR 127 | | stat.S_IROTH 128 | | stat.S_IWOTH, 129 | ) 130 | except Exception as e: 131 | logger.error(f"Failed to change csv permissions: {e}") 132 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import segmentation_models_pytorch as smp 4 | import torch 5 | from lightning import Trainer 6 | from torch.utils.data import DataLoader 7 | from torchvision.models import ( 8 | convnext_base, 9 | efficientnet_v2_m, 10 | mobilenet_v3_large, 11 | resnet50, 12 | resnext50_32x4d, 13 | swin_b, 14 | vgg16, 15 | vit_b_16, 16 | ) 17 | 18 | from src import log 19 | from src.callbacks import BenchmarkCallback 20 | from src.data.in_memory_dataset import InMemoryDataset 21 | from src.log import print_requirements 22 | from src.models.lightning_modules import LitClassification 23 | 24 | logger = log.logger 25 | 26 | ARCHITECTURES = { 27 | "resnet50": resnet50, 28 | "convnext": convnext_base, 29 | "vgg16": vgg16, 30 | "efficient_net_v2": efficientnet_v2_m, 31 | "mobilenet_v3": mobilenet_v3_large, 32 | "resnext50": resnext50_32x4d, 33 | "swin": swin_b, 34 | "vit": vit_b_16, 35 | "unet_resnet50": smp.Unet, 36 | # TODO "ssd_vgg16": ssd300_vgg16, 37 | # TODO "fasterrcnn_resnet50_v2": fasterrcnn_resnet50_fpn_v2, 38 | } 39 | 40 | 41 | def main(args): 42 | if args.list_requirements: 43 | print_requirements() 44 | 45 | args_dict = vars(args) 46 | logger.info(f"User Arguments {args_dict}") 47 | 48 | dataset = InMemoryDataset(width=args.width, height=args.width) 49 | data_loader = DataLoader( 50 | dataset, 51 | num_workers=args.n_workers, 52 | batch_size=args.batch_size, 53 | shuffle=True, 54 | pin_memory=True, 55 | drop_last=True, 56 | ) 57 | 58 | trainer = Trainer( 59 | accelerator=args.accelerator, 60 | strategy="ddp", 61 | precision=args.precision, 62 | limit_train_batches=args.n_iters + args.warmup_steps, 63 | max_epochs=1, 64 | logger=False, 65 | enable_checkpointing=False, 66 | callbacks=[ 67 | BenchmarkCallback( 68 | warmup_steps=args.warmup_steps, 69 | model_name=args.model, 70 | precision=args.precision, 71 | workers=args.n_workers, 72 | ) 73 | ], 74 | devices=torch.cuda.device_count(), 75 | ) 76 | 77 | if args.model in ARCHITECTURES: 78 | if args.model == "unet_resnet50": 79 | model = ARCHITECTURES[args.model]( 80 | encoder_name="resnet50", encoder_weights=None 81 | ) 82 | else: 83 | model = ARCHITECTURES[args.model]() 84 | 85 | else: 86 | raise ValueError("Architecture not supported.") 87 | 88 | model = LitClassification(model=model) 89 | trainer.fit(model=model, train_dataloaders=data_loader) 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.") 94 | 95 | parser.add_argument( 96 | "--batch-size", 97 | type=int, 98 | required=True, 99 | help="Minibatch size. Set the value so >90%% VRAM is filled during benchmark for most representative results.", 100 | ) 101 | parser.add_argument( 102 | "--n-iters", 103 | type=int, 104 | default=200, 105 | help="Number of training iterations to benchmark for. One iteration = one batch update.", 106 | ) 107 | parser.add_argument( 108 | "--precision", choices=["32", "16", "16-mixed", "bf16-mixed"], default="32" 109 | ) 110 | parser.add_argument( 111 | "--n-workers", 112 | type=int, 113 | default=4, 114 | help="Number of Data Loader workers. CPU shouldn't be a bottleneck with 4+.", 115 | ) 116 | 117 | parser.add_argument("--width", type=int, default=224, help="Input width") 118 | parser.add_argument("--height", type=int, default=224, help="Input height") 119 | 120 | parser.add_argument( 121 | "--warmup-steps", 122 | type=int, 123 | default=100, 124 | help=( 125 | "Number of training iterations to use for warmup. " 126 | + " The benchmark timer starts after the warmup iterations are finished." 127 | ), 128 | ) 129 | parser.add_argument( 130 | "--accelerator", choices=["gpu"], default="gpu", help="Accelerator to use." 131 | ) 132 | parser.add_argument( 133 | "--model", 134 | default="resnet50", 135 | choices=list(ARCHITECTURES.keys()), 136 | help="Architecture to benchmark.", 137 | ) 138 | parser.add_argument( 139 | "--list-requirements", 140 | action="store_true", 141 | help="Prints all python packages along with their versions.", 142 | ) 143 | 144 | args = parser.parse_args() 145 | 146 | if args.n_iters <= 0: 147 | raise ValueError("Number of iterations must be > 0") 148 | 149 | if args.warmup_steps <= 0: 150 | raise ValueError("Number of warmup steps must be > 0") 151 | 152 | logger.info("########## STARTING NEW BENCHMARK RUN ###########") 153 | 154 | if not torch.cuda.is_available(): 155 | raise ValueError("CUDA device not found on this system.") 156 | else: 157 | logger.info(f"CUDA Device Name: {torch.cuda.get_device_name(0)}") 158 | logger.info(f"CUDNN version: {torch.backends.cudnn.version()}") 159 | logger.info( 160 | f"CUDA Device Total Memory: {(torch.cuda.get_device_properties(0).total_memory / 1e9):.2f} GB" 161 | ) 162 | 163 | main(args=args) 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Tensorpix logo 3 |

4 | 5 | --- 6 | 7 | # Benchmarking CV models 8 | 9 | Docker image for simple training benchmark of popular computer vision models. 10 | 11 | The benchmark code explicitly focuses on benchmarking only the **pure training loop code**. The dataset is 12 | generated on the fly and directly in RAM with minimal overhead. 13 | 14 | There is no extra work done in the training loop such as data preprocessing, model saving, validation, logging... 15 | 16 | We use [Lightning AI](https://lightning.ai/) library for benchmarks as it's a popular tool among deep learning practitioners. 17 | 18 | It also supports features such as mixed precision, DDP, and multi-GPU training. 19 | Such features can significantly affect benchmark performance so it's important to offer them in benchmarks. 20 | 21 | ## ❓ Why did we create this? 22 | 23 | [Our](https://tensorpix.ai) ML team had a dilemma while choosing the best GPU for our budget. GPU X was 2x the price of GPU Y, but we couldn't find reliable data that shows if GPU X was also 2x the speed of GPU Y. 24 | 25 | There were [some benchmarks](https://lambdalabs.com/gpu-benchmarks), but very few of them were specific for computer vision tasks and even fewer for the GPUs we wanted to test. So we created a docker image that does this with minimal setup. 26 | 27 | You can use this benchmark repo to: 28 | 29 | - See how various GPUs perform on various deep CV architectures 30 | - Benchmark various CV architectures 31 | - See how efficient are multi-GPU setups for a specific GPU 32 | - Test how much you gain in training speed when using Mixed-precision 33 | - Stress test the GPU(s) at near 100% utilization 34 | - Make pizzas (not tested) 35 | 36 | ## 📋 Supported architectures 37 | 38 | Please open an issue if you need support for a new architecture. 39 | 40 | - ResNet50 41 | - ConvNext (base) 42 | - VGG16 43 | - Efficient Net v2 44 | - MobileNet V3 45 | - ResNeXt50 46 | - SWIN 47 | - VIT 48 | - UNet with ResNet50 backbone 49 | 50 | ## 📖 How to benchmark 51 | 52 | ### Prerequisites 53 | 54 | In order to run benchmark docker containers you must have the following installed on the host machine: 55 | 56 | - Docker (we used v24.0.6 for testing) 57 | - NVIDIA drivers. See [Versions](#versions) when choosing the docker image. 58 | - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) - required in order to use CUDA inside docker containers 59 | 60 | ### Training vs Inference 61 | 62 | To benchmark model training, append the `src.train` when running the container. If you want to benchmark model inference, append the `src.inference` to the docker run command. See examples below for more details. 63 | 64 | ### Examples 65 | 66 | **Minimal** 67 | 68 | `docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models src.train --batch-size 32` 69 | 70 | **Advanced** 71 | 72 | `docker run --rm --ipc=host --ulimit memlock=-1 --gpus '"device=0,1"' -v ./benchmarks:/workdir/benchmarks ghcr.io/tensorpix/benchmarking-cv-models src.train --batch-size 32 --n-iters 1000 --warmup-steps 100 --model resnext50 --precision 16-mixed --width 320 --height 320` 73 | 74 | **Benchmark Inference** 75 | 76 | `docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models src.inference --batch-size 32 --n-iters 1000 --model resnext50 --precision 16 --width 256 --height 256` 77 | 78 | **List all train options:** 79 | 80 | `docker run --rm ghcr.io/tensorpix/benchmarking-cv-models src.train --help` 81 | 82 | **List all inference options:** 83 | 84 | `docker run --rm ghcr.io/tensorpix/benchmarking-cv-models src.inference --help` 85 | 86 | ### How to select particular GPUs 87 | 88 | If you want to use all available GPUs, then set the `--gpus all` docker parameter. 89 | 90 | If want to use for example GPUs at indicies 2 and 3, set `--gpus '"device=2,3"'`. 91 | 92 | ### Logging results to a persistent CSV file 93 | 94 | Benchmark code will create a CSV file with benchmark results on every run. The file will exist inside the docker container, but you have to mount it in order to see it on the host machine. 95 | 96 | To do so, use the following docker argument when running a container: `-v :/workdir/benchmarks`. See the [advanced example](#examples) for more details. The CSV file will reside in the mounted host directory. 97 | 98 | We also recommend that you create the `` on the host before running the container as the container will create the folder under the `root` user if it doesn't exist on the host. 99 | 100 | ### Versions 101 | 102 | We support two docker images: one for CUDA 12.0 and second for CUDA 11.8. The `12.0` version is on the latest docker tag, while `11.8` is on the `ghcr.io/tensorpix/benchmarking-cv-models:cuda118` tag. 103 | 104 | `11.8` version supports earlier NVIDIA drivers so if you run into driver related errors, try this image instead. 105 | 106 | ## 📊 Metrics 107 | 108 | We use 3 metrics for the benchmark: 109 | 110 | - Images per second 111 | - Batches per second 112 | - Megapixels per second 113 | 114 | Images/s and batches/s are self-explanatory. Megapixels/s (MPx) are not usually used but we like this metric as it's input resolution independent. 115 | 116 | It's calculated according to the following formula: `(input_width_px * input_height_px * batch_size * n_gpus * n_iterations) / (elapsed_time_s * 10^6)` 117 | -------------------------------------------------------------------------------- /src/inference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import torch.utils.benchmark as benchmark 5 | 6 | from src import log 7 | from src.log import print_requirements 8 | 9 | logger = log.logger 10 | 11 | ARCHITECTURES = { 12 | "resnet50": "resnet50", 13 | "convnext": "convnext_base", 14 | "vgg16": "vgg16", 15 | "efficient_net_v2": "efficientnet_v2_m", 16 | "mobilenet_v3": "mobilenet_v3_large", 17 | "resnext50": "resnext50_32x4d", 18 | "swin": "swin_b", 19 | "vit": "vit_b_16", 20 | "ssd_vgg16": "ssd300_vgg16", 21 | "fasterrcnn_resnet50_v2": "fasterrcnn_resnet50_fpn_v2", 22 | } 23 | 24 | 25 | def benchmark_inference( 26 | stmt: str, 27 | setup: str, 28 | input: torch.Tensor, 29 | n_runs: int = 100, 30 | num_threads: int = 1, 31 | ): 32 | """ 33 | Benchmark a model using torch.utils.benchmark. 34 | 35 | When evaluating model throughoutput in MP/s only the image height, width and batch size are taken into 36 | account. The number of channels are ignored as they are fixed to 3 channels in most cases (RGB images). 37 | Speed evaluation measures how fast can we process an arbitrary input image so channels 38 | don't affect the model computation speed. 39 | """ 40 | 41 | timer = benchmark.Timer( 42 | stmt=stmt, 43 | setup=setup, 44 | num_threads=num_threads, 45 | globals={"x": input}, 46 | ) 47 | 48 | logger.info( 49 | f"Running benchmark on sample of {n_runs} runs with {num_threads} thread(s)..." 50 | ) 51 | result = timer.timeit(n_runs) 52 | 53 | batch, height, width = input.size(0), input.size(-2), input.size(-1) 54 | total_pixels = batch * width * height 55 | 56 | logger.info(f"Batch size: {batch}") 57 | logger.info(f"Input resolution: {width}x{height} pixels\n") 58 | 59 | mean_per_batch = result.mean 60 | median_per_batch = result.median 61 | 62 | mean_speed_mpx = (total_pixels / 1e6) / mean_per_batch 63 | median_speed_mpx = (total_pixels / 1e6) / median_per_batch 64 | 65 | logger.info( 66 | f"Mean throughoutput per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s" 67 | ) 68 | logger.info( 69 | f"Median throughoutput per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n" 70 | ) 71 | 72 | logger.info( 73 | f"Model mean throughoutput in megapixels per second: {mean_speed_mpx:.3f} MP/s" 74 | ) 75 | logger.info( 76 | f"Model median throughoutput in megapixels per second: {median_speed_mpx:.3f} MP/s\n" 77 | ) 78 | 79 | 80 | def main(args): 81 | if args.list_requirements: 82 | print_requirements() 83 | 84 | if args.model.lower() not in ARCHITECTURES: 85 | raise ValueError("Architecture not supported.") 86 | 87 | stmt = """ \ 88 | with torch.inference_mode(): 89 | out = model(x) 90 | out = out.float().cpu() 91 | """ 92 | 93 | arch = ARCHITECTURES[args.model.lower()] 94 | setup = f"from torchvision.models import {arch}; model = {arch}(); model.eval()" 95 | 96 | input_shape = [args.batch_size, 3, args.height, args.width] 97 | precision = torch.float16 if args.precision == "16" else torch.float32 98 | 99 | x = torch.rand(*input_shape, dtype=precision) 100 | x = x.cuda(args.gpu_device_index, non_blocking=True) 101 | setup = f"{setup}; model.cuda({args.gpu_device_index})" 102 | 103 | if args.precision == "16": 104 | setup = f"{setup}; model.half()" 105 | 106 | benchmark_inference( 107 | stmt=stmt, 108 | setup=setup, 109 | input=x, 110 | n_runs=args.n_iters, 111 | num_threads=args.n_workers, 112 | ) 113 | 114 | 115 | if __name__ == "__main__": 116 | parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.") 117 | 118 | parser.add_argument("--batch-size", type=int, required=True, default=1) 119 | parser.add_argument( 120 | "--n-iters", 121 | type=int, 122 | default=100, 123 | help="Number of training iterations to benchmark for. One iteration = one batch update", 124 | ) 125 | parser.add_argument("--precision", choices=["32", "16"], default="16") 126 | parser.add_argument("--n-workers", type=int, default=1) 127 | parser.add_argument("--gpu-device-index", type=int, default=0) 128 | 129 | parser.add_argument("--width", type=int, default=224, help="Input width") 130 | parser.add_argument("--height", type=int, default=224, help="Input height") 131 | 132 | parser.add_argument( 133 | "--model", 134 | default="resnet50", 135 | choices=list(ARCHITECTURES.keys()), 136 | help="Architecture to benchmark.", 137 | ) 138 | parser.add_argument("--list-requirements", action="store_true") 139 | 140 | args = parser.parse_args() 141 | 142 | if args.n_iters <= 0: 143 | raise ValueError("Number of iterations must be > 0") 144 | 145 | logger.info("########## STARTING NEW INFERENCE BENCHMARK RUN ###########") 146 | 147 | if not torch.cuda.is_available(): 148 | raise ValueError("CUDA device not found on this system.") 149 | else: 150 | logger.info( 151 | f"CUDA Device Name: {torch.cuda.get_device_name(args.gpu_device_index)}" 152 | ) 153 | logger.info(f"CUDNN version: {torch.backends.cudnn.version()}") 154 | logger.info( 155 | "CUDA Device Total Memory: " 156 | + f"{(torch.cuda.get_device_properties(args.gpu_device_index).total_memory / 1e9):.2f} GB" 157 | ) 158 | 159 | main(args=args) 160 | --------------------------------------------------------------------------------