├── tests ├── __init__.py ├── converter_tests │ ├── __init__.py │ └── test_getitem.py ├── feature_tests │ ├── __init__.py │ ├── test_version_utils.py │ ├── test_contiguous.py │ ├── test_tensor_ne.py │ ├── test_flatten_module.py │ ├── test_tensor_shape_div_batch.py │ ├── test_save_load.py │ ├── test_legacy_max_batch_size.py │ ├── test_interpolate_dynamic.py │ ├── test_flatten_dynamic.py │ ├── test_dynamic_shape.py │ ├── test_dataset_calibrator.py │ ├── test_flattener.py │ ├── test_dataset.py │ └── test_tensor_shape.py └── model_tests │ ├── __init__.py │ ├── timm │ ├── __init__.py │ └── test_maxvit.py │ └── torchvision │ ├── __init__.py │ ├── test_segmentation_models.py │ └── test_classification_models.py ├── docs ├── CHANGELOG.md ├── CONTRIBUTING.md ├── css │ └── version-select.css ├── images │ └── check.svg ├── index.md ├── getting_started.md ├── usage │ ├── basic_usage.md │ ├── custom_converter.md │ └── reduced_precision.md ├── js │ └── version-select.js ├── benchmarks │ ├── jetson_nano.md │ └── jetson_xavier.md └── see_also.md ├── torch2trt ├── contrib │ ├── __init__.py │ └── qat │ │ ├── __init__.py │ │ ├── converters │ │ ├── __init__.py │ │ ├── QuantRelu.py │ │ ├── QuantConv.py │ │ └── QuantConvBN.py │ │ ├── layers │ │ ├── __init__.py │ │ ├── README.md │ │ ├── quant_activation.py │ │ └── _utils.py │ │ └── README.md ├── converters │ ├── __init__.py │ ├── unimplemented_converters.py │ └── plugin_converters.py ├── test.py ├── __init__.py ├── version_utils.py ├── flatten_module.py ├── plugins │ └── plugins.cpp ├── dataset_calibrator.py ├── misc_utils.py ├── utils.py ├── flattener.py ├── trt_module.py └── dataset.py ├── examples ├── contrib │ ├── quantization_aware_training │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── models.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── pytorch_nvidia_quantization.patch │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ └── cifar10.py │ │ ├── __init__.py │ │ ├── setup.py │ │ ├── parser.py │ │ ├── README.md │ │ ├── infer.py │ │ └── train.py │ └── pre_py3.7 │ │ └── fix-getitem.patch ├── easyocr │ ├── download_images.sh │ ├── README.md │ ├── generate_data.py │ ├── optimize_detector.py │ ├── run_end2end.py │ └── optimize_recognizer.py ├── image_classification │ ├── conversion.ipynb │ └── live_demo.ipynb └── image_segmentation │ └── conversion.ipynb ├── CLA.pdf ├── scripts ├── test_docs.sh ├── release_test_docs.sh ├── push_docs.sh ├── release_build_docs.sh ├── build_docs.sh ├── release_push_docs.sh ├── build_pre_py3.7.sh ├── build_contrib.sh ├── dump_converters.py └── profile_timm_models.sh ├── requirements ├── requirements_8.txt └── requirements_10.txt ├── docker ├── 21-06 │ ├── build.sh │ ├── run.sh │ └── Dockerfile ├── 21-09 │ ├── build.sh │ ├── run.sh │ └── Dockerfile ├── l4t-35.1.0 │ ├── Dockerfile │ ├── build.sh │ └── run.sh └── 21-08 │ ├── build.sh │ ├── run.sh │ └── Dockerfile ├── plugins └── src │ ├── tests.cpp │ ├── example_plugin.h │ ├── reflection_pad_2d_plugin.h │ └── reflection_pad_2d_plugin_test.cpp ├── .gitignore ├── LICENSE.md ├── mkdocs.yml ├── CMakeLists.txt ├── CONTRIBUTORS.md ├── setup.py ├── benchmarks ├── JETSON_NANO.md └── JETSON_XAVIER.md ├── test.sh ├── CHANGELOG.md └── CONTRIBUTING.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ../CHANGELOG.md -------------------------------------------------------------------------------- /tests/converter_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/feature_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/model_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ../CONTRIBUTING.md -------------------------------------------------------------------------------- /tests/model_tests/timm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/model_tests/torchvision/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /torch2trt/contrib/__init__.py: -------------------------------------------------------------------------------- 1 | from .qat import * 2 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CLA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/torch2trt/HEAD/CLA.pdf -------------------------------------------------------------------------------- /scripts/test_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdocs serve --dev-addr=0.0.0.0:8000 -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/__init__.py: -------------------------------------------------------------------------------- 1 | from .layers import * 2 | -------------------------------------------------------------------------------- /torch2trt/contrib/qat/__init__.py: -------------------------------------------------------------------------------- 1 | from .converters import * 2 | from .layers import * 3 | -------------------------------------------------------------------------------- /requirements/requirements_8.txt: -------------------------------------------------------------------------------- 1 | tensorrt==8.6.1 2 | torch 3 | torchvision 4 | timm 5 | onnx_graphsurgeon -------------------------------------------------------------------------------- /requirements/requirements_10.txt: -------------------------------------------------------------------------------- 1 | tensorrt==10.0.1 2 | torch 3 | torchvision 4 | timm 5 | onnx_graphsurgeon -------------------------------------------------------------------------------- /docker/21-06/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build -t torch2trt:21-06 -f $(pwd)/docker/21-06/Dockerfile . -------------------------------------------------------------------------------- /docker/21-06/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | docker run --gpus all -it --rm -v $(pwd):/torch2trt torch2trt:21-06 -------------------------------------------------------------------------------- /docker/21-09/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build -t torch2trt:21-09 -f $(pwd)/docker/21-09/Dockerfile . -------------------------------------------------------------------------------- /docker/l4t-35.1.0/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/l4t-pytorch:r35.1.0-pth1.12-py3 2 | 3 | RUN pip install timm -------------------------------------------------------------------------------- /docker/21-08/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build -t torch2trt:21-08 -f $(pwd)/docker/21-08/Dockerfile . 4 | -------------------------------------------------------------------------------- /docker/21-09/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | docker run --gpus all -it -d --rm -v $(pwd):/torch2trt torch2trt:21-09 -------------------------------------------------------------------------------- /docker/21-08/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | docker run --gpus all -it -d --rm -v $(pwd):/torch2trt torch2trt:21-08 5 | -------------------------------------------------------------------------------- /scripts/release_test_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TAG=$1 4 | 5 | mike set-default $TAG 6 | mike serve --dev-addr=0.0.0.0:8000 -------------------------------------------------------------------------------- /torch2trt/contrib/qat/converters/__init__.py: -------------------------------------------------------------------------------- 1 | from .QuantConv import * 2 | from .QuantConvBN import * 3 | from .QuantRelu import * 4 | -------------------------------------------------------------------------------- /torch2trt/contrib/qat/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .quant_conv import * 2 | from .quant_activation import * 3 | from ._utils import * 4 | -------------------------------------------------------------------------------- /docs/css/version-select.css: -------------------------------------------------------------------------------- 1 | @media only screen and (max-width:76.1875em) { 2 | #version-selector { 3 | padding: .6rem .8rem; 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /torch2trt/converters/__init__.py: -------------------------------------------------------------------------------- 1 | from .unimplemented_converters import * 2 | from .plugin_converters import * 3 | from .native_converters import * -------------------------------------------------------------------------------- /scripts/push_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TAG=$1 4 | 5 | python3 scripts/dump_converters.py > docs/converters.md 6 | 7 | mike deploy $TAG --push 8 | -------------------------------------------------------------------------------- /plugins/src/tests.cpp: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do this in one cpp file 2 | #include -------------------------------------------------------------------------------- /scripts/release_build_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TAG=$1 4 | 5 | python3 scripts/dump_converters.py --tag=$TAG > docs/converters.md 6 | 7 | mike deploy $TAG -------------------------------------------------------------------------------- /scripts/build_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GITHUB=$1 4 | TAG=$2 5 | 6 | python3 scripts/dump_converters.py --github=$GITHUB --tag=$TAG > docs/converters.md 7 | -------------------------------------------------------------------------------- /torch2trt/test.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | 3 | print("torch2trt.test is no longer supported. Please implement unit tests in the tests directory instead.") -------------------------------------------------------------------------------- /docker/l4t-35.1.0/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=l4t-35.1.0 4 | 5 | docker build -t torch2trt:$VERSION -f $(pwd)/docker/$VERSION/Dockerfile $(pwd)/docker/$VERSION -------------------------------------------------------------------------------- /scripts/release_push_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TAG=$1 4 | 5 | python3 scripts/dump_converters.py --tag=$TAG > docs/converters.md 6 | 7 | mike deploy $TAG --push 8 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='qat', 5 | version="1", 6 | packages=find_packages() 7 | ) 8 | 9 | -------------------------------------------------------------------------------- /docs/images/check.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docker/21-06/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:21.06-py3 2 | 3 | 4 | RUN pip3 install termcolor 5 | 6 | RUN git clone https://github.com/catchorg/Catch2.git && \ 7 | cd Catch2 && \ 8 | cmake -Bbuild -H. -DBUILD_TESTING=OFF && \ 9 | cmake --build build/ --target install -------------------------------------------------------------------------------- /docker/21-09/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:21.09-py3 2 | 3 | 4 | RUN pip3 install termcolor 5 | 6 | RUN git clone https://github.com/catchorg/Catch2.git && \ 7 | cd Catch2 && \ 8 | cmake -Bbuild -H. -DBUILD_TESTING=OFF && \ 9 | cmake --build build/ --target install -------------------------------------------------------------------------------- /docker/l4t-35.1.0/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=l4t-35.1.0 4 | 5 | 6 | docker run \ 7 | --network host \ 8 | --ipc host \ 9 | --gpus all \ 10 | -it \ 11 | -d \ 12 | --rm \ 13 | --name=torch2trt \ 14 | -v $(pwd):/torch2trt \ 15 | torch2trt:$VERSION -------------------------------------------------------------------------------- /docker/21-08/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:21.08-py3 2 | 3 | 4 | RUN pip3 install termcolor 5 | 6 | RUN git clone https://github.com/catchorg/Catch2.git && \ 7 | cd Catch2 && \ 8 | cmake -Bbuild -H. -DBUILD_TESTING=OFF && \ 9 | cmake --build build/ --target install 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ninja_deps 2 | .ninja_log 3 | build.ninja 4 | tags 5 | *.o 6 | *.pb.o 7 | torch2trt.egg-info 8 | build/ 9 | dist/ 10 | __pycache__/ 11 | *.so 12 | *.pb.h 13 | *.pb.cc 14 | *_pb2.py 15 | *.pyc 16 | *.ipynb_checkpoints 17 | *.pth 18 | docs/converters.md 19 | site 20 | ToJetsonGrp 21 | .vscode 22 | data -------------------------------------------------------------------------------- /scripts/build_pre_py3.7.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exu 2 | 3 | PATCH_DIR="examples/contrib/pre_py3.7/" 4 | PATCH_FILES=( 5 | "fix-getitem.patch" 6 | ) 7 | 8 | for patch_file in "${PATCH_FILES[@]}"; do 9 | patch_file="${PATCH_DIR}""${patch_file}" 10 | git apply "${patch_file}" 11 | done 12 | 13 | python3 setup.py install 14 | -------------------------------------------------------------------------------- /tests/feature_tests/test_version_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import torch2trt.version_utils 4 | 5 | def test_version_utils(): 6 | 7 | a = torch2trt.version_utils.Version("10.1") 8 | 9 | assert a >= "10.1" 10 | assert a >= "10.0" 11 | assert a > "7.0" 12 | assert a < "11.0" 13 | assert a == "10.1" 14 | assert a <= "10.1" 15 | assert a <= "10.2" -------------------------------------------------------------------------------- /torch2trt/__init__.py: -------------------------------------------------------------------------------- 1 | from .torch2trt import * 2 | from .converters import * 3 | import tensorrt as trt 4 | 5 | def load_plugins(): 6 | import torch2trt.torch_plugins 7 | registry = trt.get_plugin_registry() 8 | torch2trt_creators = [c for c in registry.plugin_creator_list if c.plugin_namespace == 'torch2trt'] 9 | for c in torch2trt_creators: 10 | registry.register_creator(c, 'torch2trt') 11 | 12 | try: 13 | load_plugins() 14 | except: 15 | pass 16 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # torch2trt 2 | 3 | 4 | 5 | torch2trt is a PyTorch to TensorRT converter which utilizes the 6 | TensorRT Python API. The converter is 7 | 8 | * Easy to use - Convert modules with a single function call ``torch2trt`` 9 | 10 | * Easy to extend - Write your own layer converter in Python and register it with ``@tensorrt_converter`` 11 | 12 | If you find an issue, please [let us know](https://github.com/NVIDIA-AI-IOT/torch2trt/issues)! -------------------------------------------------------------------------------- /tests/feature_tests/test_contiguous.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch2trt import torch2trt 3 | 4 | 5 | def test_contiguous(): 6 | 7 | torch.manual_seed(0) 8 | 9 | net = torch.nn.Conv2d(3, 10, kernel_size=3) 10 | net.eval().cuda() 11 | 12 | test_tensor = torch.randn((1, 25, 25, 3)).cuda().permute((0, 3, 1, 2)) 13 | 14 | with torch.no_grad(): 15 | test_out = net(test_tensor) 16 | 17 | with torch.no_grad(): 18 | trt_net = torch2trt(net, [test_tensor]) 19 | test_trt_out = trt_net(test_tensor) 20 | 21 | delta = torch.max((test_out.contiguous() - test_trt_out.contiguous()).abs()) 22 | assert delta < 1e-3, f"Delta: {delta}" 23 | 24 | -------------------------------------------------------------------------------- /torch2trt/contrib/qat/README.md: -------------------------------------------------------------------------------- 1 | ## Quantization Aware Training 2 | 3 | This contrib folder provides layers and converters for Quantization Aware Training to convert layers into INT8. 4 | 5 | ### Supported Layers 6 | 7 | - Conv2d 8 | - Conv2d + fused BN 9 | - ReLU 10 | 11 | ### Future Support for Layers 12 | 13 | -Pooling layers 14 | -Linear layer 15 | 16 | ### Supported Quantization Techniques 17 | 18 | - per tensor quantization 19 | - symmetric quantization 20 | 21 | ### Future Support for Quantization Techniques 22 | 23 | - per channel quantization 24 | - asymmetric quantization 25 | 26 | ### Working example 27 | 28 | Please see `examples/quantization_aware_training` 29 | -------------------------------------------------------------------------------- /tests/feature_tests/test_tensor_ne.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from torch2trt import torch2trt, trt 4 | 5 | def test_tensor_ne(): 6 | 7 | class NotEqual(torch.nn.Module): 8 | def __init__(self): 9 | super(NotEqual, self).__init__() 10 | 11 | def forward(self, x, y): 12 | return x != y 13 | 14 | module = NotEqual().cuda().eval() 15 | 16 | x = torch.randn(1, 3, 40, 20).cuda() 17 | y = torch.randn(1, 3, 1, 20).cuda() 18 | 19 | module_trt = torch2trt(module, [x, y], log_level=trt.Logger.VERBOSE) 20 | 21 | assert torch.all(module_trt(x, y) == module(x, y)) 22 | 23 | 24 | if __name__ == "__main__": 25 | test_tensor_ne() -------------------------------------------------------------------------------- /examples/easyocr/download_images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p images 4 | 5 | wget https://user-images.githubusercontent.com/4212806/180578035-e39cae5d-db18-4941-98a8-d697a1ba2336.jpg -O images/image_0.jpg 6 | wget https://user-images.githubusercontent.com/4212806/180578037-98d81133-0e05-4bdf-ac2b-9918cacc8e64.jpg -O images/image_1.jpg 7 | wget https://user-images.githubusercontent.com/4212806/180578039-ce315b8a-6678-4f25-aa8e-e35e4a5e63dc.jpg -O images/image_2.jpg 8 | wget https://user-images.githubusercontent.com/4212806/180578040-f1d34f29-ce3f-4fc8-9e58-1d009df84959.jpg -O images/image_3.jpg 9 | wget https://user-images.githubusercontent.com/4212806/180578041-25919c7b-f520-4782-8351-fc8de9ffd016.jpg -O images/image_4.jpg -------------------------------------------------------------------------------- /tests/feature_tests/test_flatten_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch2trt import torch2trt 4 | 5 | 6 | def test_flatten_nested_tuple_args(): 7 | 8 | class TestModule(nn.Module): 9 | 10 | def forward(self, x, yz): 11 | return torch.cat([x, yz[0], yz[1]], dim=-1) 12 | 13 | module = TestModule().cuda().eval() 14 | 15 | data = ( 16 | torch.randn(1, 3, 32, 32).cuda(), 17 | ( 18 | torch.randn(1, 3, 32, 32).cuda(), 19 | torch.randn(1, 3, 32, 32).cuda() 20 | ) 21 | ) 22 | 23 | module_trt = torch2trt(module, data) 24 | 25 | out = module(*data) 26 | out_trt = module_trt(*data) 27 | 28 | assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3)) 29 | 30 | -------------------------------------------------------------------------------- /scripts/build_contrib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | git clone https://github.com/NVIDIA/TensorRT.git /tmp/TensorRT/ 4 | 5 | parentdir="$(dirname "$(pwd)")" 6 | patch="examples/contrib/quantization_aware_training/utils/pytorch_nvidia_quantization.patch" 7 | patch_file="$parentdir/$patch" 8 | 9 | pushd /tmp/TensorRT 10 | cp $patch_file . 11 | git checkout e724d31ab84626ca334b4284703b5048eb698c98 ## keeping this for versioning control 12 | git sparse-checkout init --cone 13 | git sparse-checkout set /tools/pytorch-quantization/ 14 | git apply --reject --whitespace=fix pytorch_nvidia_quantization.patch 15 | cd tools/pytorch-quantization/ 16 | python setup.py install 17 | popd 18 | 19 | pushd $parentdir 20 | python3 setup.py install --plugins --contrib 21 | popd 22 | 23 | 24 | -------------------------------------------------------------------------------- /tests/feature_tests/test_tensor_shape_div_batch.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from torch2trt import torch2trt, trt 4 | 5 | def test_div_constant_batch(): 6 | 7 | class DivConstantBatch(torch.nn.Module): 8 | def __init__(self): 9 | super(DivConstantBatch, self).__init__() 10 | self.register_buffer('y', torch.ones((1, 3, 10, 10))) 11 | 12 | def forward(self, x): 13 | return x / self.y 14 | 15 | module = DivConstantBatch().cuda().eval() 16 | 17 | x = torch.randn(1, 3, 10, 10).cuda() 18 | 19 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE) 20 | 21 | assert torch.allclose(module_trt(x), module(x), atol=1e-3, rtol=1e-3) 22 | 23 | 24 | if __name__ == "__main__": 25 | test_div_constant_batch() 26 | -------------------------------------------------------------------------------- /tests/feature_tests/test_save_load.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch2trt 3 | import torchvision 4 | import torch 5 | 6 | 7 | def test_save_load(): 8 | model = torch.nn.Conv2d(3, 3, 1).cuda().eval().half() 9 | data = torch.randn((1, 3, 224, 224)).cuda().half() 10 | 11 | print('Running torch2trt...') 12 | model_trt = torch2trt.torch2trt(model, [data], fp16_mode=True, max_workspace_size=1<<25) 13 | 14 | print('Saving model...') 15 | torch.save(model_trt.state_dict(), '.test_model.pth') 16 | 17 | print('Loading model...') 18 | model_trt_2 = torch2trt.TRTModule() 19 | model_trt_2.load_state_dict(torch.load('.test_model.pth')) 20 | 21 | assert(model_trt_2.engine is not None) 22 | 23 | print(torch.max(torch.abs(model_trt_2(data) - model(data)))) 24 | print(torch.max(torch.abs(model_trt_2(data) - model_trt(data)))) -------------------------------------------------------------------------------- /tests/model_tests/timm/test_maxvit.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch2trt 3 | from timm.models.maxxvit import ( 4 | maxvit_tiny_rw_224, 5 | maxvit_rmlp_pico_rw_256, 6 | maxvit_rmlp_small_rw_224 7 | ) 8 | import torch 9 | 10 | 11 | def _cross_validate_module(model, shape=(224, 224)): 12 | model = model.cuda() 13 | data = torch.randn(1, 3, *shape).cuda() 14 | model_trt = torch2trt.torch2trt(model, [data]) 15 | out = model(data) 16 | out_trt = model_trt(data) 17 | assert torch.allclose(out, out_trt, rtol=1e-2, atol=1e-2) 18 | 19 | 20 | def test_maxvit_tiny_rw_224(): 21 | _cross_validate_module(maxvit_tiny_rw_224().cuda().eval(), (224, 224)) 22 | 23 | 24 | def test_maxvit_rmlp_small_rw_224(): 25 | _cross_validate_module(maxvit_rmlp_small_rw_224().cuda().eval(), (224, 224)) 26 | 27 | 28 | if __name__ == "__main__": 29 | test_maxvit_tiny_rw_224() -------------------------------------------------------------------------------- /torch2trt/contrib/qat/converters/QuantRelu.py: -------------------------------------------------------------------------------- 1 | from torch2trt.torch2trt import * 2 | import tensorrt as trt 3 | 4 | @tensorrt_converter('torch2trt.contrib.qat.layers.quant_activation.IQuantReLU.forward',enabled=trt_version() >= '7.0') 5 | def convert_QuantReLU(ctx): 6 | module = ctx.method_args[0] 7 | input = ctx.method_args[1] 8 | input_trt = add_missing_trt_tensors(ctx.network, [input])[0] 9 | output = ctx.method_return 10 | layer = ctx.network.add_activation( 11 | input=input_trt, type=trt.ActivationType.RELU) 12 | 13 | ## int 8 precision 14 | if 'qat_mode' in ctx.torch2trt_kwargs: 15 | amax = module._input_quantizer.learned_amax 16 | layer.precision = trt.int8 17 | layer.set_output_type(0,trt.int8) 18 | out = layer.get_output(0) 19 | out.dynamic_range=(-amax,amax) 20 | 21 | output._trt = layer.get_output(0) 22 | -------------------------------------------------------------------------------- /torch2trt/contrib/qat/layers/README.md: -------------------------------------------------------------------------------- 1 | ## Layers 2 | 3 | - Every layer has two implementations (Training and Inference). This is required as the quantized aware layers quantize the weights / activation in the forward pass. 4 | - If we try to convert the layers into TRT engine (w quantization happening in the forward pass), then a lot of unwanted ops will be presented in the final TRT engine as Torch2TRT will convert all the ops into their TRT equivalent layers. . 5 | - Therefore, an inference version of the layer is created so that only the learned parameters (zero point / scale) are carried with the layer for convertng the layer into a TRT engine. 6 | 7 | ## Quantization Type 8 | 9 | Currently. TRT7 only supports per tensor symmetric quantization. Support for other techniques of quantization (such as per channel , asymmetric etc) will be supported once the newer versions of TensorRT support them. 10 | 11 | ## Working example 12 | 13 | Please refer to `examples/quantization_aware_training/` for a working example. 14 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /torch2trt/version_utils.py: -------------------------------------------------------------------------------- 1 | import packaging.version 2 | import tensorrt as trt 3 | import torch 4 | 5 | 6 | def trt_version(): 7 | return Version(trt.__version__) 8 | 9 | 10 | def torch_version(): 11 | return Version(torch.__version__) 12 | 13 | 14 | class Version(packaging.version.Version): 15 | 16 | def __ge__(self, other): 17 | if isinstance(other, str): 18 | other = Version(other) 19 | return super().__ge__(other) 20 | 21 | def __le__(self, other): 22 | if isinstance(other, str): 23 | other = Version(other) 24 | return super().__le__(other) 25 | 26 | def __eq__(self, other): 27 | if isinstance(other, str): 28 | other = Version(other) 29 | return super().__eq__(other) 30 | 31 | def __gt__(self, other): 32 | if isinstance(other, str): 33 | other = Version(other) 34 | return super().__gt__(other) 35 | 36 | def __lt__(self, other): 37 | if isinstance(other, str): 38 | other = Version(other) 39 | return super().__lt__(other) 40 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/models/models.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Contains basic model definitions 3 | ''' 4 | 5 | import torch 6 | import torch.nn as nn 7 | from utils.utilities import qrelu,qconv2d 8 | 9 | class vanilla_cnn(nn.Module): 10 | def __init__(self,qat_mode=False,infer=False): 11 | super().__init__() 12 | self.qat = qat_mode 13 | self.layer1=qconv2d(3,32,padding=1,qat=qat_mode,infer=infer) 14 | self.layer2=qconv2d(32,64,padding=1,qat=qat_mode,infer=infer) 15 | self.layer3=qconv2d(64,128,padding=1,qat=qat_mode,infer=infer) 16 | self.layer4=qconv2d(128,256,padding=1,qat=qat_mode,infer=infer) 17 | self.layer5 = nn.MaxPool2d(kernel_size=2,stride=8) 18 | self.fcs = nn.Sequential( 19 | nn.Linear(4096,1024), 20 | nn.ReLU(), 21 | nn.Linear(1024,512), 22 | nn.ReLU(), 23 | nn.Linear(512,10)) 24 | 25 | def forward(self,x): 26 | x = self.layer1(x) 27 | x = self.layer2(x) 28 | x = self.layer3(x) 29 | x = self.layer4(x) 30 | x = self.layer5(x) 31 | x = x.view(x.size(0),-1) 32 | x = self.fcs(x) 33 | return x 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /docs/getting_started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | Follow these steps to get started using torch2trt. 4 | 5 | !!! note 6 | 7 | torch2trt depends on the TensorRT Python API. On Jetson, this is included with the latest JetPack. For desktop, please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html). You may also try installing torch2trt inside one of the NGC PyTorch docker containers for [Desktop](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) or [Jetson](https://ngc.nvidia.com/catalog/containers/nvidia:l4t-pytorch). 8 | 9 | ### Install Without plugins 10 | 11 | To install without compiling plugins, call the following 12 | 13 | ```bash 14 | git clone https://github.com/NVIDIA-AI-IOT/torch2trt 15 | cd torch2trt 16 | python setup.py install 17 | ``` 18 | 19 | ### Install With plugins 20 | 21 | To install with plugins to support some operations in PyTorch that are not natviely supported with TensorRT, call the following 22 | 23 | !!! note 24 | 25 | Please note, this currently only includes the interpolate plugin. This plugin requires PyTorch 1.3+ for serialization. 26 | 27 | ```bash 28 | git clone https://github.com/NVIDIA-AI-IOT/torch2trt 29 | cd torch2trt 30 | sudo python setup.py install --plugins 31 | ``` 32 | 33 | -------------------------------------------------------------------------------- /torch2trt/converters/unimplemented_converters.py: -------------------------------------------------------------------------------- 1 | from torch2trt.torch2trt import * 2 | 3 | 4 | def is_private(method): 5 | method = method.split('.')[-1] # remove prefix 6 | return method[0] == '_' and method[1] != '_' 7 | 8 | def is_function_type(method): 9 | fntype = eval(method + '.__class__.__name__') 10 | return fntype == 'function' or fntype == 'builtin_function_or_method' or fntype == 'method_descriptor' 11 | 12 | def get_methods(namespace): 13 | methods = [] 14 | for method in dir(eval(namespace)): 15 | full_method = namespace + '.' + method 16 | if not is_private(full_method) and is_function_type(full_method): 17 | methods.append(full_method) 18 | return methods 19 | 20 | 21 | TORCH_METHODS = [] 22 | TORCH_METHODS += get_methods('torch') 23 | TORCH_METHODS += get_methods('torch.Tensor') 24 | TORCH_METHODS += get_methods('torch.nn.functional') 25 | 26 | 27 | for method in TORCH_METHODS: 28 | 29 | @tensorrt_converter(method, is_real=False) 30 | def warn_method(ctx): 31 | print('Warning: Encountered known unsupported method %s' % ctx.method_str) 32 | 33 | 34 | @tensorrt_converter('torch.Tensor.dim', is_real=False) 35 | @tensorrt_converter('torch.Tensor.size', is_real=False) 36 | def dont_warn(ctx): 37 | pass 38 | -------------------------------------------------------------------------------- /examples/easyocr/README.md: -------------------------------------------------------------------------------- 1 | # torch2trt EasyOCR Example 2 | 3 | This example uses torch2trt to optimize EasyOCR. EasyOCR is split into 4 | two TensorRT engines, one for the detector, one for the recognizer. 5 | 6 | To run the example, follow these steps 7 | 8 | 1. Download example images 9 | 10 | ```bash 11 | ./download_images.sh 12 | ``` 13 | 14 | 2. Generate data for shape inference and calibration. By default this script will look in the ``images`` directory. 15 | 16 | ```bash 17 | python3 generate_data.py 18 | ``` 19 | 20 | 3. Optimize the Text Detector. This will use the data from step 1 for shape inference and calibration. It creates a file ``detector_trt.pth``. 21 | 22 | ```bash 23 | python3 optimizer_detector.py 24 | ``` 25 | 26 | 4. Optimize the Text Recognizer. This also uses the data generated from step 1. It creates a file ``recognizer_trt.pth``. 27 | 28 | ```bash 29 | python3 optimize_recognizer.py 30 | 31 | 5. Run the pipeline end to end and compare the performance to the original PyTorch model. 32 | 33 | ```bash 34 | python3 run_end2end.py 35 | ``` 36 | 37 | That's it! To use the model in your application, reference these scripts for more details. Specifically, reference 38 | ``run_end2end.py`` to see how to create and execute the full model pipeline. -------------------------------------------------------------------------------- /torch2trt/flatten_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .flattener import Flattener 4 | 5 | 6 | class Unflatten(nn.Module): 7 | 8 | def __init__(self, module, input_flattener=None, output_flattener=None): 9 | super().__init__() 10 | self.module = module 11 | self.input_flattener = input_flattener 12 | self.output_flattener = output_flattener 13 | 14 | def forward(self, *args): 15 | if self.input_flattener is not None: 16 | args = self.input_flattener.flatten(args) 17 | output = self.module(*args) 18 | if self.output_flattener is not None: 19 | output = self.output_flattener.unflatten(output) 20 | return output 21 | 22 | 23 | class Flatten(nn.Module): 24 | 25 | def __init__(self, module, input_flattener=None, output_flattener=None): 26 | super().__init__() 27 | self.module = module 28 | self.input_flattener = input_flattener 29 | self.output_flattener = output_flattener 30 | 31 | def forward(self, *args): 32 | if self.input_flattener is not None: 33 | args = self.input_flattener.unflatten(*args) 34 | output = self.module(*args) 35 | if self.output_flattener is not None: 36 | output = self.output_flattener.flatten(output) 37 | return output -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: torch2trt 2 | theme: 3 | name: "material" 4 | palette: 5 | primary: green 6 | secondary: light green 7 | 8 | repo_url: https://github.com/NVIDIA-AI-IOT/torch2trt 9 | 10 | plugins: 11 | - search 12 | 13 | use_directory_urls: False 14 | 15 | edit_uri: blob/master/docs 16 | markdown_extensions: 17 | - pymdownx.tabbed 18 | - pymdownx.keys 19 | - pymdownx.snippets 20 | - pymdownx.inlinehilite 21 | - pymdownx.highlight: 22 | use_pygments: true 23 | - admonition 24 | - pymdownx.details 25 | - pymdownx.superfences 26 | - attr_list 27 | 28 | # use_directory_urls - False to fix broken raw html image links 29 | # https://github.com/mkdocs/mkdocs/issues/991 30 | 31 | 32 | nav: 33 | 34 | - Home: index.md 35 | - Getting Started: getting_started.md 36 | - Usage: 37 | - Basic Usage: usage/basic_usage.md 38 | - Reduced Precision: usage/reduced_precision.md 39 | - Custom Converter: usage/custom_converter.md 40 | - Converters: converters.md 41 | - Benchmarks: 42 | - Jetson Nano: benchmarks/jetson_nano.md 43 | - Jetson Xavier: benchmarks/jetson_xavier.md 44 | - Contributing: CONTRIBUTING.md 45 | - See Also: see_also.md 46 | 47 | extra_css: 48 | - css/version-select.css 49 | extra_javascript: 50 | - js/version-select.js 51 | 52 | google_analytics: 53 | - UA-135919510-3 54 | - auto 55 | 56 | -------------------------------------------------------------------------------- /tests/feature_tests/test_legacy_max_batch_size.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torch2trt import torch2trt 4 | 5 | 6 | def test_legacy_max_batch_size(): 7 | 8 | model = nn.Conv2d(3, 6, kernel_size=1).cuda().eval() 9 | 10 | data = torch.randn(1, 3, 32, 32).cuda() 11 | 12 | model_trt = torch2trt(model, [data], max_batch_size=4) 13 | 14 | 15 | data = torch.randn(1, 3, 32, 32).cuda() 16 | out = model(data) 17 | out_trt = model_trt(data) 18 | 19 | assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3)) 20 | 21 | 22 | data = torch.randn(4, 3, 32, 32).cuda() 23 | out = model(data) 24 | out_trt = model_trt(data) 25 | 26 | assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3)) 27 | 28 | def test_legacy_max_batch_size_conv1d(): 29 | 30 | model = nn.Conv1d(10, 20, kernel_size=1).cuda().eval() 31 | 32 | data = torch.randn(1, 10, 32).cuda() 33 | 34 | model_trt = torch2trt(model, [data], max_batch_size=4, use_onnx=False) 35 | 36 | 37 | data = torch.randn(1, 10, 32).cuda() 38 | out = model(data) 39 | out_trt = model_trt(data) 40 | 41 | assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3)) 42 | 43 | 44 | data = torch.randn(4, 10, 32).cuda() 45 | out = model(data) 46 | out_trt = model_trt(data) 47 | 48 | assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3)) 49 | 50 | if __name__ == '__main__': 51 | test_legacy_max_batch_size_conv1d() -------------------------------------------------------------------------------- /tests/model_tests/torchvision/test_segmentation_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch2trt 4 | 5 | 6 | class ModelWrapper(torch.nn.Module): 7 | def __init__(self, model): 8 | super(ModelWrapper, self).__init__() 9 | self.model = model 10 | def forward(self, x): 11 | return self.model(x)['out'] 12 | 13 | 14 | def _cross_validate_module(model, shape=(224, 224)): 15 | model = model.cuda().eval() 16 | data = torch.randn(1, 3, *shape).cuda() 17 | model_trt = torch2trt.torch2trt(model, [data]) 18 | data = torch.randn(1, 3, *shape).cuda() 19 | out = model(data) 20 | out_trt = model_trt(data) 21 | assert torch.allclose(out, out_trt, rtol=1e-2, atol=1e-2) 22 | 23 | 24 | 25 | def test_deeplabv3_resnet50(): 26 | bb = torchvision.models.segmentation.deeplabv3_resnet50(pretrained=False) 27 | model = ModelWrapper(bb) 28 | _cross_validate_module(model) 29 | 30 | 31 | def test_deeplabv3_resnet101(): 32 | bb = torchvision.models.segmentation.deeplabv3_resnet101(pretrained=False) 33 | model = ModelWrapper(bb) 34 | _cross_validate_module(model) 35 | 36 | 37 | def test_fcn_resnet50(): 38 | bb = torchvision.models.segmentation.fcn_resnet50(pretrained=False) 39 | model = ModelWrapper(bb) 40 | _cross_validate_module(model) 41 | 42 | 43 | def test_fcn_resnet101(): 44 | bb = torchvision.models.segmentation.fcn_resnet101(pretrained=False) 45 | model = ModelWrapper(bb) 46 | _cross_validate_module(model) -------------------------------------------------------------------------------- /examples/contrib/pre_py3.7/fix-getitem.patch: -------------------------------------------------------------------------------- 1 | From d9b35495da58038fd5045cc0e2c1f0416f8e62f0 Mon Sep 17 00:00:00 2001 2 | From: Chao Zhang 3 | Date: Tue, 21 Jun 2022 15:38:23 +0000 4 | Subject: [PATCH] Fix getitem for Py<3.7 5 | 6 | --- 7 | torch2trt/torch2trt.py | 13 ++++++++++++- 8 | 1 file changed, 12 insertions(+), 1 deletion(-) 9 | 10 | diff --git a/torch2trt/torch2trt.py b/torch2trt/torch2trt.py 11 | index 3aa6946..9528f1a 100644 12 | --- a/torch2trt/torch2trt.py 13 | +++ b/torch2trt/torch2trt.py 14 | @@ -310,6 +310,14 @@ def attach_converter(ctx, method, converter, method_str): 15 | return wrapper 16 | 17 | 18 | +def _getitem_wrapper(method=torch.Tensor.__getitem__): 19 | + def wrapper(arg0, arg1): 20 | + if type(arg1) is torch.Tensor: 21 | + arg1 = (arg1, ) 22 | + return method(arg0, arg1) 23 | + return wrapper 24 | + 25 | + 26 | class ConversionHook(object): 27 | """Attaches TensorRT converter to PyTorch method call""" 28 | 29 | @@ -330,7 +338,10 @@ class ConversionHook(object): 30 | ) 31 | 32 | def __exit__(self, type, val, tb): 33 | - self._set_method(self.converter['method_impl']) 34 | + if '__getitem__' in self.converter['method_str']: 35 | + self._set_method(_getitem_wrapper()) 36 | + else: 37 | + self._set_method(self.converter['method_impl']) 38 | 39 | def default_input_names(num_inputs): 40 | return ["input_%d" % i for i in range(num_inputs)] 41 | -- 42 | 2.32.0 43 | 44 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0.0) 2 | project(torch2trt_plugins VERSION 0.1.0) 3 | 4 | # VARIABLES 5 | set(CUDA_ARCHITECTURES 53 62 72 87) 6 | 7 | # BUILD PLUGINS LIBRARY 8 | find_package(CUDA REQUIRED) 9 | 10 | enable_language(CUDA) 11 | 12 | include_directories("${CUDA_INCLUDE_DIRS}") 13 | 14 | add_library(torch2trt_plugins SHARED 15 | plugins/src/example_plugin.cu 16 | plugins/src/reflection_pad_2d_plugin.cu 17 | ) 18 | set_property(TARGET torch2trt_plugins PROPERTY CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) 19 | 20 | target_link_libraries( 21 | torch2trt_plugins 22 | nvinfer 23 | ${CUDA_LIBRARIES} 24 | ) 25 | 26 | install (TARGETS torch2trt_plugins 27 | LIBRARY DESTINATION lib) 28 | 29 | # BUILD TESTS 30 | find_package(Catch2 QUIET) 31 | 32 | if(Catch2_FOUND) 33 | include(CTest) 34 | include(CPack) 35 | include(Catch) 36 | enable_testing() 37 | 38 | add_executable(torch2trt_plugins_test 39 | plugins/src/tests.cpp 40 | plugins/src/example_plugin_test.cpp 41 | plugins/src/reflection_pad_2d_plugin_test.cpp 42 | ) 43 | 44 | set_property(TARGET torch2trt_plugins_test PROPERTY CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) 45 | 46 | target_link_libraries(torch2trt_plugins_test 47 | PRIVATE 48 | Catch2::Catch2WithMain 49 | torch2trt_plugins 50 | nvinfer 51 | ${CUDA_LIBRARIES} 52 | ) 53 | 54 | set(CPACK_PROJECT_NAME ${PROJECT_NAME}) 55 | set(CPACK_PROJECT_VERSION ${PROJECT_VERSION}) 56 | catch_discover_tests(torch2trt_plugins_test) 57 | endif() -------------------------------------------------------------------------------- /torch2trt/plugins/plugins.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "interpolate.cpp" 3 | #include "group_norm.cpp" 4 | 5 | 6 | using namespace nvinfer1; 7 | 8 | namespace torch2trt { 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 10 | py::class_(m, "InterpolatePlugin") 11 | .def(py::init, std::string, bool>(), py::arg("size"), py::arg("mode"), py::arg("align_corners")) 12 | .def(py::init(), py::arg("data")) 13 | .def("getSerializationSize", &InterpolatePlugin::getSerializationSize) 14 | .def("deserializeFromString", &InterpolatePlugin::deserializeFromString) 15 | .def("serializeToString", [](const InterpolatePlugin& plugin) { 16 | std::string data = plugin.serializeToString(); 17 | return py::bytes(data); 18 | }); 19 | py::class_(m, "GroupNormPlugin") 20 | .def(py::init(), py::arg("num_groups"), py::arg("weight"), py::arg("bias"), py::arg("eps")) 21 | .def(py::init(), py::arg("data")) 22 | .def("getSerializationSize", &GroupNormPlugin::getSerializationSize) 23 | .def("deserializeFromString", &GroupNormPlugin::deserializeFromString) 24 | .def("serializeToString", [](const GroupNormPlugin& plugin) { 25 | std::string data = plugin.serializeToString(); 26 | return py::bytes(data); 27 | }); 28 | 29 | } 30 | } // namespace torch2trt 31 | -------------------------------------------------------------------------------- /torch2trt/contrib/qat/layers/quant_activation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from . import _utils 3 | from pytorch_quantization import tensor_quant 4 | from pytorch_quantization.nn.modules import _utils as utils 5 | 6 | class QuantReLU(torch.nn.ReLU,utils.QuantInputMixin): 7 | """ 8 | Quantized ReLu. However, output of relu needs to be quantized for it to correclty map to a TRT layer 9 | """ 10 | default_quant_desc_input = tensor_quant.QUANT_DESC_8BIT_PER_TENSOR 11 | 12 | def __init__(self,inplace=False,**kwargs): 13 | super(QuantReLU,self).__init__(inplace) 14 | quant_desc_input = _utils.pop_quant_desc_in_kwargs(self.__class__, input_only=True, **kwargs) 15 | self.init_quantizer(quant_desc_input) 16 | 17 | def forward(self,input): 18 | output = super(QuantReLU,self).forward(input) 19 | ## Although o/p of relu is being quantized, terminology still says input quantizer, will change later 20 | output = self._input_quantizer(output) 21 | return output 22 | 23 | ## Inference class for quantized relu 24 | class IQuantReLU(torch.nn.ReLU,_utils.QuantMixinInput): 25 | ''' 26 | Mimicking inference side for relu followed by a quantized layer 27 | ''' 28 | def __init__(self,inplace=False): 29 | super().__init__(inplace) 30 | self.init_quantizer() 31 | 32 | def __repr__(self): 33 | s = super().__repr__() 34 | s = "(" + s + "dynamic_range amax {0:.4f})".format(self._input_quantizer.learned_amax) 35 | return s 36 | 37 | 38 | def forward(self,inputs): 39 | return super(IQuantReLU,self).forward(inputs) 40 | 41 | -------------------------------------------------------------------------------- /docs/usage/basic_usage.md: -------------------------------------------------------------------------------- 1 | # Basic Usage 2 | 3 | This page demonstrates basic torch2trt usage. 4 | 5 | ## Conversion 6 | 7 | You can easily convert a PyTorch module by calling ``torch2trt`` passing example data as input, for example to convert ``alexnet`` we call 8 | 9 | ```python 10 | import torch 11 | from torch2trt import torch2trt 12 | from torchvision.models.alexnet import alexnet 13 | 14 | # create some regular pytorch model... 15 | model = alexnet(pretrained=True).eval().cuda() 16 | 17 | # create example data 18 | x = torch.ones((1, 3, 224, 224)).cuda() 19 | 20 | # convert to TensorRT feeding sample data as input 21 | model_trt = torch2trt(model, [x]) 22 | ``` 23 | 24 | !!! note 25 | 26 | Currently with torch2trt, once the model is converted, you must use the same input shapes during 27 | execution. The exception is 28 | the batch size, which can vary up to the value specified by the ``max_batch_size`` parameter. 29 | 30 | ## Executution 31 | 32 | We can execute the returned ``TRTModule`` just like the original PyTorch model. Here we 33 | execute the model and print the maximum absolute error. 34 | 35 | ```python 36 | y = model(x) 37 | y_trt = model_trt(x) 38 | 39 | # check the output against PyTorch 40 | print(torch.max(torch.abs(y - y_trt))) 41 | ``` 42 | 43 | ## Saving and loading 44 | 45 | We can save the model as a ``state_dict``. 46 | 47 | ```python 48 | torch.save(model_trt.state_dict(), 'alexnet_trt.pth') 49 | ``` 50 | 51 | We can load the saved model into a ``TRTModule`` 52 | 53 | ```python 54 | from torch2trt import TRTModule 55 | 56 | model_trt = TRTModule() 57 | 58 | model_trt.load_state_dict(torch.load('alexnet_trt.pth')) 59 | ``` 60 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/utils/pytorch_nvidia_quantization.patch: -------------------------------------------------------------------------------- 1 | From fa12201005e221fc6de8b0d836fdd60c0a107aaa Mon Sep 17 00:00:00 2001 2 | From: Kshitij Srivastava 3 | Date: Wed, 4 Nov 2020 18:01:14 -0500 4 | Subject: [PATCH] saving learned amax as a part of state dict 5 | 6 | --- 7 | .../pytorch_quantization/nn/modules/tensor_quantizer.py | 5 +++++ 8 | 1 file changed, 5 insertions(+) 9 | 10 | diff --git a/tools/pytorch-quantization/pytorch_quantization/nn/modules/tensor_quantizer.py b/tools/pytorch-quantization/pytorch_quantization/nn/modules/tensor_quantizer.py 11 | index fd3f32c..d26c585 100644 12 | --- a/tools/pytorch-quantization/pytorch_quantization/nn/modules/tensor_quantizer.py 13 | +++ b/tools/pytorch-quantization/pytorch_quantization/nn/modules/tensor_quantizer.py 14 | @@ -87,6 +87,10 @@ class TensorQuantizer(nn.Module): 15 | 16 | if quant_desc.amax is not None: 17 | self.register_buffer('_amax', torch.tensor(quant_desc.amax)) 18 | + 19 | + ##dynamic amax needs to be stored as a part of state dict to be used at inference time to map dynamic range to 20 | + # TRT layer 21 | + self.register_buffer('learned_amax',torch.tensor(1)) 22 | 23 | # Clip module consumes a lot of memory, so only create it if learn_amax is True 24 | if self._learn_amax: 25 | @@ -273,6 +277,7 @@ class TensorQuantizer(nn.Module): 26 | if self._scale_amax is not None: 27 | amax = amax.detach() * self._scale_amax 28 | 29 | + self.learned_amax = amax 30 | return amax 31 | 32 | def _fb_fake_quant(self, inputs, amax): 33 | -- 34 | 2.29.2 35 | 36 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | Below is a list of developers who have contributed to torch2trt. This is also used to track contributors 4 | who have agreed to torch2trt's Contributor License Agreement. 5 | 6 | - [John Welsh](https://github.com/jaybdub) (CLA) 7 | - John Welsh 8 | 9 | ## Becoming a Contributor 10 | 11 | If you've made a notable contribution to torch2trt and wish to be listed as a contributor, simply do the following. 12 | 13 | 1. Modify ``CONTRIBUTORS.md`` and add your name with a hyperlink to your GitHub account to the end of the contributors list. 14 | 15 | ```md 16 | - [](https://github.com/) 17 | ``` 18 | 19 | 2. Stage the changes in a standalone commit 20 | 21 | ```md 22 | git add CONTRIBUTORS.md 23 | ``` 24 | 25 | 3. Make a signed commit with the following message text 26 | 27 | ```md 28 | git commit -m "Added to CONTRIBUTORS.md" 29 | ``` 30 | 31 | ## Signing Contributor License Agreement (CLA) 32 | 33 | In some instances, you may be requested to sign torch2trt's Contributor License Agreement (CLA). To do so, 34 | 35 | 1. If you're not already listed as a contributor in CONTRIBUTORS.md, make a commit as described above to add yourself to CONTRIBUTORS.md 36 | 37 | 2. Add the text ``(CLA)`` after your name in ``CONTRIBUTORS.md`` 38 | 3. Stage the changes in a standalone commit 39 | 40 | ```md 41 | git add CONTRIBUTORS.md 42 | ``` 43 | 4. Make a signed commit with the following text 44 | 45 | ```md 46 | git commit -S -m "I have read and agree to the Contributor License Agreement as written in the file CLA.md of this project. Signed, " 47 | ``` 48 | 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tensorrt 3 | import torch 4 | from setuptools import setup, find_packages 5 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension 6 | from packaging import version 7 | 8 | 9 | def trt_inc_dir(): 10 | return "/usr/include/aarch64-linux-gnu" 11 | 12 | def trt_lib_dir(): 13 | return "/usr/lib/aarch64-linux-gnu" 14 | 15 | ext_modules = [] 16 | exclude_dir = ["torch2trt/contrib","torch2trt/contrib.*"] 17 | 18 | compile_args_cxx = [] 19 | if version.parse(torch.__version__) < version.parse('1.5'): 20 | compile_args_cxx.append('-DUSE_DEPRECATED_INTLIST') 21 | if version.parse(tensorrt.__version__) < version.parse('8'): 22 | compile_args_cxx.append('-DPRE_TRT8') 23 | 24 | plugins_ext_module = CUDAExtension( 25 | name='plugins', 26 | sources=[ 27 | 'torch2trt/plugins/plugins.cpp' 28 | ], 29 | include_dirs=[ 30 | trt_inc_dir() 31 | ], 32 | library_dirs=[ 33 | trt_lib_dir() 34 | ], 35 | libraries=[ 36 | 'nvinfer' 37 | ], 38 | extra_compile_args={ 39 | 'cxx': compile_args_cxx, 40 | 'nvcc': [] 41 | } 42 | ) 43 | 44 | if '--plugins' in sys.argv: 45 | ext_modules.append(plugins_ext_module) 46 | sys.argv.remove('--plugins') 47 | 48 | if '--contrib' in sys.argv: 49 | exclude_dir=[] 50 | sys.argv.remove('--contrib') 51 | 52 | setup( 53 | name='torch2trt', 54 | version='0.5.0', 55 | description='An easy to use PyTorch to TensorRT converter', 56 | packages=find_packages(exclude=exclude_dir), 57 | ext_package='torch2trt', 58 | ext_modules=ext_modules, 59 | cmdclass={'build_ext': BuildExtension} 60 | ) 61 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/datasets/cifar10.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torchvision.transforms as transforms 4 | 5 | class Cifar10Loaders: 6 | """ 7 | Data loaders for cifar 10 dataset 8 | """ 9 | def __init__(self, data_dir='/tmp/cifar10', download=True, batch_size=128, pin_memory=True, num_workers=4): 10 | self.data_dir = data_dir 11 | self.download = download 12 | self.batch_size= batch_size 13 | self.pin_memory = pin_memory 14 | self.num_workers = num_workers 15 | self.train_transform = transforms.Compose([ 16 | transforms.RandomCrop(32, padding=4), 17 | transforms.RandomHorizontalFlip(), 18 | transforms.ToTensor(), 19 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 20 | ]) 21 | self.test_transform = transforms.Compose([ 22 | transforms.ToTensor(), 23 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 24 | ]) 25 | 26 | def train_loader(self,shuffle=True): 27 | trainset = torchvision.datasets.CIFAR10(root=self.data_dir, train=True, download=True, transform=self.train_transform) 28 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=self.batch_size, shuffle=shuffle, num_workers=self.num_workers, pin_memory=self.pin_memory) 29 | return trainloader 30 | 31 | def test_loader(self,shuffle=False): 32 | testset = torchvision.datasets.CIFAR10(root=self.data_dir, train=False, download=True, transform=self.test_transform) 33 | testloader = torch.utils.data.DataLoader(testset, batch_size=self.batch_size, shuffle=shuffle, num_workers=self.num_workers, pin_memory=self.pin_memory) 34 | return testloader 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /examples/easyocr/generate_data.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import cv2 3 | import torch 4 | import glob 5 | from easyocr import Reader 6 | from torch2trt.dataset import FolderDataset 7 | from torch2trt import torch2trt, TRTModule 8 | import math 9 | import os 10 | 11 | parser = ArgumentParser() 12 | parser.add_argument('--images', type=str, default='images') 13 | parser.add_argument('--detector_data', type=str, default='detector_data') 14 | parser.add_argument('--recognizer_data', type=str, default='recognizer_data') 15 | parser.add_argument('--max_image_area', type=int, default=1280*720) 16 | parser.add_argument('--recognizer_batch_size', type=int, default=1) 17 | args = parser.parse_args() 18 | 19 | 20 | reader = Reader(['en']) 21 | 22 | 23 | detector_dataset = FolderDataset(args.detector_data) 24 | recognizer_dataset = FolderDataset(args.recognizer_data) 25 | 26 | 27 | def shrink_to_area(image, area): 28 | height = image.shape[0] 29 | width = image.shape[1] 30 | 31 | if height * width > area: 32 | ar = width / height 33 | new_height = math.sqrt(area / ar) 34 | new_width = ar * new_height 35 | new_height = math.floor(new_height) 36 | new_width = math.floor(new_width) 37 | print(f'Resizing {width}x{height} to {new_width}x{new_height}') 38 | image = cv2.resize(image, (new_width, new_height)) 39 | 40 | return image 41 | 42 | 43 | with detector_dataset.record(reader.detector.module): 44 | with recognizer_dataset.record(reader.recognizer.module): 45 | 46 | for path in glob.glob(os.path.join(args.images, '*.jpg')): 47 | print(path) 48 | 49 | image = cv2.imread(path) 50 | 51 | image = shrink_to_area(image, args.max_image_area) 52 | 53 | reader.readtext(image, batch_size=args.recognizer_batch_size) 54 | -------------------------------------------------------------------------------- /docs/js/version-select.js: -------------------------------------------------------------------------------- 1 | window.addEventListener("DOMContentLoaded", function() { 2 | // This is a bit hacky. Figure out the base URL from a known CSS file the 3 | // template refers to... 4 | var ex = new RegExp("/?css/version-select.css$"); 5 | var sheet = document.querySelector('link[href$="version-select.css"]'); 6 | 7 | var ABS_BASE_URL = sheet.href.replace(ex, ""); 8 | var CURRENT_VERSION = ABS_BASE_URL.split("/").pop(); 9 | 10 | function makeSelect(options, selected) { 11 | var select = document.createElement("select"); 12 | select.classList.add("form-control"); 13 | 14 | options.forEach(function(i) { 15 | var option = new Option(i.text, i.value, undefined, 16 | i.value === selected); 17 | select.add(option); 18 | }); 19 | 20 | return select; 21 | } 22 | 23 | var xhr = new XMLHttpRequest(); 24 | xhr.open("GET", ABS_BASE_URL + "/../versions.json"); 25 | xhr.onload = function() { 26 | var versions = JSON.parse(this.responseText); 27 | 28 | var realVersion = versions.find(function(i) { 29 | return i.version === CURRENT_VERSION || 30 | i.aliases.includes(CURRENT_VERSION); 31 | }).version; 32 | 33 | var select = makeSelect(versions.map(function(i) { 34 | return {text: i.title, value: i.version}; 35 | }), realVersion); 36 | select.addEventListener("change", function(event) { 37 | window.location.href = ABS_BASE_URL + "/../" + this.value; 38 | }); 39 | 40 | var container = document.createElement("div"); 41 | container.id = "version-selector"; 42 | container.className = "md-nav__item"; 43 | container.appendChild(select); 44 | 45 | var sidebar = document.querySelector(".md-nav--primary > .md-nav__list"); 46 | sidebar.parentNode.insertBefore(container, sidebar); 47 | }; 48 | xhr.send(); 49 | }); 50 | -------------------------------------------------------------------------------- /tests/feature_tests/test_interpolate_dynamic.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn.functional as F 4 | from torch2trt import ( 5 | torch2trt, 6 | trt 7 | ) 8 | 9 | 10 | def test_interpolate_dynamic_size(): 11 | 12 | class TestModule(torch.nn.Module): 13 | def forward(self, x): 14 | size = x.size() 15 | return F.interpolate(x, size=(size[2]*2, size[3]*3)) 16 | 17 | module = TestModule().cuda().eval() 18 | 19 | x = torch.randn(1, 3, 32, 32).cuda() 20 | 21 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, min_shapes=[(1, 3, 32, 32)], max_shapes=[(4, 3, 64, 64)], opt_shapes=[(1, 3, 32, 32)]) 22 | 23 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 24 | 25 | x = torch.randn(1, 3, 32, 32).cuda() 26 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 27 | 28 | x = torch.randn(4, 3, 64, 64).cuda() 29 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 30 | 31 | 32 | def test_interpolate_dynamic_shape(): 33 | 34 | class TestModule(torch.nn.Module): 35 | def forward(self, x): 36 | size = x.shape 37 | return F.interpolate(x, size=(size[2]*2, size[3]*3)) 38 | 39 | module = TestModule().cuda().eval() 40 | 41 | x = torch.randn(1, 3, 32, 32).cuda() 42 | 43 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, min_shapes=[(1, 3, 32, 32)], max_shapes=[(4, 3, 64, 64)], opt_shapes=[(1, 3, 32, 32)]) 44 | 45 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 46 | 47 | x = torch.randn(1, 3, 32, 32).cuda() 48 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 49 | 50 | x = torch.randn(4, 3, 64, 64).cuda() 51 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 52 | -------------------------------------------------------------------------------- /torch2trt/dataset_calibrator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import tensorrt as trt 3 | import os 4 | from .flattener import Flattener 5 | 6 | __all__ = [ 7 | 'DEFAULT_CALIBRATION_ALGORITHM', 8 | 'DatasetCalibrator' 9 | ] 10 | 11 | 12 | if trt.__version__ >= '5.1': 13 | DEFAULT_CALIBRATION_ALGORITHM = trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2 14 | else: 15 | DEFAULT_CALIBRATION_ALGORITHM = trt.CalibrationAlgoType.ENTROPY_CALIBRATION 16 | 17 | 18 | class DatasetCalibrator(trt.IInt8Calibrator): 19 | 20 | def __init__(self, dataset, algorithm=DEFAULT_CALIBRATION_ALGORITHM, cache_file=None, flattener=None): 21 | super(DatasetCalibrator, self).__init__() 22 | self.dataset = dataset 23 | self.algorithm = algorithm 24 | self.count = 0 25 | self.cache_file = cache_file 26 | if flattener is None: 27 | flattener = Flattener.from_value(dataset[0]) 28 | self.flattener = flattener 29 | 30 | def get_batch(self, *args, **kwargs): 31 | if self.count < len(self.dataset): 32 | tensors = self.flattener.flatten(self.dataset[self.count]) 33 | bindings = [int(t.data_ptr()) for t in tensors] 34 | self.count += 1 35 | return bindings 36 | else: 37 | return [] 38 | 39 | def get_algorithm(self): 40 | return self.algorithm 41 | 42 | def get_batch_size(self): 43 | return 1 44 | 45 | def read_calibration_cache(self, *args, **kwargs): 46 | if (self.cache_file is not None) and os.path.exists(self.cache_file): 47 | with open(self.cache_file, 'rb') as f: 48 | return f.read() 49 | 50 | def write_calibration_cache(self, cache, *args, **kwargs): 51 | if self.cache_file is not None: 52 | with open(self.cache_file, 'wb') as f: 53 | f.write(cache) 54 | -------------------------------------------------------------------------------- /torch2trt/misc_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import tensorrt as trt 4 | 5 | from .version_utils import ( 6 | trt_version 7 | ) 8 | 9 | 10 | def torch_dtype_to_trt(dtype): 11 | if trt_version() >= '7.0' and dtype == torch.bool: 12 | return trt.bool 13 | elif dtype == torch.int8: 14 | return trt.int8 15 | elif dtype == torch.int32: 16 | return trt.int32 17 | elif dtype == torch.float16: 18 | return trt.float16 19 | elif dtype == torch.float32: 20 | return trt.float32 21 | else: 22 | raise TypeError("%s is not supported by tensorrt" % dtype) 23 | 24 | 25 | def torch_dtype_from_trt(dtype): 26 | if dtype == trt.int8: 27 | return torch.int8 28 | elif trt_version() >= '7.0' and dtype == trt.bool: 29 | return torch.bool 30 | elif dtype == trt.int32: 31 | return torch.int32 32 | elif dtype == trt.float16: 33 | return torch.float16 34 | elif dtype == trt.float32: 35 | return torch.float32 36 | else: 37 | raise TypeError("%s is not supported by torch" % dtype) 38 | 39 | 40 | def torch_device_to_trt(device): 41 | if device.type == torch.device("cuda").type: 42 | return trt.TensorLocation.DEVICE 43 | elif device.type == torch.device("cpu").type: 44 | return trt.TensorLocation.HOST 45 | else: 46 | return TypeError("%s is not supported by tensorrt" % device) 47 | 48 | 49 | def torch_device_from_trt(device): 50 | if device == trt.TensorLocation.DEVICE: 51 | return torch.device("cuda") 52 | elif device == trt.TensorLocation.HOST: 53 | return torch.device("cpu") 54 | else: 55 | return TypeError("%s is not supported by torch" % device) 56 | 57 | 58 | def trt_int_dtype(): 59 | if trt_version() >= "10.0": 60 | return np.int64 61 | else: 62 | return np.int32 63 | 64 | -------------------------------------------------------------------------------- /torch2trt/converters/plugin_converters.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch2trt.torch2trt import * 4 | import numpy as np 5 | import ctypes 6 | 7 | 8 | try: 9 | ctypes.CDLL('libtorch2trt_plugins.so') 10 | 11 | def create_reflection_pad_2d_plugin(paddingLeft, paddingRight, paddingTop, paddingBottom): 12 | 13 | registry = trt.get_plugin_registry() 14 | creator = registry.get_plugin_creator('ReflectionPad2dPlugin', '1', '') 15 | 16 | fc = trt.PluginFieldCollection([ 17 | trt.PluginField( 18 | 'paddingLeft', 19 | np.array([paddingLeft]).astype(np.int32), 20 | trt.PluginFieldType.INT32 21 | ), 22 | trt.PluginField( 23 | 'paddingRight', 24 | np.array([paddingRight]).astype(np.int32), 25 | trt.PluginFieldType.INT32 26 | ), 27 | trt.PluginField( 28 | 'paddingTop', 29 | np.array([paddingTop]).astype(np.int32), 30 | trt.PluginFieldType.INT32 31 | ), 32 | trt.PluginField( 33 | 'paddingBottom', 34 | np.array([paddingBottom]).astype(np.int32), 35 | trt.PluginFieldType.INT32 36 | ) 37 | ]) 38 | 39 | return creator.create_plugin('', fc) 40 | @tensorrt_converter(nn.ReflectionPad2d.forward) 41 | def convert_reflection_pad(ctx): 42 | module = get_arg(ctx, 'self', pos=0, default=None) 43 | input = get_arg(ctx, 'x', pos=1, default=None) 44 | output = ctx.method_return 45 | input_trt = input._trt 46 | plugin = create_reflection_pad_2d_plugin( 47 | module.padding[0], 48 | module.padding[1], 49 | module.padding[2], 50 | module.padding[3] 51 | ) 52 | layer = ctx.network.add_plugin_v2([input_trt], plugin) 53 | output._trt = layer.get_output(0) 54 | 55 | except: 56 | pass -------------------------------------------------------------------------------- /scripts/dump_converters.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import subprocess 4 | import os 5 | from importlib.machinery import SourceFileLoader 6 | 7 | torch2trt = SourceFileLoader("torch2trt", "torch2trt/__init__.py").load_module() # to load relative to root 8 | 9 | HEADER = """ 10 | # Converters 11 | 12 | This table contains a list of supported PyTorch methods and their associated converters. 13 | 14 | If your model is not converting, a good start in debugging would be to see if it contains a method not listed 15 | in this table. You may also find these a useful reference when writing your own converters. 16 | 17 | | Method | Converter | 18 | |--------|-----------|""" 19 | 20 | if __name__ == '__main__': 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--github', 24 | type=str, 25 | default='https://github.com/NVIDIA-AI-IOT/torch2trt') 26 | parser.add_argument('--tag', type=str, default='master') 27 | args = parser.parse_args() 28 | 29 | print(HEADER) 30 | 31 | for method, entry in torch2trt.CONVERTERS.items(): 32 | 33 | if not entry['is_real']: 34 | continue 35 | 36 | converter = entry['converter'] 37 | 38 | # get commit hash 39 | # p = subprocess.Popen(['git', 'rev-parse', 'HEAD'], 40 | # stdout=subprocess.PIPE, 41 | # stderr=subprocess.PIPE) 42 | # commit, err = p.communicate() 43 | # commit = commit.decode('utf-8').strip('\n') 44 | 45 | # get github URL 46 | url = '{github}/blob/{commit}/{relpath}#L{lineno}'.format( 47 | github=args.github, 48 | commit=args.tag, 49 | relpath=os.path.relpath(converter.__code__.co_filename, 50 | os.path.abspath('.')), 51 | lineno=converter.__code__.co_firstlineno) 52 | 53 | print('| ``{method}`` | [``{converter}``]({url}) |'.format( 54 | method=method, converter=converter.__name__, url=url)) 55 | -------------------------------------------------------------------------------- /tests/feature_tests/test_flatten_dynamic.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from torch2trt import torch2trt, trt 3 | import torch 4 | 5 | 6 | class FlattenModule(torch.nn.Module): 7 | def __init__(self, start_dim, end_dim): 8 | super().__init__() 9 | self.start_dim = start_dim 10 | self.end_dim = end_dim 11 | 12 | def forward(self, x): 13 | return torch.flatten(x, self.start_dim, self.end_dim) 14 | 15 | 16 | def test_flatten_dynamic_0_n1(): 17 | 18 | # 0, -1 19 | module = FlattenModule(start_dim=0, end_dim=-1).cuda().eval() 20 | 21 | x = torch.randn(1, 4, 5).cuda() 22 | 23 | module_trt = torch2trt(module, [x], max_batch_size=4, log_level=trt.Logger.VERBOSE) 24 | 25 | x = torch.randn(1, 4, 5).cuda() 26 | assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2)) 27 | 28 | x = torch.randn(4, 4, 5).cuda() 29 | assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2)) 30 | 31 | 32 | def test_flatten_dynamic_1_n1(): 33 | # 1, -1 34 | module = FlattenModule(start_dim=1, end_dim=-1).cuda().eval() 35 | 36 | x = torch.randn(1, 4, 5).cuda() 37 | 38 | module_trt = torch2trt(module, [x], max_batch_size=4, log_level=trt.Logger.VERBOSE) 39 | 40 | x = torch.randn(1, 4, 5).cuda() 41 | assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2)) 42 | 43 | x = torch.randn(4, 4, 5).cuda() 44 | assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2)) 45 | 46 | 47 | def test_flatten_dynamic_0_1(): 48 | # 0, 1 49 | module = FlattenModule(start_dim=0, end_dim=1).cuda().eval() 50 | 51 | x = torch.randn(1, 4, 5).cuda() 52 | 53 | module_trt = torch2trt(module, [x], max_batch_size=4, log_level=trt.Logger.VERBOSE) 54 | 55 | x = torch.randn(1, 4, 5).cuda() 56 | assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2)) 57 | 58 | x = torch.randn(4, 4, 5).cuda() 59 | assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2)) 60 | 61 | 62 | if __name__ == '__main__': 63 | 64 | test_flatten_dynamic_0_1() -------------------------------------------------------------------------------- /torch2trt/utils.py: -------------------------------------------------------------------------------- 1 | import graphviz 2 | import tensorrt as trt 3 | 4 | 5 | def trt_network_to_dot_graph(network): 6 | dot = graphviz.Digraph(comment="Network") 7 | 8 | # add nodes (layers) 9 | for i in range(network.num_layers): 10 | layer = network.get_layer(i) 11 | dot.node(layer.name) 12 | 13 | # add nodes (inputs) 14 | for i in range(network.num_inputs): 15 | dot.node(network.get_input(i).name) 16 | 17 | # add nodes (outputs) 18 | for i in range(network.num_outputs): 19 | dot.node(network.get_output(i).name) 20 | 21 | # add layer->layer edges 22 | for a in range(network.num_layers): 23 | layer_a = network.get_layer(a) 24 | 25 | for b in range(network.num_layers): 26 | layer_b = network.get_layer(b) 27 | 28 | for i in range(layer_a.num_outputs): 29 | output_i = layer_a.get_output(i) 30 | 31 | for j in range(layer_b.num_inputs): 32 | input_j = layer_b.get_input(j) 33 | 34 | if output_i == input_j: 35 | dot.edge(layer_a.name, layer_b.name, label=str(input_j.shape)) 36 | 37 | # add input->layer edges 38 | for i in range(network.num_inputs): 39 | input_i = network.get_input(i) 40 | 41 | for b in range(network.num_layers): 42 | layer_b = network.get_layer(b) 43 | 44 | for j in range(layer_b.num_inputs): 45 | input_j = layer_b.get_input(j) 46 | 47 | if input_i == input_j: 48 | dot.edge(input_i.name, layer_b.name, label=str(input_j.shape)) 49 | 50 | # add layer->output edges 51 | for i in range(network.num_outputs): 52 | input_i = network.get_output(i) 53 | 54 | for b in range(network.num_layers): 55 | layer_b = network.get_layer(b) 56 | 57 | for j in range(layer_b.num_outputs): 58 | input_j = layer_b.get_output(j) 59 | 60 | if input_i == input_j: 61 | dot.edge(layer_b.name, input_i.name, label=str(input_j.shape)) 62 | 63 | return dot 64 | -------------------------------------------------------------------------------- /scripts/profile_timm_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | profile_timm() { 4 | python3 scripts/profile_timm.py --model $1 --size $2 5 | python3 scripts/profile_timm.py --model $1 --use-onnx --size $2 6 | } 7 | 8 | profile_timm beit_base_patch16_224 224 9 | profile_timm botnet26t_256 256 10 | profile_timm gernet_s 224 11 | profile_timm cait_xxs24_224 224 12 | profile_timm coat_tiny 224 13 | profile_timm convit_tiny 224 14 | profile_timm convmixer_768_32 224 15 | profile_timm convnext_tiny 288 16 | profile_timm crossvit_15_240 240 17 | profile_timm cspresnet50 256 18 | profile_timm deit_tiny_patch16_224 224 19 | profile_timm densenet121 224 20 | profile_timm dla34 224 21 | profile_timm dpn68 224 22 | profile_timm edgenext_xx_small 288 23 | profile_timm efficientformer_l1 224 24 | profile_timm gcvit_xxtiny 224 25 | profile_timm ghostnet_050 224 26 | profile_timm gluon_resnet18_v1b 224 27 | profile_timm gluon_xception65 299 28 | profile_timm hardcorenas_a 224 29 | profile_timm hrnet_w18_small 224 30 | profile_timm inception_resnet_v2 299 31 | profile_timm inception_v3 299 32 | profile_timm inception_v4 299 33 | profile_timm levit_128s 224 34 | profile_timm maxvit_tiny_224 224 35 | profile_timm coatnet_0_224 224 36 | profile_timm mixer_s32_224 224 37 | profile_timm mobilenetv3_small_050 224 38 | profile_timm mobilevit_xs 256 39 | profile_timm mvitv2_tiny 224 40 | profile_timm nasnetalarge 331 41 | profile_timm nest_tiny 224 42 | profile_timm dm_nfnet_f0 256 43 | profile_timm pit_ti_224 224 44 | profile_timm pnasnet5large 331 45 | profile_timm poolformer_s12 224 46 | profile_timm pvt_v2_b0 224 47 | profile_timm regnetx_040 224 48 | profile_timm res2net50_26w_4s 224 49 | profile_timm resnest14d 224 50 | profile_timm resnet10t 224 51 | profile_timm resnetv2_50 224 52 | profile_timm rexnet_100 224 53 | profile_timm selecsls42 224 54 | profile_timm legacy_senet154 224 55 | profile_timm sequencer2d_s 224 56 | profile_timm skresnet18 224 57 | profile_timm swin_tiny_patch4_window7_224 224 58 | profile_timm swinv2_tiny_window8_256 256 59 | profile_timm swinv2_cr_tiny_224 224 60 | profile_timm tnt_s_patch16_224 224 61 | profile_timm tresnet_m 224 62 | profile_timm twins_pcpvt_small 224 63 | profile_timm vgg11 224 64 | profile_timm visformer_tiny 224 65 | profile_timm volo_d1_224 224 66 | profile_timm vovnet39a 224 67 | profile_timm xception 299 68 | profile_timm xception41 299 69 | profile_timm xcit_nano_12_p16_224 224 -------------------------------------------------------------------------------- /tests/feature_tests/test_dynamic_shape.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | import tensorrt as trt 5 | from torch2trt import torch2trt 6 | from torch2trt.dataset import ListDataset 7 | 8 | 9 | def test_dynamic_shape_conv2d(): 10 | 11 | torch.manual_seed(0) 12 | 13 | module = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval() 14 | 15 | dataset = ListDataset() 16 | dataset.insert((torch.randn(1, 3, 224, 224).cuda(),)) 17 | dataset.insert((torch.randn(1, 3, 64, 64).cuda(),)) 18 | dataset.insert((torch.randn(1, 3, 128, 128).cuda(),)) 19 | dataset.insert((torch.randn(4, 3, 32, 32).cuda(),)) 20 | 21 | module_trt = torch2trt( 22 | module, 23 | dataset, 24 | log_level=trt.Logger.INFO 25 | ) 26 | 27 | inputs = dataset[0] 28 | assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3)) 29 | inputs = dataset[1] 30 | assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3)) 31 | inputs = dataset[2] 32 | assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3)) 33 | inputs = dataset[3] 34 | assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3)) 35 | 36 | 37 | def test_dynamic_shape_conv2d_onnx(): 38 | 39 | torch.manual_seed(0) 40 | 41 | module = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval() 42 | 43 | dataset = ListDataset() 44 | dataset.insert((torch.randn(1, 3, 224, 224).cuda(),)) 45 | dataset.insert((torch.randn(1, 3, 64, 64).cuda(),)) 46 | dataset.insert((torch.randn(1, 3, 128, 128).cuda(),)) 47 | dataset.insert((torch.randn(4, 3, 32, 32).cuda(),)) 48 | 49 | module_trt = torch2trt( 50 | module, 51 | dataset, 52 | use_onnx=True, 53 | log_level=trt.Logger.INFO 54 | ) 55 | 56 | inputs = dataset[0] 57 | assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3)) 58 | inputs = dataset[1] 59 | assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3)) 60 | inputs = dataset[2] 61 | assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3)) 62 | inputs = dataset[3] 63 | assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3)) 64 | 65 | 66 | if __name__ == '__main__': 67 | 68 | test_dynamic_shape_conv2d() -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def parse_args(): 4 | """ 5 | """ 6 | parser = argparse.ArgumentParser(description='PyTorch QAT') 7 | parser.add_argument('--tl','--transfer_learning',action='store_true',help='used to map weights correctly') 8 | parser.add_argument('--iter',default=300, type=int, help='no of iterations') 9 | parser.add_argument('--m','--model_name',default=None,help="Name of the model") 10 | parser.add_argument('--b', '--batch_size', default=128, type=int, help='mini-batch size (default: 32)') 11 | parser.add_argument('--optimizer', default='Adam', type=str,help='type of optimizer (default=Adam)') 12 | parser.add_argument( '--wd','--weight-decay', default=1e-5, type=float, help='weight decay (default: 1e-5)') 13 | parser.add_argument('--start_epoch','--s_ep', default=0, type=int, help='starting epoch') 14 | parser.add_argument('--num_epochs',default=30,type=int, help='no of epochs') 15 | parser.add_argument('--no_cuda', action='store_true',help='disables cuda training') 16 | parser.add_argument('--seed', type=int, default=12345,help='random seed for experiments. [default: 12345]') 17 | parser.add_argument('--lr', '--learning_rate', default=1e-3, type=float, help='initial learning rate') 18 | parser.add_argument('--lrdt', '--learning_rate_decay_interval', default=30, type=int, help='initial learning rate decay after n epochs') 19 | parser.add_argument('--od','--output_dir', default='/tmp/',help='output path') 20 | parser.add_argument('--en','--exp_name', default='pytorch_exp',help = 'experiment name to create output dir') 21 | parser.add_argument('--load_ckpt', default = None, help = "path to ckpt") 22 | parser.add_argument('--netqat',action='store_true',help = 'quantize model using custom layer') 23 | parser.add_argument('--partial_ckpt',action='store_true',help = 'load_partial checkpoint') 24 | parser.add_argument('--v','--verbose',action='store_true') 25 | parser.add_argument('--FP16',action='store_true',help='run TRT engine at FP16') 26 | parser.add_argument('--test_trt',action='store_true',help='gather metrics using trt') 27 | parser.add_argument('--INT8PTC',action='store_true',help='run TRT engine at INT8 with Post Training Cal') 28 | parser.add_argument('--INT8QAT',action='store_true',help='run TRT engine at INT8 with QAT') 29 | args = parser.parse_args() 30 | return args 31 | 32 | -------------------------------------------------------------------------------- /docs/usage/custom_converter.md: -------------------------------------------------------------------------------- 1 | # Custom Converter 2 | 3 | This page details how to extend or modify the behavior of torch2trt by implementing and registering 4 | custom converters. 5 | 6 | ## Background 7 | 8 | torch2trt works by attaching conversion functions (like ``convert_ReLU``) to the original 9 | PyTorch functional calls (like ``torch.nn.ReLU.forward``). The sample input data is passed 10 | through the network, just as before, except now whenever a registered function (``torch.nn.ReLU.forward``) 11 | is encountered, the corresponding converter (``convert_ReLU``) is also called afterwards. The converter 12 | is passed the arguments and return statement of the original PyTorch function, as well as the TensorRT 13 | network that is being constructed. The input tensors to the original PyTorch function are modified to 14 | have an attribute ``_trt``, which is the TensorRT counterpart to the PyTorch tensor. The conversion function 15 | uses this ``_trt`` to add layers to the TensorRT network, and then sets the ``_trt`` attribute for 16 | relevant output tensors. Once the model is fully executed, the final tensors returns are marked as outputs 17 | of the TensorRT network, and the optimized TensorRT engine is built. 18 | 19 | ## Add a custom converter 20 | 21 | Here we show how to add a converter for the ``ReLU`` module using the TensorRT 22 | python API. 23 | 24 | ```python 25 | import tensorrt as trt 26 | from torch2trt import tensorrt_converter 27 | 28 | @tensorrt_converter('torch.nn.ReLU.forward') 29 | def convert_ReLU(ctx): 30 | input = ctx.method_args[1] 31 | output = ctx.method_return 32 | layer = ctx.network.add_activation(input=input._trt, type=trt.ActivationType.RELU) 33 | output._trt = layer.get_output(0) 34 | ``` 35 | 36 | The converter takes one argument, a ``ConversionContext``, which will contain 37 | the following 38 | 39 | * ``ctx.network`` - The TensorRT network that is being constructed. 40 | 41 | * ``ctx.method_args`` - Positional arguments that were passed to the specified PyTorch function. The ``_trt`` attribute is set for relevant input tensors. 42 | * ``ctx.method_kwargs`` - Keyword arguments that were passed to the specified PyTorch function. 43 | * ``ctx.method_return`` - The value returned by the specified PyTorch function. The converter must set the ``_trt`` attribute where relevant. 44 | 45 | Please see the [converters](../converters.md) page for a list of implemented converters and links to their source code. These may help 46 | in learning how to write converters. 47 | -------------------------------------------------------------------------------- /examples/easyocr/optimize_detector.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from torch2trt.dataset import FolderDataset, ListDataset 3 | from torch2trt import torch2trt, TRTModule 4 | from easyocr import Reader 5 | import tensorrt as trt 6 | import torch 7 | import time 8 | from tempfile import mkdtemp 9 | 10 | parser = ArgumentParser() 11 | parser.add_argument('--detector_data', type=str, default='detector_data') 12 | parser.add_argument('--output', type=str, default='detector_trt.pth') 13 | parser.add_argument('--int8', action='store_true') 14 | parser.add_argument('--fp16', action='store_true') 15 | parser.add_argument('--dla', action='store_true') 16 | parser.add_argument('--dla_core', type=int, default=0) 17 | args = parser.parse_args() 18 | 19 | detector_dataset = FolderDataset(args.detector_data) 20 | 21 | if len(detector_dataset) == 0: 22 | raise ValueError('Detector dataset is empty, make sure to run generate_data.py first.') 23 | 24 | reader = Reader(['en']) 25 | detector_torch = reader.detector.module 26 | 27 | if args.int8: 28 | num_calib = 5 29 | calib_dataset = FolderDataset(mkdtemp()) 30 | for i in range(num_calib): 31 | calib_dataset.insert(tuple([t + 0.2 * torch.randn_like(t) for t in detector_dataset[i % len(detector_dataset)]])) 32 | 33 | print('Running torch2trt...') 34 | detector_trt = torch2trt( 35 | detector_torch, 36 | detector_dataset, 37 | int8_mode=args.int8, 38 | fp16_mode=args.fp16, 39 | default_device_type=trt.DeviceType.DLA if args.dla else trt.DeviceType.GPU, 40 | max_workspace_size=1 << 26, 41 | log_level=trt.Logger.VERBOSE, 42 | int8_calib_dataset=calib_dataset if args.int8 else None, 43 | int8_calib_algorithm=trt.CalibrationAlgoType.MINMAX_CALIBRATION, 44 | use_onnx=True 45 | ) 46 | 47 | torch.save(detector_trt.state_dict(), args.output) 48 | 49 | def profile_module(module, dataset, count=None): 50 | 51 | if count is None: 52 | count = len(dataset) 53 | 54 | output = module(*dataset[0]) # warmup 55 | 56 | torch.cuda.current_stream().synchronize() 57 | t0 = time.monotonic() 58 | for i in range(count): 59 | output = module(*dataset[i % len(dataset)]) 60 | torch.cuda.current_stream().synchronize() 61 | t1 = time.monotonic() 62 | 63 | return count / (t1 - t0) 64 | 65 | print('Profiling PyTorch...') 66 | fps_torch = profile_module(detector_torch, detector_dataset, 30) 67 | print(f'FPS Torch: {fps_torch}') 68 | 69 | print('Profiling TensorRT') 70 | fps_trt = profile_module(detector_trt, detector_dataset, 30) 71 | print(f'FPS TensorRT: {fps_trt}') -------------------------------------------------------------------------------- /benchmarks/JETSON_NANO.md: -------------------------------------------------------------------------------- 1 | | Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) | 2 | |------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------| 3 | | torchvision.models.alexnet.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.29E-05 | 46.4 | 69.9 | 22.1 | 14.7 | 4 | | torchvision.models.squeezenet.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.20E-02 | 44 | 137 | 24.2 | 7.6 | 5 | | torchvision.models.squeezenet.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 76.6 | 248 | 14 | 4.34 | 6 | | torchvision.models.resnet.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 29.4 | 90.2 | 34.7 | 11.4 | 7 | | torchvision.models.resnet.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.56E-01 | 15.5 | 50.7 | 64.8 | 20.2 | 8 | | torchvision.models.resnet.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 6.45E-02 | 12.4 | 34.2 | 81.7 | 29.8 | 9 | | torchvision.models.resnet.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.01E+03 | 7.18 | 19.9 | 141 | 51.1 | 10 | | torchvision.models.resnet.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 4.96 | 14.1 | 204 | 72.3 | 11 | | torchvision.models.densenet.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 11.5 | 41.9 | 84.5 | 24.8 | 12 | | torchvision.models.densenet.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 8.25 | 33.2 | 118 | 31.2 | 13 | | torchvision.models.densenet.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 6.84 | 25.4 | 141 | 40.8 | 14 | | torchvision.models.densenet.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.15E-03 | 4.71 | 15.6 | 247 | 65.8 | 15 | | torchvision.models.vgg.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.51E-04 | 8.9 | 18.3 | 114 | 55.1 | 16 | | torchvision.models.vgg.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.07E-04 | 6.53 | 14.7 | 156 | 68.7 | 17 | | torchvision.models.vgg.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.58E-04 | 5.09 | 11.9 | 201 | 85.1 | 18 | | torchvision.models.vgg.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.81E-04 | 8.74 | 18.4 | 117 | 54.8 | 19 | | torchvision.models.vgg.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.19E-04 | 6.31 | 14.8 | 162 | 68.5 | 20 | | torchvision.models.vgg.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 4.96 | 12 | 207 | 84.3 | 21 | -------------------------------------------------------------------------------- /docs/benchmarks/jetson_nano.md: -------------------------------------------------------------------------------- 1 | # Jetson Nano 2 | 3 | | Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) | 4 | |------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------| 5 | | torchvision.models.alexnet.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.29E-05 | 46.4 | 69.9 | 22.1 | 14.7 | 6 | | torchvision.models.squeezenet.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.20E-02 | 44 | 137 | 24.2 | 7.6 | 7 | | torchvision.models.squeezenet.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 76.6 | 248 | 14 | 4.34 | 8 | | torchvision.models.resnet.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 29.4 | 90.2 | 34.7 | 11.4 | 9 | | torchvision.models.resnet.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.56E-01 | 15.5 | 50.7 | 64.8 | 20.2 | 10 | | torchvision.models.resnet.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 6.45E-02 | 12.4 | 34.2 | 81.7 | 29.8 | 11 | | torchvision.models.resnet.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.01E+03 | 7.18 | 19.9 | 141 | 51.1 | 12 | | torchvision.models.resnet.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 4.96 | 14.1 | 204 | 72.3 | 13 | | torchvision.models.densenet.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 11.5 | 41.9 | 84.5 | 24.8 | 14 | | torchvision.models.densenet.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 8.25 | 33.2 | 118 | 31.2 | 15 | | torchvision.models.densenet.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 6.84 | 25.4 | 141 | 40.8 | 16 | | torchvision.models.densenet.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.15E-03 | 4.71 | 15.6 | 247 | 65.8 | 17 | | torchvision.models.vgg.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.51E-04 | 8.9 | 18.3 | 114 | 55.1 | 18 | | torchvision.models.vgg.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.07E-04 | 6.53 | 14.7 | 156 | 68.7 | 19 | | torchvision.models.vgg.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.58E-04 | 5.09 | 11.9 | 201 | 85.1 | 20 | | torchvision.models.vgg.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.81E-04 | 8.74 | 18.4 | 117 | 54.8 | 21 | | torchvision.models.vgg.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.19E-04 | 6.31 | 14.8 | 162 | 68.5 | 22 | | torchvision.models.vgg.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 4.96 | 12 | 207 | 84.3 | 23 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUT_FILE=$1 4 | 5 | touch $OUTPUT_FILE 6 | 7 | echo "| Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) |" >> $OUTPUT_FILE 8 | echo "|------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------|" >> $OUTPUT_FILE 9 | 10 | python3 -m torch2trt.test -o $OUTPUT_FILE --name alexnet --include=torch2trt.tests.torchvision.classification 11 | python3 -m torch2trt.test -o $OUTPUT_FILE --name squeezenet1_0 --include=torch2trt.tests.torchvision.classification 12 | python3 -m torch2trt.test -o $OUTPUT_FILE --name squeezenet1_1 --include=torch2trt.tests.torchvision.classification 13 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet18 --include=torch2trt.tests.torchvision.classification 14 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet34 --include=torch2trt.tests.torchvision.classification 15 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet50 --include=torch2trt.tests.torchvision.classification 16 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet101 --include=torch2trt.tests.torchvision.classification 17 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet152 --include=torch2trt.tests.torchvision.classification 18 | python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet121 --include=torch2trt.tests.torchvision.classification 19 | python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet169 --include=torch2trt.tests.torchvision.classification 20 | python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet201 --include=torch2trt.tests.torchvision.classification 21 | python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet161 --include=torch2trt.tests.torchvision.classification 22 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg11$ --include=torch2trt.tests.torchvision.classification 23 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg13$ --include=torch2trt.tests.torchvision.classification 24 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg16$ --include=torch2trt.tests.torchvision.classification 25 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg19$ --include=torch2trt.tests.torchvision.classification 26 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg11_bn --include=torch2trt.tests.torchvision.classification 27 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg13_bn --include=torch2trt.tests.torchvision.classification 28 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg16_bn --include=torch2trt.tests.torchvision.classification 29 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg19_bn --include=torch2trt.tests.torchvision.classification 30 | python3 -m torch2trt.test -o $OUTPUT_FILE --name mobilenet_v2 --include=torch2trt.tests.torchvision.classification 31 | -------------------------------------------------------------------------------- /examples/easyocr/run_end2end.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import cv2 3 | import torch 4 | import glob 5 | from easyocr import Reader 6 | from torch2trt.dataset import FolderDataset 7 | from torch2trt import torch2trt, TRTModule 8 | import math 9 | import time 10 | import os 11 | 12 | parser = ArgumentParser() 13 | parser.add_argument('--images', type=str, default='images') 14 | parser.add_argument('--detector_trt', type=str, default='detector_trt.pth') 15 | parser.add_argument('--recognizer_trt', type=str, default='recognizer_trt.pth') 16 | parser.add_argument('--max_image_area', type=int, default=1280*720) 17 | parser.add_argument('--count', type=int, default=None) 18 | parser.add_argument('--recognizer_batch_size', type=int, default=1) 19 | args = parser.parse_args() 20 | 21 | 22 | def shrink_to_area(image, area): 23 | height = image.shape[0] 24 | width = image.shape[1] 25 | 26 | if height * width > area: 27 | ar = width / height 28 | new_height = math.sqrt(area / ar) 29 | new_width = ar * new_height 30 | new_height = math.floor(new_height) 31 | new_width = math.floor(new_width) 32 | print(f'Resizing {width}x{height} to {new_width}x{new_height}') 33 | image = cv2.resize(image, (new_width, new_height)) 34 | 35 | return image 36 | 37 | image_paths = glob.glob(os.path.join(args.images, '*.jpg')) 38 | 39 | def profile_reader(reader): 40 | 41 | cumulative_execution_time = 0 42 | 43 | if args.count is None: 44 | count = len(image_paths) 45 | else: 46 | count = args.count 47 | 48 | for i in range(count): 49 | 50 | path = image_paths[i % len(image_paths)] 51 | image = cv2.imread(path) 52 | 53 | image = shrink_to_area(image, args.max_image_area) 54 | 55 | t0 = time.monotonic() 56 | reader.readtext(image, batch_size=args.recognizer_batch_size) 57 | t1 = time.monotonic() 58 | 59 | cumulative_execution_time += (t1 - t0) 60 | 61 | return count / cumulative_execution_time 62 | 63 | 64 | reader = Reader(['en']) 65 | 66 | detector_trt = TRTModule() 67 | detector_trt.load_state_dict(torch.load(args.detector_trt)) 68 | 69 | recognizer_trt = TRTModule() 70 | recognizer_trt.load_state_dict(torch.load(args.recognizer_trt)) 71 | 72 | test_image = shrink_to_area(cv2.imread(image_paths[0]), args.max_image_area) 73 | 74 | print('Dumping torch output...') 75 | print(reader.readtext(test_image, batch_size=args.recognizer_batch_size)) 76 | 77 | print('Profiling torch...') 78 | fps_torch = profile_reader(reader) 79 | 80 | reader.detector.module = detector_trt 81 | reader.recognizer.module = recognizer_trt 82 | 83 | 84 | print('Dumping TensorRT output...') 85 | print(reader.readtext(test_image, batch_size=args.recognizer_batch_size)) 86 | 87 | print('Profiling torch...') 88 | fps_trt = profile_reader(reader) 89 | 90 | 91 | print(f'FPS Torch: {fps_torch}') 92 | print(f'FPS TensorRT: {fps_trt}') -------------------------------------------------------------------------------- /tests/feature_tests/test_dataset_calibrator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorrt as trt 3 | import torch 4 | import torch.nn as nn 5 | from torch2trt.dataset import ( 6 | TensorBatchDataset, 7 | ListDataset 8 | ) 9 | from torch2trt import torch2trt 10 | 11 | 12 | def test_dataset_calibrator_batch_dataset(): 13 | 14 | torch.manual_seed(0) 15 | 16 | 17 | class TestModule(nn.Module): 18 | def __init__(self): 19 | super().__init__() 20 | self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval() 21 | 22 | def forward(self, x, y): 23 | a = self.conv(x) 24 | b = self.conv(y) 25 | return torch.cat([a, b], dim=0) 26 | 27 | inputs = [ 28 | torch.randn(1, 3, 32, 32).cuda(), 29 | torch.randn(1, 3, 32, 32).cuda() 30 | ] 31 | 32 | module = TestModule().cuda().eval() 33 | 34 | dataset = TensorBatchDataset() 35 | 36 | with dataset.record(module): 37 | for i in range(50): 38 | module(*inputs) 39 | 40 | module_trt = torch2trt( 41 | module, 42 | dataset[0], 43 | int8_mode=True, 44 | int8_calib_dataset=dataset, 45 | log_level=trt.Logger.INFO 46 | ) 47 | 48 | inputs = [ 49 | torch.randn(1, 3, 32, 32).cuda(), 50 | torch.randn(1, 3, 32, 32).cuda() 51 | ] 52 | output = module(*inputs) 53 | output_trt = module_trt(*inputs) 54 | 55 | assert(torch.allclose(output, output_trt, rtol=1e-3, atol=1e-3)) 56 | 57 | 58 | def test_dataset_calibrator_list_dataset(): 59 | 60 | torch.manual_seed(0) 61 | 62 | 63 | class TestModule(nn.Module): 64 | def __init__(self): 65 | super().__init__() 66 | self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval() 67 | 68 | def forward(self, x, y): 69 | a = self.conv(x) 70 | b = self.conv(y) 71 | return torch.cat([a, b], dim=0) 72 | 73 | inputs = [ 74 | torch.randn(1, 3, 32, 32).cuda(), 75 | torch.randn(1, 3, 32, 32).cuda() 76 | ] 77 | 78 | module = TestModule().cuda().eval() 79 | 80 | dataset = ListDataset() 81 | 82 | with dataset.record(module): 83 | for i in range(50): 84 | module(*inputs) 85 | 86 | module_trt = torch2trt( 87 | module, 88 | dataset[0], 89 | int8_mode=True, 90 | int8_calib_dataset=dataset, 91 | log_level=trt.Logger.INFO 92 | ) 93 | 94 | inputs = [ 95 | torch.randn(1, 3, 32, 32).cuda(), 96 | torch.randn(1, 3, 32, 32).cuda() 97 | ] 98 | output = module(*inputs) 99 | output_trt = module_trt(*inputs) 100 | 101 | assert(torch.allclose(output, output_trt, rtol=1e-3, atol=1e-3)) 102 | 103 | 104 | if __name__ == '__main__': 105 | test_dataset_calibrator_list_dataset() -------------------------------------------------------------------------------- /docs/see_also.md: -------------------------------------------------------------------------------- 1 | # See Also 2 | 3 | !!! note 4 | 5 | The state of these converters may change over time. We provide this information here with the hope that it will help shed light on the landscape of tools available for optimizing PyTorch models with TensorRT. 6 | If you find this information helpful or outdated / misleading, please let us know. 7 | 8 | In addition to torch2trt, there are other workflows for optimizing your PyTorch model with TensorRT. 9 | 10 | The other converters we are aware of are 11 | 12 | * [ONNX to TensorRT](https://github.com/onnx/onnx-tensorrt) 13 | 14 | !!! tip 15 | 16 | Since the ONNX parser ships with TensorRT, we have included a convenience method for using this 17 | workflow with torch2trt. If you want to quickly try the ONNX method using the torch2trt interface, just call ``torch2trt(..., use_onnx=True)``. 18 | This will perform conversion on the module by exporting the model using PyTorch's JIT tracer, 19 | and parsing with TensorRT's ONNX parser. 20 | 21 | * [TRTorch](https://github.com/NVIDIA/TRTorch) 22 | 23 | Which one you use depends largely on your use case. The differences often come down to 24 | 25 | ## Layer support 26 | 27 | Modern deep learning frameworks are large, and there often arise 28 | caveats converting between frameworks using a given workflow. These could include 29 | limitations in serialization or parsing formats. Or in some instances, it may be possible 30 | the layer could be supported, but it has just not been done yet. TRTorch is strong 31 | in the sense that it will default to the original PyTorch method for layers 32 | which are not converted to TensorRT. The best way to know 33 | which conversion method works for you is to try converting your model. 34 | 35 | ## Feature support 36 | 37 | TensorRT is evolving and the conversion workflows may have varying level 38 | of feature support. In some instances, you may wish to use a latest feature of TensorRT, like dynamic shapes, 39 | but it is not supported in torch2trt or the interface has not yet been exposed. In this 40 | instance, we recommend checking to see if it is supported by one of the other workflows. The ONNX 41 | converter is typically strong in this regards, since the parser is distributed with TensorRT. 42 | 43 | !!! note 44 | 45 | If there is a TensorRT feature you wished to see in torch2trt, please let us know. We can not gaurantee this will be done, but it helps us gauge interest. 46 | 47 | ## Extensibility / Ease of Use 48 | 49 | In case none of the converters satisfy for your use case, you may find it necessary to adapt 50 | the converter to fit your needs. This is very intuitive with torch2trt, 51 | since it is done inline with Python, and there are many [examples](converters.md) to reference. If you know 52 | how the original PyTorch method works, and have the TensorRT Python API on hand, it is relatively straight forward to adapt torch2trt to your needs. 53 | The extensibility is often helpful when you want to implement a converter that is specific to the 54 | context the layer appears in. 55 | 56 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/README.md: -------------------------------------------------------------------------------- 1 | ## QAT working example 2 | 3 | This example is using QAT library open sourced by nvidia. [Github link](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization) 4 | 5 | ## Directory overview 6 | 7 | 1. This directory contains 8 | 1. `dataset` : contains code for cifar-10 dataset 9 | 2. `layers` : contains implementation for inference. More details under `layers/README.md` 10 | 3. `models`: contains two models. `resnet18` and `vanilla_cnn` 11 | 4. `utils` : contains various utility functions for loading state dict, custom wrapper for training and inference & calculating accuracy during training 12 | 5. `train.py` and `infer.py` : contains code for training and inference (including trt conversion) 13 | 14 | 2. Usually, nvidia quantization library doesn't provide control per layer for quantization. Custom wrapper under `utils/utilities.py` helps us in quantization selective layers in our model. 15 | 16 | ## Environment 17 | 18 | **Filename** : pytorch_ngc_container_20.09 19 | 20 | ``` 21 | FROM nvcr.io/nvidia/pytorch:20.09-py3 22 | RUN apt-get update && apt-get install -y software-properties-common && apt-get update 23 | RUN add-apt-repository ppa:git-core/ppa && \ 24 | apt install -y git 25 | 26 | RUN pip install termcolor graphviz 27 | 28 | RUN git clone https://github.com/NVIDIA-AI-IOT/torch2trt.git /sw/torch2trt/ && \ 29 | cd /sw/torch2trt/scripts && \ 30 | bash build_contrib.sh 31 | 32 | ``` 33 | 34 | Docker build: `docker build -f pytorch_ngc_container_20.09 -t pytorch_ngc_container_20.09 .` 35 | 36 | `docker_image=pytorch_ngc_container_20.09` 37 | 38 | Docker run : `docker run -e NVIDIA_VISIBLE_DEVICES=0 --gpus 0 -it --shm-size=1g --ulimit memlock=-1 --rm -v $PWD:/workspace/work $docker_image` 39 | 40 | **Important Notes** : 41 | 42 | - Sparse checkout helps us in checking out a part of the github repo. 43 | - Patch file can be found under `examples/quantization_aware_training/utils` 44 | 45 | ## Workflow 46 | 47 | Workflow consists of three parts. 48 | 1. Train without quantization: 49 | 50 | Here pretrained weights from imagenet are used. 51 | 52 | `python train.py --m resnet34-tl / resnet18-tl --num_epochs 45 --test_trt --FP16 --INT8PTC` 53 | 54 | 2. Train with quantization (weights are mapped using a custom function to make sure that each weight is loaded correctly) 55 | 56 | `python train.py --m resnet34/ resnet18 --netqat --partial_ckpt --tl --load_ckpt /tmp/pytorch_exp/{} --num_epochs 25 --lr 1e-4 --lrdt 10` 57 | 58 | 3. Infer with and without TRT 59 | 60 | `python infer.py --m resnet34/resnet18 --load_ckpt /tmp/pytorch_exp_1/ckpt_{} --netqat --INT8QAT` 61 | 62 | 63 | ## Accuracy Results 64 | 65 | | Model | FP32 | FP16 | INT8 (QAT) | INT(PTC) | 66 | |-------|------|------|------------|----------| 67 | | Resnet18 | 83.08 | 83.12 | 83.12 | 83.06 | 68 | | Resnet34 | 84.65 | 84.65 | 83.26 | 84.5 | 69 | 70 | 71 | **Please note that the idea behind these experiments is to see if TRT conversion is working properly rather than achieving industry standard accuracy results** 72 | 73 | ## Future Work 74 | 75 | - Add results for Resnet50, EfficientNet and Mobilenet 76 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/infer.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | import torchvision 6 | import argparse 7 | import os,sys 8 | from datasets.cifar10 import Cifar10Loaders 9 | from utils.utilities import calculate_accuracy, timeGraph,printStats 10 | from models.resnet import resnet18,resnet34 11 | from parser import parse_args 12 | from torch2trt import torch2trt 13 | import tensorrt as trt 14 | torch.set_printoptions(precision=5) 15 | 16 | def main(): 17 | args = parse_args() 18 | 19 | args.cuda = not args.no_cuda and torch.cuda.is_available() 20 | torch.manual_seed(78543) 21 | 22 | if args.cuda: 23 | torch.backends.cudnn.benchmark = True 24 | torch.cuda.manual_seed(args.seed) 25 | 26 | loaders = Cifar10Loaders() 27 | train_loader = loaders.train_loader() 28 | test_loader = loaders.test_loader() 29 | 30 | if args.m == "resnet18": 31 | if args.netqat: 32 | model=resnet18(qat_mode=True,infer=True) 33 | else: 34 | model=resnet18() 35 | elif args.m == "resnet34": 36 | if args.netqat: 37 | model=resnet34(qat_mode=True,infer=True) 38 | else: 39 | model=resnet34() 40 | else: 41 | raise NotImplementedError("{} model not found".format(args.m)) 42 | 43 | 44 | model = model.cuda().eval() 45 | 46 | if args.load_ckpt: 47 | checkpoint = torch.load(args.load_ckpt) 48 | if not args.netqat: 49 | checkpoint = mapping_names_resnets(checkpoint) 50 | model.load_state_dict(checkpoint['model_state_dict'],strict=True) 51 | print("===>>> Checkpoint loaded successfully from {} ".format(args.load_ckpt)) 52 | 53 | test_accuracy = calculate_accuracy(model,test_loader) 54 | print(" Test accuracy for Pytorch model: {0} ".format(test_accuracy)) 55 | rand_in = torch.randn([128,3,32,32],dtype=torch.float32).cuda() 56 | 57 | #Converting the model to TRT 58 | if args.FP16: 59 | trt_model_fp16 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,max_batch_size=128) 60 | test_accuracy = calculate_accuracy(trt_model_fp16,test_loader) 61 | print(" TRT test accuracy at FP16: {0}".format(test_accuracy)) 62 | 63 | if args.INT8QAT: 64 | trt_model_int8 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,int8_mode=True,max_batch_size=128,qat_mode=True) 65 | test_accuracy = calculate_accuracy(trt_model_int8,test_loader) 66 | print(" TRT test accuracy at INT8 QAT: {0}".format(test_accuracy)) 67 | 68 | if args.INT8PTC: 69 | ##preparing calib dataset 70 | calib_dataset = list() 71 | for i, sam in enumerate(test_loader): 72 | calib_dataset.extend(sam[0]) 73 | if i ==5: 74 | break 75 | 76 | trt_model_calib_int8 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,int8_calib_dataset=calib_dataset,int8_mode=True,max_batch_size=128) 77 | test_accuracy = calculate_accuracy(trt_model_calib_int8,test_loader) 78 | print(" TRT test accuracy at INT8 PTC: {0}".format(test_accuracy)) 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changes 2 | 3 | ## [master](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/master) 4 | 5 | - Added inference and conversion support for TensorRT 10 6 | - Removed redundant converters, and merged converters for ND convolutions, pooling, etc. 7 | - Migrated test cases to use PyTest 8 | - Added unique axis names when using ONNX to support mis-matched dynamic axes (needed for whisper) 9 | 10 | ## [v0.5.0](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/v0.5.0) - 05/3/2024 11 | 12 | - Added tensor shape tracking to support dynamic shapes for flatten, squeeze, unsqueeze, view, reshape, interpolate, and getitem methods 13 | - Added EasyOCR example 14 | - Added the ``DatasetRecorder`` context manager, allowing to easily capture of module inputs in large pipeline for calibration and shape inference 15 | - Added support for legacy max_batch_size using optimization profiles 16 | - Added support for nested tuple, dict and list module inputs and outputs via. the ``Flattener`` class 17 | - Added ability to accept dataset as ``inputs`` argument, and infer optimization profiles from the data 18 | - Added Dataset, TensorBatchDataset, ListDataset, and FolderDatset classes 19 | - Added support for dynamic shapes 20 | - Known limitation: Currently some converters (ie: View) may have unexpected behavior if their arguments are defined with dynamic Tensor shapes. 21 | 22 | ## [0.4.0](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/v0.4.0) - 07/22/2022 23 | 24 | - Added converter for ``torch.nn.functional.group_norm`` using native TensorRT layers 25 | - Added converter for ``torch.nn.ReflectionPad2d`` using plugin layer 26 | - Added torch2trt_plugins library 27 | - Added support for Deep Learning Accelerator (DLA) 28 | - Added support for explicit batch 29 | - Added support for TensorRT 8 30 | 31 | ## [0.3.0](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/v0.3.0) - 07/15/2021 32 | 33 | - Added converter for ``torch.nn.functional.adaptive_avg_pool3d`` 34 | - Added converter for ``torch.nn.functional.adaptive_max_pool3d`` 35 | - Added converter for ``torch.maxpool3d`` and ``torch.nn.functional.max_pool3d`` 36 | - Added Quantization Aware Training (QAT) workflow to contrib 37 | - Added converter for ``torch.roll`` 38 | - Added converter for ``torch.nn.functional.layer_norm`` 39 | - Added converter for ``torch.nn.functional.gelu`` 40 | - Added converter for ``torch.nn.functional.linear`` 41 | - Added converter for ``torch.nn.functional.silu`` 42 | 43 | ## [0.2.0](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/v0.2.0) - 03/02/2021 44 | 45 | - Added converter for ``torch.Tensor.flatten`` 46 | - Added converter for ``torch.nn.functional.conv2d`` and ``torch.nn.functional.conv3d`` 47 | - Added converter for ``torch.Tensor.expand`` 48 | - Added support for custom converters for methods defined outside of ``torch`` module 49 | - Added names for TensorRT layers 50 | - Added GroupNorm plugin which internally uses PyTorch aten::group_norm 51 | - Replaced Tensor.ndim references with len(tensor.shape) to support older pytorch versions 52 | - Added reduced precision documentation page 53 | - Added converters for ``floordiv``, ``mod``, ``ne``, and ``torch.tensor`` operations 54 | - Extended ``relu`` converter to support ``Tensor.relu`` operation 55 | - Extended ``sigmoid`` converter to support ``Tensor.sigmoid`` operation 56 | -------------------------------------------------------------------------------- /examples/easyocr/optimize_recognizer.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from torch2trt.dataset import FolderDataset 3 | from torch2trt import torch2trt, TRTModule 4 | from easyocr import Reader 5 | import tensorrt as trt 6 | import torch 7 | import time 8 | from tempfile import mkdtemp 9 | 10 | 11 | parser = ArgumentParser() 12 | parser.add_argument('--detector_data', type=str, default='detector_data') 13 | parser.add_argument('--recognizer_data', type=str, default='recognizer_data') 14 | parser.add_argument('--output', type=str, default='recognizer_trt.pth') 15 | parser.add_argument('--int8', action='store_true') 16 | parser.add_argument('--fp16', action='store_true') 17 | parser.add_argument('--max_workspace_size', type=int, default=1<<28) 18 | args = parser.parse_args() 19 | 20 | detector_dataset = FolderDataset(args.detector_data) 21 | recognizer_dataset = FolderDataset(args.recognizer_data) 22 | 23 | if len(detector_dataset) == 0: 24 | raise ValueError('Detector dataset is empty, make sure to run generate_data.py first.') 25 | 26 | if len(recognizer_dataset) == 0: 27 | raise ValueError('Recognizer dataset is empty, make sure to run generate_data.py first.') 28 | 29 | 30 | if args.int8: 31 | num_calib = 200 32 | calib_dataset = FolderDataset(mkdtemp()) 33 | for i in range(num_calib): 34 | calib_dataset.insert(tuple([t.float() + 0.2 * torch.randn_like(t.float()) for t in recognizer_dataset[i % len(recognizer_dataset)]])) 35 | 36 | reader = Reader(['en']) 37 | module_torch = reader.detector.module 38 | 39 | max_shapes = list(recognizer_dataset.max_shapes()) 40 | 41 | # override default max shape to use full image width 42 | max_shapes[0] = torch.Size(( 43 | recognizer_dataset.max_shapes()[0][0], 44 | recognizer_dataset.max_shapes()[0][1], 45 | recognizer_dataset.max_shapes()[0][2], 46 | detector_dataset.max_shapes()[0][3] 47 | )) 48 | max_shapes = tuple(max_shapes) 49 | 50 | class PoolFix(torch.nn.Module): 51 | def forward(self, x): 52 | return torch.mean(x, dim=-1, keepdim=True) 53 | 54 | if isinstance(reader.recognizer.module.AdaptiveAvgPool, torch.nn.AdaptiveAvgPool2d): 55 | reader.recognizer.module.AdaptiveAvgPool = PoolFix() 56 | 57 | recognizer_torch = reader.recognizer.module 58 | 59 | print('Running torch2trt...') 60 | recognizer_trt = torch2trt( 61 | reader.recognizer.module, 62 | recognizer_dataset, 63 | max_shapes=max_shapes, 64 | use_onnx=True, # LSTM currently only implemented in ONNX workflow 65 | fp16_mode=args.fp16, 66 | int8_mode=args.int8, 67 | max_workspace_size=args.max_workspace_size, 68 | log_level=trt.Logger.VERBOSE 69 | ) 70 | 71 | # recognizer_trt.ignore_inputs = [1] 72 | 73 | torch.save(recognizer_trt.state_dict(), args.output) 74 | 75 | def profile_module(module, dataset, count=None): 76 | 77 | if count is None: 78 | count = len(dataset) 79 | 80 | output = module(*dataset[0]) # warmup 81 | 82 | torch.cuda.current_stream().synchronize() 83 | t0 = time.monotonic() 84 | for i in range(count): 85 | output = module(*dataset[i % len(dataset)]) 86 | torch.cuda.current_stream().synchronize() 87 | t1 = time.monotonic() 88 | 89 | return count / (t1 - t0) 90 | 91 | print('Profiling PyTorch...') 92 | fps_torch = profile_module(recognizer_torch, recognizer_dataset, 50) 93 | print(f'FPS Torch: {fps_torch}') 94 | 95 | print('Profiling TensorRT') 96 | fps_trt = profile_module(recognizer_trt, recognizer_dataset, 30) 97 | print(f'FPS TensorRT: {fps_trt}') -------------------------------------------------------------------------------- /torch2trt/flattener.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | 4 | 5 | def _default_condition(x): 6 | return isinstance(x, torch.Tensor) and (x.dtype is torch.half or x.dtype is torch.float or x.dtype == torch.bool) 7 | 8 | 9 | def _make_schema_from_value(value, condition=_default_condition, size=0): 10 | if condition(value): 11 | return size, size + 1 12 | elif isinstance(value, list) or isinstance(value, tuple): 13 | schema = [] 14 | for child_value in value: 15 | child_schema, size = _make_schema_from_value(child_value, condition, size) 16 | schema.append(child_schema) 17 | if isinstance(value, tuple): 18 | schema = tuple(schema) 19 | return schema, size 20 | elif isinstance(value, dict): 21 | schema = {} 22 | for child_key in sorted(value.keys()): 23 | child_value = value[child_key] 24 | child_schema, size = _make_schema_from_value(child_value, condition, size) 25 | schema[child_key] = child_schema 26 | return schema, size 27 | else: 28 | return None, size 29 | 30 | 31 | class Flattener(object): 32 | 33 | def __init__(self, schema, size): 34 | self._schema = schema 35 | self._size = size 36 | 37 | @staticmethod 38 | def from_value(value, condition=_default_condition): 39 | return Flattener(*_make_schema_from_value(value, condition)) 40 | 41 | @staticmethod 42 | def from_dict(x): 43 | return Flattener(x['schema'], x['size']) 44 | 45 | def dict(self): 46 | return {'schema': self.schema, 'size': self.size} 47 | 48 | @property 49 | def schema(self): 50 | return self._schema 51 | 52 | @property 53 | def size(self): 54 | return self._size 55 | 56 | def __len__(self): 57 | return self._size 58 | 59 | def _flatten(self, value, result): 60 | if isinstance(self._schema, int): 61 | result[self._schema] = value 62 | elif isinstance(self._schema, list) or isinstance(self._schema, tuple): 63 | for child_value, child_schema in zip(value, self._schema): 64 | Flattener(child_schema, self.size)._flatten(child_value, result) 65 | elif isinstance(self._schema, dict): 66 | for key in sorted(self._schema.keys()): 67 | child_value = value[key] 68 | child_schema = self._schema[key] 69 | Flattener(child_schema, self.size)._flatten(child_value, result) 70 | 71 | def flatten(self, value): 72 | result = [None for i in range(self.size)] 73 | self._flatten(value, result) 74 | return result 75 | 76 | def unflatten(self, flattened): 77 | if isinstance(self._schema, int): 78 | return flattened[self._schema] 79 | elif isinstance(self._schema, list) or isinstance(self._schema, tuple): 80 | result = [] 81 | for child_schema in self._schema: 82 | result.append(Flattener(child_schema, self.size).unflatten(flattened)) 83 | if isinstance(self._schema, tuple): 84 | result = tuple(result) 85 | return result 86 | elif isinstance(self._schema, dict): 87 | result = {} 88 | for child_key in sorted(self._schema.keys()): 89 | child_schema = self._schema[child_key] 90 | result[child_key] = Flattener(child_schema, self.size).unflatten(flattened) 91 | return result 92 | else: 93 | return None -------------------------------------------------------------------------------- /plugins/src/example_plugin.h: -------------------------------------------------------------------------------- 1 | #ifndef TORCH2TRT_PLUGIN_EXAMPLE 2 | #define TORCH2TRT_PLUGIN_EXAMPLE 3 | 4 | 5 | #include "NvInfer.h" 6 | #include "NvInferPlugin.h" 7 | #include 8 | #include 9 | #include 10 | 11 | #define EXAMPLE_PLUGIN_NAME "ExamplePlugin" 12 | #define EXAMPLE_PLUGIN_VERSION "1" 13 | 14 | 15 | using namespace nvinfer1; 16 | 17 | 18 | namespace torch2trt_plugins { 19 | 20 | 21 | template 22 | void exampleFuncton(T *x, T *y, float scale, int size, cudaStream_t stream=0); 23 | void exampleFunctonHalf(__half *x, __half *y, float scale, int size, cudaStream_t stream=0); 24 | 25 | 26 | class ExamplePlugin : public IPluginV2Ext { 27 | public: 28 | int32_t inputSize; 29 | DataType dataType; 30 | float scale; 31 | std::string pluginNamespace; 32 | 33 | ExamplePlugin(float scale=2.0); 34 | ExamplePlugin(float scale, int32_t inputSize, DataType dataType); 35 | ExamplePlugin(void const* serialData, size_t serialLength); 36 | ~ExamplePlugin(); 37 | 38 | /* IPluginV2 methods */ 39 | 40 | AsciiChar const* getPluginType() const noexcept override; 41 | 42 | AsciiChar const* getPluginVersion() const noexcept override; 43 | 44 | int32_t getNbOutputs() const noexcept override; 45 | 46 | Dims getOutputDimensions(int32_t index, Dims const* inputs, int32_t nbInputDims) noexcept override; 47 | 48 | bool supportsFormat(DataType type, PluginFormat format) const noexcept; 49 | 50 | int32_t initialize() noexcept override; 51 | 52 | void terminate() noexcept override; 53 | 54 | size_t getWorkspaceSize(int32_t maxBatchSize) const noexcept override; 55 | 56 | int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, 57 | cudaStream_t stream) noexcept 58 | override; 59 | 60 | size_t getSerializationSize() const noexcept override; 61 | 62 | void serialize(void* buffer) const noexcept override; 63 | 64 | void destroy() noexcept override; 65 | 66 | 67 | void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override; 68 | 69 | AsciiChar const* getPluginNamespace() const noexcept override; 70 | 71 | // IPluginV2Ext methods 72 | IPluginV2Ext* clone() const noexcept override; 73 | DataType getOutputDataType(int32_t index, DataType const* inputTypes, int32_t nbInputs) const noexcept override; 74 | bool isOutputBroadcastAcrossBatch(int32_t outputIndex, bool const* inputIsBroadcasted, int32_t nbInputs) const noexcept override; 75 | bool canBroadcastInputAcrossBatch(int32_t inputIndex) const noexcept override; 76 | void configurePlugin(Dims const* inputDims, int32_t nbInputs, Dims const* outputDims, int32_t nbOutputs, 77 | DataType const* inputTypes, DataType const* outputTypes, bool const* inputIsBroadcast, 78 | bool const* outputIsBroadcast, PluginFormat floatFormat, int32_t maxBatchSize) noexcept override; 79 | }; 80 | 81 | class ExamplePluginCreator : public IPluginCreator { 82 | private: 83 | PluginFieldCollection fieldCollection; 84 | std::vector fields; 85 | std::string pluginNamespace; 86 | 87 | public: 88 | ExamplePluginCreator(); 89 | 90 | /* IPluginCreator methods */ 91 | AsciiChar const* getPluginName() const noexcept override; 92 | 93 | AsciiChar const* getPluginVersion() const noexcept override; 94 | 95 | PluginFieldCollection const* getFieldNames() noexcept override; 96 | 97 | IPluginV2* createPlugin(AsciiChar const* name, PluginFieldCollection const* fc) noexcept override; 98 | 99 | IPluginV2* deserializePlugin(AsciiChar const* name, void const* serialData, size_t serialLength) noexcept override; 100 | 101 | void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override; 102 | 103 | 104 | AsciiChar const* getPluginNamespace() const noexcept override; 105 | }; 106 | 107 | } 108 | 109 | #endif -------------------------------------------------------------------------------- /torch2trt/contrib/qat/converters/QuantConv.py: -------------------------------------------------------------------------------- 1 | from torch2trt.torch2trt import * 2 | from torch2trt.module_test import add_module_test 3 | import tensorrt as trt 4 | 5 | @tensorrt_converter('torch2trt.contrib.qat.layers.quant_conv.IQuantConv2d.forward', enabled=trt_version() >= '7.0') 6 | def convert_QuantConv(ctx): 7 | module = ctx.method_args[0] 8 | input = ctx.method_args[1] 9 | input_trt = add_missing_trt_tensors(ctx.network, [input])[0] 10 | output = ctx.method_return 11 | 12 | input_dim = input.dim() - 2 13 | 14 | kernel_size = module.kernel_size 15 | if not isinstance(kernel_size, tuple): 16 | kernel_size = (kernel_size, ) * input_dim 17 | 18 | stride = module.stride 19 | if not isinstance(stride, tuple): 20 | stride = (stride, ) * input_dim 21 | 22 | padding = module.padding 23 | if not isinstance(padding, tuple): 24 | padding = (padding, ) * input_dim 25 | 26 | dilation = module.dilation 27 | if not isinstance(dilation, tuple): 28 | dilation = (dilation, ) * input_dim 29 | 30 | kernel = module.weight.detach().cpu().numpy() 31 | 32 | bias = None #trt.Weights(torch_dtype_to_trt(module.weight.dtype)) 33 | if module.bias is not None: 34 | bias = module.bias.detach().cpu().numpy() 35 | 36 | layer = ctx.network.add_convolution_nd( 37 | input=input_trt, 38 | num_output_maps=module.out_channels, 39 | kernel_shape=kernel_size, 40 | kernel=kernel, 41 | bias=bias) 42 | layer.stride_nd = stride 43 | layer.padding_nd = padding 44 | layer.dilation_nd = dilation 45 | 46 | if module.groups is not None: 47 | layer.num_groups = module.groups 48 | 49 | if 'qat_mode' in ctx.torch2trt_kwargs: 50 | #Setting dynamic range for conv 51 | w_quant_amax = module._weight_quantizer.learned_amax 52 | layer.precision = trt.int8 53 | layer.set_output_type(0,trt.int8) 54 | conv_out = layer.get_output(0) 55 | conv_out.dynamic_range=(-w_quant_amax,w_quant_amax) 56 | 57 | 58 | output._trt = layer.get_output(0) 59 | 60 | 61 | 62 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') 63 | def test_Conv2d_basic_trt7(): 64 | return IQuantConv2d(10, 5, kernel_size=1, stride=1, padding=0) 65 | 66 | ''' 67 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') 68 | def test_Conv2d_stride2_trt7(): 69 | return torch.nn.Conv2d(10, 5, kernel_size=1, stride=2, padding=0) 70 | 71 | 72 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') 73 | def test_Conv2d_kernel3_trt7(): 74 | return torch.nn.Conv2d(10, 5, kernel_size=3, stride=2, padding=1) 75 | 76 | 77 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') 78 | def test_Conv2d_dilation2_trt7(): 79 | return torch.nn.Conv2d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) 80 | 81 | 82 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') 83 | def test_Conv3d_basic_trt7(): 84 | return torch.nn.Conv3d(10, 5, kernel_size=1, stride=1, padding=0) 85 | 86 | 87 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') 88 | def test_Conv3d_stride2_trt7(): 89 | return torch.nn.Conv3d(10, 5, kernel_size=1, stride=2, padding=0) 90 | 91 | 92 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') 93 | def test_Conv3d_kernel3_trt7(): 94 | return torch.nn.Conv3d(10, 5, kernel_size=3, stride=2, padding=1) 95 | 96 | 97 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') 98 | def test_Conv3d_dilation2_trt7(): 99 | return torch.nn.Conv3d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) 100 | 101 | ''' 102 | -------------------------------------------------------------------------------- /torch2trt/contrib/qat/converters/QuantConvBN.py: -------------------------------------------------------------------------------- 1 | from torch2trt.torch2trt import * 2 | from torch2trt.module_test import add_module_test 3 | import tensorrt as trt 4 | 5 | @tensorrt_converter('torch2trt.contrib.qat.layers.quant_conv.IQuantConvBN2d.forward', enabled=trt_version() >= '7.0') 6 | def convert_QuantConv(ctx): 7 | module = ctx.method_args[0] 8 | input = ctx.method_args[1] 9 | input_trt = add_missing_trt_tensors(ctx.network, [input])[0] 10 | output = ctx.method_return 11 | 12 | input_dim = input.dim() - 2 13 | 14 | kernel_size = module.kernel_size 15 | if not isinstance(kernel_size, tuple): 16 | kernel_size = (kernel_size, ) * input_dim 17 | 18 | stride = module.stride 19 | if not isinstance(stride, tuple): 20 | stride = (stride, ) * input_dim 21 | 22 | padding = module.padding 23 | if not isinstance(padding, tuple): 24 | padding = (padding, ) * input_dim 25 | 26 | dilation = module.dilation 27 | if not isinstance(dilation, tuple): 28 | dilation = (dilation, ) * input_dim 29 | 30 | kernel = module.folded_weight.detach().cpu().numpy() 31 | 32 | bias = None #trt.Weights(torch_dtype_to_trt(module.weight.dtype)) 33 | if hasattr(module,'folded_bias'): 34 | bias = module.folded_bias.detach().cpu().numpy() 35 | 36 | layer = ctx.network.add_convolution_nd( 37 | input=input_trt, 38 | num_output_maps=module.out_channels, 39 | kernel_shape=kernel_size, 40 | kernel=kernel, 41 | bias=bias) 42 | layer.stride_nd = stride 43 | layer.padding_nd = padding 44 | layer.dilation_nd = dilation 45 | 46 | if module.groups is not None: 47 | layer.num_groups = module.groups 48 | 49 | if 'qat_mode' in ctx.torch2trt_kwargs: 50 | #Setting dynamic range for conv 51 | w_quant_amax = module._weight_quantizer.learned_amax 52 | layer.precision = trt.int8 53 | layer.set_output_type(0,trt.int8) 54 | conv_out = layer.get_output(0) 55 | conv_out.dynamic_range=(-w_quant_amax,w_quant_amax) 56 | 57 | 58 | output._trt = layer.get_output(0) 59 | 60 | 61 | 62 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') 63 | def test_Conv2d_basic_trt7(): 64 | return IQuantConv2d(10, 5, kernel_size=1, stride=1, padding=0) 65 | 66 | ''' 67 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') 68 | def test_Conv2d_stride2_trt7(): 69 | return torch.nn.Conv2d(10, 5, kernel_size=1, stride=2, padding=0) 70 | 71 | 72 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') 73 | def test_Conv2d_kernel3_trt7(): 74 | return torch.nn.Conv2d(10, 5, kernel_size=3, stride=2, padding=1) 75 | 76 | 77 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') 78 | def test_Conv2d_dilation2_trt7(): 79 | return torch.nn.Conv2d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) 80 | 81 | 82 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') 83 | def test_Conv3d_basic_trt7(): 84 | return torch.nn.Conv3d(10, 5, kernel_size=1, stride=1, padding=0) 85 | 86 | 87 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') 88 | def test_Conv3d_stride2_trt7(): 89 | return torch.nn.Conv3d(10, 5, kernel_size=1, stride=2, padding=0) 90 | 91 | 92 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') 93 | def test_Conv3d_kernel3_trt7(): 94 | return torch.nn.Conv3d(10, 5, kernel_size=3, stride=2, padding=1) 95 | 96 | 97 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') 98 | def test_Conv3d_dilation2_trt7(): 99 | return torch.nn.Conv3d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) 100 | 101 | ''' 102 | -------------------------------------------------------------------------------- /plugins/src/reflection_pad_2d_plugin.h: -------------------------------------------------------------------------------- 1 | #ifndef TORCH2TRT_PLUGIN_EXAMPLE 2 | #define TORCH2TRT_PLUGIN_EXAMPLE 3 | 4 | 5 | #include "NvInfer.h" 6 | #include "NvInferPlugin.h" 7 | #include 8 | #include 9 | #include 10 | 11 | #define REFLECTION_PAD_2D_PLUGIN_NAME "ReflectionPad2dPlugin" 12 | #define REFLECTION_PAD_2D_PLUGIN_VERSION "1" 13 | 14 | 15 | using namespace nvinfer1; 16 | 17 | 18 | namespace torch2trt_plugins { 19 | 20 | 21 | template 22 | void reflectionPad2dFunction( 23 | T *x, T *y, 24 | int N, int C, int H, int W, 25 | int paddingLeft, int paddingRight, int paddingTop, int paddingBottom, 26 | cudaStream_t stream=0); 27 | 28 | 29 | class ReflectionPad2dPlugin : public IPluginV2Ext { 30 | public: 31 | int32_t outputSize; 32 | DataType dataType; 33 | int32_t paddingLeft; 34 | int32_t paddingRight; 35 | int32_t paddingTop; 36 | int32_t paddingBottom; 37 | std::string pluginNamespace; 38 | Dims3 outputDims; 39 | 40 | ReflectionPad2dPlugin(int32_t paddingLeft, int32_t paddingRight, int32_t paddingTop, int32_t paddingBottom); 41 | ~ReflectionPad2dPlugin(); 42 | 43 | // IPluginV2 methods 44 | 45 | AsciiChar const* getPluginType() const noexcept override; 46 | 47 | AsciiChar const* getPluginVersion() const noexcept override; 48 | 49 | int32_t getNbOutputs() const noexcept override; 50 | 51 | Dims getOutputDimensions(int32_t index, Dims const* inputs, int32_t nbInputDims) noexcept override; 52 | 53 | bool supportsFormat(DataType type, PluginFormat format) const noexcept; 54 | 55 | int32_t initialize() noexcept override; 56 | 57 | void terminate() noexcept override; 58 | 59 | size_t getWorkspaceSize(int32_t maxBatchSize) const noexcept override; 60 | 61 | int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, 62 | cudaStream_t stream) noexcept 63 | override; 64 | 65 | size_t getSerializationSize() const noexcept override; 66 | 67 | void serialize(void* buffer) const noexcept override; 68 | 69 | void destroy() noexcept override; 70 | 71 | IPluginV2Ext* clone() const noexcept override; 72 | 73 | void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override; 74 | 75 | AsciiChar const* getPluginNamespace() const noexcept override; 76 | 77 | // IPluginV2Ext methods 78 | DataType getOutputDataType(int32_t index, DataType const* inputTypes, int32_t nbInputs) const noexcept override; 79 | bool isOutputBroadcastAcrossBatch(int32_t outputIndex, bool const* inputIsBroadcasted, int32_t nbInputs) const noexcept override; 80 | bool canBroadcastInputAcrossBatch(int32_t inputIndex) const noexcept override; 81 | void configurePlugin(Dims const* inputDims, int32_t nbInputs, Dims const* outputDims, int32_t nbOutputs, 82 | DataType const* inputTypes, DataType const* outputTypes, bool const* inputIsBroadcast, 83 | bool const* outputIsBroadcast, PluginFormat floatFormat, int32_t maxBatchSize) noexcept override; 84 | }; 85 | 86 | class ReflectionPad2dPluginCreator : public IPluginCreator { 87 | private: 88 | PluginFieldCollection fieldCollection; 89 | std::vector fields; 90 | std::string pluginNamespace; 91 | 92 | public: 93 | ReflectionPad2dPluginCreator(); 94 | 95 | /* IPluginCreator methods */ 96 | AsciiChar const* getPluginName() const noexcept override; 97 | 98 | AsciiChar const* getPluginVersion() const noexcept override; 99 | 100 | PluginFieldCollection const* getFieldNames() noexcept override; 101 | 102 | IPluginV2* createPlugin(AsciiChar const* name, PluginFieldCollection const* fc) noexcept override; 103 | 104 | IPluginV2* deserializePlugin(AsciiChar const* name, void const* serialData, size_t serialLength) noexcept override; 105 | 106 | void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override; 107 | 108 | 109 | AsciiChar const* getPluginNamespace() const noexcept override; 110 | }; 111 | 112 | } 113 | 114 | #endif -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Forms of contribution 4 | 5 | ### Submit an Issue 6 | 7 | torch2trt is use case driven. We originally created it to solve 8 | use cases related to NVIDIA Jetson, but the layer support has grown 9 | largely since it's release and we've found that it has 10 | helped many other developers as well. 11 | 12 | The growth of torch2trt has been largely driven by issues submitted on [GitHub](https://github.com/NVIDIA-AI-IOT/torch2trt/issues). 13 | We learn a lot from the reported issues. Submitting an issue it is one of the best ways to begin contributing to torch2trt. 14 | 15 | The reported issues typically are one of the following, 16 | 17 | * A bug or unexpected result 18 | * A model with unsupported layers 19 | 20 | If you report an issue, we typically find the following information helpful 21 | 22 | * PyTorch version 23 | * TensorRT version 24 | * Platform (ie: Jetson Nano) 25 | * The PyTorch Module you're attempting to convert 26 | * The steps taken to convert the PyTorch module 27 | 28 | If you're not sure how to provide any of these pieces of information, don't worry. Just open the issue 29 | and we're happy to discuss and help work out the details. 30 | 31 | ### Ask a Question 32 | 33 | Another great way to contribute is to ask a question on [GitHub](https://github.com/NVIDIA-AI-IOT/torch2trt/issues). 34 | There are often other developers who share your question, and they may find the discussion helpful. This also 35 | helps us gauge feature interest and identify gaps in documentation. 36 | 37 | ### Submit a Pull Request 38 | 39 | torch2trt is use case driven and has limited maintainence, for this reason we value community contributions greatly. 40 | Another great way to contribute is by submitting a pull request. Pull requests which are most likely to be accepted are 41 | 42 | * A new converter 43 | * A test case 44 | * A bug fix 45 | 46 | If you add a new converter, it is best to include a few test 47 | cases that cross validate the converter against the original PyTorch. We provide a utility function to do this, 48 | as described in the [Custom Converter](usage/custom_converter.md) usage guide. 49 | 50 | Ideally pull requests solve one thing at a time. This makes it easy 51 | to evaluate the impact that the changes have on the project step-by-step. The more confident we are that 52 | the changes will not adversely impact the experience of other developers, the more likely we are to accept them. 53 | 54 | ## Running module test cases 55 | 56 | Before any change is accepted, we run the test cases on at least one platform. This performs a large number 57 | of cross validation checks against PyTorch. To do this 58 | 59 | ```bash 60 | python3 -m torch2trt.test --name=converters --tolerance=1e-2 61 | ``` 62 | 63 | This will not hard-fail, but will highlight any build errors or max error checks. It is helpful if you include 64 | the status of this command in any pull-request, as well as system information like 65 | 66 | * PyTorch version 67 | * TensorRT version 68 | * Platform (ie: Jetson Nano) 69 | 70 | ## Testing documentation 71 | 72 | If you have a change that modifies the documentation, it is relatively straightforward to test. We 73 | use ``mkdocs-material`` for documentation, which parses markdown files in the ``docs`` folder. 74 | 75 | To view the docs, simply call 76 | 77 | ``` 78 | ./scripts/test_docs.sh 79 | ``` 80 | 81 | And then navigate to ``https://:8000``. 82 | 83 | Please note, this will not include dynamically generated documentation pages like the converters page. 84 | These contain cross reference links to the GitHub source code. If you want to test these 85 | you can call 86 | 87 | ```bash 88 | ./scripts/build_docs.sh 89 | ``` 90 | 91 | Pointing to the public reflection 92 | of your local repository. For example, if we're working off the upstream master branch, we 93 | would call 94 | 95 | ```bash 96 | ./scripts/build_docs.sh https://github.com/NVIDIA-AI-IOT/torch2trt master 97 | ``` 98 | 99 | If your changes are pushed to your fork, you would do 100 | 101 | ```bash 102 | ./scripts/build_docs.sh https://github.com//torch2trt my_branch 103 | ``` 104 | 105 | -------------------------------------------------------------------------------- /tests/feature_tests/test_flattener.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from torch2trt.flattener import Flattener 4 | 5 | 6 | def test_flattener_from_value(): 7 | 8 | x = (torch.ones(3), torch.ones(3)) 9 | 10 | flattener = Flattener.from_value(x) 11 | 12 | assert(isinstance(flattener.schema, tuple)) 13 | assert(flattener.schema[0] == 0) 14 | assert(flattener.schema[1] == 1) 15 | 16 | 17 | def test_flattener_tuple(): 18 | 19 | x = (torch.ones(3), torch.ones(3)) 20 | 21 | flattener = Flattener.from_value(x) 22 | 23 | y = flattener.flatten(x) 24 | 25 | assert(len(y) == len(x)) 26 | assert(y[0] is x[0]) 27 | assert(y[1] is x[1]) 28 | 29 | z = flattener.unflatten(y) 30 | 31 | assert(isinstance(z, tuple)) 32 | assert(z[0] is x[0]) 33 | assert(z[1] is x[1]) 34 | 35 | 36 | def test_flattener_list(): 37 | 38 | x = [torch.ones(3), torch.ones(3)] 39 | 40 | flattener = Flattener.from_value(x) 41 | 42 | y = flattener.flatten(x) 43 | 44 | assert(len(y) == len(x)) 45 | assert(y[0] is x[0]) 46 | assert(y[1] is x[1]) 47 | 48 | z = flattener.unflatten(y) 49 | 50 | assert(isinstance(z, list)) 51 | assert(z[0] is x[0]) 52 | assert(z[1] is x[1]) 53 | 54 | 55 | def test_flattener_dict(): 56 | 57 | x = {'a': torch.ones(3), 'b': torch.ones(3)} 58 | 59 | flattener = Flattener.from_value(x) 60 | 61 | y = flattener.flatten(x) 62 | 63 | assert(len(y) == len(x)) 64 | assert((y[0] is x['a'] and y[1] is x['b']) or (y[1] is x['a'] and y[0] is x['b'])) 65 | 66 | z = flattener.unflatten(y) 67 | 68 | assert(isinstance(z, dict)) 69 | assert(z['a'] is x['a']) 70 | assert(z['b'] is x['b']) 71 | 72 | 73 | def test_flattener_nested_tuple(): 74 | 75 | x = (torch.ones(1), (torch.ones(2), torch.ones(3))) 76 | 77 | flattener = Flattener.from_value(x) 78 | 79 | y = flattener.flatten(x) 80 | 81 | assert(len(y) == 3) 82 | 83 | z = flattener.unflatten(y) 84 | 85 | assert(isinstance(z, tuple)) 86 | assert(isinstance(z[1], tuple)) 87 | assert(z[0] is x[0]) 88 | assert(z[1][0] is x[1][0]) 89 | assert(z[1][1] is x[1][1]) 90 | 91 | 92 | def test_flattener_nested_list(): 93 | 94 | x = [torch.ones(1), [torch.ones(2), torch.ones(3)]] 95 | 96 | flattener = Flattener.from_value(x) 97 | 98 | y = flattener.flatten(x) 99 | 100 | assert(len(y) == 3) 101 | 102 | z = flattener.unflatten(y) 103 | 104 | assert(isinstance(z, list)) 105 | assert(isinstance(z[1], list)) 106 | assert(z[0] is x[0]) 107 | assert(z[1][0] is x[1][0]) 108 | assert(z[1][1] is x[1][1]) 109 | assert(z[0] is x[0]) 110 | assert(z[1][0] is x[1][0]) 111 | assert(z[1][1] is x[1][1]) 112 | 113 | 114 | def test_flattener_nested_dict(): 115 | 116 | x = {'a': torch.ones(1), 'b': {'a': torch.ones(2), 'b': torch.ones(3)}} 117 | 118 | flattener = Flattener.from_value(x) 119 | 120 | y = flattener.flatten(x) 121 | 122 | assert(len(y) == 3) 123 | 124 | z = flattener.unflatten(y) 125 | 126 | assert(isinstance(z, dict)) 127 | assert(isinstance(z['b'], dict)) 128 | assert(z['a'] is x['a']) 129 | assert(z['b']['a'] is x['b']['a']) 130 | assert(z['b']['b'] is x['b']['b']) 131 | 132 | 133 | def test_flattener_heterogeneous(): 134 | 135 | x = { 136 | 'a': (torch.ones(1), {'a': torch.ones(2)}), 137 | 'b': [torch.ones(3), torch.ones(4), (torch.ones(5), {'a': torch.ones(6)})] 138 | } 139 | 140 | flattener = Flattener.from_value(x) 141 | 142 | y = flattener.flatten(x) 143 | 144 | assert(len(y) == 6) 145 | 146 | z = flattener.unflatten(y) 147 | 148 | assert(isinstance(z, dict)) 149 | assert(isinstance(z['a'], tuple)) 150 | assert(z['a'][0] is x['a'][0]) 151 | assert(isinstance(z['a'][1], dict)) 152 | assert(z['a'][1]['a'] is x['a'][1]['a']) 153 | assert(isinstance(z['b'], list)) 154 | assert(z['b'][0] is x['b'][0]) 155 | assert(z['b'][1] is x['b'][1]) 156 | assert(isinstance(z['b'][2], tuple)) 157 | assert(z['b'][2][0] is x['b'][2][0]) 158 | assert(isinstance(z['b'][2][1], dict)) 159 | assert(z['b'][2][1]['a'] is x['b'][2][1]['a']) -------------------------------------------------------------------------------- /tests/feature_tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | from torch2trt.dataset import ( 5 | TensorBatchDataset, 6 | ListDataset, 7 | FolderDataset 8 | ) 9 | from tempfile import mkdtemp 10 | 11 | 12 | def test_dataset_shapes(): 13 | 14 | dataset = ListDataset() 15 | dataset.insert((torch.randn(1, 3, 32, 32), torch.randn(1, 4))) 16 | dataset.insert((torch.randn(1, 3, 64, 64), torch.randn(1, 8))) 17 | dataset.insert((torch.randn(1, 3, 48, 48), torch.randn(1, 6))) 18 | 19 | shapes = dataset.shapes() 20 | 21 | assert(shapes[0][0] == (1, 3, 32, 32)) 22 | assert(shapes[0][1] == (1, 3, 64, 64)) 23 | assert(shapes[1][0] == (1, 4)) 24 | assert(shapes[1][1] == (1, 8)) 25 | 26 | assert(dataset.min_shapes()[0] == (1, 3, 32, 32)) 27 | assert(dataset.min_shapes()[1] == (1, 4)) 28 | assert(dataset.max_shapes()[0] == (1, 3, 64, 64)) 29 | assert(dataset.max_shapes()[1] == (1, 8)) 30 | assert(dataset.median_numel_shapes()[0] == (1, 3, 48, 48)) 31 | assert(dataset.median_numel_shapes()[1] == (1, 6)) 32 | 33 | 34 | def test_dataset_infer_dynamic_axes(): 35 | 36 | dataset = ListDataset() 37 | dataset.insert((torch.randn(1, 3, 32, 32), torch.randn(1, 4))) 38 | dataset.insert((torch.randn(1, 3, 64, 64), torch.randn(1, 8))) 39 | dataset.insert((torch.randn(1, 3, 48, 48), torch.randn(1, 6))) 40 | 41 | dynamic_axes = dataset.infer_dynamic_axes() 42 | 43 | assert(dynamic_axes[0] == [2, 3]) 44 | assert(dynamic_axes[1] == [1]) 45 | 46 | 47 | def test_tensor_batch_dataset_record(): 48 | 49 | dataset = TensorBatchDataset() 50 | 51 | class TestModule(nn.Module): 52 | def __init__(self): 53 | super().__init__() 54 | self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval() 55 | 56 | def forward(self, x, y): 57 | a = self.conv(x) 58 | b = self.conv(y) 59 | return torch.cat([a, b], dim=0) 60 | 61 | inputs = [ 62 | torch.randn(1, 3, 32, 32).cuda(), 63 | torch.randn(1, 3, 32, 32).cuda() 64 | ] 65 | 66 | module = TestModule().cuda().eval() 67 | 68 | with dataset.record(module): 69 | for i in range(5): 70 | module(*inputs) 71 | 72 | assert(len(dataset) == 5) 73 | assert(len(dataset[0]) == 2) 74 | assert(dataset[0][0].shape == (1, 3, 32, 32)) 75 | assert(dataset[0][1].shape == (1, 3, 32, 32)) 76 | 77 | 78 | def test_list_dataset_record(): 79 | 80 | dataset = ListDataset() 81 | 82 | class TestModule(nn.Module): 83 | def __init__(self): 84 | super().__init__() 85 | self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval() 86 | 87 | def forward(self, x, y): 88 | a = self.conv(x) 89 | b = self.conv(y) 90 | return torch.cat([a, b], dim=0) 91 | 92 | inputs = [ 93 | torch.randn(1, 3, 32, 32).cuda(), 94 | torch.randn(1, 3, 32, 32).cuda() 95 | ] 96 | 97 | module = TestModule().cuda().eval() 98 | 99 | with dataset.record(module): 100 | for i in range(5): 101 | module(*inputs) 102 | 103 | assert(len(dataset) == 5) 104 | assert(len(dataset[0]) == 2) 105 | assert(dataset[0][0].shape == (1, 3, 32, 32)) 106 | assert(dataset[0][1].shape == (1, 3, 32, 32)) 107 | 108 | 109 | def test_folder_dataset_record(): 110 | 111 | dataset = FolderDataset(mkdtemp()) 112 | 113 | class TestModule(nn.Module): 114 | def __init__(self): 115 | super().__init__() 116 | self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval() 117 | 118 | def forward(self, x, y): 119 | a = self.conv(x) 120 | b = self.conv(y) 121 | return torch.cat([a, b], dim=0) 122 | 123 | device = torch.device('cuda:0') 124 | 125 | inputs = [ 126 | torch.randn(1, 3, 32, 32, device=device), 127 | torch.randn(1, 3, 32, 32, device=device) 128 | ] 129 | 130 | module = TestModule().to(device).eval() 131 | 132 | with dataset.record(module): 133 | for i in range(5): 134 | module(*inputs) 135 | 136 | assert(len(dataset) == 5) 137 | assert(len(dataset[0]) == 2) 138 | assert(dataset[0][0].shape == (1, 3, 32, 32)) 139 | assert(dataset[0][1].shape == (1, 3, 32, 32)) 140 | assert(dataset[0][0].device == device) -------------------------------------------------------------------------------- /benchmarks/JETSON_XAVIER.md: -------------------------------------------------------------------------------- 1 | | Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) | 2 | |------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------| 3 | | torch2trt.tests.torchvision.classification.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.63E-05 | 251 | 565 | 4.96 | 2.02 | 4 | | torch2trt.tests.torchvision.classification.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 121 | 834 | 8.04 | 1.49 | 5 | | torch2trt.tests.torchvision.classification.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 125 | 1.29e+03 | 8.01 | 1.02 | 6 | | torch2trt.tests.torchvision.classification.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-03 | 136 | 722 | 7.33 | 1.64 | 7 | | torch2trt.tests.torchvision.classification.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.50E-01 | 77.8 | 396 | 12.9 | 2.79 | 8 | | torch2trt.tests.torchvision.classification.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.09E-01 | 55.8 | 326 | 17.9 | 3.37 | 9 | | torch2trt.tests.torchvision.classification.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 28.3 | 175 | 35.1 | 6.04 | 10 | | torch2trt.tests.torchvision.classification.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 18.8 | 122 | 53.2 | 8.57 | 11 | | torch2trt.tests.torchvision.classification.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.81E-03 | 20.9 | 76.6 | 47.5 | 13 | 12 | | torch2trt.tests.torchvision.classification.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.91E-03 | 14.8 | 41.7 | 66.7 | 23.7 | 13 | | torch2trt.tests.torchvision.classification.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 12.6 | 30.2 | 79.1 | 33 | 14 | | torch2trt.tests.torchvision.classification.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 16.1 | 43.7 | 62.1 | 23 | 15 | | torch2trt.tests.torchvision.classification.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.56E-03 | 84.8 | 201 | 12.1 | 5.24 | 16 | | torch2trt.tests.torchvision.classification.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.24E-03 | 71.1 | 165 | 14.3 | 6.34 | 17 | | torch2trt.tests.torchvision.classification.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.78E-03 | 61.5 | 139 | 16.5 | 7.46 | 18 | | torch2trt.tests.torchvision.classification.vgg19 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.81E-03 | 54.1 | 120 | 18.7 | 8.61 | 19 | | torch2trt.tests.torchvision.classification.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.20E-03 | 81.5 | 200 | 12.5 | 5.27 | 20 | | torch2trt.tests.torchvision.classification.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.71E-03 | 67.5 | 165 | 15.1 | 6.33 | 21 | | torch2trt.tests.torchvision.classification.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.87E-03 | 58.3 | 139 | 17.4 | 7.48 | 22 | | torch2trt.tests.torchvision.classification.vgg19_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.44E-03 | 51.4 | 120 | 19.7 | 8.61 | 23 | | torch2trt.tests.torchvision.classification.mobilenet_v2 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 64.8 | 723 | 15.4 | 1.67 | 24 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.2 | 463 | 19.4 | 2.17 | 25 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 49.4 | 419 | 20.4 | 2.43 | 26 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x1_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.4 | 426 | 19.6 | 2.37 | 27 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x2_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 48.2 | 419 | 20.8 | 2.48 | 28 | | torch2trt.tests.torchvision.classification.mnasnet0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.03E-06 | 67.8 | 883 | 14.9 | 1.4 | 29 | | torch2trt.tests.torchvision.classification.mnasnet0_75 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.6 | 751 | 14.8 | 1.6 | 30 | | torch2trt.tests.torchvision.classification.mnasnet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 65.7 | 667 | 15.2 | 1.77 | 31 | | torch2trt.tests.torchvision.classification.mnasnet1_3 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.4 | 573 | 15 | 2.02 | -------------------------------------------------------------------------------- /docs/benchmarks/jetson_xavier.md: -------------------------------------------------------------------------------- 1 | # Jetson Xavier 2 | 3 | | Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) | 4 | |------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------| 5 | | torch2trt.tests.torchvision.classification.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.63E-05 | 251 | 565 | 4.96 | 2.02 | 6 | | torch2trt.tests.torchvision.classification.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 121 | 834 | 8.04 | 1.49 | 7 | | torch2trt.tests.torchvision.classification.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 125 | 1.29e+03 | 8.01 | 1.02 | 8 | | torch2trt.tests.torchvision.classification.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-03 | 136 | 722 | 7.33 | 1.64 | 9 | | torch2trt.tests.torchvision.classification.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.50E-01 | 77.8 | 396 | 12.9 | 2.79 | 10 | | torch2trt.tests.torchvision.classification.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.09E-01 | 55.8 | 326 | 17.9 | 3.37 | 11 | | torch2trt.tests.torchvision.classification.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 28.3 | 175 | 35.1 | 6.04 | 12 | | torch2trt.tests.torchvision.classification.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 18.8 | 122 | 53.2 | 8.57 | 13 | | torch2trt.tests.torchvision.classification.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.81E-03 | 20.9 | 76.6 | 47.5 | 13 | 14 | | torch2trt.tests.torchvision.classification.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.91E-03 | 14.8 | 41.7 | 66.7 | 23.7 | 15 | | torch2trt.tests.torchvision.classification.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 12.6 | 30.2 | 79.1 | 33 | 16 | | torch2trt.tests.torchvision.classification.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 16.1 | 43.7 | 62.1 | 23 | 17 | | torch2trt.tests.torchvision.classification.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.56E-03 | 84.8 | 201 | 12.1 | 5.24 | 18 | | torch2trt.tests.torchvision.classification.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.24E-03 | 71.1 | 165 | 14.3 | 6.34 | 19 | | torch2trt.tests.torchvision.classification.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.78E-03 | 61.5 | 139 | 16.5 | 7.46 | 20 | | torch2trt.tests.torchvision.classification.vgg19 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.81E-03 | 54.1 | 120 | 18.7 | 8.61 | 21 | | torch2trt.tests.torchvision.classification.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.20E-03 | 81.5 | 200 | 12.5 | 5.27 | 22 | | torch2trt.tests.torchvision.classification.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.71E-03 | 67.5 | 165 | 15.1 | 6.33 | 23 | | torch2trt.tests.torchvision.classification.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.87E-03 | 58.3 | 139 | 17.4 | 7.48 | 24 | | torch2trt.tests.torchvision.classification.vgg19_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.44E-03 | 51.4 | 120 | 19.7 | 8.61 | 25 | | torch2trt.tests.torchvision.classification.mobilenet_v2 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 64.8 | 723 | 15.4 | 1.67 | 26 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.2 | 463 | 19.4 | 2.17 | 27 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 49.4 | 419 | 20.4 | 2.43 | 28 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x1_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.4 | 426 | 19.6 | 2.37 | 29 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x2_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 48.2 | 419 | 20.8 | 2.48 | 30 | | torch2trt.tests.torchvision.classification.mnasnet0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.03E-06 | 67.8 | 883 | 14.9 | 1.4 | 31 | | torch2trt.tests.torchvision.classification.mnasnet0_75 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.6 | 751 | 14.8 | 1.6 | 32 | | torch2trt.tests.torchvision.classification.mnasnet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 65.7 | 667 | 15.2 | 1.77 | 33 | | torch2trt.tests.torchvision.classification.mnasnet1_3 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.4 | 573 | 15 | 2.02 | 34 | -------------------------------------------------------------------------------- /tests/model_tests/torchvision/test_classification_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch2trt 4 | 5 | 6 | def _cross_validate_module(model, shape=(224, 224)): 7 | model = model.cuda().eval() 8 | data = torch.randn(1, 3, *shape).cuda() 9 | model_trt = torch2trt.torch2trt(model, [data]) 10 | data = torch.randn(1, 3, *shape).cuda() 11 | out = model(data) 12 | out_trt = model_trt(data) 13 | assert torch.allclose(out, out_trt, rtol=1e-1, atol=1e-1) 14 | 15 | 16 | 17 | def test_alexnet(): 18 | model = torchvision.models.alexnet(pretrained=False) 19 | _cross_validate_module(model) 20 | 21 | 22 | def test_squeezenet1_0(): 23 | model = torchvision.models.squeezenet1_0(pretrained=False) 24 | _cross_validate_module(model) 25 | 26 | 27 | def test_squeezenet1_1(): 28 | model = torchvision.models.squeezenet1_1(pretrained=False) 29 | _cross_validate_module(model) 30 | 31 | 32 | def test_resnet18(): 33 | model = torchvision.models.resnet18(pretrained=False) 34 | _cross_validate_module(model) 35 | 36 | 37 | def test_resnet34(): 38 | model = torchvision.models.resnet34(pretrained=False) 39 | _cross_validate_module(model) 40 | 41 | 42 | def test_resnet50(): 43 | model = torchvision.models.resnet50(pretrained=False) 44 | _cross_validate_module(model) 45 | 46 | 47 | def test_resnet101(): 48 | model = torchvision.models.resnet101(pretrained=False) 49 | _cross_validate_module(model) 50 | 51 | 52 | def test_resnet152(): 53 | model = torchvision.models.resnet152(pretrained=False) 54 | _cross_validate_module(model) 55 | 56 | 57 | def test_densenet121(): 58 | model = torchvision.models.densenet121(pretrained=False) 59 | _cross_validate_module(model) 60 | 61 | 62 | def test_densenet169(): 63 | model = torchvision.models.densenet169(pretrained=False) 64 | _cross_validate_module(model) 65 | 66 | 67 | def test_densenet201(): 68 | model = torchvision.models.densenet201(pretrained=False) 69 | _cross_validate_module(model) 70 | 71 | 72 | def test_densenet161(): 73 | model = torchvision.models.densenet161(pretrained=False) 74 | _cross_validate_module(model) 75 | 76 | 77 | def test_vgg11(): 78 | model = torchvision.models.vgg11(pretrained=False) 79 | _cross_validate_module(model) 80 | 81 | 82 | def test_vgg13(): 83 | model = torchvision.models.vgg13(pretrained=False) 84 | _cross_validate_module(model) 85 | 86 | 87 | def test_vgg16(): 88 | model = torchvision.models.vgg16(pretrained=False) 89 | _cross_validate_module(model) 90 | 91 | 92 | def test_vgg19(): 93 | model = torchvision.models.vgg19(pretrained=False) 94 | _cross_validate_module(model) 95 | 96 | 97 | def test_vgg11_bn(): 98 | model = torchvision.models.vgg11_bn(pretrained=False) 99 | _cross_validate_module(model) 100 | 101 | 102 | def test_vgg13_bn(): 103 | model = torchvision.models.vgg13_bn(pretrained=False) 104 | _cross_validate_module(model) 105 | 106 | 107 | def test_vgg16_bn(): 108 | model = torchvision.models.vgg16_bn(pretrained=False) 109 | _cross_validate_module(model) 110 | 111 | 112 | def test_vgg19_bn(): 113 | model = torchvision.models.vgg19_bn(pretrained=False) 114 | _cross_validate_module(model) 115 | 116 | 117 | def mobilenet_v2(): 118 | model = torchvision.models.mobilenet_v2(pretrained=False) 119 | _cross_validate_module(model) 120 | 121 | 122 | def test_shufflenet_v2_x0_5(): 123 | model = torchvision.models.shufflenet_v2_x0_5(pretrained=False) 124 | _cross_validate_module(model) 125 | 126 | 127 | def test_shufflenet_v2_x1_0(): 128 | model = torchvision.models.shufflenet_v2_x1_0(pretrained=False) 129 | _cross_validate_module(model) 130 | 131 | 132 | def test_shufflenet_v2_x1_5(): 133 | model = torchvision.models.shufflenet_v2_x1_5(pretrained=False) 134 | _cross_validate_module(model) 135 | 136 | 137 | def test_shufflenet_v2_x2_0(): 138 | model = torchvision.models.shufflenet_v2_x2_0(pretrained=False) 139 | _cross_validate_module(model) 140 | 141 | 142 | def test_mnasnet0_5(): 143 | model = torchvision.models.mnasnet0_5(pretrained=False) 144 | _cross_validate_module(model) 145 | 146 | 147 | def test_mnasnet0_75(): 148 | model = torchvision.models.mnasnet0_75(pretrained=False) 149 | _cross_validate_module(model) 150 | 151 | 152 | def test_mnasnet1_0(): 153 | model = torchvision.models.mnasnet1_0(pretrained=False) 154 | _cross_validate_module(model) 155 | 156 | 157 | def test_mnasnet1_3(): 158 | model = torchvision.models.mnasnet1_3(pretrained=False) 159 | _cross_validate_module(model) -------------------------------------------------------------------------------- /examples/image_classification/conversion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "First, we create the pre-trained ImageNet model. We'll use ``resnet18`` from the torchvision package. Make sure to set the device to ``cuda``, since the inputs and parameter devices are inferred from model. Also make sure to set ``eval()`` to fix batch norm statistics." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import torchvision\n", 17 | "\n", 18 | "model = torchvision.models.resnet18(pretrained=True).cuda().half().eval()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "Next, we create some sample input that will be used to infer the shape and data types of our TensorRT engine" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import torch\n", 35 | "\n", 36 | "data = torch.randn((1, 3, 224, 224)).cuda().half()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Finally, create the optimized TensorRT engine." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "from torch2trt import torch2trt\n", 53 | "\n", 54 | "model_trt = torch2trt(model, [data], fp16_mode=True)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "We can execute the network like this" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "output_trt = model_trt(data)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "And check against the original output" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 8, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "tensor([ 0.7231, 3.0195, 3.1016, 3.1152, 4.7539, 3.8301, 3.9180, 0.3086,\n", 90 | " -0.8726, -0.2261], device='cuda:0', dtype=torch.float16,\n", 91 | " grad_fn=)\n", 92 | "tensor([ 0.7202, 3.0234, 3.1074, 3.1133, 4.7539, 3.8340, 3.9141, 0.3081,\n", 93 | " -0.8716, -0.2227], device='cuda:0', dtype=torch.float16)\n", 94 | "max error: 0.011719\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "output = model(data)\n", 100 | "\n", 101 | "print(output.flatten()[0:10])\n", 102 | "print(output_trt.flatten()[0:10])\n", 103 | "print('max error: %f' % float(torch.max(torch.abs(output - output_trt))))" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "We can save the model like this" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "torch.save(model_trt.state_dict(), 'resnet18_trt.pth')" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "And load the model like this." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "from torch2trt import TRTModule\n", 136 | "\n", 137 | "model_trt = TRTModule()\n", 138 | "\n", 139 | "model_trt.load_state_dict(torch.load('resnet18_trt.pth'))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "That's it for this notebook! Try out the live demo to see real-time classification on a video feed." 147 | ] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 3", 153 | "language": "python", 154 | "name": "python3" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 3 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython3", 166 | "version": "3.6.7" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 2 171 | } 172 | -------------------------------------------------------------------------------- /plugins/src/reflection_pad_2d_plugin_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "reflection_pad_2d_plugin.h" 3 | #include 4 | #include "NvInfer.h" 5 | #include 6 | 7 | 8 | using namespace torch2trt_plugins; 9 | 10 | 11 | TEMPLATE_TEST_CASE("Test reflection pad kernel", "[ReflectionPad2d][template]" , float) { 12 | TestType x_cpu[9] = { 13 | 0, 1, 2, 14 | 3, 4, 5, 15 | 6, 7, 8 16 | }; 17 | TestType y_cpu[25]; 18 | TestType *x_gpu; 19 | TestType *y_gpu; 20 | TestType y_cpu_gt[25] = { 21 | 4, 3, 4, 5, 4, 22 | 1, 0, 1, 2, 1, 23 | 4, 3, 4, 5, 4, 24 | 7, 6, 7, 8, 7, 25 | 4, 3, 4, 5, 4 26 | }; 27 | 28 | // y_cpu = (TestType*) malloc(16 * sizeof(TestType)); 29 | cudaMalloc((void**)&x_gpu, 9 * sizeof(TestType)); 30 | cudaMalloc((void**)&y_gpu, 25 * sizeof(TestType)); 31 | cudaMemcpy(x_gpu, x_cpu, 9 * sizeof(TestType), cudaMemcpyHostToDevice); 32 | 33 | reflectionPad2dFunction(x_gpu, y_gpu, 34 | 1, 1, 5, 5, 35 | 1, 1, 1, 1); 36 | 37 | cudaMemcpy(y_cpu, y_gpu, 25 * sizeof(TestType), cudaMemcpyDeviceToHost); 38 | for (int i = 0; i < 25; i++) { 39 | REQUIRE(y_cpu[i] == y_cpu_gt[i]); 40 | } 41 | cudaFree(x_gpu); 42 | cudaFree(y_gpu); 43 | } 44 | 45 | TEMPLATE_TEST_CASE("Test reflection pad plugin enqueue", "[ReflectionPad2d][template]" , float) { 46 | TestType x_cpu[9] = { 47 | 0, 1, 2, 48 | 3, 4, 5, 49 | 6, 7, 8 50 | }; 51 | TestType y_cpu[25]; 52 | TestType *x_gpu; 53 | TestType *y_gpu; 54 | TestType y_cpu_gt[25] = { 55 | 4, 3, 4, 5, 4, 56 | 1, 0, 1, 2, 1, 57 | 4, 3, 4, 5, 4, 58 | 7, 6, 7, 8, 7, 59 | 4, 3, 4, 5, 4 60 | }; 61 | 62 | // y_cpu = (TestType*) malloc(16 * sizeof(TestType)); 63 | cudaMalloc((void**)&x_gpu, 9 * sizeof(TestType)); 64 | cudaMalloc((void**)&y_gpu, 25 * sizeof(TestType)); 65 | cudaMemcpy(x_gpu, x_cpu, 9 * sizeof(TestType), cudaMemcpyHostToDevice); 66 | 67 | auto plugin = ReflectionPad2dPlugin(1, 1, 1, 1); 68 | Dims3 inputDims(1, 3, 3); 69 | Dims3 outputDims(1, 5, 5); 70 | DataType inputTypes = DataType::kFLOAT; 71 | DataType outputTypes = DataType::kFLOAT; 72 | bool inputIsBroadcast = false; 73 | bool outputIsBroadcast = false; 74 | plugin.configurePlugin( 75 | &inputDims, 1, 76 | &outputDims, 1, 77 | &inputTypes, 78 | &outputTypes, 79 | &inputIsBroadcast, 80 | &outputIsBroadcast, 81 | PluginFormat::kLINEAR, 82 | 1 83 | ); 84 | 85 | void *inputs[] = {(void*)x_gpu}; 86 | void *outputs[] = {(void*)y_gpu}; 87 | plugin.enqueue(1, inputs, outputs, nullptr, 0); 88 | 89 | cudaMemcpy(y_cpu, y_gpu, 25 * sizeof(TestType), cudaMemcpyDeviceToHost); 90 | for (int i = 0; i < 25; i++) { 91 | REQUIRE(y_cpu[i] == y_cpu_gt[i]); 92 | } 93 | cudaFree(x_gpu); 94 | cudaFree(y_gpu); 95 | } 96 | 97 | TEMPLATE_TEST_CASE("Test reflection pad plugin enqueue 2 channels", "[ReflectionPad2d][template]" , float) { 98 | TestType x_cpu[9*2] = { 99 | 0, 1, 2, 100 | 3, 4, 5, 101 | 6, 7, 8, 102 | 0, 1, 2, 103 | 3, 4, 5, 104 | 6, 7, 8 105 | }; 106 | TestType y_cpu[25*2]; 107 | TestType *x_gpu; 108 | TestType *y_gpu; 109 | TestType y_cpu_gt[25*2] = { 110 | 4, 3, 4, 5, 4, 111 | 1, 0, 1, 2, 1, 112 | 4, 3, 4, 5, 4, 113 | 7, 6, 7, 8, 7, 114 | 4, 3, 4, 5, 4, 115 | 4, 3, 4, 5, 4, 116 | 1, 0, 1, 2, 1, 117 | 4, 3, 4, 5, 4, 118 | 7, 6, 7, 8, 7, 119 | 4, 3, 4, 5, 4 120 | }; 121 | 122 | // y_cpu = (TestType*) malloc(16 * sizeof(TestType)); 123 | cudaMalloc((void**)&x_gpu, 2*9 * sizeof(TestType)); 124 | cudaMalloc((void**)&y_gpu, 2*25 * sizeof(TestType)); 125 | cudaMemcpy(x_gpu, x_cpu, 2*9 * sizeof(TestType), cudaMemcpyHostToDevice); 126 | 127 | auto plugin = ReflectionPad2dPlugin(1, 1, 1, 1); 128 | Dims3 inputDims(2, 3, 3); 129 | Dims3 outputDims(2, 5, 5); 130 | DataType inputTypes = DataType::kFLOAT; 131 | DataType outputTypes = DataType::kFLOAT; 132 | bool inputIsBroadcast = false; 133 | bool outputIsBroadcast = false; 134 | plugin.configurePlugin( 135 | &inputDims, 1, 136 | &outputDims, 1, 137 | &inputTypes, 138 | &outputTypes, 139 | &inputIsBroadcast, 140 | &outputIsBroadcast, 141 | PluginFormat::kLINEAR, 142 | 1 143 | ); 144 | 145 | void *inputs[] = {(void*)x_gpu}; 146 | void *outputs[] = {(void*)y_gpu}; 147 | plugin.enqueue(1, inputs, outputs, nullptr, 0); 148 | 149 | cudaMemcpy(y_cpu, y_gpu, 2*25 * sizeof(TestType), cudaMemcpyDeviceToHost); 150 | for (int i = 0; i < 2*25; i++) { 151 | REQUIRE(y_cpu[i] == y_cpu_gt[i]); 152 | } 153 | cudaFree(x_gpu); 154 | cudaFree(y_gpu); 155 | } 156 | -------------------------------------------------------------------------------- /tests/feature_tests/test_tensor_shape.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn.functional as F 4 | from torch2trt import ( 5 | torch2trt, 6 | trt, 7 | SizeWrapper, 8 | tensorrt_converter 9 | ) 10 | 11 | 12 | def test_tensor_shape_view_trivial(): 13 | 14 | class TestModule(torch.nn.Module): 15 | def forward(self, x): 16 | size = x.size() 17 | return x.view(size) 18 | 19 | module = TestModule().cuda().eval() 20 | 21 | x = torch.randn(1, 3, 32, 32).cuda() 22 | 23 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4) 24 | 25 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 26 | 27 | x = torch.randn(1, 3, 32, 32).cuda() 28 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 29 | 30 | x = torch.randn(4, 3, 32, 32).cuda() 31 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 32 | 33 | 34 | def test_tensor_shape_view_mul(): 35 | 36 | class TestModule(torch.nn.Module): 37 | def forward(self, x): 38 | size = x.size() 39 | return x.view(size[0] * size[1], size[2] * size[3]) 40 | 41 | module = TestModule().cuda().eval() 42 | 43 | x = torch.randn(1, 3, 32, 32).cuda() 44 | 45 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4) 46 | 47 | x = torch.randn(1, 3, 32, 32).cuda() 48 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 49 | 50 | x = torch.randn(4, 3, 32, 32).cuda() 51 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 52 | 53 | 54 | def test_tensor_shape_view_mul(): 55 | 56 | class TestModule(torch.nn.Module): 57 | def forward(self, x): 58 | size = x.size() 59 | return x.view(size[0] * size[1], size[2] * size[3]) 60 | 61 | module = TestModule().cuda().eval() 62 | 63 | x = torch.randn(1, 3, 32, 32).cuda() 64 | 65 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4) 66 | 67 | x = torch.randn(1, 3, 32, 32).cuda() 68 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 69 | 70 | x = torch.randn(4, 3, 32, 32).cuda() 71 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 72 | 73 | 74 | def test_tensor_shape_view_mul_cast(): 75 | 76 | class TestModule(torch.nn.Module): 77 | def forward(self, x): 78 | size = x.size() 79 | return x.view(size[0] * int(size[1]), int(size[2] * size[3])) 80 | 81 | module = TestModule().cuda().eval() 82 | 83 | x = torch.randn(1, 3, 32, 32).cuda() 84 | 85 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4) 86 | 87 | x = torch.randn(1, 3, 32, 32).cuda() 88 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 89 | 90 | x = torch.randn(4, 3, 32, 32).cuda() 91 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 92 | 93 | 94 | def test_tensor_shape_view_mul_const_lhs(): 95 | 96 | class TestModule(torch.nn.Module): 97 | def forward(self, x): 98 | size = x.size() 99 | return x.view(size[0] * 1, size[1], size[2] * size[3]) 100 | 101 | module = TestModule().cuda().eval() 102 | 103 | x = torch.randn(1, 3, 32, 32).cuda() 104 | 105 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4) 106 | 107 | x = torch.randn(1, 3, 32, 32).cuda() 108 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 109 | 110 | x = torch.randn(4, 3, 32, 32).cuda() 111 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 112 | 113 | 114 | def test_tensor_shape_view_mul_const_rhs(): 115 | 116 | class TestModule(torch.nn.Module): 117 | def forward(self, x): 118 | size = x.size() 119 | return x.view(1 * size[0], size[1], size[2] * size[3]) 120 | 121 | module = TestModule().cuda().eval() 122 | 123 | x = torch.randn(1, 3, 32, 32).cuda() 124 | 125 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4) 126 | 127 | x = torch.randn(1, 3, 32, 32).cuda() 128 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 129 | 130 | x = torch.randn(4, 3, 32, 32).cuda() 131 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 132 | 133 | 134 | def test_tensor_shape_view_static(): 135 | 136 | class TestModule(torch.nn.Module): 137 | def forward(self, x): 138 | size = x.size() 139 | return x.view(1, 3, 32, 32) 140 | 141 | module = TestModule().cuda().eval() 142 | 143 | x = torch.randn(1, 3, 32, 32).cuda() 144 | 145 | module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4) 146 | 147 | x = torch.randn(1, 3, 32, 32).cuda() 148 | assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 149 | 150 | # x = torch.randn(4, 3, 32, 32).cuda() 151 | # assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2)) 152 | 153 | 154 | if __name__ == '__main__': 155 | 156 | test_tensor_shape_view_mul() -------------------------------------------------------------------------------- /tests/converter_tests/test_getitem.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | from torch2trt import torch2trt, trt 5 | 6 | 7 | class YOLOXFocusTestModule(nn.Module): 8 | 9 | 10 | def forward(self, x): 11 | patch_top_left = x[..., ::2, ::2] 12 | patch_top_right = x[..., ::2, 1::2] 13 | patch_bot_left = x[..., 1::2, ::2] 14 | patch_bot_right = x[..., 1::2, 1::2] 15 | x = torch.cat( 16 | ( 17 | patch_top_left, 18 | patch_bot_left, 19 | patch_top_right, 20 | patch_bot_right, 21 | ), 22 | dim=1, 23 | ) 24 | return x 25 | 26 | 27 | def test_getitem_dynamic_yolox_layer(): 28 | 29 | class YOLOXFocusTestModule(nn.Module): 30 | 31 | 32 | def forward(self, x): 33 | patch_top_left = x[..., ::2, ::2] 34 | patch_top_right = x[..., ::2, 1::2] 35 | patch_bot_left = x[..., 1::2, ::2] 36 | patch_bot_right = x[..., 1::2, 1::2] 37 | x = torch.cat( 38 | ( 39 | patch_top_left, 40 | patch_bot_left, 41 | patch_top_right, 42 | patch_bot_right, 43 | ), 44 | dim=1, 45 | ) 46 | return x 47 | 48 | module = YOLOXFocusTestModule().cuda().eval() 49 | 50 | data = torch.randn(1, 3, 112, 112).cuda() 51 | 52 | module_trt = torch2trt(module, [data], max_batch_size=4, log_level=trt.Logger.VERBOSE) 53 | 54 | data = torch.randn(1, 3, 112, 112).cuda() 55 | assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4)) 56 | 57 | data = torch.randn(4, 3, 112, 112).cuda() 58 | assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4)) 59 | 60 | 61 | def test_getitem_dynamic_add_dim(): 62 | 63 | class TestModule(nn.Module): 64 | 65 | 66 | def forward(self, x): 67 | patch_top_left = x[..., None] 68 | patch_top_right = x[..., None] 69 | patch_bot_left = x[..., None] 70 | patch_bot_right = x[..., None] 71 | x = torch.cat( 72 | ( 73 | patch_top_left, 74 | patch_bot_left, 75 | patch_top_right, 76 | patch_bot_right, 77 | ), 78 | dim=1, 79 | ) 80 | return x 81 | 82 | module = TestModule().cuda().eval() 83 | 84 | data = torch.randn(1, 3, 112, 112).cuda() 85 | 86 | module_trt = torch2trt(module, [data], max_batch_size=4, log_level=trt.Logger.VERBOSE) 87 | 88 | data = torch.randn(1, 3, 112, 112).cuda() 89 | assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4)) 90 | 91 | data = torch.randn(4, 3, 112, 112).cuda() 92 | assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4)) 93 | 94 | 95 | def test_getitem_dynamic_remove_dim(): 96 | 97 | class TestModule(nn.Module): 98 | 99 | 100 | def forward(self, x): 101 | patch_top_left = x[..., 0] 102 | patch_top_right = x[..., 0] 103 | patch_bot_left = x[..., 0] 104 | patch_bot_right = x[..., 0] 105 | x = torch.cat( 106 | ( 107 | patch_top_left, 108 | patch_bot_left, 109 | patch_top_right, 110 | patch_bot_right, 111 | ), 112 | dim=1, 113 | ) 114 | return x 115 | 116 | module = TestModule().cuda().eval() 117 | 118 | data = torch.randn(1, 3, 112, 112).cuda() 119 | 120 | module_trt = torch2trt(module, [data], max_batch_size=4, log_level=trt.Logger.VERBOSE) 121 | 122 | data = torch.randn(1, 3, 112, 112).cuda() 123 | assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4)) 124 | 125 | data = torch.randn(4, 3, 112, 112).cuda() 126 | assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4)) 127 | 128 | 129 | def test_getitem_dynamic_remove_add_dim(): 130 | 131 | class TestModule(nn.Module): 132 | 133 | 134 | def forward(self, x): 135 | patch_top_left = x[..., 0, None] 136 | patch_top_right = x[..., 0, None] 137 | patch_bot_left = x[..., 0, None] 138 | patch_bot_right = x[..., 0, None] 139 | x = torch.cat( 140 | ( 141 | patch_top_left, 142 | patch_bot_left, 143 | patch_top_right, 144 | patch_bot_right, 145 | ), 146 | dim=1, 147 | ) 148 | return x 149 | 150 | module = TestModule().cuda().eval() 151 | 152 | data = torch.randn(1, 3, 112, 112).cuda() 153 | 154 | module_trt = torch2trt(module, [data], max_batch_size=4, log_level=trt.Logger.VERBOSE) 155 | 156 | data = torch.randn(1, 3, 112, 112).cuda() 157 | assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4)) 158 | 159 | data = torch.randn(4, 3, 112, 112).cuda() 160 | assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4)) 161 | 162 | -------------------------------------------------------------------------------- /torch2trt/contrib/qat/layers/_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import copy 3 | import inspect 4 | 5 | from absl import logging 6 | 7 | from torch import nn 8 | 9 | from pytorch_quantization.nn import TensorQuantizer as TQ 10 | from pytorch_quantization.tensor_quant import QuantDescriptor, QUANT_DESC_8BIT_PER_TENSOR 11 | 12 | ''' 13 | Currently Nvidia quantization library quantizes the input of the conv layer as opposed to output of ReLU. 14 | utilities classes and functions mentioned below are going to help us map int8 layers correctly to TensorRT layers. 15 | ''' 16 | 17 | class QuantWeightMixin(): 18 | """Mixin class for adding basic quantization logic to quantized modules""" 19 | 20 | default_quant_desc_weight = QUANT_DESC_8BIT_PER_TENSOR 21 | 22 | @classmethod 23 | def set_default_quant_desc_input(cls, value): 24 | """ 25 | Args: 26 | value: An instance of :class:`QuantDescriptor ` 27 | """ 28 | if not isinstance(value, QuantDescriptor): 29 | raise ValueError("{} is not an instance of QuantDescriptor!") 30 | cls.default_quant_desc_weight = copy.deepcopy(value) 31 | 32 | def init_quantizer(self, quant_desc_weight): 33 | """Helper function for __init__ of simple quantized module 34 | 35 | Create weight quantizer based on quant_desc passed by kwargs, or default of the class. 36 | 37 | Args: 38 | quant_desc_weight: An instance of :class:`QuantDescriptor ` 39 | """ 40 | if not inspect.stack()[1].function == "__init__": 41 | raise TypeError("{} should be only called by __init__ of quantized module.".format(__name__)) 42 | self._fake_quant = True 43 | if not quant_desc_weight.fake_quant: 44 | raise ValueError("Only fake quantization is supported!") 45 | 46 | logging.info("Input is %squantized to %d bits in %s with axis %s!", "" 47 | if not quant_desc_weight.fake_quant else "fake ", 48 | quant_desc_weight.num_bits, self.__class__.__name__, quant_desc_weight.axis) 49 | 50 | self._weight_quantizer = TQ(quant_desc_weight) 51 | 52 | # pylint:disable=missing-docstring 53 | @property 54 | def weight_quantizer(self): 55 | return self._weight_quantizer 56 | # pylint:enable=missing-docstring 57 | 58 | 59 | def pop_quant_desc_in_kwargs(quant_cls, input_only=False,weight_only=False, **kwargs): 60 | """Pop quant descriptors in kwargs 61 | 62 | If there is no descriptor in kwargs, the default one in quant_cls will be used 63 | 64 | Arguments: 65 | quant_cls: A class that has default quantization descriptors 66 | input_only: A boolean. If True, pop quant_desc_input only, not quant_desc_weight. Default false. 67 | 68 | Keyword Arguments: 69 | quant_desc_input: An instance of :class:`QuantDescriptor `. 70 | Quantization descriptor of input. 71 | quant_desc_weight: An instance of :class:`QuantDescriptor `. 72 | Quantization descriptor of weight. 73 | 74 | Note: Original function doesnt pop quant_desc_weight 75 | """ 76 | if input_only: 77 | quant_desc_input = kwargs.pop('quant_desc_input', quant_cls.default_quant_desc_input) 78 | elif weight_only: 79 | quant_desc_weight = kwargs.pop('quant_desc_weight', quant_cls.default_quant_desc_weight) 80 | else: 81 | quant_desc_input = kwargs.pop('quant_desc_input', quant_cls.default_quant_desc_input) 82 | quant_desc_weight = kwargs.pop('quant_desc_weight', quant_cls.default_quant_desc_weight) 83 | 84 | 85 | # Check if anything is left in **kwargs 86 | if kwargs: 87 | raise TypeError("Unused keys: {}".format(kwargs.keys())) 88 | 89 | if input_only: 90 | return quant_desc_input 91 | 92 | if weight_only: 93 | return quant_desc_weight 94 | 95 | return quant_desc_input, quant_desc_weight 96 | 97 | 98 | 99 | ''' 100 | Inference Layers: At inference time, we dont need to carry entire qat library. We only need dynamic range so that layers 101 | can be mapped to TRT layers at INT8. 102 | ''' 103 | 104 | class TensorQuantizer(torch.nn.Module): 105 | def __init__(self): 106 | super().__init__() 107 | self.register_buffer('learned_amax',torch.tensor(1.0)) 108 | 109 | class QuantMixin(): 110 | def init_quantizer(self): 111 | self._input_quantizer = TensorQuantizer() 112 | self._weight_quantizer = TensorQuantizer() 113 | 114 | @property 115 | def input_quantizer(self): 116 | return self._input_quantizer 117 | 118 | @property 119 | def weight_quantizer(self): 120 | return self._weight_quantizer 121 | 122 | class QuantMixinInput(): 123 | def init_quantizer(self): 124 | self._input_quantizer = TensorQuantizer() 125 | 126 | @property 127 | def input_quantizer(self): 128 | return self._input_quantizer 129 | 130 | class QuantMixinWeight(): 131 | def init_quantizer(self): 132 | self._weight_quantizer = TensorQuantizer() 133 | 134 | @property 135 | def weight_quantizer(self): 136 | return self._weight_quantizer 137 | 138 | 139 | -------------------------------------------------------------------------------- /examples/image_classification/live_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook will run a live demo on Jetson Nano using [JetCam](https://github.com/NVIDIA-AI-IOT/jetcam) to acquire images from the camera. First,\n", 8 | "let's start the camera. See the JetCam examples for details." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "from jetcam.csi_camera import CSICamera\n", 18 | "# from jetcam.usb_camera import USBCamera\n", 19 | "\n", 20 | "camera = CSICamera(width=224, height=224)\n", 21 | "# camera = USBCamera(width=224, height=224)\n", 22 | "\n", 23 | "camera.running = True" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Now, let's connect the camera's value to a widget to display." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from jetcam.utils import bgr8_to_jpeg\n", 40 | "import traitlets\n", 41 | "import ipywidgets\n", 42 | "\n", 43 | "image_w = ipywidgets.Image()\n", 44 | "\n", 45 | "traitlets.dlink((camera, 'value'), (image_w, 'value'), transform=bgr8_to_jpeg)\n", 46 | "\n", 47 | "display(image_w)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Next, we'll load the TensorRT model. (We assume you followed the conversion notebook and saved to the path ``resnet18_trt.pth``)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import torch\n", 64 | "from torch2trt import TRTModule\n", 65 | "\n", 66 | "model_trt = TRTModule()\n", 67 | "model_trt.load_state_dict(torch.load('resnet18_trt.pth'))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "The following function will be used to pre-process images from the camera" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "import cv2\n", 84 | "import numpy as np\n", 85 | "import torchvision\n", 86 | "\n", 87 | "device = torch.device('cuda')\n", 88 | "mean = 255.0 * np.array([0.485, 0.456, 0.406])\n", 89 | "stdev = 255.0 * np.array([0.229, 0.224, 0.225])\n", 90 | "\n", 91 | "normalize = torchvision.transforms.Normalize(mean, stdev)\n", 92 | "\n", 93 | "def preprocess(camera_value):\n", 94 | " global device, normalize\n", 95 | " x = camera_value\n", 96 | " x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)\n", 97 | " x = x.transpose((2, 0, 1))\n", 98 | " x = torch.from_numpy(x).float()\n", 99 | " x = normalize(x)\n", 100 | " x = x.to(device)\n", 101 | " x = x[None, ...]\n", 102 | " return x" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "This text area will be used to display the class predictions." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "text = ipywidgets.Textarea()\n", 119 | "display(text)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "We load the imagenet labels to associate the neural network output with a class name." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "import json\n", 136 | "\n", 137 | "with open('imagenet_labels.json', 'r') as f:\n", 138 | " labels = json.load(f)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "Finally, we create our execution function, which we attach as a callback to the camera's ``value`` attribute.\n", 146 | "\n", 147 | "Whenever the camera's value is updated (which it will be for each frame, since we set ``camera.running = True``). This function will be called\n", 148 | "describing how the value changed. The new camera value will be stored in ``change['new']``." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "def execute(change):\n", 158 | " image = change['new']\n", 159 | " output = model_trt(preprocess(image).half()).detach().cpu().numpy().flatten()\n", 160 | " idx = output.argmax()\n", 161 | " text.value = labels[idx]\n", 162 | "\n", 163 | "camera.observe(execute, names='value')" 164 | ] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.6.7" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 2 188 | } 189 | -------------------------------------------------------------------------------- /examples/image_segmentation/conversion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import torchvision\n", 11 | "import torch2trt" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "model = torchvision.models.segmentation.deeplabv3_resnet101(pretrained=True)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "model = model.cuda().eval().half()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "class ModelWrapper(torch.nn.Module):\n", 39 | " def __init__(self, model):\n", 40 | " super(ModelWrapper, self).__init__()\n", 41 | " self.model = model\n", 42 | " def forward(self, x):\n", 43 | " return self.model(x)['out']" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "model_w = ModelWrapper(model).half()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "data = torch.ones((1, 3, 224, 224)).cuda().half()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "model_trt = torch2trt.torch2trt(model_w, [data], fp16_mode=True)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "# Live demo" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# from jetcam.csi_camera import CSICamera\n", 87 | "from jetcam.usb_camera import USBCamera\n", 88 | "\n", 89 | "# camera = CSICamera(width=224, height=224)\n", 90 | "camera = USBCamera(width=224, height=224)\n", 91 | "\n", 92 | "camera.running = True" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "from jetcam.utils import bgr8_to_jpeg\n", 102 | "import traitlets\n", 103 | "import ipywidgets\n", 104 | "\n", 105 | "image_w = ipywidgets.Image()\n", 106 | "\n", 107 | "traitlets.dlink((camera, 'value'), (image_w, 'value'), transform=bgr8_to_jpeg)\n", 108 | "\n", 109 | "display(image_w)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "import cv2\n", 119 | "import numpy as np\n", 120 | "import torchvision\n", 121 | "\n", 122 | "device = torch.device('cuda')\n", 123 | "mean = 255.0 * np.array([0.485, 0.456, 0.406])\n", 124 | "stdev = 255.0 * np.array([0.229, 0.224, 0.225])\n", 125 | "\n", 126 | "normalize = torchvision.transforms.Normalize(mean, stdev)\n", 127 | "\n", 128 | "def preprocess(camera_value):\n", 129 | " global device, normalize\n", 130 | " x = camera_value\n", 131 | " x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)\n", 132 | " x = x.transpose((2, 0, 1))\n", 133 | " x = torch.from_numpy(x).float()\n", 134 | " x = normalize(x)\n", 135 | " x = x.to(device)\n", 136 | " x = x[None, ...]\n", 137 | " return x" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "seg_image = ipywidgets.Image()\n", 147 | "\n", 148 | "display(seg_image)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "def execute(change):\n", 158 | " image = change['new']\n", 159 | " output = model_trt(preprocess(camera.value).half())[0].detach().cpu().float().numpy()\n", 160 | " mask = 1.0 * (output.argmax(0) == 15)\n", 161 | " seg_image.value = bgr8_to_jpeg(mask[:, :, None] * image)\n", 162 | " \n", 163 | " \n", 164 | "mask = execute({'new': camera.value})\n", 165 | "# camera.observe(execute, names='value')" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "camera.observe(execute, names='value')" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "camera.unobserve(execute, names='value')" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "import time\n", 193 | "\n", 194 | "torch.cuda.current_stream().synchronize()\n", 195 | "t0 = time.time()\n", 196 | "for i in range(100):\n", 197 | " output = model_w(preprocess(camera.value).half())\n", 198 | "torch.cuda.current_stream().synchronize()\n", 199 | "t1 = time.time()\n", 200 | "\n", 201 | "print(100.0 / (t1 - t0))" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.6.7" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 2 226 | } 227 | -------------------------------------------------------------------------------- /docs/usage/reduced_precision.md: -------------------------------------------------------------------------------- 1 | # Reduced Precision 2 | 3 | For certain platforms, reduced precision can result in substantial improvements in throughput, 4 | often with little impact on model accuracy. 5 | 6 | # Support Matrix 7 | 8 | Below is a table of layer precision support for various NVIDIA platforms. 9 | 10 | | Platform | FP16 | INT8 | 11 | |----------|------|------| 12 | | Jetson Nano | ![X](../images/check.svg) | | 13 | | Jetson TX2 | ![X](../images/check.svg) | ![X](../images/check.svg) | 14 | | Jetson Xavier NX | ![X](../images/check.svg) | ![X](../images/check.svg) | 15 | | Jetson AGX Xavier | ![X](../images/check.svg) | ![X](../images/check.svg) | 16 | 17 | !!! note 18 | 19 | If the platform you're using is missing from this table or you spot anything incorrect 20 | please [let us know](https://github.com/NVIDIA-AI-IOT/torch2trt). 21 | 22 | ## FP16 Precision 23 | 24 | To enable support for fp16 precision with TensorRT, torch2trt exposes the ``fp16_mode`` parameter. 25 | Converting a model with ``fp16_mode=True`` allows the TensorRT optimizer to select layers with fp16 26 | precision. 27 | 28 | 29 | ```python 30 | model_trt = torch2trt(model, [data], fp16_mode=True) 31 | ``` 32 | 33 | !!! note 34 | 35 | When ``fp16_mode=True``, this does not necessarily mean that TensorRT will select FP16 layers. 36 | The optimizer attempts to automatically select tactics which result in the best performance. 37 | 38 | ## INT8 Precision 39 | 40 | torch2trt also supports int8 precision with TensorRT with the ``int8_mode`` parameter. Unlike fp16 and fp32 precision, switching 41 | to in8 precision often requires calibration to avoid a significant drop in accuracy. 42 | 43 | ### Input Data Calibration 44 | 45 | By default 46 | torch2trt will calibrate using the input data provided. For example, if you wanted 47 | to calibrate on a set of 64 random normal images you could do. 48 | 49 | ```python 50 | data = torch.randn(64, 3, 224, 224).cuda().eval() 51 | 52 | model_trt = torch2trt(model, [data], int8_mode=True) 53 | ``` 54 | 55 | ### Dataset Calibration 56 | 57 | In many instances, you may want to calibrate on more data than fits in memory. For this reason, 58 | torch2trt exposes the ``int8_calibration_dataset`` parameter. This parameter takes an input 59 | dataset that is used for calibration. If this parameter is specified, the input data is 60 | ignored during calibration. You create an input dataset by defining 61 | a class which implements the ``__len__`` and ``__getitem__`` methods. 62 | 63 | * The ``__len__`` method should return the number of calibration samples 64 | * The ``__getitem__`` method must return a single calibration sample. This is a list of input tensors to the model. Each tensor should match the shape 65 | you provide to the ``inputs`` parameter when calling ``torch2trt``. 66 | 67 | For example, say you trained an image classification network using the PyTorch [``ImageFolder``](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder) dataset. 68 | You could wrap this dataset for calibration, by defining a new dataset which returns only the images without labels in list format. 69 | 70 | ```python 71 | from torchvision.datasets import ImageFolder 72 | from torchvision.transforms import ToTensor, Compose, Normalize, Resize 73 | 74 | 75 | class ImageFolderCalibDataset(): 76 | 77 | def __init__(self, root): 78 | self.dataset = ImageFolder( 79 | root=root, 80 | transform=Compose([ 81 | Resize((224, 224)), 82 | ToTensor(), 83 | Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 84 | ]) 85 | ) 86 | 87 | def __len__(self): 88 | return len(self.dataset) 89 | 90 | def __getitem__(self, idx): 91 | image, _ = self.dataset[idx] 92 | image = image[None, ...] # add batch dimension 93 | return [image] 94 | ``` 95 | 96 | You would then provide this calibration dataset to torch2trt as follows 97 | 98 | ```python 99 | dataset = ImageFolderCalibDataset('images') 100 | 101 | model_trt = torch2trt(model, [data], int8_calib_dataset=dataset) 102 | ``` 103 | 104 | ### Calibration Algorithm 105 | 106 | To override the default calibration algorithm that torch2trt uses, you can set the ``int8_calib_algoirthm`` 107 | to the [``tensorrt.CalibrationAlgoType``](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Int8/Calibrator.html#iint8calibrator) 108 | that you wish to use. For example, to use the minmax calibration algorithm you would do 109 | 110 | ```python 111 | import tensorrt as trt 112 | 113 | model_trt = torch2trt(model, [data], int8_mode=True, int8_calib_algorithm=trt.CalibrationAlgoType.MINMAX_CALIBRATION) 114 | ``` 115 | 116 | ### Calibration Batch Size 117 | 118 | During calibration, torch2trt pulls data in batches for the TensorRT calibrator. In some instances 119 | [developers have found](https://github.com/NVIDIA-AI-IOT/torch2trt/pull/398) that the calibration batch size can impact the calibrated model accuracy. To set the calibration batch size, you can set the ``int8_calib_batch_size`` 120 | parameter. For example, to use a calibration batch size of 32 you could do 121 | 122 | ```python 123 | model_trt = torch2trt(model, [data], int8_mode=True, int8_calib_batch_size=32) 124 | ``` 125 | 126 | ## Binding Data Types 127 | 128 | The data type of input and output bindings in TensorRT are determined by the original 129 | PyTorch module input and output data types. 130 | This does not directly impact whether the TensorRT optimizer will internally use fp16 or int8 precision. 131 | 132 | For example, to create a model with fp32 precision bindings, you would do the following 133 | 134 | ```python 135 | model = model.float() 136 | data = data.float() 137 | 138 | model_trt = torch2trt(model, [data], fp16_mode=True) 139 | ``` 140 | 141 | In this instance, the optimizer may choose to use fp16 precision layers internally, but the 142 | input and output data types are fp32. To use fp16 precision input and output bindings you would do 143 | 144 | ```python 145 | model = model.half() 146 | data = data.half() 147 | 148 | model_trt = torch2trt(model, [data], fp16_mode=True) 149 | ``` 150 | 151 | Now, the input and output bindings of the model are half precision, and internally the optimizer may 152 | choose to select fp16 layers as well. 153 | -------------------------------------------------------------------------------- /examples/contrib/quantization_aware_training/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import torchvision 5 | import argparse 6 | import os,sys 7 | import torch.optim as optim 8 | from datasets.cifar10 import Cifar10Loaders 9 | from models.models import vanilla_cnn 10 | from models.resnet import resnet18 , resnet34 11 | from utils.utilities import calculate_accuracy , add_missing_keys, transfer_learning_resnet18,transfer_learning_resnet34, mapping_names 12 | from parser import parse_args 13 | import time 14 | from torch2trt import torch2trt 15 | import tensorrt as trt 16 | 17 | def main(): 18 | args = parse_args() 19 | 20 | ## Create an output dir 21 | output_dir_path = args.od + args.en 22 | if not os.path.exists(output_dir_path): 23 | os.makedirs(output_dir_path) 24 | dir_name=output_dir_path 25 | else: 26 | counter=1 27 | dir_name = output_dir_path 28 | new_dir_name = dir_name 29 | while os.path.exists(new_dir_name): 30 | new_dir_name = dir_name + "_" + str(counter) 31 | counter +=1 32 | os.makedirs(new_dir_name) 33 | dir_name=new_dir_name 34 | 35 | print("===>> Output folder = {}".format(dir_name)) 36 | 37 | args.cuda = not args.no_cuda and torch.cuda.is_available() 38 | torch.manual_seed(args.seed) 39 | 40 | if args.cuda: 41 | torch.backends.cudnn.benchmark = True 42 | torch.cuda.manual_seed(args.seed) 43 | 44 | loaders = Cifar10Loaders() 45 | train_loader = loaders.train_loader() 46 | test_loader = loaders.test_loader() 47 | 48 | if args.m =="resnet18": 49 | if args.netqat: 50 | model=resnet18(qat_mode=True) 51 | else: 52 | model=resnet18() 53 | elif args.m =="resnet34": 54 | if args.netqat: 55 | model=resnet34(qat_mode=True) 56 | else: 57 | model=resnet34() 58 | elif args.m == 'resnet34-tl': 59 | model = transfer_learning_resnet34() 60 | elif args.m == "resnet18-tl": ## resnet18 transfer learning 61 | model=transfer_learning_resnet18() 62 | else: 63 | raise NotImplementedError("model {} is not defined".format(args.m)) 64 | 65 | if args.cuda: 66 | model = model.cuda() 67 | 68 | best_test_accuracy=0 69 | if args.v: 70 | print("======>>> keys present in state dict at model creation") 71 | for k,_ in model.state_dict().items(): 72 | print(k) 73 | 74 | if args.load_ckpt: 75 | model.eval() 76 | checkpoint = torch.load(args.load_ckpt) 77 | if args.partial_ckpt: 78 | model_state = checkpoint['model_state_dict'] 79 | if args.v: 80 | print("====>>>>> keys present in the ckpt state dict") 81 | for k,_ in model_state.items(): 82 | print(k) 83 | if args.tl: 84 | model_state = mapping_names(model_state) 85 | new_state_dict = add_missing_keys(model.state_dict(),model_state) 86 | model.load_state_dict(new_state_dict,strict=True) 87 | else: 88 | model.load_state_dict(checkpoint['model_state_dict'],strict=True) 89 | 90 | criterion = nn.CrossEntropyLoss() 91 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=0.9) 92 | if args.load_ckpt: 93 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 94 | epoch = checkpoint['epoch'] 95 | loss = checkpoint['loss'] 96 | print("===>>> Checkpoint loaded successfully from {} at epoch {} ".format(args.load_ckpt,epoch)) 97 | 98 | print("===>> Training started") 99 | for epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): 100 | running_loss=0.0 101 | start=time.time() 102 | model.train() 103 | for i, data in enumerate(train_loader,0): 104 | inputs, labels = data 105 | 106 | if args.cuda: 107 | inputs = inputs.cuda() 108 | labels = labels.cuda() 109 | 110 | optimizer.zero_grad() 111 | 112 | outputs = model(inputs) 113 | loss = criterion(outputs,labels) 114 | loss.backward() 115 | optimizer.step() 116 | 117 | running_loss +=loss.item() 118 | 119 | if epoch > 0 and epoch % args.lrdt == 0: 120 | print("===>> decaying learning rate at epoch {}".format(epoch)) 121 | for param_group in optimizer.param_groups: 122 | param_group['lr'] = param_group['lr'] * 0.94 123 | 124 | running_loss /= len(train_loader) 125 | end = time.time() 126 | test_accuracy = calculate_accuracy(model,test_loader) 127 | 128 | print("Epoch: {0} | Loss: {1} | Test accuracy: {2}| Time Taken (sec): {3} ".format(epoch+1, np.around(running_loss,6), test_accuracy, np.around((end-start),4))) 129 | 130 | ##Save the best checkpoint 131 | if test_accuracy > best_test_accuracy: 132 | best_ckpt_filename = dir_name + "/ckpt_" + str(epoch) 133 | best_test_accuracy = test_accuracy 134 | torch.save({ 135 | 'epoch': epoch, 136 | 'model_state_dict': model.state_dict(), 137 | 'optimizer_state_dict': optimizer.state_dict(), 138 | 'loss': running_loss, 139 | }, best_ckpt_filename) 140 | print("Training finished") 141 | 142 | ## Running metrics 143 | if args.test_trt: 144 | if args.m == 'resnet34-tl' or args.m == 'resnet34': 145 | model = transfer_learning_resnet34(pretrained=False) 146 | elif args.m == 'resnet18-tl' or args.m == 'resnet18': 147 | model= transfer_learning_resnet18(pretrained=False) 148 | else: 149 | raise NotImplementedError("model {} is not defined".format(args.m)) 150 | 151 | model=model.cuda().eval() 152 | checkpoint = torch.load(best_ckpt_filename) 153 | model.load_state_dict(checkpoint['model_state_dict'],strict=True) 154 | 155 | pytorch_test_accuracy = calculate_accuracy(model,test_loader) 156 | rand_in = torch.randn([128,3,32,32],dtype=torch.float32).cuda() 157 | 158 | if args.FP16: 159 | trt_model_fp16 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,max_batch_size=128) 160 | trtfp16_test_accuracy = calculate_accuracy(trt_model_fp16,test_loader) 161 | 162 | if args.INT8PTC: 163 | ##preparing calib dataset 164 | calib_dataset = list() 165 | for i, sam in enumerate(test_loader): 166 | calib_dataset.extend(sam[0]) 167 | if i ==5: 168 | break 169 | 170 | trt_model_calib_int8 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,int8_calib_dataset=calib_dataset,int8_mode=True,max_batch_size=128) 171 | int8_test_accuracy = calculate_accuracy(trt_model_calib_int8,test_loader) 172 | 173 | print("Test Accuracy") 174 | print("Pytorch model :",pytorch_test_accuracy) 175 | print("TRT FP16 model :",trtfp16_test_accuracy) 176 | print("TRT INT8 PTC model :",int8_test_accuracy) 177 | 178 | 179 | if __name__ == "__main__": 180 | main() 181 | -------------------------------------------------------------------------------- /torch2trt/trt_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import tensorrt as trt 3 | from .flattener import Flattener 4 | from .misc_utils import ( 5 | torch_dtype_from_trt, 6 | torch_device_from_trt 7 | ) 8 | from .version_utils import ( 9 | trt_version 10 | ) 11 | 12 | 13 | class TRTModule(torch.nn.Module): 14 | def __init__(self, engine=None, input_names=None, output_names=None, input_flattener=None, output_flattener=None): 15 | super(TRTModule, self).__init__() 16 | self._register_state_dict_hook(TRTModule._on_state_dict) 17 | 18 | if isinstance(engine, str): 19 | # assume filepath 20 | with open(engine, 'rb') as f: 21 | engine = f.read() 22 | with trt.Logger() as logger, trt.Runtime(logger) as runtime: 23 | engine = runtime.deserialize_cuda_engine(engine) 24 | elif isinstance(engine, trt.IHostMemory): 25 | with trt.Logger() as logger, trt.Runtime(logger) as runtime: 26 | engine = runtime.deserialize_cuda_engine(engine) 27 | 28 | self.engine = engine 29 | if self.engine is not None: 30 | self.context = self.engine.create_execution_context() 31 | self._update_name_binindgs_maps() 32 | self.input_names = input_names 33 | self.output_names = output_names 34 | self.input_flattener = input_flattener 35 | self.output_flattener = output_flattener 36 | 37 | def _update_name_binindgs_maps(self): 38 | if trt_version() >= "10.0": 39 | self._update_name_binding_maps_trt_10() 40 | else: 41 | self._update_name_binding_maps_pre_trt_10() 42 | 43 | def _update_name_binding_maps_trt_10(self): 44 | self._name_to_binding = {} 45 | self._binding_to_name = {} 46 | for i in range(self.engine.num_io_tensors): 47 | name_i = self.engine.get_tensor_name(i) 48 | self._name_to_binding[name_i] = i 49 | self._binding_to_name[i] = name_i 50 | 51 | def _update_name_binding_maps_pre_trt_10(self): 52 | self._name_to_binding = {} 53 | self._binding_to_name = {} 54 | for i in range(self.engine.num_bindings): 55 | name_i = self.engine.get_binding_name(i) 56 | self._name_to_binding[name_i] = i 57 | self._binding_to_name[i] = name_i 58 | 59 | def _on_state_dict(self, state_dict, prefix, local_metadata): 60 | state_dict[prefix + "engine"] = bytearray(self.engine.serialize()) 61 | state_dict[prefix + "input_names"] = self.input_names 62 | state_dict[prefix + "output_names"] = self.output_names 63 | state_dict[prefix + "input_flattener"] = self.input_flattener.dict() 64 | state_dict[prefix + "output_flattener"] = self.output_flattener.dict() 65 | 66 | def _load_from_state_dict( 67 | self, 68 | state_dict, 69 | prefix, 70 | local_metadata, 71 | strict, 72 | missing_keys, 73 | unexpected_keys, 74 | error_msgs, 75 | ): 76 | engine_bytes = state_dict[prefix + "engine"] 77 | 78 | with trt.Logger() as logger, trt.Runtime(logger) as runtime: 79 | self.engine = runtime.deserialize_cuda_engine(engine_bytes) 80 | self.context = self.engine.create_execution_context() 81 | 82 | self.input_names = state_dict[prefix + "input_names"] 83 | self.output_names = state_dict[prefix + "output_names"] 84 | 85 | if 'input_flattener' in state_dict: 86 | self.input_flattener = Flattener.from_dict(state_dict['input_flattener']) 87 | else: 88 | self.input_flattener = None 89 | 90 | if 'output_flattener' in state_dict: 91 | self.output_flattener = Flattener.from_dict(state_dict['output_flattener']) 92 | else: 93 | self.output_flattener = None 94 | 95 | self._update_name_binindgs_maps() 96 | 97 | def _forward_pre_10(self, *inputs): 98 | bindings = [None] * (len(self.input_names) + len(self.output_names)) 99 | 100 | if self.input_flattener is not None: 101 | inputs = self.input_flattener.flatten(inputs) 102 | 103 | for i, input_name in enumerate(self.input_names): 104 | idx = self.engine.get_binding_index(input_name) 105 | shape = tuple(inputs[i].shape) 106 | bindings[idx] = inputs[i].contiguous().data_ptr() 107 | self.context.set_binding_shape(idx, shape) 108 | 109 | # create output tensors 110 | outputs = [None] * len(self.output_names) 111 | for i, output_name in enumerate(self.output_names): 112 | idx = self.engine.get_binding_index(output_name) 113 | dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx)) 114 | shape = tuple(self.context.get_binding_shape(idx)) 115 | device = torch_device_from_trt(self.engine.get_location(idx)) 116 | output = torch.empty(size=shape, dtype=dtype, device=device) 117 | outputs[i] = output 118 | bindings[idx] = output.data_ptr() 119 | 120 | self.context.execute_async_v2( 121 | bindings, torch.cuda.current_stream().cuda_stream 122 | ) 123 | 124 | if self.output_flattener is not None: 125 | outputs = self.output_flattener.unflatten(outputs) 126 | else: 127 | outputs = tuple(outputs) 128 | if len(outputs) == 1: 129 | outputs = outputs[0] 130 | 131 | return outputs 132 | 133 | def _forward_post_10(self, *inputs): 134 | if self.input_flattener is not None: 135 | inputs = self.input_flattener.flatten(inputs) 136 | 137 | # set shapes 138 | for i, input_name in enumerate(self.input_names): 139 | shape = tuple(inputs[i].shape) 140 | data_ptr = inputs[i].contiguous().data_ptr() 141 | self.context.set_tensor_address(input_name, data_ptr) 142 | self.context.set_input_shape(input_name, shape) 143 | 144 | # execute 145 | outputs = [None] * len(self.output_names) 146 | for i, output_name in enumerate(self.output_names): 147 | dtype = torch_dtype_from_trt(self.engine.get_tensor_dtype(output_name)) 148 | shape = tuple(self.context.get_tensor_shape(output_name)) 149 | device = torch_device_from_trt(self.engine.get_tensor_location(output_name)) 150 | output = torch.empty(size=shape, dtype=dtype, device=device) 151 | outputs[i] = output 152 | self.context.set_tensor_address(output_name, output.data_ptr()) 153 | 154 | self.context.execute_async_v3(torch.cuda.current_stream().cuda_stream) 155 | 156 | if self.output_flattener is not None: 157 | outputs = self.output_flattener.unflatten(outputs) 158 | else: 159 | outputs = tuple(outputs) 160 | if len(outputs) == 1: 161 | outputs = outputs[0] 162 | 163 | return outputs 164 | 165 | def forward(self, *inputs): 166 | if trt_version() < "10.0": 167 | return self._forward_pre_10(*inputs) 168 | else: 169 | return self._forward_post_10(*inputs) 170 | 171 | def enable_profiling(self): 172 | if not self.context.profiler: 173 | self.context.profiler = trt.Profiler() 174 | -------------------------------------------------------------------------------- /torch2trt/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import glob 4 | from uuid import uuid1 5 | from torch2trt.flattener import Flattener 6 | 7 | 8 | __all__ = [ 9 | 'DatasetRecorder', 10 | 'Dataset', 11 | 'ListDataset', 12 | 'TensorBatchDataset' 13 | ] 14 | 15 | 16 | class DatasetRecorder(object): 17 | 18 | def __init__(self, dataset, module): 19 | self.dataset = dataset 20 | self.module = module 21 | self.handle = None 22 | 23 | def __enter__(self, *args, **kwargs): 24 | 25 | if self.handle is not None: 26 | raise RuntimeError('DatasetRecorder is already active.') 27 | 28 | self.handle = self.module.register_forward_pre_hook(self._callback) 29 | 30 | return self 31 | 32 | def __exit__(self, *args, **kwargs): 33 | if self.handle is not None: 34 | self.handle.remove() 35 | self.handle = None 36 | 37 | def _callback(self, module, input): 38 | self.dataset.insert(input) 39 | 40 | 41 | class Dataset(object): 42 | 43 | def __len__(self): 44 | raise NotImplementedError 45 | 46 | def __getitem__(self, index): 47 | raise NotImplementedError 48 | 49 | def insert(self, item): 50 | raise NotImplementedError 51 | 52 | def record(self, module): 53 | return DatasetRecorder(self, module) 54 | 55 | def num_inputs(self): 56 | return len(self.getitem_flat(0)) 57 | 58 | @property 59 | def flattener(self): 60 | if not hasattr(self, '_flattener') or self._flattener is None: 61 | assert(len(self) > 0, 'Cannot create default flattener without input data.') 62 | value = self[0] 63 | self._flattener = Flattener.from_value(value) 64 | return self._flattener 65 | 66 | def getitem_flat(self, index): 67 | return self.flattener.flatten(self[index]) 68 | 69 | def shapes_for_index(self, index, flat=False): 70 | shapes = [None for i in range(self.num_inputs())] 71 | tensors = self.getitem_flat(index) 72 | for j in range(len(tensors)): 73 | shapes[j] = torch.Size(tuple(tensors[j].shape)) 74 | 75 | if flat: 76 | return shapes 77 | else: 78 | return self.flattener.unflatten(shapes) 79 | 80 | def shapes(self, flat=False): 81 | shapes = [[] for i in range(self.num_inputs())] 82 | for i in range(len(self)): 83 | tensors = self.getitem_flat(i) 84 | for j in range(len(tensors)): 85 | shapes[j].append(torch.Size(tuple(tensors[j].shape))) 86 | 87 | if flat: 88 | return shapes 89 | else: 90 | return self.flattener.unflatten(shapes) 91 | 92 | def _shape_stats(self, stat_fn, flat=False): 93 | shapes = [] 94 | for s in self.shapes(flat=True): 95 | shape_tensor = [] 96 | for si in s: 97 | shape_tensor.append(tuple(si)) 98 | shape_tensor = torch.LongTensor(shape_tensor) 99 | shapes.append(shape_tensor) 100 | 101 | stat_shapes = [] 102 | for shape in shapes: 103 | stat_shape = torch.Size(stat_fn(shape)) 104 | stat_shapes.append(stat_shape) 105 | if flat: 106 | return stat_shapes 107 | else: 108 | return self.flattener.unflatten(stat_shapes) 109 | 110 | def min_shapes(self, flat=False): 111 | return self._shape_stats(lambda x: torch.min(x, dim=0)[0], flat) 112 | 113 | def max_shapes(self, flat=False): 114 | return self._shape_stats(lambda x: torch.max(x, dim=0)[0], flat) 115 | 116 | def item_numel(self, index): 117 | tensors = self.getitem_flat(index) 118 | return sum([t.numel() for t in tensors]) 119 | 120 | def median_numel_shapes(self, flat=False): 121 | numels = torch.LongTensor([self.item_numel(i) for i in range(len(self))]) 122 | median_index = int(torch.argsort(numels)[len(numels) // 2]) 123 | return self.shapes_for_index(median_index, flat=flat) 124 | 125 | def infer_dynamic_axes(self, flat=False): 126 | min_shapes = self.min_shapes(flat=True) 127 | max_shapes = self.max_shapes(flat=True) 128 | dynamic_axes = [[] for i in range(self.num_inputs())] 129 | for i, (mins, maxs) in enumerate(zip(min_shapes, max_shapes)): 130 | for j, (mins_i, maxs_i) in enumerate(zip(mins, maxs)): 131 | if mins_i != maxs_i: 132 | dynamic_axes[i].append(j) 133 | if flat: 134 | return dynamic_axes 135 | else: 136 | return self.flattener.unflatten(dynamic_axes) 137 | 138 | 139 | class ListDataset(Dataset): 140 | 141 | def __init__(self, items=None): 142 | if items is None: 143 | items = [] 144 | self.items = [t for t in items] 145 | 146 | def __len__(self): 147 | return len(self.items) 148 | 149 | def __getitem__(self, index): 150 | return self.items[index] 151 | 152 | def insert(self, item): 153 | self.items.append(item) 154 | 155 | 156 | class TensorBatchDataset(Dataset): 157 | 158 | def __init__(self, tensors=None): 159 | if tensors is not None: 160 | self._flattener = Flattener.from_value(tensors) 161 | self.tensors = self._flattener.flatten(tensors) 162 | else: 163 | self._flattener = None 164 | self.tensors = None 165 | 166 | def __len__(self): 167 | if self.tensors is None: 168 | return 0 169 | else: 170 | return len(self.tensors[0]) 171 | 172 | def __getitem__(self, idx): 173 | if self.tensors is None: 174 | raise IndexError('Dataset is empty.') 175 | return self.flattener.unflatten([t[idx:idx+1] for t in self.tensors]) 176 | 177 | def insert(self, tensors): 178 | if self._flattener is None: 179 | self._flattener = Flattener.from_value(tensors) 180 | 181 | tensors = self.flattener.flatten(tensors) 182 | 183 | if self.tensors is None: 184 | self.tensors = tensors 185 | else: 186 | if len(self.tensors) != len(tensors): 187 | raise ValueError('Number of inserted tensors does not match the number of tensors in the current dataset.') 188 | 189 | self.tensors = tuple([ 190 | torch.cat((self.tensors[index], tensors[index]), dim=0) 191 | for index in range(len(tensors)) 192 | ]) 193 | 194 | 195 | class FolderDataset(Dataset): 196 | 197 | def __init__(self, folder): 198 | super().__init__() 199 | if not os.path.exists(folder): 200 | os.makedirs(folder) 201 | self.folder = folder 202 | 203 | def file_paths(self): 204 | return sorted(glob.glob(os.path.join(self.folder, '*.pth'))) 205 | 206 | def __len__(self): 207 | return len(self.file_paths()) 208 | 209 | def __getitem__(self, index): 210 | return torch.load(self.file_paths()[index]) 211 | 212 | def insert(self, tensors): 213 | i = 0 214 | file_paths = [os.path.basename(path) for path in self.file_paths()] 215 | while ('input_%d.pth' % i) in file_paths: 216 | i += 1 217 | torch.save(tensors, os.path.join(self.folder, 'input_%d.pth' % i)) --------------------------------------------------------------------------------