├── tests
    ├── __init__.py
    ├── converter_tests
    │   ├── __init__.py
    │   └── test_getitem.py
    ├── feature_tests
    │   ├── __init__.py
    │   ├── test_version_utils.py
    │   ├── test_contiguous.py
    │   ├── test_tensor_ne.py
    │   ├── test_flatten_module.py
    │   ├── test_tensor_shape_div_batch.py
    │   ├── test_save_load.py
    │   ├── test_legacy_max_batch_size.py
    │   ├── test_interpolate_dynamic.py
    │   ├── test_flatten_dynamic.py
    │   ├── test_dynamic_shape.py
    │   ├── test_dataset_calibrator.py
    │   ├── test_flattener.py
    │   ├── test_dataset.py
    │   └── test_tensor_shape.py
    └── model_tests
    │   ├── __init__.py
    │   ├── timm
    │       ├── __init__.py
    │       └── test_maxvit.py
    │   └── torchvision
    │       ├── __init__.py
    │       ├── test_segmentation_models.py
    │       └── test_classification_models.py
├── docs
    ├── CHANGELOG.md
    ├── CONTRIBUTING.md
    ├── css
    │   └── version-select.css
    ├── images
    │   └── check.svg
    ├── index.md
    ├── getting_started.md
    ├── usage
    │   ├── basic_usage.md
    │   ├── custom_converter.md
    │   └── reduced_precision.md
    ├── js
    │   └── version-select.js
    ├── benchmarks
    │   ├── jetson_nano.md
    │   └── jetson_xavier.md
    └── see_also.md
├── torch2trt
    ├── contrib
    │   ├── __init__.py
    │   └── qat
    │   │   ├── __init__.py
    │   │   ├── converters
    │   │       ├── __init__.py
    │   │       ├── QuantRelu.py
    │   │       ├── QuantConv.py
    │   │       └── QuantConvBN.py
    │   │   ├── layers
    │   │       ├── __init__.py
    │   │       ├── README.md
    │   │       ├── quant_activation.py
    │   │       └── _utils.py
    │   │   └── README.md
    ├── converters
    │   ├── __init__.py
    │   ├── unimplemented_converters.py
    │   └── plugin_converters.py
    ├── test.py
    ├── __init__.py
    ├── version_utils.py
    ├── flatten_module.py
    ├── plugins
    │   └── plugins.cpp
    ├── dataset_calibrator.py
    ├── misc_utils.py
    ├── utils.py
    ├── flattener.py
    ├── trt_module.py
    └── dataset.py
├── examples
    ├── contrib
    │   ├── quantization_aware_training
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   └── models.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── pytorch_nvidia_quantization.patch
    │   │   ├── datasets
    │   │   │   ├── __init__.py
    │   │   │   └── cifar10.py
    │   │   ├── __init__.py
    │   │   ├── setup.py
    │   │   ├── parser.py
    │   │   ├── README.md
    │   │   ├── infer.py
    │   │   └── train.py
    │   └── pre_py3.7
    │   │   └── fix-getitem.patch
    ├── easyocr
    │   ├── download_images.sh
    │   ├── README.md
    │   ├── generate_data.py
    │   ├── optimize_detector.py
    │   ├── run_end2end.py
    │   └── optimize_recognizer.py
    ├── image_classification
    │   ├── conversion.ipynb
    │   └── live_demo.ipynb
    └── image_segmentation
    │   └── conversion.ipynb
├── CLA.pdf
├── scripts
    ├── test_docs.sh
    ├── release_test_docs.sh
    ├── push_docs.sh
    ├── release_build_docs.sh
    ├── build_docs.sh
    ├── release_push_docs.sh
    ├── build_pre_py3.7.sh
    ├── build_contrib.sh
    ├── dump_converters.py
    └── profile_timm_models.sh
├── requirements
    ├── requirements_8.txt
    └── requirements_10.txt
├── docker
    ├── 21-06
    │   ├── build.sh
    │   ├── run.sh
    │   └── Dockerfile
    ├── 21-09
    │   ├── build.sh
    │   ├── run.sh
    │   └── Dockerfile
    ├── l4t-35.1.0
    │   ├── Dockerfile
    │   ├── build.sh
    │   └── run.sh
    └── 21-08
    │   ├── build.sh
    │   ├── run.sh
    │   └── Dockerfile
├── plugins
    └── src
    │   ├── tests.cpp
    │   ├── example_plugin.h
    │   ├── reflection_pad_2d_plugin.h
    │   └── reflection_pad_2d_plugin_test.cpp
├── .gitignore
├── LICENSE.md
├── mkdocs.yml
├── CMakeLists.txt
├── CONTRIBUTORS.md
├── setup.py
├── benchmarks
    ├── JETSON_NANO.md
    └── JETSON_XAVIER.md
├── test.sh
├── CHANGELOG.md
└── CONTRIBUTING.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ../CHANGELOG.md


--------------------------------------------------------------------------------
/tests/converter_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/feature_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/model_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ../CONTRIBUTING.md


--------------------------------------------------------------------------------
/tests/model_tests/timm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/model_tests/torchvision/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/torch2trt/contrib/__init__.py:
--------------------------------------------------------------------------------
1 | from .qat import *
2 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CLA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/torch2trt/HEAD/CLA.pdf


--------------------------------------------------------------------------------
/scripts/test_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdocs serve --dev-addr=0.0.0.0:8000


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/__init__.py:
--------------------------------------------------------------------------------
1 | from .layers import *
2 | 


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/__init__.py:
--------------------------------------------------------------------------------
1 | from .converters import *
2 | from .layers import *
3 | 


--------------------------------------------------------------------------------
/requirements/requirements_8.txt:
--------------------------------------------------------------------------------
1 | tensorrt==8.6.1
2 | torch
3 | torchvision
4 | timm
5 | onnx_graphsurgeon


--------------------------------------------------------------------------------
/requirements/requirements_10.txt:
--------------------------------------------------------------------------------
1 | tensorrt==10.0.1
2 | torch
3 | torchvision
4 | timm
5 | onnx_graphsurgeon


--------------------------------------------------------------------------------
/docker/21-06/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker build -t torch2trt:21-06 -f $(pwd)/docker/21-06/Dockerfile .


--------------------------------------------------------------------------------
/docker/21-06/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | docker run --gpus all -it --rm -v $(pwd):/torch2trt torch2trt:21-06 


--------------------------------------------------------------------------------
/docker/21-09/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker build -t torch2trt:21-09 -f $(pwd)/docker/21-09/Dockerfile .


--------------------------------------------------------------------------------
/docker/l4t-35.1.0/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/l4t-pytorch:r35.1.0-pth1.12-py3
2 | 
3 | RUN pip install timm


--------------------------------------------------------------------------------
/docker/21-08/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker build -t torch2trt:21-08 -f $(pwd)/docker/21-08/Dockerfile .
4 | 


--------------------------------------------------------------------------------
/docker/21-09/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | docker run --gpus all -it -d --rm -v $(pwd):/torch2trt torch2trt:21-09


--------------------------------------------------------------------------------
/docker/21-08/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | docker run --gpus all -it -d --rm -v $(pwd):/torch2trt torch2trt:21-08
5 | 


--------------------------------------------------------------------------------
/scripts/release_test_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | TAG=$1
4 | 
5 | mike set-default $TAG
6 | mike serve --dev-addr=0.0.0.0:8000


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/converters/__init__.py:
--------------------------------------------------------------------------------
1 | from .QuantConv import *
2 | from .QuantConvBN import *
3 | from .QuantRelu import *
4 | 


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .quant_conv import *
2 | from .quant_activation import *
3 | from ._utils import *
4 | 


--------------------------------------------------------------------------------
/docs/css/version-select.css:
--------------------------------------------------------------------------------
1 | @media only screen and (max-width:76.1875em) {
2 |   #version-selector {
3 |     padding: .6rem .8rem;
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/torch2trt/converters/__init__.py:
--------------------------------------------------------------------------------
1 | from .unimplemented_converters import *
2 | from .plugin_converters import *
3 | from .native_converters import *


--------------------------------------------------------------------------------
/scripts/push_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | TAG=$1
4 | 
5 | python3 scripts/dump_converters.py > docs/converters.md
6 | 
7 | mike deploy $TAG --push
8 | 


--------------------------------------------------------------------------------
/plugins/src/tests.cpp:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN  // This tells Catch to provide a main() - only do this in one cpp file
2 | #include <catch2/catch_all.hpp>


--------------------------------------------------------------------------------
/scripts/release_build_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | TAG=$1
4 | 
5 | python3 scripts/dump_converters.py --tag=$TAG > docs/converters.md
6 | 
7 | mike deploy $TAG


--------------------------------------------------------------------------------
/scripts/build_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | GITHUB=$1
4 | TAG=$2
5 | 
6 | python3 scripts/dump_converters.py --github=$GITHUB --tag=$TAG > docs/converters.md
7 | 


--------------------------------------------------------------------------------
/torch2trt/test.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 | 
3 |     print("torch2trt.test is no longer supported.  Please implement unit tests in the tests directory instead.")


--------------------------------------------------------------------------------
/docker/l4t-35.1.0/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | VERSION=l4t-35.1.0
4 | 
5 | docker build -t torch2trt:$VERSION -f $(pwd)/docker/$VERSION/Dockerfile $(pwd)/docker/$VERSION


--------------------------------------------------------------------------------
/scripts/release_push_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | TAG=$1
4 | 
5 | python3 scripts/dump_converters.py --tag=$TAG > docs/converters.md
6 | 
7 | mike deploy $TAG --push
8 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name='qat',
5 |     version="1",
6 |     packages=find_packages()
7 | )
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/images/check.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24"><path fill-rule="evenodd" d="M21.03 5.72a.75.75 0 010 1.06l-11.5 11.5a.75.75 0 01-1.072-.012l-5.5-5.75a.75.75 0 111.084-1.036l4.97 5.195L19.97 5.72a.75.75 0 011.06 0z"></path></svg>


--------------------------------------------------------------------------------
/docker/21-06/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/pytorch:21.06-py3
2 | 
3 | 
4 | RUN pip3 install termcolor
5 | 
6 | RUN git clone https://github.com/catchorg/Catch2.git && \
7 |     cd Catch2 && \
8 |     cmake -Bbuild -H. -DBUILD_TESTING=OFF && \
9 |     cmake --build build/ --target install 


--------------------------------------------------------------------------------
/docker/21-09/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/pytorch:21.09-py3
2 | 
3 | 
4 | RUN pip3 install termcolor
5 | 
6 | RUN git clone https://github.com/catchorg/Catch2.git && \
7 |     cd Catch2 && \
8 |     cmake -Bbuild -H. -DBUILD_TESTING=OFF && \
9 |     cmake --build build/ --target install 


--------------------------------------------------------------------------------
/docker/l4t-35.1.0/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | VERSION=l4t-35.1.0
 4 | 
 5 | 
 6 | docker run \
 7 |     --network host \
 8 |     --ipc host \
 9 |     --gpus all \
10 |     -it \
11 |     -d \
12 |     --rm \
13 |     --name=torch2trt \
14 |     -v $(pwd):/torch2trt \
15 |     torch2trt:$VERSION


--------------------------------------------------------------------------------
/docker/21-08/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:21.08-py3
 2 | 
 3 | 
 4 | RUN pip3 install termcolor
 5 | 
 6 | RUN git clone https://github.com/catchorg/Catch2.git && \
 7 |     cd Catch2 && \
 8 |     cmake -Bbuild -H. -DBUILD_TESTING=OFF && \
 9 |     cmake --build build/ --target install 
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ninja_deps
 2 | .ninja_log
 3 | build.ninja
 4 | tags
 5 | *.o
 6 | *.pb.o
 7 | torch2trt.egg-info
 8 | build/
 9 | dist/
10 | __pycache__/
11 | *.so
12 | *.pb.h
13 | *.pb.cc
14 | *_pb2.py
15 | *.pyc
16 | *.ipynb_checkpoints
17 | *.pth
18 | docs/converters.md
19 | site
20 | ToJetsonGrp
21 | .vscode
22 | data


--------------------------------------------------------------------------------
/scripts/build_pre_py3.7.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exu
 2 | 
 3 | PATCH_DIR="examples/contrib/pre_py3.7/"
 4 | PATCH_FILES=(
 5 |     "fix-getitem.patch"
 6 | )
 7 | 
 8 | for patch_file in "${PATCH_FILES[@]}"; do
 9 |     patch_file="${PATCH_DIR}""${patch_file}"
10 |     git apply "${patch_file}"
11 | done
12 | 
13 | python3 setup.py install
14 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_version_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import torch2trt.version_utils
 4 | 
 5 | def test_version_utils():
 6 |     
 7 |     a = torch2trt.version_utils.Version("10.1")
 8 | 
 9 |     assert a >= "10.1"
10 |     assert a >= "10.0"
11 |     assert a > "7.0"
12 |     assert a < "11.0"
13 |     assert a == "10.1"
14 |     assert a <= "10.1"
15 |     assert a <= "10.2"


--------------------------------------------------------------------------------
/torch2trt/__init__.py:
--------------------------------------------------------------------------------
 1 | from .torch2trt import *
 2 | from .converters import *
 3 | import tensorrt as trt
 4 | 
 5 | def load_plugins():
 6 |     import torch2trt.torch_plugins
 7 |     registry = trt.get_plugin_registry()
 8 |     torch2trt_creators = [c for c in registry.plugin_creator_list if c.plugin_namespace == 'torch2trt']
 9 |     for c in torch2trt_creators:
10 |         registry.register_creator(c, 'torch2trt')
11 | 
12 | try:
13 |     load_plugins()
14 | except:
15 |     pass
16 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # torch2trt
 2 | 
 3 | <img src="images/chart.svg" style="height:368px"/> 
 4 | 
 5 | torch2trt is a PyTorch to TensorRT converter which utilizes the 
 6 | TensorRT Python API.  The converter is
 7 | 
 8 | * Easy to use - Convert modules with a single function call ``torch2trt``
 9 | 
10 | * Easy to extend - Write your own layer converter in Python and register it with ``@tensorrt_converter``
11 | 
12 | If you find an issue, please [let us know](https://github.com/NVIDIA-AI-IOT/torch2trt/issues)!


--------------------------------------------------------------------------------
/tests/feature_tests/test_contiguous.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch2trt import torch2trt
 3 | 
 4 | 
 5 | def test_contiguous():
 6 | 
 7 |     torch.manual_seed(0)
 8 | 
 9 |     net = torch.nn.Conv2d(3, 10, kernel_size=3)
10 |     net.eval().cuda()
11 | 
12 |     test_tensor = torch.randn((1, 25, 25, 3)).cuda().permute((0, 3, 1, 2))
13 | 
14 |     with torch.no_grad():
15 |         test_out = net(test_tensor)
16 | 
17 |     with torch.no_grad():
18 |         trt_net = torch2trt(net, [test_tensor])
19 |         test_trt_out = trt_net(test_tensor)
20 | 
21 |     delta = torch.max((test_out.contiguous() - test_trt_out.contiguous()).abs())
22 |     assert delta < 1e-3, f"Delta: {delta}"
23 | 
24 | 


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/README.md:
--------------------------------------------------------------------------------
 1 | ## Quantization Aware Training
 2 | 
 3 | This contrib folder provides layers and converters for Quantization Aware Training to convert layers into INT8. 
 4 | 
 5 | ### Supported Layers
 6 | 
 7 | - Conv2d
 8 | - Conv2d + fused BN
 9 | - ReLU
10 |  
11 | ### Future Support for Layers
12 | 
13 |  -Pooling layers    
14 |  -Linear layer    
15 | 
16 | ### Supported Quantization Techniques
17 | 
18 | - per tensor quantization
19 | - symmetric quantization
20 | 
21 | ### Future Support for Quantization Techniques
22 | 
23 | - per channel quantization
24 | - asymmetric quantization
25 | 
26 | ### Working example
27 | 
28 | Please see `examples/quantization_aware_training`
29 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_tensor_ne.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from torch2trt import torch2trt, trt
 4 |     
 5 | def test_tensor_ne():
 6 | 
 7 |     class NotEqual(torch.nn.Module):
 8 |         def __init__(self):
 9 |             super(NotEqual, self).__init__()
10 | 
11 |         def forward(self, x, y):
12 |             return x != y
13 | 
14 |     module = NotEqual().cuda().eval()
15 | 
16 |     x = torch.randn(1, 3, 40, 20).cuda()
17 |     y = torch.randn(1, 3, 1, 20).cuda()
18 | 
19 |     module_trt = torch2trt(module, [x, y], log_level=trt.Logger.VERBOSE)
20 | 
21 |     assert torch.all(module_trt(x, y) == module(x, y))
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     test_tensor_ne()


--------------------------------------------------------------------------------
/examples/easyocr/download_images.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p images
4 | 
5 | wget https://user-images.githubusercontent.com/4212806/180578035-e39cae5d-db18-4941-98a8-d697a1ba2336.jpg -O images/image_0.jpg
6 | wget https://user-images.githubusercontent.com/4212806/180578037-98d81133-0e05-4bdf-ac2b-9918cacc8e64.jpg -O images/image_1.jpg
7 | wget https://user-images.githubusercontent.com/4212806/180578039-ce315b8a-6678-4f25-aa8e-e35e4a5e63dc.jpg -O images/image_2.jpg
8 | wget https://user-images.githubusercontent.com/4212806/180578040-f1d34f29-ce3f-4fc8-9e58-1d009df84959.jpg -O images/image_3.jpg
9 | wget https://user-images.githubusercontent.com/4212806/180578041-25919c7b-f520-4782-8351-fc8de9ffd016.jpg -O images/image_4.jpg


--------------------------------------------------------------------------------
/tests/feature_tests/test_flatten_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch2trt import torch2trt
 4 | 
 5 | 
 6 | def test_flatten_nested_tuple_args():
 7 | 
 8 |     class TestModule(nn.Module):
 9 | 
10 |         def forward(self, x, yz):
11 |             return torch.cat([x, yz[0], yz[1]], dim=-1)
12 | 
13 |     module = TestModule().cuda().eval()
14 | 
15 |     data = (
16 |         torch.randn(1, 3, 32, 32).cuda(),
17 |         (
18 |             torch.randn(1, 3, 32, 32).cuda(),
19 |             torch.randn(1, 3, 32, 32).cuda()
20 |         )
21 |     )
22 | 
23 |     module_trt = torch2trt(module, data)
24 | 
25 |     out = module(*data)
26 |     out_trt = module_trt(*data)
27 | 
28 |     assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3))
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/build_contrib.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | git clone https://github.com/NVIDIA/TensorRT.git /tmp/TensorRT/
 4 | 
 5 | parentdir="$(dirname "$(pwd)")"
 6 | patch="examples/contrib/quantization_aware_training/utils/pytorch_nvidia_quantization.patch"
 7 | patch_file="$parentdir/$patch"
 8 | 
 9 | pushd /tmp/TensorRT
10 |     cp $patch_file .
11 |     git checkout e724d31ab84626ca334b4284703b5048eb698c98  ## keeping this for versioning control
12 |     git sparse-checkout init --cone 
13 |     git sparse-checkout set /tools/pytorch-quantization/
14 |     git apply --reject --whitespace=fix pytorch_nvidia_quantization.patch
15 |     cd tools/pytorch-quantization/
16 |     python setup.py install
17 | popd
18 | 
19 | pushd $parentdir
20 |     python3 setup.py install --plugins --contrib
21 | popd
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_tensor_shape_div_batch.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from torch2trt import torch2trt, trt
 4 |     
 5 | def test_div_constant_batch():
 6 | 
 7 |     class DivConstantBatch(torch.nn.Module):
 8 |         def __init__(self):
 9 |             super(DivConstantBatch, self).__init__()
10 |             self.register_buffer('y', torch.ones((1, 3, 10, 10)))
11 | 
12 |         def forward(self, x):
13 |             return x / self.y
14 | 
15 |     module = DivConstantBatch().cuda().eval()
16 | 
17 |     x = torch.randn(1, 3, 10, 10).cuda()
18 | 
19 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE)
20 | 
21 |     assert torch.allclose(module_trt(x), module(x), atol=1e-3, rtol=1e-3)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     test_div_constant_batch()
26 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_save_load.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch2trt
 3 | import torchvision
 4 | import torch
 5 | 
 6 | 
 7 | def test_save_load():
 8 |     model = torch.nn.Conv2d(3, 3, 1).cuda().eval().half()
 9 |     data = torch.randn((1, 3, 224, 224)).cuda().half()
10 |     
11 |     print('Running torch2trt...')
12 |     model_trt = torch2trt.torch2trt(model, [data], fp16_mode=True, max_workspace_size=1<<25)
13 | 
14 |     print('Saving model...')
15 |     torch.save(model_trt.state_dict(), '.test_model.pth')
16 | 
17 |     print('Loading model...')
18 |     model_trt_2 = torch2trt.TRTModule()
19 |     model_trt_2.load_state_dict(torch.load('.test_model.pth'))
20 | 
21 |     assert(model_trt_2.engine is not None)
22 |     
23 |     print(torch.max(torch.abs(model_trt_2(data) - model(data))))
24 |     print(torch.max(torch.abs(model_trt_2(data) - model_trt(data))))


--------------------------------------------------------------------------------
/tests/model_tests/timm/test_maxvit.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch2trt
 3 | from timm.models.maxxvit import (
 4 |     maxvit_tiny_rw_224,
 5 |     maxvit_rmlp_pico_rw_256,
 6 |     maxvit_rmlp_small_rw_224
 7 | )
 8 | import torch
 9 | 
10 | 
11 | def _cross_validate_module(model, shape=(224, 224)):
12 |     model = model.cuda()
13 |     data = torch.randn(1, 3, *shape).cuda()
14 |     model_trt = torch2trt.torch2trt(model, [data])
15 |     out = model(data)
16 |     out_trt = model_trt(data)
17 |     assert torch.allclose(out, out_trt, rtol=1e-2, atol=1e-2)
18 | 
19 | 
20 | def test_maxvit_tiny_rw_224():
21 |     _cross_validate_module(maxvit_tiny_rw_224().cuda().eval(), (224, 224))
22 | 
23 | 
24 | def test_maxvit_rmlp_small_rw_224():
25 |     _cross_validate_module(maxvit_rmlp_small_rw_224().cuda().eval(), (224, 224))
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     test_maxvit_tiny_rw_224()


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/converters/QuantRelu.py:
--------------------------------------------------------------------------------
 1 | from torch2trt.torch2trt import *
 2 | import tensorrt as trt
 3 | 
 4 | @tensorrt_converter('torch2trt.contrib.qat.layers.quant_activation.IQuantReLU.forward',enabled=trt_version() >= '7.0')
 5 | def convert_QuantReLU(ctx):
 6 |     module = ctx.method_args[0]
 7 |     input = ctx.method_args[1]
 8 |     input_trt = add_missing_trt_tensors(ctx.network, [input])[0]
 9 |     output = ctx.method_return
10 |     layer = ctx.network.add_activation(
11 |         input=input_trt, type=trt.ActivationType.RELU)
12 |     
13 |     ## int 8 precision
14 |     if 'qat_mode' in ctx.torch2trt_kwargs:
15 |         amax = module._input_quantizer.learned_amax
16 |         layer.precision = trt.int8
17 |         layer.set_output_type(0,trt.int8)
18 |         out = layer.get_output(0)
19 |         out.dynamic_range=(-amax,amax)
20 | 
21 |     output._trt = layer.get_output(0)
22 | 


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/layers/README.md:
--------------------------------------------------------------------------------
 1 | ## Layers
 2 | 
 3 | - Every layer has two implementations (Training and Inference). This is required as the quantized aware layers quantize the weights / activation in the forward pass. 
 4 | - If we try to convert the layers into TRT engine (w quantization happening in the forward pass), then a lot of unwanted ops will be presented in the final TRT engine as Torch2TRT will convert all the ops into their TRT equivalent layers. .   
 5 | - Therefore, an inference version of the layer is created so that only the learned parameters (zero point / scale) are carried with the layer for convertng the layer into a TRT engine. 
 6 | 
 7 | ## Quantization Type
 8 | 
 9 | Currently. TRT7 only supports per tensor symmetric quantization. Support for other techniques of quantization (such as per channel , asymmetric etc) will be supported once the newer versions of TensorRT support them.
10 | 
11 | ## Working example
12 | 
13 | Please refer to `examples/quantization_aware_training/` for a working example. 
14 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/torch2trt/version_utils.py:
--------------------------------------------------------------------------------
 1 | import packaging.version
 2 | import tensorrt as trt
 3 | import torch
 4 | 
 5 | 
 6 | def trt_version():
 7 |     return Version(trt.__version__)
 8 | 
 9 | 
10 | def torch_version():
11 |     return Version(torch.__version__)
12 | 
13 | 
14 | class Version(packaging.version.Version):
15 | 
16 |     def __ge__(self, other):
17 |         if isinstance(other, str):
18 |             other = Version(other)
19 |         return super().__ge__(other)
20 | 
21 |     def __le__(self, other):
22 |         if isinstance(other, str):
23 |             other = Version(other)
24 |         return super().__le__(other)
25 | 
26 |     def __eq__(self, other):
27 |         if isinstance(other, str):
28 |             other = Version(other)
29 |         return super().__eq__(other)
30 | 
31 |     def __gt__(self, other):
32 |         if isinstance(other, str):
33 |             other = Version(other)
34 |         return super().__gt__(other)
35 |     
36 |     def __lt__(self, other):
37 |         if isinstance(other, str):
38 |             other = Version(other)
39 |         return super().__lt__(other)
40 |     


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/models/models.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Contains basic model definitions 
 3 | '''
 4 | 
 5 | import torch 
 6 | import torch.nn as nn
 7 | from utils.utilities import qrelu,qconv2d
 8 | 
 9 | class vanilla_cnn(nn.Module):
10 |     def __init__(self,qat_mode=False,infer=False):
11 |         super().__init__()
12 |         self.qat = qat_mode
13 |         self.layer1=qconv2d(3,32,padding=1,qat=qat_mode,infer=infer)
14 |         self.layer2=qconv2d(32,64,padding=1,qat=qat_mode,infer=infer)
15 |         self.layer3=qconv2d(64,128,padding=1,qat=qat_mode,infer=infer)
16 |         self.layer4=qconv2d(128,256,padding=1,qat=qat_mode,infer=infer)
17 |         self.layer5 = nn.MaxPool2d(kernel_size=2,stride=8)
18 |         self.fcs = nn.Sequential(
19 |                 nn.Linear(4096,1024),
20 |                 nn.ReLU(),
21 |                 nn.Linear(1024,512),
22 |                 nn.ReLU(),
23 |                 nn.Linear(512,10))
24 | 
25 |     def forward(self,x):
26 |         x = self.layer1(x)
27 |         x = self.layer2(x)
28 |         x = self.layer3(x)
29 |         x = self.layer4(x)
30 |         x = self.layer5(x)
31 |         x = x.view(x.size(0),-1)
32 |         x = self.fcs(x)
33 |         return x
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/docs/getting_started.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | Follow these steps to get started using torch2trt.
 4 | 
 5 | !!! note
 6 | 
 7 |     torch2trt depends on the TensorRT Python API.  On Jetson, this is included with the latest JetPack.  For desktop, please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html).  You may also try installing torch2trt inside one of the NGC PyTorch docker containers for [Desktop](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) or [Jetson](https://ngc.nvidia.com/catalog/containers/nvidia:l4t-pytorch).
 8 | 
 9 | ### Install Without plugins
10 | 
11 | To install without compiling plugins, call the following
12 | 
13 | ```bash
14 | git clone https://github.com/NVIDIA-AI-IOT/torch2trt
15 | cd torch2trt
16 | python setup.py install
17 | ```
18 | 
19 | ### Install With plugins
20 | 
21 | To install with plugins to support some operations in PyTorch that are not natviely supported with TensorRT, call the following
22 | 
23 | !!! note
24 |     
25 |     Please note, this currently only includes the interpolate plugin.  This plugin requires PyTorch 1.3+ for serialization.  
26 | 
27 | ```bash
28 | git clone https://github.com/NVIDIA-AI-IOT/torch2trt
29 | cd torch2trt
30 | sudo python setup.py install --plugins
31 | ```
32 | 
33 | 


--------------------------------------------------------------------------------
/torch2trt/converters/unimplemented_converters.py:
--------------------------------------------------------------------------------
 1 | from torch2trt.torch2trt import *
 2 | 
 3 | 
 4 | def is_private(method):
 5 |     method = method.split('.')[-1]  # remove prefix
 6 |     return method[0] == '_' and method[1] != '_'
 7 | 
 8 | def is_function_type(method):
 9 |     fntype =  eval(method + '.__class__.__name__')
10 |     return fntype == 'function' or fntype == 'builtin_function_or_method' or fntype == 'method_descriptor'
11 | 
12 | def get_methods(namespace):
13 |     methods = []
14 |     for method in dir(eval(namespace)):
15 |         full_method = namespace + '.' + method
16 |         if not is_private(full_method) and is_function_type(full_method):
17 |             methods.append(full_method)
18 |     return methods
19 | 
20 | 
21 | TORCH_METHODS = []
22 | TORCH_METHODS += get_methods('torch')
23 | TORCH_METHODS += get_methods('torch.Tensor')
24 | TORCH_METHODS += get_methods('torch.nn.functional')
25 | 
26 | 
27 | for method in TORCH_METHODS:
28 |     
29 |     @tensorrt_converter(method, is_real=False)
30 |     def warn_method(ctx):
31 |         print('Warning: Encountered known unsupported method %s' % ctx.method_str)
32 |         
33 | 
34 | @tensorrt_converter('torch.Tensor.dim', is_real=False)
35 | @tensorrt_converter('torch.Tensor.size', is_real=False)
36 | def dont_warn(ctx):
37 |     pass
38 | 


--------------------------------------------------------------------------------
/examples/easyocr/README.md:
--------------------------------------------------------------------------------
 1 | # torch2trt EasyOCR Example
 2 | 
 3 | This example uses torch2trt to optimize EasyOCR.  EasyOCR is split into
 4 | two TensorRT engines, one for the detector, one for the recognizer.
 5 | 
 6 | To run the example, follow these steps
 7 | 
 8 | 1. Download example images
 9 | 
10 |     ```bash
11 |     ./download_images.sh
12 |     ```
13 | 
14 | 2. Generate data for shape inference and calibration.  By default this script will look in the ``images`` directory.
15 | 
16 |     ```bash
17 |     python3 generate_data.py
18 |     ```
19 | 
20 | 3. Optimize the Text Detector.  This will use the data from step 1 for shape inference and calibration.  It creates a file ``detector_trt.pth``.
21 | 
22 |     ```bash
23 |     python3 optimizer_detector.py
24 |     ```
25 | 
26 | 4. Optimize the Text Recognizer. This also uses the data generated from step 1.  It creates a file ``recognizer_trt.pth``.
27 | 
28 |     ```bash
29 |     python3 optimize_recognizer.py
30 | 
31 | 5. Run the pipeline end to end and compare the performance to the original PyTorch model. 
32 | 
33 |     ```bash
34 |     python3 run_end2end.py
35 |     ```
36 | 
37 | That's it!  To use the model in your application, reference these scripts for more details.  Specifically, reference
38 | ``run_end2end.py`` to see how to create and execute the full model pipeline.


--------------------------------------------------------------------------------
/torch2trt/flatten_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from .flattener import Flattener
 4 | 
 5 | 
 6 | class Unflatten(nn.Module):
 7 | 
 8 |     def __init__(self, module, input_flattener=None, output_flattener=None):
 9 |         super().__init__()
10 |         self.module = module
11 |         self.input_flattener = input_flattener
12 |         self.output_flattener = output_flattener
13 | 
14 |     def forward(self, *args):
15 |         if self.input_flattener is not None:
16 |             args = self.input_flattener.flatten(args)
17 |         output = self.module(*args)
18 |         if self.output_flattener is not None:
19 |             output = self.output_flattener.unflatten(output)
20 |         return output
21 | 
22 | 
23 | class Flatten(nn.Module):
24 | 
25 |     def __init__(self, module, input_flattener=None, output_flattener=None):
26 |         super().__init__()
27 |         self.module = module
28 |         self.input_flattener = input_flattener
29 |         self.output_flattener = output_flattener
30 | 
31 |     def forward(self, *args):
32 |         if self.input_flattener is not None:
33 |             args = self.input_flattener.unflatten(*args)
34 |         output = self.module(*args)
35 |         if self.output_flattener is not None:
36 |             output = self.output_flattener.flatten(output)
37 |         return output


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: torch2trt
 2 | theme:
 3 |     name: "material"
 4 |     palette:
 5 |         primary: green
 6 |         secondary: light green
 7 | 
 8 | repo_url: https://github.com/NVIDIA-AI-IOT/torch2trt
 9 | 
10 | plugins:
11 |   - search
12 |   
13 | use_directory_urls: False
14 | 
15 | edit_uri: blob/master/docs
16 | markdown_extensions:
17 |   - pymdownx.tabbed
18 |   - pymdownx.keys
19 |   - pymdownx.snippets
20 |   - pymdownx.inlinehilite
21 |   - pymdownx.highlight:
22 |       use_pygments: true
23 |   - admonition
24 |   - pymdownx.details
25 |   - pymdownx.superfences
26 |   - attr_list
27 |   
28 | # use_directory_urls - False to fix broken raw html image links
29 | # https://github.com/mkdocs/mkdocs/issues/991
30 | 
31 | 
32 | nav:
33 | 
34 |   - Home: index.md
35 |   - Getting Started: getting_started.md
36 |   - Usage:
37 |       - Basic Usage: usage/basic_usage.md
38 |       - Reduced Precision: usage/reduced_precision.md
39 |       - Custom Converter: usage/custom_converter.md
40 |   - Converters: converters.md
41 |   - Benchmarks: 
42 |       - Jetson Nano: benchmarks/jetson_nano.md
43 |       - Jetson Xavier: benchmarks/jetson_xavier.md
44 |   - Contributing: CONTRIBUTING.md
45 |   - See Also: see_also.md
46 | 
47 | extra_css:
48 |     - css/version-select.css
49 | extra_javascript:
50 |     - js/version-select.js
51 |     
52 | google_analytics:
53 |     - UA-135919510-3
54 |     - auto
55 |     
56 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_legacy_max_batch_size.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | from torch2trt import torch2trt
 4 | 
 5 | 
 6 | def test_legacy_max_batch_size():
 7 | 
 8 |     model = nn.Conv2d(3, 6, kernel_size=1).cuda().eval()
 9 | 
10 |     data = torch.randn(1, 3, 32, 32).cuda()
11 | 
12 |     model_trt = torch2trt(model, [data], max_batch_size=4)
13 | 
14 | 
15 |     data = torch.randn(1, 3, 32, 32).cuda()
16 |     out = model(data)
17 |     out_trt = model_trt(data)
18 | 
19 |     assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3))
20 | 
21 | 
22 |     data = torch.randn(4, 3, 32, 32).cuda()
23 |     out = model(data)
24 |     out_trt = model_trt(data)
25 | 
26 |     assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3))
27 | 
28 | def test_legacy_max_batch_size_conv1d():
29 | 
30 |     model = nn.Conv1d(10, 20, kernel_size=1).cuda().eval()
31 | 
32 |     data = torch.randn(1, 10, 32).cuda()
33 | 
34 |     model_trt = torch2trt(model, [data], max_batch_size=4, use_onnx=False)
35 | 
36 | 
37 |     data = torch.randn(1, 10, 32).cuda()
38 |     out = model(data)
39 |     out_trt = model_trt(data)
40 | 
41 |     assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3))
42 | 
43 | 
44 |     data = torch.randn(4, 10, 32).cuda()
45 |     out = model(data)
46 |     out_trt = model_trt(data)
47 | 
48 |     assert(torch.allclose(out, out_trt, atol=1e-3, rtol=1e-3))
49 | 
50 | if __name__ == '__main__':
51 |     test_legacy_max_batch_size_conv1d()


--------------------------------------------------------------------------------
/tests/model_tests/torchvision/test_segmentation_models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch2trt
 4 | 
 5 | 
 6 | class ModelWrapper(torch.nn.Module):
 7 |     def __init__(self, model):
 8 |         super(ModelWrapper, self).__init__()
 9 |         self.model = model
10 |     def forward(self, x):
11 |         return self.model(x)['out']
12 | 
13 | 
14 | def _cross_validate_module(model, shape=(224, 224)):
15 |     model = model.cuda().eval()
16 |     data = torch.randn(1, 3, *shape).cuda()
17 |     model_trt = torch2trt.torch2trt(model, [data])
18 |     data = torch.randn(1, 3, *shape).cuda()
19 |     out = model(data)
20 |     out_trt = model_trt(data)
21 |     assert torch.allclose(out, out_trt, rtol=1e-2, atol=1e-2)
22 | 
23 | 
24 | 
25 | def test_deeplabv3_resnet50():
26 |     bb = torchvision.models.segmentation.deeplabv3_resnet50(pretrained=False)
27 |     model = ModelWrapper(bb)
28 |     _cross_validate_module(model)
29 | 
30 | 
31 | def test_deeplabv3_resnet101():
32 |     bb = torchvision.models.segmentation.deeplabv3_resnet101(pretrained=False)
33 |     model = ModelWrapper(bb)
34 |     _cross_validate_module(model)
35 | 
36 | 
37 | def test_fcn_resnet50():
38 |     bb = torchvision.models.segmentation.fcn_resnet50(pretrained=False)
39 |     model = ModelWrapper(bb)
40 |     _cross_validate_module(model)
41 | 
42 | 
43 | def test_fcn_resnet101():
44 |     bb = torchvision.models.segmentation.fcn_resnet101(pretrained=False)
45 |     model = ModelWrapper(bb)
46 |     _cross_validate_module(model)


--------------------------------------------------------------------------------
/examples/contrib/pre_py3.7/fix-getitem.patch:
--------------------------------------------------------------------------------
 1 | From d9b35495da58038fd5045cc0e2c1f0416f8e62f0 Mon Sep 17 00:00:00 2001
 2 | From: Chao Zhang <chaoz@aurora.tech>
 3 | Date: Tue, 21 Jun 2022 15:38:23 +0000
 4 | Subject: [PATCH] Fix getitem for Py<3.7
 5 | 
 6 | ---
 7 |  torch2trt/torch2trt.py | 13 ++++++++++++-
 8 |  1 file changed, 12 insertions(+), 1 deletion(-)
 9 | 
10 | diff --git a/torch2trt/torch2trt.py b/torch2trt/torch2trt.py
11 | index 3aa6946..9528f1a 100644
12 | --- a/torch2trt/torch2trt.py
13 | +++ b/torch2trt/torch2trt.py
14 | @@ -310,6 +310,14 @@ def attach_converter(ctx, method, converter, method_str):
15 |      return wrapper
16 |  
17 |  
18 | +def _getitem_wrapper(method=torch.Tensor.__getitem__):
19 | +    def wrapper(arg0, arg1):
20 | +        if type(arg1) is torch.Tensor:
21 | +            arg1 = (arg1, )
22 | +        return method(arg0, arg1)
23 | +    return wrapper
24 | +
25 | +
26 |  class ConversionHook(object):
27 |      """Attaches TensorRT converter to PyTorch method call"""
28 |  
29 | @@ -330,7 +338,10 @@ class ConversionHook(object):
30 |          )
31 |  
32 |      def __exit__(self, type, val, tb):
33 | -        self._set_method(self.converter['method_impl'])
34 | +        if '__getitem__' in self.converter['method_str']:
35 | +            self._set_method(_getitem_wrapper())
36 | +        else:
37 | +            self._set_method(self.converter['method_impl'])
38 |  
39 |  def default_input_names(num_inputs):
40 |      return ["input_%d" % i for i in range(num_inputs)]
41 | -- 
42 | 2.32.0
43 | 
44 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0.0)
 2 | project(torch2trt_plugins VERSION 0.1.0)
 3 | 
 4 | # VARIABLES
 5 | set(CUDA_ARCHITECTURES 53 62 72 87)
 6 | 
 7 | # BUILD PLUGINS LIBRARY
 8 | find_package(CUDA REQUIRED)
 9 | 
10 | enable_language(CUDA)
11 | 
12 | include_directories("${CUDA_INCLUDE_DIRS}")
13 | 
14 | add_library(torch2trt_plugins SHARED
15 |     plugins/src/example_plugin.cu
16 |     plugins/src/reflection_pad_2d_plugin.cu
17 | )
18 | set_property(TARGET torch2trt_plugins PROPERTY CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES})
19 | 
20 | target_link_libraries(
21 |     torch2trt_plugins
22 |     nvinfer
23 |     ${CUDA_LIBRARIES}
24 | )
25 | 
26 | install (TARGETS torch2trt_plugins
27 |          LIBRARY DESTINATION lib)
28 | 
29 | # BUILD TESTS
30 | find_package(Catch2 QUIET)
31 | 
32 | if(Catch2_FOUND)
33 |     include(CTest)
34 |     include(CPack)
35 |     include(Catch)
36 |     enable_testing()
37 | 
38 |     add_executable(torch2trt_plugins_test
39 |         plugins/src/tests.cpp
40 |         plugins/src/example_plugin_test.cpp
41 |         plugins/src/reflection_pad_2d_plugin_test.cpp
42 |     )
43 | 
44 |     set_property(TARGET torch2trt_plugins_test PROPERTY CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES})
45 | 
46 |     target_link_libraries(torch2trt_plugins_test 
47 |         PRIVATE 
48 |         Catch2::Catch2WithMain 
49 |         torch2trt_plugins
50 |         nvinfer
51 |         ${CUDA_LIBRARIES}
52 |     )
53 | 
54 |     set(CPACK_PROJECT_NAME ${PROJECT_NAME})
55 |     set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
56 |     catch_discover_tests(torch2trt_plugins_test)
57 | endif()


--------------------------------------------------------------------------------
/torch2trt/plugins/plugins.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include "interpolate.cpp"
 3 | #include "group_norm.cpp"
 4 | 
 5 | 
 6 | using namespace nvinfer1;
 7 | 
 8 | namespace torch2trt {
 9 |     PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |         py::class_<InterpolatePlugin>(m, "InterpolatePlugin")
11 |             .def(py::init<std::vector<int64_t>, std::string, bool>(), py::arg("size"), py::arg("mode"), py::arg("align_corners"))
12 |             .def(py::init<const std::string &>(), py::arg("data"))
13 |             .def("getSerializationSize", &InterpolatePlugin::getSerializationSize)
14 |             .def("deserializeFromString", &InterpolatePlugin::deserializeFromString)
15 |             .def("serializeToString", [](const InterpolatePlugin& plugin) {
16 |                     std::string data = plugin.serializeToString();
17 |                     return py::bytes(data);
18 |                     });
19 |         py::class_<GroupNormPlugin>(m, "GroupNormPlugin")
20 |             .def(py::init<int64_t, at::Tensor, at::Tensor, double>(), py::arg("num_groups"), py::arg("weight"), py::arg("bias"), py::arg("eps"))
21 |             .def(py::init<const std::string &>(), py::arg("data"))
22 |             .def("getSerializationSize", &GroupNormPlugin::getSerializationSize)
23 |             .def("deserializeFromString", &GroupNormPlugin::deserializeFromString)
24 |             .def("serializeToString", [](const GroupNormPlugin& plugin) {
25 |                     std::string data = plugin.serializeToString();
26 |                     return py::bytes(data);
27 |                     });
28 | 
29 |     }
30 | } // namespace torch2trt
31 | 


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/layers/quant_activation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from . import _utils
 3 | from pytorch_quantization import tensor_quant
 4 | from pytorch_quantization.nn.modules import _utils as utils
 5 | 
 6 | class QuantReLU(torch.nn.ReLU,utils.QuantInputMixin):
 7 |     """
 8 |     Quantized ReLu. However, output of relu needs to be quantized for it to correclty map to a TRT layer
 9 |     """
10 |     default_quant_desc_input = tensor_quant.QUANT_DESC_8BIT_PER_TENSOR
11 | 
12 |     def __init__(self,inplace=False,**kwargs):
13 |         super(QuantReLU,self).__init__(inplace)
14 |         quant_desc_input = _utils.pop_quant_desc_in_kwargs(self.__class__, input_only=True, **kwargs)
15 |         self.init_quantizer(quant_desc_input)
16 |     
17 |     def forward(self,input):
18 |         output = super(QuantReLU,self).forward(input)
19 |         ## Although o/p of relu is being quantized, terminology still says input quantizer, will change later
20 |         output = self._input_quantizer(output)
21 |         return output
22 | 
23 | ## Inference class for quantized relu
24 | class IQuantReLU(torch.nn.ReLU,_utils.QuantMixinInput):
25 |     '''
26 |     Mimicking inference side for relu followed by a quantized layer
27 |     '''
28 |     def __init__(self,inplace=False):
29 |         super().__init__(inplace)
30 |         self.init_quantizer()
31 |     
32 |     def __repr__(self):
33 |         s = super().__repr__()
34 |         s = "(" + s + "dynamic_range amax {0:.4f})".format(self._input_quantizer.learned_amax)
35 |         return s
36 | 
37 | 
38 |     def forward(self,inputs):
39 |         return super(IQuantReLU,self).forward(inputs)
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/usage/basic_usage.md:
--------------------------------------------------------------------------------
 1 | # Basic Usage
 2 | 
 3 | This page demonstrates basic torch2trt usage.
 4 | 
 5 | ## Conversion
 6 | 
 7 | You can easily convert a PyTorch module by calling ``torch2trt`` passing example data as input, for example to convert ``alexnet`` we call
 8 | 
 9 | ```python
10 | import torch
11 | from torch2trt import torch2trt
12 | from torchvision.models.alexnet import alexnet
13 | 
14 | # create some regular pytorch model...
15 | model = alexnet(pretrained=True).eval().cuda()
16 | 
17 | # create example data
18 | x = torch.ones((1, 3, 224, 224)).cuda()
19 | 
20 | # convert to TensorRT feeding sample data as input
21 | model_trt = torch2trt(model, [x])
22 | ```
23 | 
24 | !!! note
25 | 
26 |     Currently with torch2trt, once the model is converted, you must use the same input shapes during
27 |     execution.  The exception is
28 |     the batch size, which can vary up to the value specified by the ``max_batch_size`` parameter.
29 |     
30 | ## Executution
31 | 
32 | We can execute the returned ``TRTModule`` just like the original PyTorch model.  Here we
33 | execute the model and print the maximum absolute error.
34 | 
35 | ```python
36 | y = model(x)
37 | y_trt = model_trt(x)
38 | 
39 | # check the output against PyTorch
40 | print(torch.max(torch.abs(y - y_trt)))
41 | ```
42 | 
43 | ## Saving and loading
44 | 
45 | We can save the model as a ``state_dict``.
46 | 
47 | ```python
48 | torch.save(model_trt.state_dict(), 'alexnet_trt.pth')
49 | ```
50 | 
51 | We can load the saved model into a ``TRTModule``
52 | 
53 | ```python
54 | from torch2trt import TRTModule
55 | 
56 | model_trt = TRTModule()
57 | 
58 | model_trt.load_state_dict(torch.load('alexnet_trt.pth'))
59 | ```
60 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/utils/pytorch_nvidia_quantization.patch:
--------------------------------------------------------------------------------
 1 | From fa12201005e221fc6de8b0d836fdd60c0a107aaa Mon Sep 17 00:00:00 2001
 2 | From: Kshitij Srivastava <kshitij.srivastava@uber.com>
 3 | Date: Wed, 4 Nov 2020 18:01:14 -0500
 4 | Subject: [PATCH] saving learned amax as a part of state dict
 5 | 
 6 | ---
 7 |  .../pytorch_quantization/nn/modules/tensor_quantizer.py      | 5 +++++
 8 |  1 file changed, 5 insertions(+)
 9 | 
10 | diff --git a/tools/pytorch-quantization/pytorch_quantization/nn/modules/tensor_quantizer.py b/tools/pytorch-quantization/pytorch_quantization/nn/modules/tensor_quantizer.py
11 | index fd3f32c..d26c585 100644
12 | --- a/tools/pytorch-quantization/pytorch_quantization/nn/modules/tensor_quantizer.py
13 | +++ b/tools/pytorch-quantization/pytorch_quantization/nn/modules/tensor_quantizer.py
14 | @@ -87,6 +87,10 @@ class TensorQuantizer(nn.Module):
15 |  
16 |          if quant_desc.amax is not None:
17 |              self.register_buffer('_amax', torch.tensor(quant_desc.amax))
18 | +        
19 | +        ##dynamic amax needs to be stored as a part of state dict to be used at inference time to map dynamic range to
20 | +        # TRT layer
21 | +        self.register_buffer('learned_amax',torch.tensor(1))
22 |  
23 |          # Clip module consumes a lot of memory, so only create it if learn_amax is True
24 |          if self._learn_amax:
25 | @@ -273,6 +277,7 @@ class TensorQuantizer(nn.Module):
26 |          if self._scale_amax is not None:
27 |              amax = amax.detach() * self._scale_amax
28 |  
29 | +        self.learned_amax = amax
30 |          return amax
31 |  
32 |      def _fb_fake_quant(self, inputs, amax):
33 | -- 
34 | 2.29.2
35 | 
36 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
 1 | # Contributors
 2 | 
 3 | Below is a list of developers who have contributed to torch2trt.  This is also used to track contributors
 4 | who have agreed to torch2trt's Contributor License Agreement.
 5 | 
 6 | - [John Welsh](https://github.com/jaybdub) (CLA)
 7 | - John Welsh
 8 | 
 9 | ## Becoming a Contributor
10 | 
11 | If you've made a notable contribution to torch2trt and wish to be listed as a contributor, simply do the following.
12 | 
13 | 1. Modify ``CONTRIBUTORS.md`` and add your name with a hyperlink to your GitHub account to the end of the contributors list.
14 | 
15 |     ```md
16 |     - [<Full name or GitHub account>](https://github.com/<GitHub account>)
17 |     ```
18 | 
19 | 2. Stage the changes in a standalone commit
20 | 
21 |     ```md
22 |     git add CONTRIBUTORS.md
23 |     ```
24 | 
25 | 3. Make a signed commit with the following message text
26 | 
27 |     ```md
28 |     git commit -m "Added <Full name or GitHub account> to CONTRIBUTORS.md"
29 |     ```
30 | 
31 | ## Signing Contributor License Agreement (CLA)
32 | 
33 | In some instances, you may be requested to sign torch2trt's Contributor License Agreement (CLA). To do so,
34 | 
35 | 1. If you're not already listed as a contributor in CONTRIBUTORS.md, make a commit as described above to add yourself to CONTRIBUTORS.md
36 | 
37 | 2. Add the text ``(CLA)`` after your name in ``CONTRIBUTORS.md``
38 | 3. Stage the changes in a standalone commit
39 | 
40 |    ```md
41 |    git add CONTRIBUTORS.md
42 |    ```
43 | 4. Make a signed commit with the following text
44 | 
45 |    ```md
46 |    git commit -S -m "I have read and agree to the Contributor License Agreement as written in the file CLA.md of this project.  Signed, <Full Name>"
47 |    ```
48 | 
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tensorrt
 3 | import torch
 4 | from setuptools import setup, find_packages
 5 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
 6 | from packaging import version
 7 | 
 8 | 
 9 | def trt_inc_dir():
10 |     return "/usr/include/aarch64-linux-gnu"
11 | 
12 | def trt_lib_dir():
13 |     return "/usr/lib/aarch64-linux-gnu"
14 | 
15 | ext_modules = []
16 | exclude_dir = ["torch2trt/contrib","torch2trt/contrib.*"]
17 | 
18 | compile_args_cxx = []
19 | if version.parse(torch.__version__) < version.parse('1.5'):
20 |     compile_args_cxx.append('-DUSE_DEPRECATED_INTLIST')
21 | if version.parse(tensorrt.__version__) < version.parse('8'):
22 |     compile_args_cxx.append('-DPRE_TRT8')
23 | 
24 | plugins_ext_module = CUDAExtension(
25 |         name='plugins',
26 |         sources=[
27 |             'torch2trt/plugins/plugins.cpp'
28 |         ],
29 |         include_dirs=[
30 |             trt_inc_dir()
31 |         ],
32 |         library_dirs=[
33 |             trt_lib_dir()
34 |         ],
35 |         libraries=[
36 |             'nvinfer'
37 |         ],
38 |         extra_compile_args={
39 |             'cxx': compile_args_cxx,
40 |             'nvcc': []
41 |         }
42 |     )
43 | 
44 | if '--plugins' in sys.argv:
45 |     ext_modules.append(plugins_ext_module)
46 |     sys.argv.remove('--plugins')
47 | 
48 | if '--contrib' in sys.argv:
49 |     exclude_dir=[]
50 |     sys.argv.remove('--contrib')
51 | 
52 | setup(
53 |     name='torch2trt',
54 |     version='0.5.0',
55 |     description='An easy to use PyTorch to TensorRT converter',
56 |     packages=find_packages(exclude=exclude_dir),
57 |     ext_package='torch2trt',
58 |     ext_modules=ext_modules,
59 |     cmdclass={'build_ext': BuildExtension}
60 | )
61 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/datasets/cifar10.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torchvision.transforms as transforms
 4 | 
 5 | class Cifar10Loaders:
 6 |     """
 7 |     Data loaders for cifar 10 dataset
 8 |     """
 9 |     def __init__(self, data_dir='/tmp/cifar10', download=True, batch_size=128, pin_memory=True, num_workers=4):
10 |         self.data_dir = data_dir
11 |         self.download = download
12 |         self.batch_size= batch_size
13 |         self.pin_memory = pin_memory
14 |         self.num_workers = num_workers
15 |         self.train_transform = transforms.Compose([
16 |             transforms.RandomCrop(32, padding=4),
17 |             transforms.RandomHorizontalFlip(),
18 |             transforms.ToTensor(),
19 |             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
20 |         ])
21 |         self.test_transform = transforms.Compose([
22 |             transforms.ToTensor(),
23 |             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
24 |         ])
25 |     
26 |     def train_loader(self,shuffle=True):
27 |         trainset = torchvision.datasets.CIFAR10(root=self.data_dir, train=True, download=True, transform=self.train_transform)
28 |         trainloader = torch.utils.data.DataLoader(trainset, batch_size=self.batch_size, shuffle=shuffle, num_workers=self.num_workers, pin_memory=self.pin_memory)
29 |         return trainloader
30 |     
31 |     def test_loader(self,shuffle=False):
32 |         testset = torchvision.datasets.CIFAR10(root=self.data_dir, train=False, download=True, transform=self.test_transform)
33 |         testloader = torch.utils.data.DataLoader(testset, batch_size=self.batch_size, shuffle=shuffle, num_workers=self.num_workers, pin_memory=self.pin_memory)
34 |         return testloader
35 |     
36 |     
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/examples/easyocr/generate_data.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import cv2
 3 | import torch
 4 | import glob
 5 | from easyocr import Reader
 6 | from torch2trt.dataset import FolderDataset
 7 | from torch2trt import torch2trt, TRTModule
 8 | import math
 9 | import os
10 | 
11 | parser = ArgumentParser()
12 | parser.add_argument('--images', type=str, default='images')
13 | parser.add_argument('--detector_data', type=str, default='detector_data')
14 | parser.add_argument('--recognizer_data', type=str, default='recognizer_data')
15 | parser.add_argument('--max_image_area', type=int, default=1280*720)
16 | parser.add_argument('--recognizer_batch_size', type=int, default=1)
17 | args = parser.parse_args()
18 | 
19 | 
20 | reader = Reader(['en'])
21 | 
22 | 
23 | detector_dataset = FolderDataset(args.detector_data)
24 | recognizer_dataset = FolderDataset(args.recognizer_data)
25 | 
26 | 
27 | def shrink_to_area(image, area):
28 |     height = image.shape[0]
29 |     width = image.shape[1]
30 | 
31 |     if height * width > area:
32 |         ar = width / height
33 |         new_height = math.sqrt(area / ar)
34 |         new_width = ar * new_height
35 |         new_height = math.floor(new_height)
36 |         new_width = math.floor(new_width)
37 |         print(f'Resizing {width}x{height} to {new_width}x{new_height}')
38 |         image = cv2.resize(image, (new_width, new_height))
39 | 
40 |     return image
41 | 
42 | 
43 | with detector_dataset.record(reader.detector.module):
44 |     with recognizer_dataset.record(reader.recognizer.module):
45 |         
46 |         for path in glob.glob(os.path.join(args.images, '*.jpg')):
47 |             print(path)
48 | 
49 |             image = cv2.imread(path)
50 | 
51 |             image = shrink_to_area(image, args.max_image_area)
52 | 
53 |             reader.readtext(image, batch_size=args.recognizer_batch_size)
54 | 


--------------------------------------------------------------------------------
/docs/js/version-select.js:
--------------------------------------------------------------------------------
 1 | window.addEventListener("DOMContentLoaded", function() {
 2 |   // This is a bit hacky. Figure out the base URL from a known CSS file the
 3 |   // template refers to...
 4 |   var ex = new RegExp("/?css/version-select.css$");
 5 |   var sheet = document.querySelector('link[href$="version-select.css"]');
 6 | 
 7 |   var ABS_BASE_URL = sheet.href.replace(ex, "");
 8 |   var CURRENT_VERSION = ABS_BASE_URL.split("/").pop();
 9 | 
10 |   function makeSelect(options, selected) {
11 |     var select = document.createElement("select");
12 |     select.classList.add("form-control");
13 | 
14 |     options.forEach(function(i) {
15 |       var option = new Option(i.text, i.value, undefined,
16 |                               i.value === selected);
17 |       select.add(option);
18 |     });
19 | 
20 |     return select;
21 |   }
22 | 
23 |   var xhr = new XMLHttpRequest();
24 |   xhr.open("GET", ABS_BASE_URL + "/../versions.json");
25 |   xhr.onload = function() {
26 |     var versions = JSON.parse(this.responseText);
27 | 
28 |     var realVersion = versions.find(function(i) {
29 |       return i.version === CURRENT_VERSION ||
30 |              i.aliases.includes(CURRENT_VERSION);
31 |     }).version;
32 | 
33 |     var select = makeSelect(versions.map(function(i) {
34 |       return {text: i.title, value: i.version};
35 |     }), realVersion);
36 |     select.addEventListener("change", function(event) {
37 |       window.location.href = ABS_BASE_URL + "/../" + this.value;
38 |     });
39 | 
40 |     var container = document.createElement("div");
41 |     container.id = "version-selector";
42 |     container.className = "md-nav__item";
43 |     container.appendChild(select);
44 | 
45 |     var sidebar = document.querySelector(".md-nav--primary > .md-nav__list");
46 |     sidebar.parentNode.insertBefore(container, sidebar);
47 |   };
48 |   xhr.send();
49 | });
50 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_interpolate_dynamic.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from torch2trt import (
 5 |     torch2trt,
 6 |     trt
 7 | )
 8 | 
 9 | 
10 | def test_interpolate_dynamic_size():
11 | 
12 |     class TestModule(torch.nn.Module):
13 |         def forward(self, x):
14 |             size = x.size()
15 |             return F.interpolate(x, size=(size[2]*2, size[3]*3))
16 | 
17 |     module = TestModule().cuda().eval()
18 | 
19 |     x = torch.randn(1, 3, 32, 32).cuda()
20 | 
21 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, min_shapes=[(1, 3, 32, 32)], max_shapes=[(4, 3, 64, 64)], opt_shapes=[(1, 3, 32, 32)])
22 | 
23 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
24 | 
25 |     x = torch.randn(1, 3, 32, 32).cuda()
26 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
27 |     
28 |     x = torch.randn(4, 3, 64, 64).cuda()
29 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
30 | 
31 | 
32 | def test_interpolate_dynamic_shape():
33 | 
34 |     class TestModule(torch.nn.Module):
35 |         def forward(self, x):
36 |             size = x.shape
37 |             return F.interpolate(x, size=(size[2]*2, size[3]*3))
38 | 
39 |     module = TestModule().cuda().eval()
40 | 
41 |     x = torch.randn(1, 3, 32, 32).cuda()
42 | 
43 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, min_shapes=[(1, 3, 32, 32)], max_shapes=[(4, 3, 64, 64)], opt_shapes=[(1, 3, 32, 32)])
44 | 
45 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
46 | 
47 |     x = torch.randn(1, 3, 32, 32).cuda()
48 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
49 |     
50 |     x = torch.randn(4, 3, 64, 64).cuda()
51 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
52 | 


--------------------------------------------------------------------------------
/torch2trt/dataset_calibrator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import tensorrt as trt
 3 | import os
 4 | from .flattener import Flattener
 5 | 
 6 | __all__ = [
 7 |     'DEFAULT_CALIBRATION_ALGORITHM',
 8 |     'DatasetCalibrator'
 9 | ]
10 | 
11 | 
12 | if trt.__version__ >= '5.1':
13 |     DEFAULT_CALIBRATION_ALGORITHM = trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2
14 | else:
15 |     DEFAULT_CALIBRATION_ALGORITHM = trt.CalibrationAlgoType.ENTROPY_CALIBRATION
16 | 
17 | 
18 | class DatasetCalibrator(trt.IInt8Calibrator):
19 | 
20 |     def __init__(self, dataset, algorithm=DEFAULT_CALIBRATION_ALGORITHM, cache_file=None, flattener=None):
21 |         super(DatasetCalibrator, self).__init__()
22 |         self.dataset = dataset
23 |         self.algorithm = algorithm
24 |         self.count = 0
25 |         self.cache_file = cache_file
26 |         if flattener is None:
27 |             flattener = Flattener.from_value(dataset[0])
28 |         self.flattener = flattener
29 | 
30 |     def get_batch(self, *args, **kwargs):
31 |         if self.count < len(self.dataset):
32 |             tensors = self.flattener.flatten(self.dataset[self.count])
33 |             bindings = [int(t.data_ptr()) for t in tensors]
34 |             self.count += 1
35 |             return bindings
36 |         else:
37 |             return []
38 | 
39 |     def get_algorithm(self):
40 |         return self.algorithm
41 | 
42 |     def get_batch_size(self):
43 |         return 1
44 | 
45 |     def read_calibration_cache(self, *args, **kwargs):
46 |         if (self.cache_file is not None) and os.path.exists(self.cache_file):
47 |             with open(self.cache_file, 'rb') as f:
48 |                 return f.read()
49 |                 
50 |     def write_calibration_cache(self, cache, *args, **kwargs):
51 |         if self.cache_file is not None:
52 |             with open(self.cache_file, 'wb') as f:
53 |                 f.write(cache)
54 | 


--------------------------------------------------------------------------------
/torch2trt/misc_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import tensorrt as trt
 4 | 
 5 | from .version_utils import (
 6 |     trt_version
 7 | )
 8 | 
 9 | 
10 | def torch_dtype_to_trt(dtype):
11 |     if trt_version() >= '7.0' and dtype == torch.bool:
12 |         return trt.bool
13 |     elif dtype == torch.int8:
14 |         return trt.int8
15 |     elif dtype == torch.int32:
16 |         return trt.int32
17 |     elif dtype == torch.float16:
18 |         return trt.float16
19 |     elif dtype == torch.float32:
20 |         return trt.float32
21 |     else:
22 |         raise TypeError("%s is not supported by tensorrt" % dtype)
23 | 
24 | 
25 | def torch_dtype_from_trt(dtype):
26 |     if dtype == trt.int8:
27 |         return torch.int8
28 |     elif trt_version() >= '7.0' and dtype == trt.bool:
29 |         return torch.bool
30 |     elif dtype == trt.int32:
31 |         return torch.int32
32 |     elif dtype == trt.float16:
33 |         return torch.float16
34 |     elif dtype == trt.float32:
35 |         return torch.float32
36 |     else:
37 |         raise TypeError("%s is not supported by torch" % dtype)
38 | 
39 | 
40 | def torch_device_to_trt(device):
41 |     if device.type == torch.device("cuda").type:
42 |         return trt.TensorLocation.DEVICE
43 |     elif device.type == torch.device("cpu").type:
44 |         return trt.TensorLocation.HOST
45 |     else:
46 |         return TypeError("%s is not supported by tensorrt" % device)
47 | 
48 | 
49 | def torch_device_from_trt(device):
50 |     if device == trt.TensorLocation.DEVICE:
51 |         return torch.device("cuda")
52 |     elif device == trt.TensorLocation.HOST:
53 |         return torch.device("cpu")
54 |     else:
55 |         return TypeError("%s is not supported by torch" % device)
56 | 
57 | 
58 | def trt_int_dtype():
59 |     if trt_version() >= "10.0":
60 |         return np.int64
61 |     else:
62 |         return np.int32
63 |     
64 | 


--------------------------------------------------------------------------------
/torch2trt/converters/plugin_converters.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch2trt.torch2trt import *
 4 | import numpy as np
 5 | import ctypes
 6 | 
 7 | 
 8 | try:
 9 |     ctypes.CDLL('libtorch2trt_plugins.so')
10 | 
11 |     def create_reflection_pad_2d_plugin(paddingLeft, paddingRight, paddingTop, paddingBottom):
12 | 
13 |         registry = trt.get_plugin_registry()
14 |         creator = registry.get_plugin_creator('ReflectionPad2dPlugin', '1', '')
15 | 
16 |         fc = trt.PluginFieldCollection([
17 |             trt.PluginField(
18 |                 'paddingLeft',
19 |                 np.array([paddingLeft]).astype(np.int32),
20 |                 trt.PluginFieldType.INT32
21 |             ),
22 |             trt.PluginField(
23 |                 'paddingRight',
24 |                 np.array([paddingRight]).astype(np.int32),
25 |                 trt.PluginFieldType.INT32
26 |             ),
27 |             trt.PluginField(
28 |                 'paddingTop',
29 |                 np.array([paddingTop]).astype(np.int32),
30 |                 trt.PluginFieldType.INT32
31 |             ),
32 |             trt.PluginField(
33 |                 'paddingBottom',
34 |                 np.array([paddingBottom]).astype(np.int32),
35 |                 trt.PluginFieldType.INT32
36 |             )
37 |         ])
38 | 
39 |         return creator.create_plugin('', fc)
40 |     @tensorrt_converter(nn.ReflectionPad2d.forward)
41 |     def convert_reflection_pad(ctx):
42 |         module = get_arg(ctx, 'self', pos=0, default=None)
43 |         input = get_arg(ctx, 'x', pos=1, default=None)
44 |         output = ctx.method_return
45 |         input_trt = input._trt
46 |         plugin = create_reflection_pad_2d_plugin(
47 |             module.padding[0],
48 |             module.padding[1],
49 |             module.padding[2],
50 |             module.padding[3]
51 |         )
52 |         layer = ctx.network.add_plugin_v2([input_trt], plugin)
53 |         output._trt = layer.get_output(0)
54 | 
55 | except:
56 |     pass


--------------------------------------------------------------------------------
/scripts/dump_converters.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import subprocess
 4 | import os
 5 | from importlib.machinery import SourceFileLoader
 6 | 
 7 | torch2trt = SourceFileLoader("torch2trt", "torch2trt/__init__.py").load_module()  # to load relative to root
 8 | 
 9 | HEADER = """
10 | # Converters
11 | 
12 | This table contains a list of supported PyTorch methods and their associated converters.
13 | 
14 | If your model is not converting, a good start in debugging would be to see if it contains a method not listed
15 | in this table.  You may also find these a useful reference when writing your own converters.
16 | 
17 | | Method | Converter |
18 | |--------|-----------|"""
19 | 
20 | if __name__ == '__main__':
21 | 
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('--github',
24 |                         type=str,
25 |                         default='https://github.com/NVIDIA-AI-IOT/torch2trt')
26 |     parser.add_argument('--tag', type=str, default='master')
27 |     args = parser.parse_args()
28 |     
29 |     print(HEADER)
30 |     
31 |     for method, entry in torch2trt.CONVERTERS.items():
32 | 
33 |         if not entry['is_real']:
34 |             continue
35 | 
36 |         converter = entry['converter']
37 | 
38 |         # get commit hash
39 | #         p = subprocess.Popen(['git', 'rev-parse', 'HEAD'],
40 | #                              stdout=subprocess.PIPE,
41 | #                              stderr=subprocess.PIPE)
42 | #         commit, err = p.communicate()
43 | #         commit = commit.decode('utf-8').strip('\n')
44 | 
45 |         # get github URL
46 |         url = '{github}/blob/{commit}/{relpath}#L{lineno}'.format(
47 |             github=args.github,
48 |             commit=args.tag,
49 |             relpath=os.path.relpath(converter.__code__.co_filename,
50 |                                     os.path.abspath('.')),
51 |             lineno=converter.__code__.co_firstlineno)
52 | 
53 |         print('| ``{method}`` | [``{converter}``]({url}) |'.format(
54 |             method=method, converter=converter.__name__, url=url))
55 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_flatten_dynamic.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from torch2trt import torch2trt, trt
 3 | import torch
 4 | 
 5 | 
 6 | class FlattenModule(torch.nn.Module):
 7 |     def __init__(self, start_dim, end_dim):
 8 |         super().__init__()
 9 |         self.start_dim = start_dim
10 |         self.end_dim = end_dim
11 | 
12 |     def forward(self, x):
13 |         return torch.flatten(x, self.start_dim, self.end_dim)
14 | 
15 | 
16 | def test_flatten_dynamic_0_n1():
17 | 
18 |     # 0, -1
19 |     module = FlattenModule(start_dim=0, end_dim=-1).cuda().eval()
20 | 
21 |     x = torch.randn(1, 4, 5).cuda()
22 | 
23 |     module_trt = torch2trt(module, [x], max_batch_size=4, log_level=trt.Logger.VERBOSE)
24 | 
25 |     x = torch.randn(1, 4, 5).cuda()
26 |     assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2))
27 | 
28 |     x = torch.randn(4, 4, 5).cuda()
29 |     assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2))
30 | 
31 | 
32 | def test_flatten_dynamic_1_n1():
33 |     # 1, -1
34 |     module = FlattenModule(start_dim=1, end_dim=-1).cuda().eval()
35 | 
36 |     x = torch.randn(1, 4, 5).cuda()
37 | 
38 |     module_trt = torch2trt(module, [x], max_batch_size=4, log_level=trt.Logger.VERBOSE)
39 | 
40 |     x = torch.randn(1, 4, 5).cuda()
41 |     assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2))
42 | 
43 |     x = torch.randn(4, 4, 5).cuda()
44 |     assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2))
45 | 
46 | 
47 | def test_flatten_dynamic_0_1():
48 |     # 0, 1
49 |     module = FlattenModule(start_dim=0, end_dim=1).cuda().eval()
50 | 
51 |     x = torch.randn(1, 4, 5).cuda()
52 | 
53 |     module_trt = torch2trt(module, [x], max_batch_size=4, log_level=trt.Logger.VERBOSE)
54 | 
55 |     x = torch.randn(1, 4, 5).cuda()
56 |     assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2))
57 | 
58 |     x = torch.randn(4, 4, 5).cuda()
59 |     assert(torch.allclose(module(x), module_trt(x), atol=1e-2, rtol=1e-2))
60 |     
61 | 
62 | if __name__ == '__main__':
63 | 
64 |     test_flatten_dynamic_0_1()


--------------------------------------------------------------------------------
/torch2trt/utils.py:
--------------------------------------------------------------------------------
 1 | import graphviz
 2 | import tensorrt as trt
 3 | 
 4 | 
 5 | def trt_network_to_dot_graph(network):
 6 |     dot = graphviz.Digraph(comment="Network")
 7 | 
 8 |     # add nodes (layers)
 9 |     for i in range(network.num_layers):
10 |         layer = network.get_layer(i)
11 |         dot.node(layer.name)
12 | 
13 |     # add nodes (inputs)
14 |     for i in range(network.num_inputs):
15 |         dot.node(network.get_input(i).name)
16 | 
17 |     # add nodes (outputs)
18 |     for i in range(network.num_outputs):
19 |         dot.node(network.get_output(i).name)
20 | 
21 |     # add layer->layer edges
22 |     for a in range(network.num_layers):
23 |         layer_a = network.get_layer(a)
24 | 
25 |         for b in range(network.num_layers):
26 |             layer_b = network.get_layer(b)
27 | 
28 |             for i in range(layer_a.num_outputs):
29 |                 output_i = layer_a.get_output(i)
30 | 
31 |                 for j in range(layer_b.num_inputs):
32 |                     input_j = layer_b.get_input(j)
33 | 
34 |                     if output_i == input_j:
35 |                         dot.edge(layer_a.name, layer_b.name, label=str(input_j.shape))
36 | 
37 |     # add input->layer edges
38 |     for i in range(network.num_inputs):
39 |         input_i = network.get_input(i)
40 | 
41 |         for b in range(network.num_layers):
42 |             layer_b = network.get_layer(b)
43 | 
44 |             for j in range(layer_b.num_inputs):
45 |                 input_j = layer_b.get_input(j)
46 | 
47 |                 if input_i == input_j:
48 |                     dot.edge(input_i.name, layer_b.name, label=str(input_j.shape))
49 | 
50 |     # add layer->output edges
51 |     for i in range(network.num_outputs):
52 |         input_i = network.get_output(i)
53 | 
54 |         for b in range(network.num_layers):
55 |             layer_b = network.get_layer(b)
56 | 
57 |             for j in range(layer_b.num_outputs):
58 |                 input_j = layer_b.get_output(j)
59 | 
60 |                 if input_i == input_j:
61 |                     dot.edge(layer_b.name, input_i.name, label=str(input_j.shape))
62 | 
63 |     return dot
64 | 


--------------------------------------------------------------------------------
/scripts/profile_timm_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | profile_timm() {
 4 |     python3 scripts/profile_timm.py --model $1 --size $2
 5 |     python3 scripts/profile_timm.py --model $1 --use-onnx --size $2
 6 | }
 7 | 
 8 | profile_timm beit_base_patch16_224 224
 9 | profile_timm botnet26t_256 256
10 | profile_timm gernet_s 224
11 | profile_timm cait_xxs24_224 224
12 | profile_timm coat_tiny 224
13 | profile_timm convit_tiny 224
14 | profile_timm convmixer_768_32 224
15 | profile_timm convnext_tiny 288
16 | profile_timm crossvit_15_240 240
17 | profile_timm cspresnet50 256
18 | profile_timm deit_tiny_patch16_224 224
19 | profile_timm densenet121 224
20 | profile_timm dla34 224
21 | profile_timm dpn68 224
22 | profile_timm edgenext_xx_small 288
23 | profile_timm efficientformer_l1 224
24 | profile_timm gcvit_xxtiny 224
25 | profile_timm ghostnet_050 224
26 | profile_timm gluon_resnet18_v1b 224
27 | profile_timm gluon_xception65 299
28 | profile_timm hardcorenas_a 224
29 | profile_timm hrnet_w18_small 224
30 | profile_timm inception_resnet_v2 299
31 | profile_timm inception_v3 299
32 | profile_timm inception_v4 299
33 | profile_timm levit_128s 224
34 | profile_timm maxvit_tiny_224 224
35 | profile_timm coatnet_0_224 224
36 | profile_timm mixer_s32_224 224
37 | profile_timm mobilenetv3_small_050 224
38 | profile_timm mobilevit_xs 256
39 | profile_timm mvitv2_tiny 224
40 | profile_timm nasnetalarge 331
41 | profile_timm nest_tiny 224
42 | profile_timm dm_nfnet_f0 256
43 | profile_timm pit_ti_224 224
44 | profile_timm pnasnet5large 331
45 | profile_timm poolformer_s12 224
46 | profile_timm pvt_v2_b0 224
47 | profile_timm regnetx_040 224
48 | profile_timm res2net50_26w_4s 224
49 | profile_timm resnest14d 224
50 | profile_timm resnet10t 224
51 | profile_timm resnetv2_50 224
52 | profile_timm rexnet_100 224
53 | profile_timm selecsls42 224
54 | profile_timm legacy_senet154 224
55 | profile_timm sequencer2d_s 224
56 | profile_timm skresnet18 224
57 | profile_timm swin_tiny_patch4_window7_224 224
58 | profile_timm swinv2_tiny_window8_256 256
59 | profile_timm swinv2_cr_tiny_224 224
60 | profile_timm tnt_s_patch16_224 224
61 | profile_timm tresnet_m 224
62 | profile_timm twins_pcpvt_small 224
63 | profile_timm vgg11 224
64 | profile_timm visformer_tiny 224
65 | profile_timm volo_d1_224 224
66 | profile_timm vovnet39a 224
67 | profile_timm xception 299
68 | profile_timm xception41 299
69 | profile_timm xcit_nano_12_p16_224 224


--------------------------------------------------------------------------------
/tests/feature_tests/test_dynamic_shape.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn as nn
 4 | import tensorrt as trt
 5 | from torch2trt import torch2trt
 6 | from torch2trt.dataset import ListDataset
 7 | 
 8 | 
 9 | def test_dynamic_shape_conv2d():
10 | 
11 |     torch.manual_seed(0)
12 | 
13 |     module = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval()
14 | 
15 |     dataset = ListDataset()
16 |     dataset.insert((torch.randn(1, 3, 224, 224).cuda(),))
17 |     dataset.insert((torch.randn(1, 3, 64, 64).cuda(),))
18 |     dataset.insert((torch.randn(1, 3, 128, 128).cuda(),))
19 |     dataset.insert((torch.randn(4, 3, 32, 32).cuda(),))
20 | 
21 |     module_trt = torch2trt(
22 |         module,
23 |         dataset,
24 |         log_level=trt.Logger.INFO
25 |     )
26 | 
27 |     inputs = dataset[0]
28 |     assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3))
29 |     inputs = dataset[1]
30 |     assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3))
31 |     inputs = dataset[2]
32 |     assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3))
33 |     inputs = dataset[3]
34 |     assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3))
35 | 
36 | 
37 | def test_dynamic_shape_conv2d_onnx():
38 | 
39 |     torch.manual_seed(0)
40 | 
41 |     module = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval()
42 | 
43 |     dataset = ListDataset()
44 |     dataset.insert((torch.randn(1, 3, 224, 224).cuda(),))
45 |     dataset.insert((torch.randn(1, 3, 64, 64).cuda(),))
46 |     dataset.insert((torch.randn(1, 3, 128, 128).cuda(),))
47 |     dataset.insert((torch.randn(4, 3, 32, 32).cuda(),))
48 | 
49 |     module_trt = torch2trt(
50 |         module,
51 |         dataset,
52 |         use_onnx=True,
53 |         log_level=trt.Logger.INFO
54 |     )
55 | 
56 |     inputs = dataset[0]
57 |     assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3))
58 |     inputs = dataset[1]
59 |     assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3))
60 |     inputs = dataset[2]
61 |     assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3))
62 |     inputs = dataset[3]
63 |     assert(torch.allclose(module(*inputs), module_trt(*inputs), rtol=1e-3, atol=1e-3))
64 | 
65 | 
66 | if __name__ == '__main__':
67 | 
68 |     test_dynamic_shape_conv2d()


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/parser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def parse_args():
 4 |     """
 5 |     """
 6 |     parser = argparse.ArgumentParser(description='PyTorch QAT')
 7 |     parser.add_argument('--tl','--transfer_learning',action='store_true',help='used to map weights correctly')
 8 |     parser.add_argument('--iter',default=300, type=int, help='no of iterations')
 9 |     parser.add_argument('--m','--model_name',default=None,help="Name of the model")
10 |     parser.add_argument('--b', '--batch_size', default=128, type=int, help='mini-batch size (default: 32)')
11 |     parser.add_argument('--optimizer', default='Adam', type=str,help='type of optimizer (default=Adam)')
12 |     parser.add_argument( '--wd','--weight-decay', default=1e-5, type=float, help='weight decay (default: 1e-5)')
13 |     parser.add_argument('--start_epoch','--s_ep', default=0, type=int, help='starting epoch')
14 |     parser.add_argument('--num_epochs',default=30,type=int, help='no of epochs')
15 |     parser.add_argument('--no_cuda', action='store_true',help='disables cuda training')
16 |     parser.add_argument('--seed', type=int, default=12345,help='random seed for experiments. [default: 12345]')
17 |     parser.add_argument('--lr', '--learning_rate', default=1e-3, type=float, help='initial learning rate')
18 |     parser.add_argument('--lrdt', '--learning_rate_decay_interval', default=30, type=int, help='initial learning rate decay after n epochs')
19 |     parser.add_argument('--od','--output_dir', default='/tmp/',help='output path')
20 |     parser.add_argument('--en','--exp_name', default='pytorch_exp',help = 'experiment name to create output dir')
21 |     parser.add_argument('--load_ckpt', default = None, help = "path to ckpt")
22 |     parser.add_argument('--netqat',action='store_true',help = 'quantize model using custom layer')
23 |     parser.add_argument('--partial_ckpt',action='store_true',help = 'load_partial checkpoint')
24 |     parser.add_argument('--v','--verbose',action='store_true')
25 |     parser.add_argument('--FP16',action='store_true',help='run TRT engine at FP16')
26 |     parser.add_argument('--test_trt',action='store_true',help='gather metrics using trt')
27 |     parser.add_argument('--INT8PTC',action='store_true',help='run TRT engine at INT8 with Post Training Cal')
28 |     parser.add_argument('--INT8QAT',action='store_true',help='run TRT engine at INT8 with QAT')
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 


--------------------------------------------------------------------------------
/docs/usage/custom_converter.md:
--------------------------------------------------------------------------------
 1 | # Custom Converter
 2 | 
 3 | This page details how to extend or modify the behavior of torch2trt by implementing and registering
 4 | custom converters.
 5 | 
 6 | ## Background
 7 | 
 8 | torch2trt works by attaching conversion functions (like ``convert_ReLU``) to the original 
 9 | PyTorch functional calls (like ``torch.nn.ReLU.forward``).  The sample input data is passed
10 | through the network, just as before, except now whenever a registered function (``torch.nn.ReLU.forward``)
11 | is encountered, the corresponding converter (``convert_ReLU``) is also called afterwards.  The converter
12 | is passed the arguments and return statement of the original PyTorch function, as well as the TensorRT
13 | network that is being constructed.  The input tensors to the original PyTorch function are modified to
14 | have an attribute ``_trt``, which is the TensorRT counterpart to the PyTorch tensor.  The conversion function
15 | uses this ``_trt`` to add layers to the TensorRT network, and then sets the ``_trt`` attribute for
16 | relevant output tensors.  Once the model is fully executed, the final tensors returns are marked as outputs
17 | of the TensorRT network, and the optimized TensorRT engine is built.
18 | 
19 | ## Add a custom converter
20 | 
21 | Here we show how to add a converter for the ``ReLU`` module using the TensorRT
22 | python API.
23 | 
24 | ```python
25 | import tensorrt as trt
26 | from torch2trt import tensorrt_converter
27 | 
28 | @tensorrt_converter('torch.nn.ReLU.forward')
29 | def convert_ReLU(ctx):
30 |     input = ctx.method_args[1]
31 |     output = ctx.method_return
32 |     layer = ctx.network.add_activation(input=input._trt, type=trt.ActivationType.RELU)  
33 |     output._trt = layer.get_output(0)
34 | ```
35 | 
36 | The converter takes one argument, a ``ConversionContext``, which will contain
37 | the following
38 | 
39 | * ``ctx.network`` - The TensorRT network that is being constructed.
40 | 
41 | * ``ctx.method_args`` - Positional arguments that were passed to the specified PyTorch function.  The ``_trt`` attribute is set for relevant input tensors.
42 | * ``ctx.method_kwargs`` - Keyword arguments that were passed to the specified PyTorch function.
43 | * ``ctx.method_return`` - The value returned by the specified PyTorch function.  The converter must set the ``_trt`` attribute where relevant.
44 | 
45 | Please see the [converters](../converters.md) page for a list of implemented converters and links to their source code.  These may help
46 | in learning how to write converters.
47 | 


--------------------------------------------------------------------------------
/examples/easyocr/optimize_detector.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from torch2trt.dataset import FolderDataset, ListDataset
 3 | from torch2trt import torch2trt, TRTModule
 4 | from easyocr import Reader
 5 | import tensorrt as trt
 6 | import torch
 7 | import time
 8 | from tempfile import mkdtemp
 9 | 
10 | parser = ArgumentParser()
11 | parser.add_argument('--detector_data', type=str, default='detector_data')
12 | parser.add_argument('--output', type=str, default='detector_trt.pth')
13 | parser.add_argument('--int8', action='store_true')
14 | parser.add_argument('--fp16', action='store_true')
15 | parser.add_argument('--dla', action='store_true')
16 | parser.add_argument('--dla_core', type=int, default=0)
17 | args = parser.parse_args()
18 | 
19 | detector_dataset = FolderDataset(args.detector_data)
20 | 
21 | if len(detector_dataset) == 0:
22 |     raise ValueError('Detector dataset is empty, make sure to run generate_data.py first.')
23 | 
24 | reader = Reader(['en'])
25 | detector_torch = reader.detector.module
26 | 
27 | if args.int8:
28 |     num_calib = 5
29 |     calib_dataset = FolderDataset(mkdtemp())
30 |     for i in range(num_calib):
31 |         calib_dataset.insert(tuple([t + 0.2 * torch.randn_like(t) for t in detector_dataset[i % len(detector_dataset)]]))
32 | 
33 | print('Running torch2trt...')
34 | detector_trt = torch2trt(
35 |     detector_torch,
36 |     detector_dataset,
37 |     int8_mode=args.int8,
38 |     fp16_mode=args.fp16,
39 |     default_device_type=trt.DeviceType.DLA if args.dla else trt.DeviceType.GPU,
40 |     max_workspace_size=1 << 26,
41 |     log_level=trt.Logger.VERBOSE,
42 |     int8_calib_dataset=calib_dataset if args.int8 else None,
43 |     int8_calib_algorithm=trt.CalibrationAlgoType.MINMAX_CALIBRATION,
44 |     use_onnx=True
45 | )
46 | 
47 | torch.save(detector_trt.state_dict(), args.output)
48 | 
49 | def profile_module(module, dataset, count=None):
50 |     
51 |     if count is None:
52 |         count = len(dataset)
53 | 
54 |     output = module(*dataset[0]) # warmup
55 | 
56 |     torch.cuda.current_stream().synchronize()
57 |     t0 = time.monotonic()
58 |     for i in range(count):
59 |         output = module(*dataset[i % len(dataset)])
60 |     torch.cuda.current_stream().synchronize()
61 |     t1 = time.monotonic()
62 | 
63 |     return count / (t1 - t0)
64 | 
65 | print('Profiling PyTorch...')
66 | fps_torch = profile_module(detector_torch, detector_dataset, 30)
67 | print(f'FPS Torch: {fps_torch}')
68 | 
69 | print('Profiling TensorRT')
70 | fps_trt = profile_module(detector_trt, detector_dataset, 30)
71 | print(f'FPS TensorRT: {fps_trt}')


--------------------------------------------------------------------------------
/benchmarks/JETSON_NANO.md:
--------------------------------------------------------------------------------
 1 | | Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) |
 2 | |------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------|
 3 | | torchvision.models.alexnet.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.29E-05 | 46.4 | 69.9 | 22.1 | 14.7 |
 4 | | torchvision.models.squeezenet.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.20E-02 | 44 | 137 | 24.2 | 7.6 |
 5 | | torchvision.models.squeezenet.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 76.6 | 248 | 14 | 4.34 |
 6 | | torchvision.models.resnet.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 29.4 | 90.2 | 34.7 | 11.4 |
 7 | | torchvision.models.resnet.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.56E-01 | 15.5 | 50.7 | 64.8 | 20.2 |
 8 | | torchvision.models.resnet.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 6.45E-02 | 12.4 | 34.2 | 81.7 | 29.8 |
 9 | | torchvision.models.resnet.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.01E+03 | 7.18 | 19.9 | 141 | 51.1 |
10 | | torchvision.models.resnet.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 4.96 | 14.1 | 204 | 72.3 |
11 | | torchvision.models.densenet.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 11.5 | 41.9 | 84.5 | 24.8 |
12 | | torchvision.models.densenet.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 8.25 | 33.2 | 118 | 31.2 |
13 | | torchvision.models.densenet.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 6.84 | 25.4 | 141 | 40.8 |
14 | | torchvision.models.densenet.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.15E-03 | 4.71 | 15.6 | 247 | 65.8 |
15 | | torchvision.models.vgg.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.51E-04 | 8.9 | 18.3 | 114 | 55.1 |
16 | | torchvision.models.vgg.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.07E-04 | 6.53 | 14.7 | 156 | 68.7 |
17 | | torchvision.models.vgg.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.58E-04 | 5.09 | 11.9 | 201 | 85.1 |
18 | | torchvision.models.vgg.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.81E-04 | 8.74 | 18.4 | 117 | 54.8 |
19 | | torchvision.models.vgg.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.19E-04 | 6.31 | 14.8 | 162 | 68.5 |
20 | | torchvision.models.vgg.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 4.96 | 12 | 207 | 84.3 |
21 | 


--------------------------------------------------------------------------------
/docs/benchmarks/jetson_nano.md:
--------------------------------------------------------------------------------
 1 | # Jetson Nano
 2 | 
 3 | | Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) |
 4 | |------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------|
 5 | | torchvision.models.alexnet.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.29E-05 | 46.4 | 69.9 | 22.1 | 14.7 |
 6 | | torchvision.models.squeezenet.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.20E-02 | 44 | 137 | 24.2 | 7.6 |
 7 | | torchvision.models.squeezenet.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 76.6 | 248 | 14 | 4.34 |
 8 | | torchvision.models.resnet.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 29.4 | 90.2 | 34.7 | 11.4 |
 9 | | torchvision.models.resnet.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.56E-01 | 15.5 | 50.7 | 64.8 | 20.2 |
10 | | torchvision.models.resnet.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 6.45E-02 | 12.4 | 34.2 | 81.7 | 29.8 |
11 | | torchvision.models.resnet.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.01E+03 | 7.18 | 19.9 | 141 | 51.1 |
12 | | torchvision.models.resnet.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 4.96 | 14.1 | 204 | 72.3 |
13 | | torchvision.models.densenet.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 11.5 | 41.9 | 84.5 | 24.8 |
14 | | torchvision.models.densenet.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 8.25 | 33.2 | 118 | 31.2 |
15 | | torchvision.models.densenet.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 6.84 | 25.4 | 141 | 40.8 |
16 | | torchvision.models.densenet.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.15E-03 | 4.71 | 15.6 | 247 | 65.8 |
17 | | torchvision.models.vgg.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.51E-04 | 8.9 | 18.3 | 114 | 55.1 |
18 | | torchvision.models.vgg.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.07E-04 | 6.53 | 14.7 | 156 | 68.7 |
19 | | torchvision.models.vgg.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.58E-04 | 5.09 | 11.9 | 201 | 85.1 |
20 | | torchvision.models.vgg.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.81E-04 | 8.74 | 18.4 | 117 | 54.8 |
21 | | torchvision.models.vgg.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.19E-04 | 6.31 | 14.8 | 162 | 68.5 |
22 | | torchvision.models.vgg.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 4.96 | 12 | 207 | 84.3 |
23 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUTPUT_FILE=$1
 4 | 
 5 | touch $OUTPUT_FILE
 6 | 
 7 | echo "| Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) |" >> $OUTPUT_FILE
 8 | echo "|------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------|" >> $OUTPUT_FILE
 9 | 
10 | python3 -m torch2trt.test -o $OUTPUT_FILE --name alexnet --include=torch2trt.tests.torchvision.classification
11 | python3 -m torch2trt.test -o $OUTPUT_FILE --name squeezenet1_0 --include=torch2trt.tests.torchvision.classification
12 | python3 -m torch2trt.test -o $OUTPUT_FILE --name squeezenet1_1 --include=torch2trt.tests.torchvision.classification
13 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet18 --include=torch2trt.tests.torchvision.classification
14 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet34 --include=torch2trt.tests.torchvision.classification
15 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet50 --include=torch2trt.tests.torchvision.classification
16 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet101 --include=torch2trt.tests.torchvision.classification
17 | python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet152 --include=torch2trt.tests.torchvision.classification
18 | python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet121 --include=torch2trt.tests.torchvision.classification
19 | python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet169 --include=torch2trt.tests.torchvision.classification
20 | python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet201 --include=torch2trt.tests.torchvision.classification
21 | python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet161 --include=torch2trt.tests.torchvision.classification
22 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg11$ --include=torch2trt.tests.torchvision.classification
23 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg13$ --include=torch2trt.tests.torchvision.classification
24 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg16$ --include=torch2trt.tests.torchvision.classification
25 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg19$ --include=torch2trt.tests.torchvision.classification
26 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg11_bn --include=torch2trt.tests.torchvision.classification
27 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg13_bn --include=torch2trt.tests.torchvision.classification
28 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg16_bn --include=torch2trt.tests.torchvision.classification
29 | python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg19_bn --include=torch2trt.tests.torchvision.classification
30 | python3 -m torch2trt.test -o $OUTPUT_FILE --name mobilenet_v2 --include=torch2trt.tests.torchvision.classification
31 | 


--------------------------------------------------------------------------------
/examples/easyocr/run_end2end.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import cv2
 3 | import torch
 4 | import glob
 5 | from easyocr import Reader
 6 | from torch2trt.dataset import FolderDataset
 7 | from torch2trt import torch2trt, TRTModule
 8 | import math
 9 | import time
10 | import os
11 | 
12 | parser = ArgumentParser()
13 | parser.add_argument('--images', type=str, default='images')
14 | parser.add_argument('--detector_trt', type=str, default='detector_trt.pth')
15 | parser.add_argument('--recognizer_trt', type=str, default='recognizer_trt.pth')
16 | parser.add_argument('--max_image_area', type=int, default=1280*720)
17 | parser.add_argument('--count', type=int, default=None)
18 | parser.add_argument('--recognizer_batch_size', type=int, default=1)
19 | args = parser.parse_args()
20 | 
21 | 
22 | def shrink_to_area(image, area):
23 |     height = image.shape[0]
24 |     width = image.shape[1]
25 | 
26 |     if height * width > area:
27 |         ar = width / height
28 |         new_height = math.sqrt(area / ar)
29 |         new_width = ar * new_height
30 |         new_height = math.floor(new_height)
31 |         new_width = math.floor(new_width)
32 |         print(f'Resizing {width}x{height} to {new_width}x{new_height}')
33 |         image = cv2.resize(image, (new_width, new_height))
34 | 
35 |     return image
36 | 
37 | image_paths = glob.glob(os.path.join(args.images, '*.jpg'))
38 | 
39 | def profile_reader(reader):
40 | 
41 |     cumulative_execution_time = 0
42 | 
43 |     if args.count is None:
44 |         count = len(image_paths)
45 |     else:
46 |         count = args.count
47 | 
48 |     for i in range(count):
49 | 
50 |         path = image_paths[i % len(image_paths)]
51 |         image = cv2.imread(path)
52 | 
53 |         image = shrink_to_area(image, args.max_image_area)
54 |         
55 |         t0 = time.monotonic()
56 |         reader.readtext(image, batch_size=args.recognizer_batch_size)
57 |         t1 = time.monotonic()
58 | 
59 |         cumulative_execution_time += (t1 - t0)
60 |     
61 |     return count / cumulative_execution_time
62 | 
63 | 
64 | reader = Reader(['en'])
65 | 
66 | detector_trt = TRTModule()
67 | detector_trt.load_state_dict(torch.load(args.detector_trt))
68 | 
69 | recognizer_trt = TRTModule()
70 | recognizer_trt.load_state_dict(torch.load(args.recognizer_trt))
71 | 
72 | test_image = shrink_to_area(cv2.imread(image_paths[0]), args.max_image_area)
73 | 
74 | print('Dumping torch output...')
75 | print(reader.readtext(test_image, batch_size=args.recognizer_batch_size))
76 | 
77 | print('Profiling torch...')
78 | fps_torch = profile_reader(reader)
79 | 
80 | reader.detector.module = detector_trt
81 | reader.recognizer.module = recognizer_trt
82 | 
83 | 
84 | print('Dumping TensorRT output...')
85 | print(reader.readtext(test_image, batch_size=args.recognizer_batch_size))
86 | 
87 | print('Profiling torch...')
88 | fps_trt = profile_reader(reader)
89 | 
90 | 
91 | print(f'FPS Torch: {fps_torch}')
92 | print(f'FPS TensorRT: {fps_trt}')


--------------------------------------------------------------------------------
/tests/feature_tests/test_dataset_calibrator.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import tensorrt as trt
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch2trt.dataset import (
  6 |     TensorBatchDataset,
  7 |     ListDataset
  8 | )
  9 | from torch2trt import torch2trt
 10 | 
 11 | 
 12 | def test_dataset_calibrator_batch_dataset():
 13 | 
 14 |     torch.manual_seed(0)
 15 | 
 16 | 
 17 |     class TestModule(nn.Module):
 18 |         def __init__(self):
 19 |             super().__init__()
 20 |             self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval()
 21 | 
 22 |         def forward(self, x, y):
 23 |             a = self.conv(x)
 24 |             b = self.conv(y)
 25 |             return torch.cat([a, b], dim=0)
 26 | 
 27 |     inputs = [
 28 |         torch.randn(1, 3, 32, 32).cuda(),
 29 |         torch.randn(1, 3, 32, 32).cuda()
 30 |     ]
 31 | 
 32 |     module = TestModule().cuda().eval()
 33 | 
 34 |     dataset = TensorBatchDataset()
 35 | 
 36 |     with dataset.record(module):
 37 |         for i in range(50):
 38 |             module(*inputs)
 39 |     
 40 |     module_trt = torch2trt(
 41 |         module,
 42 |         dataset[0],
 43 |         int8_mode=True,
 44 |         int8_calib_dataset=dataset,
 45 |         log_level=trt.Logger.INFO
 46 |     )
 47 | 
 48 |     inputs = [
 49 |         torch.randn(1, 3, 32, 32).cuda(),
 50 |         torch.randn(1, 3, 32, 32).cuda()
 51 |     ]
 52 |     output = module(*inputs)
 53 |     output_trt = module_trt(*inputs)
 54 | 
 55 |     assert(torch.allclose(output, output_trt, rtol=1e-3, atol=1e-3))
 56 | 
 57 | 
 58 | def test_dataset_calibrator_list_dataset():
 59 | 
 60 |     torch.manual_seed(0)
 61 | 
 62 | 
 63 |     class TestModule(nn.Module):
 64 |         def __init__(self):
 65 |             super().__init__()
 66 |             self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval()
 67 | 
 68 |         def forward(self, x, y):
 69 |             a = self.conv(x)
 70 |             b = self.conv(y)
 71 |             return torch.cat([a, b], dim=0)
 72 | 
 73 |     inputs = [
 74 |         torch.randn(1, 3, 32, 32).cuda(),
 75 |         torch.randn(1, 3, 32, 32).cuda()
 76 |     ]
 77 | 
 78 |     module = TestModule().cuda().eval()
 79 | 
 80 |     dataset = ListDataset()
 81 | 
 82 |     with dataset.record(module):
 83 |         for i in range(50):
 84 |             module(*inputs)
 85 |     
 86 |     module_trt = torch2trt(
 87 |         module,
 88 |         dataset[0],
 89 |         int8_mode=True,
 90 |         int8_calib_dataset=dataset,
 91 |         log_level=trt.Logger.INFO
 92 |     )
 93 | 
 94 |     inputs = [
 95 |         torch.randn(1, 3, 32, 32).cuda(),
 96 |         torch.randn(1, 3, 32, 32).cuda()
 97 |     ]
 98 |     output = module(*inputs)
 99 |     output_trt = module_trt(*inputs)
100 | 
101 |     assert(torch.allclose(output, output_trt, rtol=1e-3, atol=1e-3))
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     test_dataset_calibrator_list_dataset()


--------------------------------------------------------------------------------
/docs/see_also.md:
--------------------------------------------------------------------------------
 1 | # See Also
 2 | 
 3 | !!! note
 4 | 
 5 |     The state of these converters may change over time.  We provide this information here with the hope that it will help shed light on the landscape of tools available for optimizing PyTorch models with TensorRT.
 6 |     If you find this information helpful or outdated / misleading, please let us know.
 7 |     
 8 | In addition to torch2trt, there are other workflows for optimizing your PyTorch model with TensorRT.
 9 | 
10 | The other converters we are aware of are
11 | 
12 | * [ONNX to TensorRT](https://github.com/onnx/onnx-tensorrt)
13 | 
14 | !!! tip
15 |     
16 |     Since the ONNX parser ships with TensorRT, we have included a convenience method for using this
17 |     workflow with torch2trt.  If you want to quickly try the ONNX method using the torch2trt interface, just call ``torch2trt(..., use_onnx=True)``.
18 |     This will perform conversion on the module by exporting the model using PyTorch's JIT tracer,
19 |     and parsing with TensorRT's ONNX parser.
20 |     
21 | * [TRTorch](https://github.com/NVIDIA/TRTorch)
22 | 
23 | Which one you use depends largely on your use case. The differences often come down to
24 | 
25 | ## Layer support
26 | 
27 | Modern deep learning frameworks are large, and there often arise
28 | caveats converting between frameworks using a given workflow.  These could include
29 | limitations in serialization or parsing formats.  Or in some instances, it may be possible
30 | the layer could be supported, but it has just not been done yet.   TRTorch is strong 
31 | in the sense that it will default to the original PyTorch method for layers 
32 | which are not converted to TensorRT.  The best way to know 
33 | which conversion method works for you is to try converting your model. 
34 | 
35 | ## Feature support
36 | 
37 | TensorRT is evolving and the conversion workflows may have varying level 
38 | of feature support.  In some instances, you may wish to use a latest feature of TensorRT, like dynamic shapes,
39 | but it is not supported in torch2trt or the interface has not yet been exposed.  In this
40 | instance, we recommend checking to see if it is supported by one of the other workflows.  The ONNX
41 | converter is typically strong in this regards, since the parser is distributed with TensorRT.  
42 | 
43 | !!! note
44 | 
45 |     If there is a TensorRT feature you wished to see in torch2trt, please let us know.  We can not gaurantee this will be done, but it helps us gauge interest.
46 | 
47 | ## Extensibility / Ease of Use
48 | 
49 | In case none of the converters satisfy for your use case, you may find it necessary to adapt
50 | the converter to fit your needs.  This is very intuitive with torch2trt,
51 | since it is done inline with Python, and there are many [examples](converters.md) to reference.  If you know 
52 | how the original PyTorch method works, and have the TensorRT Python API on hand, it is relatively straight forward to adapt torch2trt to your needs.
53 | The extensibility is often helpful when you want to implement a converter that is specific to the 
54 | context the layer appears in.  
55 | 
56 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/README.md:
--------------------------------------------------------------------------------
 1 | ## QAT working example
 2 | 
 3 | This example is using QAT library open sourced by nvidia. [Github link](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization)
 4 | 
 5 | ## Directory overview
 6 | 
 7 | 1. This directory contains
 8 |    1. `dataset` : contains code for cifar-10 dataset
 9 |    2. `layers` : contains implementation for inference. More details under `layers/README.md`
10 |    3. `models`: contains two models. `resnet18` and `vanilla_cnn`
11 |    4. `utils` : contains various utility functions for loading state dict, custom wrapper for training and inference & calculating accuracy during training
12 |    5. `train.py` and `infer.py` : contains code for training and inference (including trt conversion)
13 | 
14 | 2. Usually, nvidia quantization library doesn't provide control per layer for quantization. Custom wrapper under `utils/utilities.py` helps us in quantization selective layers in our model.
15 | 
16 | ## Environment
17 | 
18 | **Filename** : pytorch_ngc_container_20.09     
19 | 
20 | ```
21 | FROM nvcr.io/nvidia/pytorch:20.09-py3
22 | RUN apt-get update && apt-get install -y software-properties-common && apt-get update
23 | RUN add-apt-repository ppa:git-core/ppa && \
24 |     apt install -y git    
25 | 
26 | RUN pip install termcolor graphviz
27 | 
28 | RUN git clone https://github.com/NVIDIA-AI-IOT/torch2trt.git /sw/torch2trt/ && \
29 |     cd /sw/torch2trt/scripts && \
30 | 	bash build_contrib.sh
31 | 
32 | ```
33 | 
34 | Docker build: `docker build -f pytorch_ngc_container_20.09 -t pytorch_ngc_container_20.09 .`
35 | 
36 | `docker_image=pytorch_ngc_container_20.09`
37 | 
38 | Docker run : `docker run -e NVIDIA_VISIBLE_DEVICES=0 --gpus 0 -it --shm-size=1g --ulimit memlock=-1  --rm  -v $PWD:/workspace/work $docker_image` 
39 | 
40 | **Important Notes** : 
41 | 
42 | - Sparse checkout helps us in checking out a part of the github repo. 
43 | - Patch file can be found under `examples/quantization_aware_training/utils`
44 | 
45 | ## Workflow
46 | 
47 | Workflow consists of three parts. 
48 | 1. Train without quantization:
49 | 
50 | Here pretrained weights from imagenet are used. 
51 | 
52 | `python train.py --m resnet34-tl / resnet18-tl --num_epochs 45 --test_trt --FP16 --INT8PTC`
53 | 
54 | 2. Train with quantization (weights are mapped using a custom function to make sure that each weight is loaded correctly)
55 | 
56 | `python train.py --m resnet34/ resnet18 --netqat --partial_ckpt --tl --load_ckpt /tmp/pytorch_exp/{} --num_epochs 25 --lr 1e-4 --lrdt 10`
57 | 
58 | 3. Infer with and without TRT
59 | 
60 | `python infer.py --m resnet34/resnet18 --load_ckpt /tmp/pytorch_exp_1/ckpt_{} --netqat --INT8QAT`
61 | 
62 | 
63 | ## Accuracy Results 
64 | 
65 | | Model | FP32 | FP16 | INT8 (QAT) | INT(PTC) |
66 | |-------|------|------|------------|----------|
67 | | Resnet18 | 83.08 | 83.12 | 83.12 | 83.06 |
68 | | Resnet34 | 84.65 | 84.65 | 83.26 | 84.5 |  
69 | 
70 | 
71 | **Please note that the idea behind these experiments is to see if TRT conversion is working properly rather than achieving industry standard accuracy results**
72 | 
73 | ## Future Work
74 | 
75 | - Add results for Resnet50, EfficientNet and Mobilenet
76 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/infer.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | import torch 
 3 | import torch.nn as nn
 4 | import numpy as np 
 5 | import torchvision
 6 | import argparse
 7 | import os,sys 
 8 | from datasets.cifar10 import Cifar10Loaders
 9 | from utils.utilities import calculate_accuracy, timeGraph,printStats
10 | from models.resnet import resnet18,resnet34
11 | from parser import parse_args
12 | from torch2trt import torch2trt
13 | import tensorrt as trt
14 | torch.set_printoptions(precision=5)
15 | 
16 | def main():
17 |     args = parse_args()
18 | 
19 |     args.cuda = not args.no_cuda and torch.cuda.is_available()
20 |     torch.manual_seed(78543)
21 | 
22 |     if args.cuda:
23 |         torch.backends.cudnn.benchmark = True
24 |         torch.cuda.manual_seed(args.seed)
25 |     
26 |     loaders = Cifar10Loaders()
27 |     train_loader = loaders.train_loader()
28 |     test_loader = loaders.test_loader()
29 | 
30 |     if args.m == "resnet18":
31 |         if args.netqat:
32 |             model=resnet18(qat_mode=True,infer=True)
33 |         else:
34 |             model=resnet18()
35 |     elif args.m == "resnet34":
36 |         if args.netqat:
37 |             model=resnet34(qat_mode=True,infer=True)
38 |         else:
39 |             model=resnet34()
40 |     else:
41 |         raise NotImplementedError("{} model not found".format(args.m))
42 | 
43 | 
44 |     model = model.cuda().eval()
45 | 
46 |     if args.load_ckpt:
47 |         checkpoint = torch.load(args.load_ckpt)
48 |         if not args.netqat:
49 |             checkpoint = mapping_names_resnets(checkpoint)
50 |         model.load_state_dict(checkpoint['model_state_dict'],strict=True)
51 |         print("===>>> Checkpoint loaded successfully from {} ".format(args.load_ckpt))
52 |     
53 |     test_accuracy = calculate_accuracy(model,test_loader)
54 |     print(" Test accuracy for Pytorch model: {0} ".format(test_accuracy))
55 |     rand_in = torch.randn([128,3,32,32],dtype=torch.float32).cuda()
56 |     
57 |     #Converting the model to TRT
58 |     if args.FP16:
59 |         trt_model_fp16 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,max_batch_size=128)
60 |         test_accuracy = calculate_accuracy(trt_model_fp16,test_loader)
61 |         print(" TRT test accuracy at FP16: {0}".format(test_accuracy))
62 |     
63 |     if args.INT8QAT:
64 |         trt_model_int8 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,int8_mode=True,max_batch_size=128,qat_mode=True)
65 |         test_accuracy = calculate_accuracy(trt_model_int8,test_loader)
66 |         print(" TRT test accuracy at INT8 QAT: {0}".format(test_accuracy))
67 |     
68 |     if args.INT8PTC:
69 |         ##preparing calib dataset
70 |         calib_dataset = list()
71 |         for i, sam in enumerate(test_loader):
72 |             calib_dataset.extend(sam[0])
73 |             if i ==5:
74 |                 break
75 | 
76 |         trt_model_calib_int8 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,int8_calib_dataset=calib_dataset,int8_mode=True,max_batch_size=128)
77 |         test_accuracy = calculate_accuracy(trt_model_calib_int8,test_loader)
78 |         print(" TRT test accuracy at INT8 PTC: {0}".format(test_accuracy))
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changes
 2 | 
 3 | ## [master](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/master)
 4 | 
 5 | - Added inference and conversion support for TensorRT 10
 6 | - Removed redundant converters, and merged converters for ND convolutions, pooling, etc.
 7 | - Migrated test cases to use PyTest
 8 | - Added unique axis names when using ONNX to support mis-matched dynamic axes (needed for whisper)
 9 | 
10 | ## [v0.5.0](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/v0.5.0) - 05/3/2024
11 | 
12 | - Added tensor shape tracking to support dynamic shapes for flatten, squeeze, unsqueeze, view, reshape, interpolate, and getitem methods
13 | - Added EasyOCR example
14 | - Added the ``DatasetRecorder`` context manager, allowing to easily capture of module inputs in large pipeline for calibration and shape inference
15 | - Added support for legacy max_batch_size using optimization profiles
16 | - Added support for nested tuple, dict and list module inputs and outputs via. the ``Flattener`` class
17 | - Added ability to accept dataset as ``inputs`` argument, and infer optimization profiles from the data
18 | - Added Dataset, TensorBatchDataset, ListDataset, and FolderDatset classes
19 | - Added support for dynamic shapes
20 |   - Known limitation: Currently some converters (ie: View) may have unexpected behavior if their arguments are defined with dynamic Tensor shapes.
21 | 
22 | ## [0.4.0](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/v0.4.0) - 07/22/2022
23 | 
24 | - Added converter for ``torch.nn.functional.group_norm`` using native TensorRT layers
25 | - Added converter for ``torch.nn.ReflectionPad2d`` using plugin layer
26 | - Added torch2trt_plugins library
27 | - Added support for Deep Learning Accelerator (DLA)
28 | - Added support for explicit batch
29 | - Added support for TensorRT 8
30 | 
31 | ## [0.3.0](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/v0.3.0) - 07/15/2021
32 | 
33 | - Added converter for ``torch.nn.functional.adaptive_avg_pool3d``
34 | - Added converter for ``torch.nn.functional.adaptive_max_pool3d``
35 | - Added converter for ``torch.maxpool3d`` and ``torch.nn.functional.max_pool3d``
36 | - Added Quantization Aware Training (QAT) workflow to contrib
37 | - Added converter for ``torch.roll``
38 | - Added converter for ``torch.nn.functional.layer_norm``
39 | - Added converter for ``torch.nn.functional.gelu``
40 | - Added converter for ``torch.nn.functional.linear``
41 | - Added converter for ``torch.nn.functional.silu``
42 | 
43 | ## [0.2.0](https://github.com/NVIDIA-AI-IOT/torch2trt/tree/v0.2.0) - 03/02/2021
44 | 
45 | - Added converter for ``torch.Tensor.flatten``
46 | - Added converter for ``torch.nn.functional.conv2d`` and ``torch.nn.functional.conv3d``
47 | - Added converter for ``torch.Tensor.expand``
48 | - Added support for custom converters for methods defined outside of ``torch`` module
49 | - Added names for TensorRT layers
50 | - Added GroupNorm plugin which internally uses PyTorch aten::group_norm
51 | - Replaced Tensor.ndim references with len(tensor.shape) to support older pytorch versions
52 | - Added reduced precision documentation page
53 | - Added converters for ``floordiv``, ``mod``, ``ne``, and ``torch.tensor`` operations
54 | - Extended ``relu`` converter to support ``Tensor.relu`` operation
55 | - Extended ``sigmoid`` converter to support ``Tensor.sigmoid`` operation
56 | 


--------------------------------------------------------------------------------
/examples/easyocr/optimize_recognizer.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from torch2trt.dataset import FolderDataset
 3 | from torch2trt import torch2trt, TRTModule
 4 | from easyocr import Reader
 5 | import tensorrt as trt
 6 | import torch
 7 | import time
 8 | from tempfile import mkdtemp
 9 | 
10 | 
11 | parser = ArgumentParser()
12 | parser.add_argument('--detector_data', type=str, default='detector_data')
13 | parser.add_argument('--recognizer_data', type=str, default='recognizer_data')
14 | parser.add_argument('--output', type=str, default='recognizer_trt.pth')
15 | parser.add_argument('--int8', action='store_true')
16 | parser.add_argument('--fp16', action='store_true')
17 | parser.add_argument('--max_workspace_size', type=int, default=1<<28)
18 | args = parser.parse_args()
19 | 
20 | detector_dataset = FolderDataset(args.detector_data)
21 | recognizer_dataset = FolderDataset(args.recognizer_data)
22 | 
23 | if len(detector_dataset) == 0:
24 |     raise ValueError('Detector dataset is empty, make sure to run generate_data.py first.')
25 | 
26 | if len(recognizer_dataset) == 0:
27 |     raise ValueError('Recognizer dataset is empty, make sure to run generate_data.py first.')
28 | 
29 | 
30 | if args.int8:
31 |     num_calib = 200
32 |     calib_dataset = FolderDataset(mkdtemp())
33 |     for i in range(num_calib):
34 |         calib_dataset.insert(tuple([t.float() + 0.2 * torch.randn_like(t.float()) for t in recognizer_dataset[i % len(recognizer_dataset)]]))
35 | 
36 | reader = Reader(['en'])
37 | module_torch = reader.detector.module
38 | 
39 | max_shapes = list(recognizer_dataset.max_shapes())
40 | 
41 | # override default max shape to use full image width
42 | max_shapes[0] = torch.Size((
43 |     recognizer_dataset.max_shapes()[0][0],
44 |     recognizer_dataset.max_shapes()[0][1],
45 |     recognizer_dataset.max_shapes()[0][2],
46 |     detector_dataset.max_shapes()[0][3]
47 | ))
48 | max_shapes = tuple(max_shapes)
49 | 
50 | class PoolFix(torch.nn.Module):
51 |     def forward(self, x):
52 |         return torch.mean(x, dim=-1, keepdim=True)
53 | 
54 | if isinstance(reader.recognizer.module.AdaptiveAvgPool, torch.nn.AdaptiveAvgPool2d):
55 |     reader.recognizer.module.AdaptiveAvgPool = PoolFix()
56 | 
57 | recognizer_torch = reader.recognizer.module
58 | 
59 | print('Running torch2trt...')
60 | recognizer_trt = torch2trt(
61 |     reader.recognizer.module, 
62 |     recognizer_dataset, 
63 |     max_shapes=max_shapes, 
64 |     use_onnx=True,  # LSTM currently only implemented in ONNX workflow
65 |     fp16_mode=args.fp16,
66 |     int8_mode=args.int8,
67 |     max_workspace_size=args.max_workspace_size,
68 |     log_level=trt.Logger.VERBOSE
69 | )
70 | 
71 | # recognizer_trt.ignore_inputs = [1]
72 | 
73 | torch.save(recognizer_trt.state_dict(), args.output)
74 | 
75 | def profile_module(module, dataset, count=None):
76 |     
77 |     if count is None:
78 |         count = len(dataset)
79 | 
80 |     output = module(*dataset[0]) # warmup
81 | 
82 |     torch.cuda.current_stream().synchronize()
83 |     t0 = time.monotonic()
84 |     for i in range(count):
85 |         output = module(*dataset[i % len(dataset)])
86 |     torch.cuda.current_stream().synchronize()
87 |     t1 = time.monotonic()
88 | 
89 |     return count / (t1 - t0)
90 | 
91 | print('Profiling PyTorch...')
92 | fps_torch = profile_module(recognizer_torch, recognizer_dataset, 50)
93 | print(f'FPS Torch: {fps_torch}')
94 | 
95 | print('Profiling TensorRT')
96 | fps_trt = profile_module(recognizer_trt, recognizer_dataset, 30)
97 | print(f'FPS TensorRT: {fps_trt}')


--------------------------------------------------------------------------------
/torch2trt/flattener.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | 
 4 | 
 5 | def _default_condition(x):
 6 |     return isinstance(x, torch.Tensor) and (x.dtype is torch.half or x.dtype is torch.float or x.dtype == torch.bool)
 7 | 
 8 | 
 9 | def _make_schema_from_value(value, condition=_default_condition, size=0):
10 |     if condition(value):
11 |         return size, size + 1
12 |     elif isinstance(value, list) or isinstance(value, tuple):
13 |         schema = []
14 |         for child_value in value:
15 |             child_schema, size = _make_schema_from_value(child_value, condition, size)
16 |             schema.append(child_schema)
17 |         if isinstance(value, tuple):
18 |             schema = tuple(schema)
19 |         return schema, size
20 |     elif isinstance(value, dict):
21 |         schema = {}
22 |         for child_key in sorted(value.keys()):
23 |             child_value = value[child_key]
24 |             child_schema, size = _make_schema_from_value(child_value, condition, size)
25 |             schema[child_key] = child_schema
26 |         return schema, size
27 |     else:
28 |         return None, size
29 | 
30 | 
31 | class Flattener(object):
32 |     
33 |     def __init__(self, schema, size):
34 |         self._schema = schema
35 |         self._size = size
36 | 
37 |     @staticmethod
38 |     def from_value(value, condition=_default_condition):
39 |         return Flattener(*_make_schema_from_value(value, condition))
40 |     
41 |     @staticmethod
42 |     def from_dict(x):
43 |         return Flattener(x['schema'], x['size'])
44 | 
45 |     def dict(self):
46 |         return {'schema': self.schema, 'size': self.size}
47 | 
48 |     @property
49 |     def schema(self):
50 |         return self._schema
51 | 
52 |     @property
53 |     def size(self):
54 |         return self._size
55 | 
56 |     def __len__(self):
57 |         return self._size
58 | 
59 |     def _flatten(self, value, result):
60 |         if isinstance(self._schema, int):
61 |             result[self._schema] = value
62 |         elif isinstance(self._schema, list) or isinstance(self._schema, tuple):
63 |             for child_value, child_schema in zip(value, self._schema):
64 |                 Flattener(child_schema, self.size)._flatten(child_value, result)
65 |         elif isinstance(self._schema, dict):
66 |             for key in sorted(self._schema.keys()):
67 |                 child_value = value[key]
68 |                 child_schema = self._schema[key]
69 |                 Flattener(child_schema, self.size)._flatten(child_value, result)
70 | 
71 |     def flatten(self, value):
72 |         result = [None for i in range(self.size)]
73 |         self._flatten(value, result)
74 |         return result
75 | 
76 |     def unflatten(self, flattened):
77 |         if isinstance(self._schema, int):
78 |             return flattened[self._schema]
79 |         elif isinstance(self._schema, list) or isinstance(self._schema, tuple):
80 |             result = []
81 |             for child_schema in self._schema:
82 |                 result.append(Flattener(child_schema, self.size).unflatten(flattened))
83 |             if isinstance(self._schema, tuple):
84 |                 result = tuple(result)
85 |             return result
86 |         elif isinstance(self._schema, dict):
87 |             result = {}
88 |             for child_key in sorted(self._schema.keys()):
89 |                 child_schema = self._schema[child_key]
90 |                 result[child_key] = Flattener(child_schema, self.size).unflatten(flattened)
91 |             return result
92 |         else:
93 |             return None


--------------------------------------------------------------------------------
/plugins/src/example_plugin.h:
--------------------------------------------------------------------------------
  1 | #ifndef TORCH2TRT_PLUGIN_EXAMPLE
  2 | #define TORCH2TRT_PLUGIN_EXAMPLE
  3 | 
  4 | 
  5 | #include "NvInfer.h"
  6 | #include "NvInferPlugin.h"
  7 | #include <cuda_fp16.h>
  8 | #include <vector>
  9 | #include <string>
 10 | 
 11 | #define EXAMPLE_PLUGIN_NAME "ExamplePlugin"
 12 | #define EXAMPLE_PLUGIN_VERSION "1"
 13 | 
 14 | 
 15 | using namespace nvinfer1;
 16 | 
 17 | 
 18 | namespace torch2trt_plugins {
 19 | 
 20 | 
 21 | template<typename T>
 22 | void exampleFuncton(T *x, T *y, float scale, int size, cudaStream_t stream=0);
 23 | void exampleFunctonHalf(__half *x, __half *y, float scale, int size, cudaStream_t stream=0);
 24 | 
 25 | 
 26 | class ExamplePlugin : public IPluginV2Ext {
 27 | public:
 28 |     int32_t inputSize;
 29 |     DataType dataType;
 30 |     float scale;
 31 |     std::string pluginNamespace;
 32 | 
 33 |     ExamplePlugin(float scale=2.0);
 34 |     ExamplePlugin(float scale, int32_t inputSize, DataType dataType);
 35 |     ExamplePlugin(void const* serialData, size_t serialLength);
 36 |     ~ExamplePlugin();
 37 | 
 38 |     /* IPluginV2 methods */
 39 | 
 40 |     AsciiChar const* getPluginType() const noexcept override;
 41 | 
 42 |     AsciiChar const* getPluginVersion() const noexcept override;
 43 | 
 44 |     int32_t getNbOutputs() const noexcept override;
 45 | 
 46 |     Dims getOutputDimensions(int32_t index, Dims const* inputs, int32_t nbInputDims) noexcept override;
 47 | 
 48 |     bool supportsFormat(DataType type, PluginFormat format) const noexcept;
 49 | 
 50 |     int32_t initialize() noexcept override;
 51 | 
 52 |     void terminate() noexcept override;
 53 | 
 54 |     size_t getWorkspaceSize(int32_t maxBatchSize) const noexcept override;
 55 | 
 56 |     int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace,
 57 |         cudaStream_t stream) noexcept
 58 |         override;
 59 | 
 60 |     size_t getSerializationSize() const noexcept override;
 61 | 
 62 |     void serialize(void* buffer) const noexcept override;
 63 | 
 64 |     void destroy() noexcept override;
 65 | 
 66 | 
 67 |     void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override;
 68 | 
 69 |     AsciiChar const* getPluginNamespace() const noexcept override;
 70 | 
 71 |     // IPluginV2Ext methods
 72 |     IPluginV2Ext* clone() const noexcept override;
 73 |     DataType getOutputDataType(int32_t index, DataType const* inputTypes, int32_t nbInputs) const noexcept override;
 74 |     bool isOutputBroadcastAcrossBatch(int32_t outputIndex, bool const* inputIsBroadcasted, int32_t nbInputs) const noexcept override;
 75 |     bool canBroadcastInputAcrossBatch(int32_t inputIndex) const noexcept override;
 76 |     void configurePlugin(Dims const* inputDims, int32_t nbInputs, Dims const* outputDims, int32_t nbOutputs,
 77 |         DataType const* inputTypes, DataType const* outputTypes, bool const* inputIsBroadcast,
 78 |         bool const* outputIsBroadcast, PluginFormat floatFormat, int32_t maxBatchSize) noexcept override;
 79 | };
 80 | 
 81 | class ExamplePluginCreator : public IPluginCreator {
 82 | private:
 83 |     PluginFieldCollection fieldCollection;
 84 |     std::vector<PluginField> fields;
 85 |     std::string pluginNamespace;
 86 | 
 87 | public:
 88 |     ExamplePluginCreator();
 89 |     
 90 |     /* IPluginCreator methods */
 91 |     AsciiChar const* getPluginName() const noexcept override;
 92 | 
 93 |     AsciiChar const* getPluginVersion() const noexcept override;
 94 | 
 95 |     PluginFieldCollection const* getFieldNames() noexcept override;
 96 | 
 97 |     IPluginV2* createPlugin(AsciiChar const* name, PluginFieldCollection const* fc) noexcept override;
 98 | 
 99 |     IPluginV2* deserializePlugin(AsciiChar const* name, void const* serialData, size_t serialLength) noexcept override;
100 | 
101 |     void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override;
102 | 
103 | 
104 |     AsciiChar const* getPluginNamespace() const noexcept override;
105 | };
106 | 
107 | }
108 | 
109 | #endif


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/converters/QuantConv.py:
--------------------------------------------------------------------------------
  1 | from torch2trt.torch2trt import *
  2 | from torch2trt.module_test import add_module_test
  3 | import tensorrt as trt
  4 | 
  5 | @tensorrt_converter('torch2trt.contrib.qat.layers.quant_conv.IQuantConv2d.forward', enabled=trt_version() >= '7.0') 
  6 | def convert_QuantConv(ctx):
  7 |     module = ctx.method_args[0]
  8 |     input = ctx.method_args[1]
  9 |     input_trt = add_missing_trt_tensors(ctx.network, [input])[0]
 10 |     output = ctx.method_return
 11 | 
 12 |     input_dim = input.dim() - 2
 13 | 
 14 |     kernel_size = module.kernel_size
 15 |     if not isinstance(kernel_size, tuple):
 16 |         kernel_size = (kernel_size, ) * input_dim
 17 | 
 18 |     stride = module.stride
 19 |     if not isinstance(stride, tuple):
 20 |         stride = (stride, ) * input_dim
 21 | 
 22 |     padding = module.padding
 23 |     if not isinstance(padding, tuple):
 24 |         padding = (padding, ) * input_dim
 25 | 
 26 |     dilation = module.dilation
 27 |     if not isinstance(dilation, tuple):
 28 |         dilation = (dilation, ) * input_dim
 29 | 
 30 |     kernel = module.weight.detach().cpu().numpy()
 31 |     
 32 |     bias = None #trt.Weights(torch_dtype_to_trt(module.weight.dtype))
 33 |     if module.bias is not None:
 34 |         bias = module.bias.detach().cpu().numpy()
 35 | 
 36 |     layer = ctx.network.add_convolution_nd(
 37 |         input=input_trt,
 38 |         num_output_maps=module.out_channels,
 39 |         kernel_shape=kernel_size,
 40 |         kernel=kernel,
 41 |         bias=bias)
 42 |     layer.stride_nd = stride
 43 |     layer.padding_nd = padding
 44 |     layer.dilation_nd = dilation
 45 | 
 46 |     if module.groups is not None:
 47 |         layer.num_groups = module.groups
 48 |     
 49 |     if 'qat_mode' in ctx.torch2trt_kwargs:
 50 |     #Setting dynamic range for conv
 51 |         w_quant_amax = module._weight_quantizer.learned_amax
 52 |         layer.precision = trt.int8
 53 |         layer.set_output_type(0,trt.int8)
 54 |         conv_out = layer.get_output(0)
 55 |         conv_out.dynamic_range=(-w_quant_amax,w_quant_amax)
 56 | 
 57 | 
 58 |     output._trt = layer.get_output(0)
 59 | 
 60 | 
 61 | 
 62 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0')
 63 | def test_Conv2d_basic_trt7():
 64 |     return IQuantConv2d(10, 5, kernel_size=1, stride=1, padding=0)
 65 | 
 66 | '''
 67 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0')
 68 | def test_Conv2d_stride2_trt7():
 69 |     return torch.nn.Conv2d(10, 5, kernel_size=1, stride=2, padding=0)
 70 | 
 71 | 
 72 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0')
 73 | def test_Conv2d_kernel3_trt7():
 74 |     return torch.nn.Conv2d(10, 5, kernel_size=3, stride=2, padding=1)
 75 | 
 76 | 
 77 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0')
 78 | def test_Conv2d_dilation2_trt7():
 79 |     return torch.nn.Conv2d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2)
 80 | 
 81 | 
 82 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0')
 83 | def test_Conv3d_basic_trt7():
 84 |     return torch.nn.Conv3d(10, 5, kernel_size=1, stride=1, padding=0)
 85 | 
 86 | 
 87 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0')
 88 | def test_Conv3d_stride2_trt7():
 89 |     return torch.nn.Conv3d(10, 5, kernel_size=1, stride=2, padding=0)
 90 | 
 91 | 
 92 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0')
 93 | def test_Conv3d_kernel3_trt7():
 94 |     return torch.nn.Conv3d(10, 5, kernel_size=3, stride=2, padding=1)
 95 | 
 96 | 
 97 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0')
 98 | def test_Conv3d_dilation2_trt7():
 99 |     return torch.nn.Conv3d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2)
100 |     
101 | '''
102 | 


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/converters/QuantConvBN.py:
--------------------------------------------------------------------------------
  1 | from torch2trt.torch2trt import *
  2 | from torch2trt.module_test import add_module_test
  3 | import tensorrt as trt
  4 | 
  5 | @tensorrt_converter('torch2trt.contrib.qat.layers.quant_conv.IQuantConvBN2d.forward', enabled=trt_version() >= '7.0') 
  6 | def convert_QuantConv(ctx):
  7 |     module = ctx.method_args[0]
  8 |     input = ctx.method_args[1]
  9 |     input_trt = add_missing_trt_tensors(ctx.network, [input])[0]
 10 |     output = ctx.method_return
 11 | 
 12 |     input_dim = input.dim() - 2
 13 | 
 14 |     kernel_size = module.kernel_size
 15 |     if not isinstance(kernel_size, tuple):
 16 |         kernel_size = (kernel_size, ) * input_dim
 17 | 
 18 |     stride = module.stride
 19 |     if not isinstance(stride, tuple):
 20 |         stride = (stride, ) * input_dim
 21 | 
 22 |     padding = module.padding
 23 |     if not isinstance(padding, tuple):
 24 |         padding = (padding, ) * input_dim
 25 | 
 26 |     dilation = module.dilation
 27 |     if not isinstance(dilation, tuple):
 28 |         dilation = (dilation, ) * input_dim
 29 | 
 30 |     kernel = module.folded_weight.detach().cpu().numpy()
 31 |     
 32 |     bias = None #trt.Weights(torch_dtype_to_trt(module.weight.dtype))
 33 |     if hasattr(module,'folded_bias'):
 34 |         bias = module.folded_bias.detach().cpu().numpy()
 35 | 
 36 |     layer = ctx.network.add_convolution_nd(
 37 |         input=input_trt,
 38 |         num_output_maps=module.out_channels,
 39 |         kernel_shape=kernel_size,
 40 |         kernel=kernel,
 41 |         bias=bias)
 42 |     layer.stride_nd = stride
 43 |     layer.padding_nd = padding
 44 |     layer.dilation_nd = dilation
 45 | 
 46 |     if module.groups is not None:
 47 |         layer.num_groups = module.groups
 48 |     
 49 |     if 'qat_mode' in ctx.torch2trt_kwargs:
 50 |         #Setting dynamic range for conv
 51 |         w_quant_amax = module._weight_quantizer.learned_amax
 52 |         layer.precision = trt.int8
 53 |         layer.set_output_type(0,trt.int8)
 54 |         conv_out = layer.get_output(0)
 55 |         conv_out.dynamic_range=(-w_quant_amax,w_quant_amax)
 56 | 
 57 | 
 58 |     output._trt = layer.get_output(0)
 59 | 
 60 | 
 61 | 
 62 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0')
 63 | def test_Conv2d_basic_trt7():
 64 |     return IQuantConv2d(10, 5, kernel_size=1, stride=1, padding=0)
 65 | 
 66 | '''
 67 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0')
 68 | def test_Conv2d_stride2_trt7():
 69 |     return torch.nn.Conv2d(10, 5, kernel_size=1, stride=2, padding=0)
 70 | 
 71 | 
 72 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0')
 73 | def test_Conv2d_kernel3_trt7():
 74 |     return torch.nn.Conv2d(10, 5, kernel_size=3, stride=2, padding=1)
 75 | 
 76 | 
 77 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0')
 78 | def test_Conv2d_dilation2_trt7():
 79 |     return torch.nn.Conv2d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2)
 80 | 
 81 | 
 82 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0')
 83 | def test_Conv3d_basic_trt7():
 84 |     return torch.nn.Conv3d(10, 5, kernel_size=1, stride=1, padding=0)
 85 | 
 86 | 
 87 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0')
 88 | def test_Conv3d_stride2_trt7():
 89 |     return torch.nn.Conv3d(10, 5, kernel_size=1, stride=2, padding=0)
 90 | 
 91 | 
 92 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0')
 93 | def test_Conv3d_kernel3_trt7():
 94 |     return torch.nn.Conv3d(10, 5, kernel_size=3, stride=2, padding=1)
 95 | 
 96 | 
 97 | @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0')
 98 | def test_Conv3d_dilation2_trt7():
 99 |     return torch.nn.Conv3d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2)
100 |     
101 | '''
102 | 


--------------------------------------------------------------------------------
/plugins/src/reflection_pad_2d_plugin.h:
--------------------------------------------------------------------------------
  1 | #ifndef TORCH2TRT_PLUGIN_EXAMPLE
  2 | #define TORCH2TRT_PLUGIN_EXAMPLE
  3 | 
  4 | 
  5 | #include "NvInfer.h"
  6 | #include "NvInferPlugin.h"
  7 | #include <cuda_fp16.h>
  8 | #include <vector>
  9 | #include <string>
 10 | 
 11 | #define REFLECTION_PAD_2D_PLUGIN_NAME "ReflectionPad2dPlugin"
 12 | #define REFLECTION_PAD_2D_PLUGIN_VERSION "1"
 13 | 
 14 | 
 15 | using namespace nvinfer1;
 16 | 
 17 | 
 18 | namespace torch2trt_plugins {
 19 | 
 20 | 
 21 | template<typename T>
 22 | void reflectionPad2dFunction(
 23 |     T *x, T *y, 
 24 |     int N, int C, int H, int W, 
 25 |     int paddingLeft, int paddingRight, int paddingTop, int paddingBottom, 
 26 |     cudaStream_t stream=0);
 27 | 
 28 | 
 29 | class ReflectionPad2dPlugin : public IPluginV2Ext {
 30 | public:
 31 |     int32_t outputSize;
 32 |     DataType dataType;
 33 |     int32_t paddingLeft;
 34 |     int32_t paddingRight;
 35 |     int32_t paddingTop;
 36 |     int32_t paddingBottom;
 37 |     std::string pluginNamespace;
 38 |     Dims3 outputDims;
 39 | 
 40 |     ReflectionPad2dPlugin(int32_t paddingLeft, int32_t paddingRight, int32_t paddingTop, int32_t paddingBottom);
 41 |     ~ReflectionPad2dPlugin();
 42 | 
 43 |     // IPluginV2 methods
 44 | 
 45 |     AsciiChar const* getPluginType() const noexcept override;
 46 | 
 47 |     AsciiChar const* getPluginVersion() const noexcept override;
 48 | 
 49 |     int32_t getNbOutputs() const noexcept override;
 50 | 
 51 |     Dims getOutputDimensions(int32_t index, Dims const* inputs, int32_t nbInputDims) noexcept override;
 52 | 
 53 |     bool supportsFormat(DataType type, PluginFormat format) const noexcept;
 54 | 
 55 |     int32_t initialize() noexcept override;
 56 | 
 57 |     void terminate() noexcept override;
 58 | 
 59 |     size_t getWorkspaceSize(int32_t maxBatchSize) const noexcept override;
 60 | 
 61 |     int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace,
 62 |         cudaStream_t stream) noexcept
 63 |         override;
 64 | 
 65 |     size_t getSerializationSize() const noexcept override;
 66 | 
 67 |     void serialize(void* buffer) const noexcept override;
 68 | 
 69 |     void destroy() noexcept override;
 70 | 
 71 |     IPluginV2Ext* clone() const noexcept override;
 72 | 
 73 |     void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override;
 74 | 
 75 |     AsciiChar const* getPluginNamespace() const noexcept override;
 76 | 
 77 |     // IPluginV2Ext methods
 78 |     DataType getOutputDataType(int32_t index, DataType const* inputTypes, int32_t nbInputs) const noexcept override;
 79 |     bool isOutputBroadcastAcrossBatch(int32_t outputIndex, bool const* inputIsBroadcasted, int32_t nbInputs) const noexcept override;
 80 |     bool canBroadcastInputAcrossBatch(int32_t inputIndex) const noexcept override;
 81 |     void configurePlugin(Dims const* inputDims, int32_t nbInputs, Dims const* outputDims, int32_t nbOutputs,
 82 |         DataType const* inputTypes, DataType const* outputTypes, bool const* inputIsBroadcast,
 83 |         bool const* outputIsBroadcast, PluginFormat floatFormat, int32_t maxBatchSize) noexcept override;
 84 | };
 85 | 
 86 | class ReflectionPad2dPluginCreator : public IPluginCreator {
 87 | private:
 88 |     PluginFieldCollection fieldCollection;
 89 |     std::vector<PluginField> fields;
 90 |     std::string pluginNamespace;
 91 | 
 92 | public:
 93 |     ReflectionPad2dPluginCreator();
 94 |     
 95 |     /* IPluginCreator methods */
 96 |     AsciiChar const* getPluginName() const noexcept override;
 97 | 
 98 |     AsciiChar const* getPluginVersion() const noexcept override;
 99 | 
100 |     PluginFieldCollection const* getFieldNames() noexcept override;
101 | 
102 |     IPluginV2* createPlugin(AsciiChar const* name, PluginFieldCollection const* fc) noexcept override;
103 | 
104 |     IPluginV2* deserializePlugin(AsciiChar const* name, void const* serialData, size_t serialLength) noexcept override;
105 | 
106 |     void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override;
107 | 
108 | 
109 |     AsciiChar const* getPluginNamespace() const noexcept override;
110 | };
111 | 
112 | }
113 | 
114 | #endif


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | ## Forms of contribution
  4 | 
  5 | ### Submit an Issue
  6 | 
  7 | torch2trt is use case driven.  We originally created it to solve
  8 | use cases related to NVIDIA Jetson, but the layer support has grown
  9 | largely since it's release and we've found that it has 
 10 | helped many other developers as well.  
 11 | 
 12 | The growth of torch2trt has been largely driven by issues submitted on [GitHub](https://github.com/NVIDIA-AI-IOT/torch2trt/issues).
 13 | We learn a lot from the reported issues. Submitting an issue it is one of the best ways to begin contributing to torch2trt.
 14 | 
 15 | The reported issues typically are one of the following,
 16 | 
 17 | * A bug or unexpected result
 18 | * A model with unsupported layers
 19 | 
 20 | If you report an issue, we typically find the following information helpful
 21 | 
 22 | * PyTorch version
 23 | * TensorRT version
 24 | * Platform (ie: Jetson Nano)
 25 | * The PyTorch Module you're attempting to convert
 26 | * The steps taken to convert the PyTorch module
 27 | 
 28 | If you're not sure how to provide any of these pieces of information, don't worry.  Just open the issue
 29 | and we're happy to discuss and help work out the details.
 30 | 
 31 | ### Ask a Question
 32 | 
 33 | Another great way to contribute is to ask a question on [GitHub](https://github.com/NVIDIA-AI-IOT/torch2trt/issues).
 34 | There are often other developers who share your question, and they may find the discussion helpful.  This also
 35 | helps us gauge feature interest and identify gaps in documentation.
 36 | 
 37 | ### Submit a Pull Request
 38 | 
 39 | torch2trt is use case driven and has limited maintainence, for this reason we value community contributions greatly.
 40 | Another great way to contribute is by submitting a pull request.  Pull requests which are most likely to be accepted are
 41 | 
 42 | * A new converter
 43 | * A test case
 44 | * A bug fix
 45 | 
 46 | If you add a new converter, it is best to include a few test
 47 | cases that cross validate the converter against the original PyTorch.  We provide a utility function to do this,
 48 | as described in the [Custom Converter](usage/custom_converter.md) usage guide.
 49 | 
 50 | Ideally pull requests solve one thing at a time.  This makes it easy
 51 | to evaluate the impact that the changes have on the project step-by-step.  The more confident we are that
 52 | the changes will not adversely impact the experience of other developers, the more likely we are to accept them.
 53 | 
 54 | ## Running module test cases
 55 | 
 56 | Before any change is accepted, we run the test cases on at least one platform.  This performs a large number
 57 | of cross validation checks against PyTorch.  To do this
 58 | 
 59 | ```bash
 60 | python3 -m torch2trt.test --name=converters --tolerance=1e-2
 61 | ```
 62 | 
 63 | This will not hard-fail, but will highlight any build errors or max error checks.  It is helpful if you include
 64 | the status of this command in any pull-request, as well as system information like
 65 | 
 66 | * PyTorch version
 67 | * TensorRT version
 68 | * Platform (ie: Jetson Nano)
 69 | 
 70 | ## Testing documentation
 71 | 
 72 | If you have a change that modifies the documentation, it is relatively straightforward to test.  We
 73 | use ``mkdocs-material`` for documentation, which parses markdown files in the ``docs`` folder.
 74 | 
 75 | To view the docs, simply call
 76 | 
 77 | ```
 78 | ./scripts/test_docs.sh
 79 | ```
 80 | 
 81 | And then navigate to ``https://<ip_address>:8000``.
 82 | 
 83 | Please note, this will not include dynamically generated documentation pages like the converters page.
 84 | These contain cross reference links to the GitHub source code. If you want to test these
 85 | you can call 
 86 |     
 87 | ```bash
 88 | ./scripts/build_docs.sh <github url> <tag>
 89 | ```
 90 |     
 91 | Pointing to the public reflection
 92 | of your local repository.  For example, if we're working off the upstream master branch, we
 93 | would call 
 94 |    
 95 | ```bash
 96 | ./scripts/build_docs.sh https://github.com/NVIDIA-AI-IOT/torch2trt master
 97 | ```
 98 |     
 99 | If your changes are pushed to your fork, you would do 
100 |    
101 | ```bash
102 | ./scripts/build_docs.sh https://github.com/<user>/torch2trt my_branch
103 | ```
104 |     
105 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_flattener.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import torch
  3 | from torch2trt.flattener import Flattener
  4 | 
  5 | 
  6 | def test_flattener_from_value():
  7 | 
  8 |     x = (torch.ones(3), torch.ones(3))
  9 | 
 10 |     flattener = Flattener.from_value(x)
 11 | 
 12 |     assert(isinstance(flattener.schema, tuple))
 13 |     assert(flattener.schema[0] == 0)
 14 |     assert(flattener.schema[1] == 1)
 15 | 
 16 | 
 17 | def test_flattener_tuple():
 18 | 
 19 |     x = (torch.ones(3), torch.ones(3))
 20 | 
 21 |     flattener = Flattener.from_value(x)
 22 | 
 23 |     y = flattener.flatten(x)
 24 | 
 25 |     assert(len(y) == len(x))
 26 |     assert(y[0] is x[0])
 27 |     assert(y[1] is x[1])
 28 | 
 29 |     z = flattener.unflatten(y)
 30 | 
 31 |     assert(isinstance(z, tuple))
 32 |     assert(z[0] is x[0])
 33 |     assert(z[1] is x[1])
 34 | 
 35 | 
 36 | def test_flattener_list():
 37 |     
 38 |     x = [torch.ones(3), torch.ones(3)]
 39 | 
 40 |     flattener = Flattener.from_value(x)
 41 | 
 42 |     y = flattener.flatten(x)
 43 | 
 44 |     assert(len(y) == len(x))
 45 |     assert(y[0] is x[0])
 46 |     assert(y[1] is x[1])
 47 | 
 48 |     z = flattener.unflatten(y)
 49 | 
 50 |     assert(isinstance(z, list))
 51 |     assert(z[0] is x[0])
 52 |     assert(z[1] is x[1])
 53 | 
 54 | 
 55 | def test_flattener_dict():
 56 |     
 57 |     x = {'a': torch.ones(3), 'b': torch.ones(3)}
 58 | 
 59 |     flattener = Flattener.from_value(x)
 60 | 
 61 |     y = flattener.flatten(x)
 62 | 
 63 |     assert(len(y) == len(x))
 64 |     assert((y[0] is x['a'] and y[1] is x['b']) or (y[1] is x['a'] and y[0] is x['b']))
 65 | 
 66 |     z = flattener.unflatten(y)
 67 | 
 68 |     assert(isinstance(z, dict))
 69 |     assert(z['a'] is x['a'])
 70 |     assert(z['b'] is x['b'])
 71 | 
 72 | 
 73 | def test_flattener_nested_tuple():
 74 | 
 75 |     x = (torch.ones(1), (torch.ones(2), torch.ones(3)))
 76 | 
 77 |     flattener = Flattener.from_value(x)
 78 | 
 79 |     y = flattener.flatten(x)
 80 | 
 81 |     assert(len(y) == 3)
 82 |     
 83 |     z = flattener.unflatten(y)
 84 | 
 85 |     assert(isinstance(z, tuple))
 86 |     assert(isinstance(z[1], tuple))
 87 |     assert(z[0] is x[0])
 88 |     assert(z[1][0] is x[1][0])
 89 |     assert(z[1][1] is x[1][1])
 90 | 
 91 | 
 92 | def test_flattener_nested_list():
 93 | 
 94 |     x = [torch.ones(1), [torch.ones(2), torch.ones(3)]]
 95 | 
 96 |     flattener = Flattener.from_value(x)
 97 | 
 98 |     y = flattener.flatten(x)
 99 | 
100 |     assert(len(y) == 3)
101 |     
102 |     z = flattener.unflatten(y)
103 | 
104 |     assert(isinstance(z, list))
105 |     assert(isinstance(z[1], list))
106 |     assert(z[0] is x[0])
107 |     assert(z[1][0] is x[1][0])
108 |     assert(z[1][1] is x[1][1])
109 |     assert(z[0] is x[0])
110 |     assert(z[1][0] is x[1][0])
111 |     assert(z[1][1] is x[1][1])
112 | 
113 | 
114 | def test_flattener_nested_dict():
115 |     
116 |     x = {'a': torch.ones(1), 'b': {'a': torch.ones(2), 'b': torch.ones(3)}}
117 | 
118 |     flattener = Flattener.from_value(x)
119 | 
120 |     y = flattener.flatten(x)
121 | 
122 |     assert(len(y) == 3)
123 | 
124 |     z = flattener.unflatten(y)
125 | 
126 |     assert(isinstance(z, dict))
127 |     assert(isinstance(z['b'], dict))
128 |     assert(z['a'] is x['a'])
129 |     assert(z['b']['a'] is x['b']['a'])
130 |     assert(z['b']['b'] is x['b']['b'])
131 | 
132 | 
133 | def test_flattener_heterogeneous():
134 | 
135 |     x = {
136 |         'a': (torch.ones(1), {'a': torch.ones(2)}),
137 |         'b': [torch.ones(3), torch.ones(4), (torch.ones(5), {'a': torch.ones(6)})]
138 |     }
139 | 
140 |     flattener = Flattener.from_value(x)
141 | 
142 |     y = flattener.flatten(x)
143 | 
144 |     assert(len(y) == 6)
145 | 
146 |     z = flattener.unflatten(y)
147 | 
148 |     assert(isinstance(z, dict))
149 |     assert(isinstance(z['a'], tuple))
150 |     assert(z['a'][0] is x['a'][0])
151 |     assert(isinstance(z['a'][1], dict))
152 |     assert(z['a'][1]['a'] is x['a'][1]['a'])
153 |     assert(isinstance(z['b'], list))
154 |     assert(z['b'][0] is x['b'][0])
155 |     assert(z['b'][1] is x['b'][1])
156 |     assert(isinstance(z['b'][2], tuple))
157 |     assert(z['b'][2][0] is x['b'][2][0])
158 |     assert(isinstance(z['b'][2][1], dict))
159 |     assert(z['b'][2][1]['a'] is x['b'][2][1]['a'])


--------------------------------------------------------------------------------
/tests/feature_tests/test_dataset.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch2trt.dataset import (
  5 |     TensorBatchDataset,
  6 |     ListDataset,
  7 |     FolderDataset
  8 | )
  9 | from tempfile import mkdtemp
 10 | 
 11 | 
 12 | def test_dataset_shapes():
 13 | 
 14 |     dataset = ListDataset()
 15 |     dataset.insert((torch.randn(1, 3, 32, 32), torch.randn(1, 4)))
 16 |     dataset.insert((torch.randn(1, 3, 64, 64), torch.randn(1, 8)))
 17 |     dataset.insert((torch.randn(1, 3, 48, 48), torch.randn(1, 6)))
 18 | 
 19 |     shapes = dataset.shapes()
 20 | 
 21 |     assert(shapes[0][0] == (1, 3, 32, 32))
 22 |     assert(shapes[0][1] == (1, 3, 64, 64))
 23 |     assert(shapes[1][0] == (1, 4))
 24 |     assert(shapes[1][1] == (1, 8))
 25 | 
 26 |     assert(dataset.min_shapes()[0] == (1, 3, 32, 32))
 27 |     assert(dataset.min_shapes()[1] == (1, 4))
 28 |     assert(dataset.max_shapes()[0] == (1, 3, 64, 64))
 29 |     assert(dataset.max_shapes()[1] == (1, 8))
 30 |     assert(dataset.median_numel_shapes()[0] == (1, 3, 48, 48))
 31 |     assert(dataset.median_numel_shapes()[1] == (1, 6))
 32 | 
 33 | 
 34 | def test_dataset_infer_dynamic_axes():
 35 | 
 36 |     dataset = ListDataset()
 37 |     dataset.insert((torch.randn(1, 3, 32, 32), torch.randn(1, 4)))
 38 |     dataset.insert((torch.randn(1, 3, 64, 64), torch.randn(1, 8)))
 39 |     dataset.insert((torch.randn(1, 3, 48, 48), torch.randn(1, 6)))
 40 | 
 41 |     dynamic_axes = dataset.infer_dynamic_axes()
 42 |     
 43 |     assert(dynamic_axes[0] == [2, 3])
 44 |     assert(dynamic_axes[1] == [1])
 45 | 
 46 | 
 47 | def test_tensor_batch_dataset_record():
 48 | 
 49 |     dataset = TensorBatchDataset()
 50 | 
 51 |     class TestModule(nn.Module):
 52 |         def __init__(self):
 53 |             super().__init__()
 54 |             self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval()
 55 | 
 56 |         def forward(self, x, y):
 57 |             a = self.conv(x)
 58 |             b = self.conv(y)
 59 |             return torch.cat([a, b], dim=0)
 60 | 
 61 |     inputs = [
 62 |         torch.randn(1, 3, 32, 32).cuda(),
 63 |         torch.randn(1, 3, 32, 32).cuda()
 64 |     ]
 65 | 
 66 |     module = TestModule().cuda().eval()
 67 | 
 68 |     with dataset.record(module):
 69 |         for i in range(5):
 70 |             module(*inputs)
 71 | 
 72 |     assert(len(dataset) == 5)
 73 |     assert(len(dataset[0]) == 2)
 74 |     assert(dataset[0][0].shape == (1, 3, 32, 32))
 75 |     assert(dataset[0][1].shape == (1, 3, 32, 32))
 76 | 
 77 | 
 78 | def test_list_dataset_record():
 79 | 
 80 |     dataset = ListDataset()
 81 | 
 82 |     class TestModule(nn.Module):
 83 |         def __init__(self):
 84 |             super().__init__()
 85 |             self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval()
 86 | 
 87 |         def forward(self, x, y):
 88 |             a = self.conv(x)
 89 |             b = self.conv(y)
 90 |             return torch.cat([a, b], dim=0)
 91 | 
 92 |     inputs = [
 93 |         torch.randn(1, 3, 32, 32).cuda(),
 94 |         torch.randn(1, 3, 32, 32).cuda()
 95 |     ]
 96 | 
 97 |     module = TestModule().cuda().eval()
 98 | 
 99 |     with dataset.record(module):
100 |         for i in range(5):
101 |             module(*inputs)
102 | 
103 |     assert(len(dataset) == 5)
104 |     assert(len(dataset[0]) == 2)
105 |     assert(dataset[0][0].shape == (1, 3, 32, 32))
106 |     assert(dataset[0][1].shape == (1, 3, 32, 32))
107 | 
108 | 
109 | def test_folder_dataset_record():
110 | 
111 |     dataset = FolderDataset(mkdtemp())
112 | 
113 |     class TestModule(nn.Module):
114 |         def __init__(self):
115 |             super().__init__()
116 |             self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1).cuda().eval()
117 | 
118 |         def forward(self, x, y):
119 |             a = self.conv(x)
120 |             b = self.conv(y)
121 |             return torch.cat([a, b], dim=0)
122 | 
123 |     device = torch.device('cuda:0')
124 | 
125 |     inputs = [
126 |         torch.randn(1, 3, 32, 32, device=device),
127 |         torch.randn(1, 3, 32, 32, device=device)
128 |     ]
129 | 
130 |     module = TestModule().to(device).eval()
131 | 
132 |     with dataset.record(module):
133 |         for i in range(5):
134 |             module(*inputs)
135 | 
136 |     assert(len(dataset) == 5)
137 |     assert(len(dataset[0]) == 2)
138 |     assert(dataset[0][0].shape == (1, 3, 32, 32))
139 |     assert(dataset[0][1].shape == (1, 3, 32, 32))
140 |     assert(dataset[0][0].device == device)


--------------------------------------------------------------------------------
/benchmarks/JETSON_XAVIER.md:
--------------------------------------------------------------------------------
 1 | | Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) |
 2 | |------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------|
 3 | | torch2trt.tests.torchvision.classification.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.63E-05 | 251 | 565 | 4.96 | 2.02 |
 4 | | torch2trt.tests.torchvision.classification.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 121 | 834 | 8.04 | 1.49 |
 5 | | torch2trt.tests.torchvision.classification.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 125 | 1.29e+03 | 8.01 | 1.02 |
 6 | | torch2trt.tests.torchvision.classification.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-03 | 136 | 722 | 7.33 | 1.64 |
 7 | | torch2trt.tests.torchvision.classification.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.50E-01 | 77.8 | 396 | 12.9 | 2.79 |
 8 | | torch2trt.tests.torchvision.classification.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.09E-01 | 55.8 | 326 | 17.9 | 3.37 |
 9 | | torch2trt.tests.torchvision.classification.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 28.3 | 175 | 35.1 | 6.04 |
10 | | torch2trt.tests.torchvision.classification.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 18.8 | 122 | 53.2 | 8.57 |
11 | | torch2trt.tests.torchvision.classification.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.81E-03 | 20.9 | 76.6 | 47.5 | 13 |
12 | | torch2trt.tests.torchvision.classification.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.91E-03 | 14.8 | 41.7 | 66.7 | 23.7 |
13 | | torch2trt.tests.torchvision.classification.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 12.6 | 30.2 | 79.1 | 33 |
14 | | torch2trt.tests.torchvision.classification.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 16.1 | 43.7 | 62.1 | 23 |
15 | | torch2trt.tests.torchvision.classification.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.56E-03 | 84.8 | 201 | 12.1 | 5.24 |
16 | | torch2trt.tests.torchvision.classification.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.24E-03 | 71.1 | 165 | 14.3 | 6.34 |
17 | | torch2trt.tests.torchvision.classification.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.78E-03 | 61.5 | 139 | 16.5 | 7.46 |
18 | | torch2trt.tests.torchvision.classification.vgg19 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.81E-03 | 54.1 | 120 | 18.7 | 8.61 |
19 | | torch2trt.tests.torchvision.classification.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.20E-03 | 81.5 | 200 | 12.5 | 5.27 |
20 | | torch2trt.tests.torchvision.classification.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.71E-03 | 67.5 | 165 | 15.1 | 6.33 |
21 | | torch2trt.tests.torchvision.classification.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.87E-03 | 58.3 | 139 | 17.4 | 7.48 |
22 | | torch2trt.tests.torchvision.classification.vgg19_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.44E-03 | 51.4 | 120 | 19.7 | 8.61 |
23 | | torch2trt.tests.torchvision.classification.mobilenet_v2 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 64.8 | 723 | 15.4 | 1.67 |
24 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.2 | 463 | 19.4 | 2.17 |
25 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 49.4 | 419 | 20.4 | 2.43 |
26 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x1_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.4 | 426 | 19.6 | 2.37 |
27 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x2_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 48.2 | 419 | 20.8 | 2.48 |
28 | | torch2trt.tests.torchvision.classification.mnasnet0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.03E-06 | 67.8 | 883 | 14.9 | 1.4 |
29 | | torch2trt.tests.torchvision.classification.mnasnet0_75 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.6 | 751 | 14.8 | 1.6 |
30 | | torch2trt.tests.torchvision.classification.mnasnet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 65.7 | 667 | 15.2 | 1.77 |
31 | | torch2trt.tests.torchvision.classification.mnasnet1_3 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.4 | 573 | 15 | 2.02 |


--------------------------------------------------------------------------------
/docs/benchmarks/jetson_xavier.md:
--------------------------------------------------------------------------------
 1 | # Jetson Xavier
 2 | 
 3 | | Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) |
 4 | |------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------|
 5 | | torch2trt.tests.torchvision.classification.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.63E-05 | 251 | 565 | 4.96 | 2.02 |
 6 | | torch2trt.tests.torchvision.classification.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 121 | 834 | 8.04 | 1.49 |
 7 | | torch2trt.tests.torchvision.classification.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 125 | 1.29e+03 | 8.01 | 1.02 |
 8 | | torch2trt.tests.torchvision.classification.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-03 | 136 | 722 | 7.33 | 1.64 |
 9 | | torch2trt.tests.torchvision.classification.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.50E-01 | 77.8 | 396 | 12.9 | 2.79 |
10 | | torch2trt.tests.torchvision.classification.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.09E-01 | 55.8 | 326 | 17.9 | 3.37 |
11 | | torch2trt.tests.torchvision.classification.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 28.3 | 175 | 35.1 | 6.04 |
12 | | torch2trt.tests.torchvision.classification.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 18.8 | 122 | 53.2 | 8.57 |
13 | | torch2trt.tests.torchvision.classification.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.81E-03 | 20.9 | 76.6 | 47.5 | 13 |
14 | | torch2trt.tests.torchvision.classification.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.91E-03 | 14.8 | 41.7 | 66.7 | 23.7 |
15 | | torch2trt.tests.torchvision.classification.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 12.6 | 30.2 | 79.1 | 33 |
16 | | torch2trt.tests.torchvision.classification.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 16.1 | 43.7 | 62.1 | 23 |
17 | | torch2trt.tests.torchvision.classification.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.56E-03 | 84.8 | 201 | 12.1 | 5.24 |
18 | | torch2trt.tests.torchvision.classification.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.24E-03 | 71.1 | 165 | 14.3 | 6.34 |
19 | | torch2trt.tests.torchvision.classification.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.78E-03 | 61.5 | 139 | 16.5 | 7.46 |
20 | | torch2trt.tests.torchvision.classification.vgg19 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.81E-03 | 54.1 | 120 | 18.7 | 8.61 |
21 | | torch2trt.tests.torchvision.classification.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.20E-03 | 81.5 | 200 | 12.5 | 5.27 |
22 | | torch2trt.tests.torchvision.classification.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.71E-03 | 67.5 | 165 | 15.1 | 6.33 |
23 | | torch2trt.tests.torchvision.classification.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.87E-03 | 58.3 | 139 | 17.4 | 7.48 |
24 | | torch2trt.tests.torchvision.classification.vgg19_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.44E-03 | 51.4 | 120 | 19.7 | 8.61 |
25 | | torch2trt.tests.torchvision.classification.mobilenet_v2 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 64.8 | 723 | 15.4 | 1.67 |
26 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.2 | 463 | 19.4 | 2.17 |
27 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 49.4 | 419 | 20.4 | 2.43 |
28 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x1_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.4 | 426 | 19.6 | 2.37 |
29 | | torch2trt.tests.torchvision.classification.shufflenet_v2_x2_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 48.2 | 419 | 20.8 | 2.48 |
30 | | torch2trt.tests.torchvision.classification.mnasnet0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.03E-06 | 67.8 | 883 | 14.9 | 1.4 |
31 | | torch2trt.tests.torchvision.classification.mnasnet0_75 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.6 | 751 | 14.8 | 1.6 |
32 | | torch2trt.tests.torchvision.classification.mnasnet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 65.7 | 667 | 15.2 | 1.77 |
33 | | torch2trt.tests.torchvision.classification.mnasnet1_3 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.4 | 573 | 15 | 2.02 |
34 | 


--------------------------------------------------------------------------------
/tests/model_tests/torchvision/test_classification_models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchvision
  3 | import torch2trt
  4 | 
  5 | 
  6 | def _cross_validate_module(model, shape=(224, 224)):
  7 |     model = model.cuda().eval()
  8 |     data = torch.randn(1, 3, *shape).cuda()
  9 |     model_trt = torch2trt.torch2trt(model, [data])
 10 |     data = torch.randn(1, 3, *shape).cuda()
 11 |     out = model(data)
 12 |     out_trt = model_trt(data)
 13 |     assert torch.allclose(out, out_trt, rtol=1e-1, atol=1e-1)
 14 | 
 15 | 
 16 |     
 17 | def test_alexnet():
 18 |     model = torchvision.models.alexnet(pretrained=False)
 19 |     _cross_validate_module(model)
 20 | 
 21 | 
 22 | def test_squeezenet1_0():
 23 |     model = torchvision.models.squeezenet1_0(pretrained=False)
 24 |     _cross_validate_module(model)
 25 | 
 26 | 
 27 | def test_squeezenet1_1():
 28 |     model = torchvision.models.squeezenet1_1(pretrained=False)
 29 |     _cross_validate_module(model)
 30 | 
 31 | 
 32 | def test_resnet18():
 33 |     model = torchvision.models.resnet18(pretrained=False)
 34 |     _cross_validate_module(model)
 35 | 
 36 | 
 37 | def test_resnet34():
 38 |     model = torchvision.models.resnet34(pretrained=False)
 39 |     _cross_validate_module(model)
 40 | 
 41 | 
 42 | def test_resnet50():
 43 |     model = torchvision.models.resnet50(pretrained=False)
 44 |     _cross_validate_module(model)
 45 | 
 46 | 
 47 | def test_resnet101():
 48 |     model = torchvision.models.resnet101(pretrained=False)
 49 |     _cross_validate_module(model)
 50 | 
 51 | 
 52 | def test_resnet152():
 53 |     model = torchvision.models.resnet152(pretrained=False)
 54 |     _cross_validate_module(model)
 55 | 
 56 | 
 57 | def test_densenet121():
 58 |     model = torchvision.models.densenet121(pretrained=False)
 59 |     _cross_validate_module(model)
 60 | 
 61 | 
 62 | def test_densenet169():
 63 |     model = torchvision.models.densenet169(pretrained=False)
 64 |     _cross_validate_module(model)
 65 | 
 66 | 
 67 | def test_densenet201():
 68 |     model = torchvision.models.densenet201(pretrained=False)
 69 |     _cross_validate_module(model)
 70 | 
 71 | 
 72 | def test_densenet161():
 73 |     model = torchvision.models.densenet161(pretrained=False)
 74 |     _cross_validate_module(model)
 75 | 
 76 | 
 77 | def test_vgg11():
 78 |     model = torchvision.models.vgg11(pretrained=False)
 79 |     _cross_validate_module(model)
 80 | 
 81 | 
 82 | def test_vgg13():
 83 |     model = torchvision.models.vgg13(pretrained=False)
 84 |     _cross_validate_module(model)
 85 | 
 86 | 
 87 | def test_vgg16():
 88 |     model = torchvision.models.vgg16(pretrained=False)
 89 |     _cross_validate_module(model)
 90 | 
 91 | 
 92 | def test_vgg19():
 93 |     model = torchvision.models.vgg19(pretrained=False)
 94 |     _cross_validate_module(model)
 95 | 
 96 | 
 97 | def test_vgg11_bn():
 98 |     model = torchvision.models.vgg11_bn(pretrained=False)
 99 |     _cross_validate_module(model)
100 | 
101 | 
102 | def test_vgg13_bn():
103 |     model = torchvision.models.vgg13_bn(pretrained=False)
104 |     _cross_validate_module(model)
105 | 
106 | 
107 | def test_vgg16_bn():
108 |     model = torchvision.models.vgg16_bn(pretrained=False)
109 |     _cross_validate_module(model)
110 | 
111 | 
112 | def test_vgg19_bn():
113 |     model = torchvision.models.vgg19_bn(pretrained=False)
114 |     _cross_validate_module(model)
115 | 
116 | 
117 | def mobilenet_v2():
118 |     model = torchvision.models.mobilenet_v2(pretrained=False)
119 |     _cross_validate_module(model)
120 | 
121 | 
122 | def test_shufflenet_v2_x0_5():
123 |     model = torchvision.models.shufflenet_v2_x0_5(pretrained=False)
124 |     _cross_validate_module(model)
125 | 
126 | 
127 | def test_shufflenet_v2_x1_0():
128 |     model = torchvision.models.shufflenet_v2_x1_0(pretrained=False)
129 |     _cross_validate_module(model)
130 | 
131 | 
132 | def test_shufflenet_v2_x1_5():
133 |     model = torchvision.models.shufflenet_v2_x1_5(pretrained=False)
134 |     _cross_validate_module(model)
135 | 
136 | 
137 | def test_shufflenet_v2_x2_0():
138 |     model = torchvision.models.shufflenet_v2_x2_0(pretrained=False)
139 |     _cross_validate_module(model)
140 | 
141 | 
142 | def test_mnasnet0_5():
143 |     model = torchvision.models.mnasnet0_5(pretrained=False)
144 |     _cross_validate_module(model)
145 | 
146 | 
147 | def test_mnasnet0_75():
148 |     model = torchvision.models.mnasnet0_75(pretrained=False)
149 |     _cross_validate_module(model)
150 | 
151 | 
152 | def test_mnasnet1_0():
153 |     model = torchvision.models.mnasnet1_0(pretrained=False)
154 |     _cross_validate_module(model)
155 | 
156 | 
157 | def test_mnasnet1_3():
158 |     model = torchvision.models.mnasnet1_3(pretrained=False)
159 |     _cross_validate_module(model)


--------------------------------------------------------------------------------
/examples/image_classification/conversion.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "First, we create the pre-trained ImageNet model.  We'll use ``resnet18`` from the torchvision package.  Make sure to set the device to ``cuda``, since the inputs and parameter devices are inferred from model.  Also make sure to set ``eval()`` to fix batch norm statistics."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import torchvision\n",
 17 |     "\n",
 18 |     "model = torchvision.models.resnet18(pretrained=True).cuda().half().eval()"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "Next, we create some sample input that will be used to infer the shape and data types of our TensorRT engine"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import torch\n",
 35 |     "\n",
 36 |     "data = torch.randn((1, 3, 224, 224)).cuda().half()"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "Finally, create the optimized TensorRT engine."
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from torch2trt import torch2trt\n",
 53 |     "\n",
 54 |     "model_trt = torch2trt(model, [data], fp16_mode=True)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "We can execute the network like this"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "output_trt = model_trt(data)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "And check against the original output"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 8,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "tensor([ 0.7231,  3.0195,  3.1016,  3.1152,  4.7539,  3.8301,  3.9180,  0.3086,\n",
 90 |       "        -0.8726, -0.2261], device='cuda:0', dtype=torch.float16,\n",
 91 |       "       grad_fn=<SliceBackward>)\n",
 92 |       "tensor([ 0.7202,  3.0234,  3.1074,  3.1133,  4.7539,  3.8340,  3.9141,  0.3081,\n",
 93 |       "        -0.8716, -0.2227], device='cuda:0', dtype=torch.float16)\n",
 94 |       "max error: 0.011719\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "output = model(data)\n",
100 |     "\n",
101 |     "print(output.flatten()[0:10])\n",
102 |     "print(output_trt.flatten()[0:10])\n",
103 |     "print('max error: %f' % float(torch.max(torch.abs(output - output_trt))))"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "We can save the model like this"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "torch.save(model_trt.state_dict(), 'resnet18_trt.pth')"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "And load the model like this."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "from torch2trt import TRTModule\n",
136 |     "\n",
137 |     "model_trt = TRTModule()\n",
138 |     "\n",
139 |     "model_trt.load_state_dict(torch.load('resnet18_trt.pth'))"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "That's it for this notebook!  Try out the live demo to see real-time classification on a video feed."
147 |    ]
148 |   }
149 |  ],
150 |  "metadata": {
151 |   "kernelspec": {
152 |    "display_name": "Python 3",
153 |    "language": "python",
154 |    "name": "python3"
155 |   },
156 |   "language_info": {
157 |    "codemirror_mode": {
158 |     "name": "ipython",
159 |     "version": 3
160 |    },
161 |    "file_extension": ".py",
162 |    "mimetype": "text/x-python",
163 |    "name": "python",
164 |    "nbconvert_exporter": "python",
165 |    "pygments_lexer": "ipython3",
166 |    "version": "3.6.7"
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 2
171 | }
172 | 


--------------------------------------------------------------------------------
/plugins/src/reflection_pad_2d_plugin_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <catch2/catch_all.hpp>
  2 | #include "reflection_pad_2d_plugin.h"
  3 | #include <cuda_fp16.h>
  4 | #include "NvInfer.h"
  5 | #include <iostream>
  6 | 
  7 | 
  8 | using namespace torch2trt_plugins;
  9 | 
 10 | 
 11 | TEMPLATE_TEST_CASE("Test reflection pad kernel", "[ReflectionPad2d][template]" , float) {
 12 |     TestType x_cpu[9] = {
 13 |         0, 1, 2,
 14 |         3, 4, 5,
 15 |         6, 7, 8
 16 |     };
 17 |     TestType y_cpu[25];
 18 |     TestType *x_gpu;
 19 |     TestType *y_gpu;
 20 |     TestType y_cpu_gt[25] = {
 21 |         4, 3, 4, 5, 4,
 22 |         1, 0, 1, 2, 1,
 23 |         4, 3, 4, 5, 4,
 24 |         7, 6, 7, 8, 7,
 25 |         4, 3, 4, 5, 4
 26 |     };
 27 |     
 28 |     // y_cpu = (TestType*) malloc(16 * sizeof(TestType));
 29 |     cudaMalloc((void**)&x_gpu, 9 * sizeof(TestType));
 30 |     cudaMalloc((void**)&y_gpu, 25 * sizeof(TestType));
 31 |     cudaMemcpy(x_gpu, x_cpu, 9 * sizeof(TestType), cudaMemcpyHostToDevice);
 32 | 
 33 |     reflectionPad2dFunction<TestType>(x_gpu, y_gpu, 
 34 |         1, 1, 5, 5,
 35 |         1, 1, 1, 1);
 36 | 
 37 |     cudaMemcpy(y_cpu, y_gpu, 25 * sizeof(TestType), cudaMemcpyDeviceToHost);
 38 |     for (int i = 0; i < 25; i++) {
 39 |         REQUIRE(y_cpu[i] == y_cpu_gt[i]);
 40 |     }
 41 |     cudaFree(x_gpu);
 42 |     cudaFree(y_gpu);
 43 | }
 44 | 
 45 | TEMPLATE_TEST_CASE("Test reflection pad plugin enqueue", "[ReflectionPad2d][template]" , float) {
 46 |     TestType x_cpu[9] = {
 47 |         0, 1, 2,
 48 |         3, 4, 5,
 49 |         6, 7, 8
 50 |     };
 51 |     TestType y_cpu[25];
 52 |     TestType *x_gpu;
 53 |     TestType *y_gpu;
 54 |     TestType y_cpu_gt[25] = {
 55 |         4, 3, 4, 5, 4,
 56 |         1, 0, 1, 2, 1,
 57 |         4, 3, 4, 5, 4,
 58 |         7, 6, 7, 8, 7,
 59 |         4, 3, 4, 5, 4
 60 |     };
 61 |     
 62 |     // y_cpu = (TestType*) malloc(16 * sizeof(TestType));
 63 |     cudaMalloc((void**)&x_gpu, 9 * sizeof(TestType));
 64 |     cudaMalloc((void**)&y_gpu, 25 * sizeof(TestType));
 65 |     cudaMemcpy(x_gpu, x_cpu, 9 * sizeof(TestType), cudaMemcpyHostToDevice);
 66 | 
 67 |     auto plugin = ReflectionPad2dPlugin(1, 1, 1, 1);
 68 |     Dims3 inputDims(1, 3, 3);
 69 |     Dims3 outputDims(1, 5, 5);
 70 |     DataType inputTypes = DataType::kFLOAT;
 71 |     DataType outputTypes = DataType::kFLOAT;
 72 |     bool inputIsBroadcast = false;
 73 |     bool outputIsBroadcast = false;
 74 |     plugin.configurePlugin(
 75 |         &inputDims, 1, 
 76 |         &outputDims, 1, 
 77 |         &inputTypes, 
 78 |         &outputTypes, 
 79 |         &inputIsBroadcast,
 80 |         &outputIsBroadcast,
 81 |         PluginFormat::kLINEAR, 
 82 |         1
 83 |     );
 84 |     
 85 |     void *inputs[] = {(void*)x_gpu};
 86 |     void *outputs[] = {(void*)y_gpu};
 87 |     plugin.enqueue(1, inputs, outputs, nullptr, 0);
 88 |     
 89 |     cudaMemcpy(y_cpu, y_gpu, 25 * sizeof(TestType), cudaMemcpyDeviceToHost);
 90 |     for (int i = 0; i < 25; i++) {
 91 |         REQUIRE(y_cpu[i] == y_cpu_gt[i]);
 92 |     }
 93 |     cudaFree(x_gpu);
 94 |     cudaFree(y_gpu);
 95 | }
 96 | 
 97 | TEMPLATE_TEST_CASE("Test reflection pad plugin enqueue 2 channels", "[ReflectionPad2d][template]" , float) {
 98 |     TestType x_cpu[9*2] = {
 99 |         0, 1, 2,
100 |         3, 4, 5,
101 |         6, 7, 8,
102 |         0, 1, 2,
103 |         3, 4, 5,
104 |         6, 7, 8
105 |     };
106 |     TestType y_cpu[25*2];
107 |     TestType *x_gpu;
108 |     TestType *y_gpu;
109 |     TestType y_cpu_gt[25*2] = {
110 |         4, 3, 4, 5, 4,
111 |         1, 0, 1, 2, 1,
112 |         4, 3, 4, 5, 4,
113 |         7, 6, 7, 8, 7,
114 |         4, 3, 4, 5, 4,
115 |         4, 3, 4, 5, 4,
116 |         1, 0, 1, 2, 1,
117 |         4, 3, 4, 5, 4,
118 |         7, 6, 7, 8, 7,
119 |         4, 3, 4, 5, 4
120 |     };
121 |     
122 |     // y_cpu = (TestType*) malloc(16 * sizeof(TestType));
123 |     cudaMalloc((void**)&x_gpu, 2*9 * sizeof(TestType));
124 |     cudaMalloc((void**)&y_gpu, 2*25 * sizeof(TestType));
125 |     cudaMemcpy(x_gpu, x_cpu, 2*9 * sizeof(TestType), cudaMemcpyHostToDevice);
126 | 
127 |     auto plugin = ReflectionPad2dPlugin(1, 1, 1, 1);
128 |     Dims3 inputDims(2, 3, 3);
129 |     Dims3 outputDims(2, 5, 5);
130 |     DataType inputTypes = DataType::kFLOAT;
131 |     DataType outputTypes = DataType::kFLOAT;
132 |     bool inputIsBroadcast = false;
133 |     bool outputIsBroadcast = false;
134 |     plugin.configurePlugin(
135 |         &inputDims, 1, 
136 |         &outputDims, 1, 
137 |         &inputTypes, 
138 |         &outputTypes, 
139 |         &inputIsBroadcast,
140 |         &outputIsBroadcast,
141 |         PluginFormat::kLINEAR, 
142 |         1
143 |     );
144 | 
145 |     void *inputs[] = {(void*)x_gpu};
146 |     void *outputs[] = {(void*)y_gpu};
147 |     plugin.enqueue(1, inputs, outputs, nullptr, 0);
148 |     
149 |     cudaMemcpy(y_cpu, y_gpu, 2*25 * sizeof(TestType), cudaMemcpyDeviceToHost);
150 |     for (int i = 0; i < 2*25; i++) {
151 |         REQUIRE(y_cpu[i] == y_cpu_gt[i]);
152 |     }
153 |     cudaFree(x_gpu);
154 |     cudaFree(y_gpu);
155 | }
156 | 


--------------------------------------------------------------------------------
/tests/feature_tests/test_tensor_shape.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch2trt import (
  5 |     torch2trt,
  6 |     trt,
  7 |     SizeWrapper,
  8 |     tensorrt_converter
  9 | )
 10 | 
 11 | 
 12 | def test_tensor_shape_view_trivial():
 13 | 
 14 |     class TestModule(torch.nn.Module):
 15 |         def forward(self, x):
 16 |             size = x.size()
 17 |             return x.view(size)
 18 | 
 19 |     module = TestModule().cuda().eval()
 20 | 
 21 |     x = torch.randn(1, 3, 32, 32).cuda()
 22 | 
 23 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4)
 24 | 
 25 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 26 | 
 27 |     x = torch.randn(1, 3, 32, 32).cuda()
 28 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 29 |     
 30 |     x = torch.randn(4, 3, 32, 32).cuda()
 31 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 32 | 
 33 | 
 34 | def test_tensor_shape_view_mul():
 35 | 
 36 |     class TestModule(torch.nn.Module):
 37 |         def forward(self, x):
 38 |             size = x.size()
 39 |             return x.view(size[0] * size[1], size[2] * size[3])
 40 | 
 41 |     module = TestModule().cuda().eval()
 42 | 
 43 |     x = torch.randn(1, 3, 32, 32).cuda()
 44 | 
 45 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4)
 46 | 
 47 |     x = torch.randn(1, 3, 32, 32).cuda()
 48 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 49 | 
 50 |     x = torch.randn(4, 3, 32, 32).cuda()
 51 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 52 | 
 53 | 
 54 | def test_tensor_shape_view_mul():
 55 | 
 56 |     class TestModule(torch.nn.Module):
 57 |         def forward(self, x):
 58 |             size = x.size()
 59 |             return x.view(size[0] * size[1], size[2] * size[3])
 60 | 
 61 |     module = TestModule().cuda().eval()
 62 | 
 63 |     x = torch.randn(1, 3, 32, 32).cuda()
 64 | 
 65 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4)
 66 | 
 67 |     x = torch.randn(1, 3, 32, 32).cuda()
 68 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 69 | 
 70 |     x = torch.randn(4, 3, 32, 32).cuda()
 71 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 72 | 
 73 | 
 74 | def test_tensor_shape_view_mul_cast():
 75 | 
 76 |     class TestModule(torch.nn.Module):
 77 |         def forward(self, x):
 78 |             size = x.size()
 79 |             return x.view(size[0] * int(size[1]), int(size[2] * size[3]))
 80 | 
 81 |     module = TestModule().cuda().eval()
 82 | 
 83 |     x = torch.randn(1, 3, 32, 32).cuda()
 84 | 
 85 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4)
 86 | 
 87 |     x = torch.randn(1, 3, 32, 32).cuda()
 88 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 89 | 
 90 |     x = torch.randn(4, 3, 32, 32).cuda()
 91 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
 92 | 
 93 | 
 94 | def test_tensor_shape_view_mul_const_lhs():
 95 | 
 96 |     class TestModule(torch.nn.Module):
 97 |         def forward(self, x):
 98 |             size = x.size()
 99 |             return x.view(size[0] * 1, size[1], size[2] * size[3])
100 | 
101 |     module = TestModule().cuda().eval()
102 | 
103 |     x = torch.randn(1, 3, 32, 32).cuda()
104 | 
105 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4)
106 | 
107 |     x = torch.randn(1, 3, 32, 32).cuda()
108 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
109 | 
110 |     x = torch.randn(4, 3, 32, 32).cuda()
111 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
112 | 
113 | 
114 | def test_tensor_shape_view_mul_const_rhs():
115 | 
116 |     class TestModule(torch.nn.Module):
117 |         def forward(self, x):
118 |             size = x.size()
119 |             return x.view(1 * size[0], size[1], size[2] * size[3])
120 | 
121 |     module = TestModule().cuda().eval()
122 | 
123 |     x = torch.randn(1, 3, 32, 32).cuda()
124 | 
125 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4)
126 | 
127 |     x = torch.randn(1, 3, 32, 32).cuda()
128 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
129 | 
130 |     x = torch.randn(4, 3, 32, 32).cuda()
131 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
132 | 
133 | 
134 | def test_tensor_shape_view_static():
135 | 
136 |     class TestModule(torch.nn.Module):
137 |         def forward(self, x):
138 |             size = x.size()
139 |             return x.view(1, 3, 32, 32)
140 | 
141 |     module = TestModule().cuda().eval()
142 | 
143 |     x = torch.randn(1, 3, 32, 32).cuda()
144 | 
145 |     module_trt = torch2trt(module, [x], log_level=trt.Logger.VERBOSE, max_batch_size=4)
146 | 
147 |     x = torch.randn(1, 3, 32, 32).cuda()
148 |     assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
149 | 
150 |     # x = torch.randn(4, 3, 32, 32).cuda()
151 |     # assert(torch.allclose(module_trt(x), module(x), atol=1e-2, rtol=1e-2))
152 | 
153 | 
154 | if __name__ == '__main__':
155 | 
156 |     test_tensor_shape_view_mul()


--------------------------------------------------------------------------------
/tests/converter_tests/test_getitem.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch2trt import torch2trt, trt
  5 | 
  6 | 
  7 | class YOLOXFocusTestModule(nn.Module):
  8 | 
  9 | 
 10 |     def forward(self, x):
 11 |         patch_top_left = x[..., ::2, ::2]
 12 |         patch_top_right = x[..., ::2, 1::2]
 13 |         patch_bot_left = x[..., 1::2, ::2]
 14 |         patch_bot_right = x[..., 1::2, 1::2]
 15 |         x = torch.cat(
 16 |             (
 17 |                 patch_top_left,
 18 |                 patch_bot_left,
 19 |                 patch_top_right,
 20 |                 patch_bot_right,
 21 |             ),
 22 |             dim=1,
 23 |         )
 24 |         return x
 25 | 
 26 | 
 27 | def test_getitem_dynamic_yolox_layer():
 28 | 
 29 |     class YOLOXFocusTestModule(nn.Module):
 30 | 
 31 | 
 32 |         def forward(self, x):
 33 |             patch_top_left = x[..., ::2, ::2]
 34 |             patch_top_right = x[..., ::2, 1::2]
 35 |             patch_bot_left = x[..., 1::2, ::2]
 36 |             patch_bot_right = x[..., 1::2, 1::2]
 37 |             x = torch.cat(
 38 |                 (
 39 |                     patch_top_left,
 40 |                     patch_bot_left,
 41 |                     patch_top_right,
 42 |                     patch_bot_right,
 43 |                 ),
 44 |                 dim=1,
 45 |             )
 46 |             return x
 47 | 
 48 |     module = YOLOXFocusTestModule().cuda().eval()
 49 | 
 50 |     data = torch.randn(1, 3, 112, 112).cuda()
 51 | 
 52 |     module_trt = torch2trt(module, [data], max_batch_size=4, log_level=trt.Logger.VERBOSE)
 53 | 
 54 |     data = torch.randn(1, 3, 112, 112).cuda()
 55 |     assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4))
 56 |     
 57 |     data = torch.randn(4, 3, 112, 112).cuda()
 58 |     assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4))
 59 | 
 60 | 
 61 | def test_getitem_dynamic_add_dim():
 62 | 
 63 |     class TestModule(nn.Module):
 64 | 
 65 | 
 66 |         def forward(self, x):
 67 |             patch_top_left = x[..., None]
 68 |             patch_top_right = x[..., None]
 69 |             patch_bot_left = x[..., None]
 70 |             patch_bot_right = x[..., None]
 71 |             x = torch.cat(
 72 |                 (
 73 |                     patch_top_left,
 74 |                     patch_bot_left,
 75 |                     patch_top_right,
 76 |                     patch_bot_right,
 77 |                 ),
 78 |                 dim=1,
 79 |             )
 80 |             return x
 81 | 
 82 |     module = TestModule().cuda().eval()
 83 | 
 84 |     data = torch.randn(1, 3, 112, 112).cuda()
 85 | 
 86 |     module_trt = torch2trt(module, [data], max_batch_size=4, log_level=trt.Logger.VERBOSE)
 87 | 
 88 |     data = torch.randn(1, 3, 112, 112).cuda()
 89 |     assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4))
 90 |     
 91 |     data = torch.randn(4, 3, 112, 112).cuda()
 92 |     assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4))
 93 | 
 94 | 
 95 | def test_getitem_dynamic_remove_dim():
 96 | 
 97 |     class TestModule(nn.Module):
 98 | 
 99 | 
100 |         def forward(self, x):
101 |             patch_top_left = x[..., 0]
102 |             patch_top_right = x[..., 0]
103 |             patch_bot_left = x[..., 0]
104 |             patch_bot_right = x[..., 0]
105 |             x = torch.cat(
106 |                 (
107 |                     patch_top_left,
108 |                     patch_bot_left,
109 |                     patch_top_right,
110 |                     patch_bot_right,
111 |                 ),
112 |                 dim=1,
113 |             )
114 |             return x
115 | 
116 |     module = TestModule().cuda().eval()
117 | 
118 |     data = torch.randn(1, 3, 112, 112).cuda()
119 | 
120 |     module_trt = torch2trt(module, [data], max_batch_size=4, log_level=trt.Logger.VERBOSE)
121 | 
122 |     data = torch.randn(1, 3, 112, 112).cuda()
123 |     assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4))
124 |     
125 |     data = torch.randn(4, 3, 112, 112).cuda()
126 |     assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4))
127 | 
128 | 
129 | def test_getitem_dynamic_remove_add_dim():
130 | 
131 |     class TestModule(nn.Module):
132 | 
133 | 
134 |         def forward(self, x):
135 |             patch_top_left = x[..., 0, None]
136 |             patch_top_right = x[..., 0, None]
137 |             patch_bot_left = x[..., 0, None]
138 |             patch_bot_right = x[..., 0, None]
139 |             x = torch.cat(
140 |                 (
141 |                     patch_top_left,
142 |                     patch_bot_left,
143 |                     patch_top_right,
144 |                     patch_bot_right,
145 |                 ),
146 |                 dim=1,
147 |             )
148 |             return x
149 | 
150 |     module = TestModule().cuda().eval()
151 | 
152 |     data = torch.randn(1, 3, 112, 112).cuda()
153 | 
154 |     module_trt = torch2trt(module, [data], max_batch_size=4, log_level=trt.Logger.VERBOSE)
155 | 
156 |     data = torch.randn(1, 3, 112, 112).cuda()
157 |     assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4))
158 |     
159 |     data = torch.randn(4, 3, 112, 112).cuda()
160 |     assert(torch.allclose(module_trt(data), module(data), atol=1e-4, rtol=1e-4))
161 | 
162 | 


--------------------------------------------------------------------------------
/torch2trt/contrib/qat/layers/_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import copy
  3 | import inspect
  4 | 
  5 | from absl import logging
  6 | 
  7 | from torch import nn
  8 | 
  9 | from pytorch_quantization.nn import TensorQuantizer as TQ
 10 | from pytorch_quantization.tensor_quant import QuantDescriptor, QUANT_DESC_8BIT_PER_TENSOR
 11 | 
 12 | '''
 13 | Currently Nvidia quantization library quantizes the input of the conv layer as opposed to output of ReLU.
 14 | utilities classes and functions mentioned below are going to help us map int8 layers correctly to TensorRT layers. 
 15 | '''
 16 | 
 17 | class QuantWeightMixin():
 18 |     """Mixin class for adding basic quantization logic to quantized modules"""
 19 | 
 20 |     default_quant_desc_weight = QUANT_DESC_8BIT_PER_TENSOR
 21 | 
 22 |     @classmethod
 23 |     def set_default_quant_desc_input(cls, value):
 24 |         """
 25 |         Args:
 26 |             value: An instance of :class:`QuantDescriptor <pytorch_quantization.tensor_quant.QuantDescriptor>`
 27 |         """
 28 |         if not isinstance(value, QuantDescriptor):
 29 |             raise ValueError("{} is not an instance of QuantDescriptor!")
 30 |         cls.default_quant_desc_weight = copy.deepcopy(value)
 31 | 
 32 |     def init_quantizer(self, quant_desc_weight):
 33 |         """Helper function for __init__ of simple quantized module
 34 | 
 35 |         Create weight quantizer based on quant_desc passed by kwargs, or default of the class.
 36 | 
 37 |         Args:
 38 |             quant_desc_weight: An instance of :class:`QuantDescriptor <pytorch_quantization.tensor_quant.QuantDescriptor>`
 39 |         """
 40 |         if not inspect.stack()[1].function == "__init__":
 41 |             raise TypeError("{} should be only called by __init__ of quantized module.".format(__name__))
 42 |         self._fake_quant = True
 43 |         if not quant_desc_weight.fake_quant:
 44 |             raise ValueError("Only fake quantization is supported!")
 45 | 
 46 |         logging.info("Input is %squantized to %d bits in %s with axis %s!", ""
 47 |                      if not quant_desc_weight.fake_quant else "fake ",
 48 |                      quant_desc_weight.num_bits, self.__class__.__name__, quant_desc_weight.axis)
 49 | 
 50 |         self._weight_quantizer = TQ(quant_desc_weight)
 51 | 
 52 |     # pylint:disable=missing-docstring
 53 |     @property
 54 |     def weight_quantizer(self):
 55 |         return self._weight_quantizer
 56 |     # pylint:enable=missing-docstring
 57 | 
 58 | 
 59 | def pop_quant_desc_in_kwargs(quant_cls, input_only=False,weight_only=False, **kwargs):
 60 |     """Pop quant descriptors in kwargs
 61 |     
 62 |     If there is no descriptor in kwargs, the default one in quant_cls will be used
 63 | 
 64 |     Arguments:
 65 |        quant_cls: A class that has default quantization descriptors
 66 |        input_only: A boolean. If True, pop quant_desc_input only, not quant_desc_weight. Default false.
 67 | 
 68 |     Keyword Arguments:
 69 |        quant_desc_input: An instance of :class:`QuantDescriptor <pytorch_quantization.tensor_quant.QuantDescriptor>`.
 70 |            Quantization descriptor of input.
 71 |        quant_desc_weight: An instance of :class:`QuantDescriptor <pytorch_quantization.tensor_quant.QuantDescriptor>`.
 72 |            Quantization descriptor of weight.
 73 | 
 74 |     Note: Original function doesnt pop quant_desc_weight
 75 |     """
 76 |     if input_only:
 77 |         quant_desc_input = kwargs.pop('quant_desc_input', quant_cls.default_quant_desc_input)
 78 |     elif weight_only:
 79 |         quant_desc_weight = kwargs.pop('quant_desc_weight', quant_cls.default_quant_desc_weight)
 80 |     else:
 81 |         quant_desc_input = kwargs.pop('quant_desc_input', quant_cls.default_quant_desc_input)
 82 |         quant_desc_weight = kwargs.pop('quant_desc_weight', quant_cls.default_quant_desc_weight)
 83 | 
 84 | 
 85 |     # Check if anything is left in **kwargs
 86 |     if kwargs:
 87 |         raise TypeError("Unused keys: {}".format(kwargs.keys()))
 88 | 
 89 |     if input_only:
 90 |         return quant_desc_input
 91 | 
 92 |     if weight_only:
 93 |         return quant_desc_weight
 94 | 
 95 |     return quant_desc_input, quant_desc_weight
 96 | 
 97 | 
 98 | 
 99 | '''
100 | Inference Layers: At inference time, we dont need to carry entire qat library. We only need dynamic range so that layers
101 | can be mapped to TRT layers at INT8.
102 | '''
103 | 
104 | class TensorQuantizer(torch.nn.Module):
105 |     def __init__(self):
106 |         super().__init__()
107 |         self.register_buffer('learned_amax',torch.tensor(1.0))
108 | 
109 | class QuantMixin():
110 |     def init_quantizer(self):
111 |         self._input_quantizer = TensorQuantizer()
112 |         self._weight_quantizer = TensorQuantizer()
113 | 
114 |     @property
115 |     def input_quantizer(self):
116 |         return self._input_quantizer
117 | 
118 |     @property
119 |     def weight_quantizer(self):
120 |         return self._weight_quantizer
121 | 
122 | class QuantMixinInput():
123 |     def init_quantizer(self):
124 |         self._input_quantizer = TensorQuantizer()
125 | 
126 |     @property
127 |     def input_quantizer(self):
128 |         return self._input_quantizer
129 | 
130 | class QuantMixinWeight():
131 |     def init_quantizer(self):
132 |         self._weight_quantizer = TensorQuantizer()
133 | 
134 |     @property
135 |     def weight_quantizer(self):
136 |         return self._weight_quantizer
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/examples/image_classification/live_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook will run a live demo on Jetson Nano using [JetCam](https://github.com/NVIDIA-AI-IOT/jetcam) to acquire images from the camera.  First,\n",
  8 |     "let's start the camera.  See the JetCam examples for details."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "from jetcam.csi_camera import CSICamera\n",
 18 |     "# from jetcam.usb_camera import USBCamera\n",
 19 |     "\n",
 20 |     "camera = CSICamera(width=224, height=224)\n",
 21 |     "# camera = USBCamera(width=224, height=224)\n",
 22 |     "\n",
 23 |     "camera.running = True"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "Now, let's connect the camera's value to a widget to display."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "from jetcam.utils import bgr8_to_jpeg\n",
 40 |     "import traitlets\n",
 41 |     "import ipywidgets\n",
 42 |     "\n",
 43 |     "image_w = ipywidgets.Image()\n",
 44 |     "\n",
 45 |     "traitlets.dlink((camera, 'value'), (image_w, 'value'), transform=bgr8_to_jpeg)\n",
 46 |     "\n",
 47 |     "display(image_w)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "Next, we'll load the TensorRT model.  (We assume you followed the conversion notebook and saved to the path ``resnet18_trt.pth``)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import torch\n",
 64 |     "from torch2trt import TRTModule\n",
 65 |     "\n",
 66 |     "model_trt = TRTModule()\n",
 67 |     "model_trt.load_state_dict(torch.load('resnet18_trt.pth'))"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "The following function will be used to pre-process images from the camera"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "import cv2\n",
 84 |     "import numpy as np\n",
 85 |     "import torchvision\n",
 86 |     "\n",
 87 |     "device = torch.device('cuda')\n",
 88 |     "mean = 255.0 * np.array([0.485, 0.456, 0.406])\n",
 89 |     "stdev = 255.0 * np.array([0.229, 0.224, 0.225])\n",
 90 |     "\n",
 91 |     "normalize = torchvision.transforms.Normalize(mean, stdev)\n",
 92 |     "\n",
 93 |     "def preprocess(camera_value):\n",
 94 |     "    global device, normalize\n",
 95 |     "    x = camera_value\n",
 96 |     "    x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)\n",
 97 |     "    x = x.transpose((2, 0, 1))\n",
 98 |     "    x = torch.from_numpy(x).float()\n",
 99 |     "    x = normalize(x)\n",
100 |     "    x = x.to(device)\n",
101 |     "    x = x[None, ...]\n",
102 |     "    return x"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "This text area will be used to display the class predictions."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "text = ipywidgets.Textarea()\n",
119 |     "display(text)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "We load the imagenet labels to associate the neural network output with a class name."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "import json\n",
136 |     "\n",
137 |     "with open('imagenet_labels.json', 'r') as f:\n",
138 |     "    labels = json.load(f)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "Finally, we create our execution function, which we attach as a callback to the camera's ``value`` attribute.\n",
146 |     "\n",
147 |     "Whenever the camera's value is updated (which it will be for each frame, since we set ``camera.running = True``).  This function will be called\n",
148 |     "describing how the value changed.  The new camera value will be stored in ``change['new']``."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "def execute(change):\n",
158 |     "    image = change['new']\n",
159 |     "    output = model_trt(preprocess(image).half()).detach().cpu().numpy().flatten()\n",
160 |     "    idx = output.argmax()\n",
161 |     "    text.value = labels[idx]\n",
162 |     "\n",
163 |     "camera.observe(execute, names='value')"
164 |    ]
165 |   }
166 |  ],
167 |  "metadata": {
168 |   "kernelspec": {
169 |    "display_name": "Python 3",
170 |    "language": "python",
171 |    "name": "python3"
172 |   },
173 |   "language_info": {
174 |    "codemirror_mode": {
175 |     "name": "ipython",
176 |     "version": 3
177 |    },
178 |    "file_extension": ".py",
179 |    "mimetype": "text/x-python",
180 |    "name": "python",
181 |    "nbconvert_exporter": "python",
182 |    "pygments_lexer": "ipython3",
183 |    "version": "3.6.7"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 2
188 | }
189 | 


--------------------------------------------------------------------------------
/examples/image_segmentation/conversion.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import torch\n",
 10 |     "import torchvision\n",
 11 |     "import torch2trt"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "model = torchvision.models.segmentation.deeplabv3_resnet101(pretrained=True)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "model = model.cuda().eval().half()"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "class ModelWrapper(torch.nn.Module):\n",
 39 |     "    def __init__(self, model):\n",
 40 |     "        super(ModelWrapper, self).__init__()\n",
 41 |     "        self.model = model\n",
 42 |     "    def forward(self, x):\n",
 43 |     "        return self.model(x)['out']"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "model_w = ModelWrapper(model).half()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "data = torch.ones((1, 3, 224, 224)).cuda().half()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "model_trt = torch2trt.torch2trt(model_w, [data], fp16_mode=True)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "# Live demo"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# from jetcam.csi_camera import CSICamera\n",
 87 |     "from jetcam.usb_camera import USBCamera\n",
 88 |     "\n",
 89 |     "# camera = CSICamera(width=224, height=224)\n",
 90 |     "camera = USBCamera(width=224, height=224)\n",
 91 |     "\n",
 92 |     "camera.running = True"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "from jetcam.utils import bgr8_to_jpeg\n",
102 |     "import traitlets\n",
103 |     "import ipywidgets\n",
104 |     "\n",
105 |     "image_w = ipywidgets.Image()\n",
106 |     "\n",
107 |     "traitlets.dlink((camera, 'value'), (image_w, 'value'), transform=bgr8_to_jpeg)\n",
108 |     "\n",
109 |     "display(image_w)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "import cv2\n",
119 |     "import numpy as np\n",
120 |     "import torchvision\n",
121 |     "\n",
122 |     "device = torch.device('cuda')\n",
123 |     "mean = 255.0 * np.array([0.485, 0.456, 0.406])\n",
124 |     "stdev = 255.0 * np.array([0.229, 0.224, 0.225])\n",
125 |     "\n",
126 |     "normalize = torchvision.transforms.Normalize(mean, stdev)\n",
127 |     "\n",
128 |     "def preprocess(camera_value):\n",
129 |     "    global device, normalize\n",
130 |     "    x = camera_value\n",
131 |     "    x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)\n",
132 |     "    x = x.transpose((2, 0, 1))\n",
133 |     "    x = torch.from_numpy(x).float()\n",
134 |     "    x = normalize(x)\n",
135 |     "    x = x.to(device)\n",
136 |     "    x = x[None, ...]\n",
137 |     "    return x"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "seg_image = ipywidgets.Image()\n",
147 |     "\n",
148 |     "display(seg_image)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "def execute(change):\n",
158 |     "    image = change['new']\n",
159 |     "    output = model_trt(preprocess(camera.value).half())[0].detach().cpu().float().numpy()\n",
160 |     "    mask = 1.0 * (output.argmax(0) == 15)\n",
161 |     "    seg_image.value = bgr8_to_jpeg(mask[:, :, None] * image)\n",
162 |     "    \n",
163 |     "    \n",
164 |     "mask = execute({'new': camera.value})\n",
165 |     "# camera.observe(execute, names='value')"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "camera.observe(execute, names='value')"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "camera.unobserve(execute, names='value')"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "import time\n",
193 |     "\n",
194 |     "torch.cuda.current_stream().synchronize()\n",
195 |     "t0 = time.time()\n",
196 |     "for i in range(100):\n",
197 |     "    output = model_w(preprocess(camera.value).half())\n",
198 |     "torch.cuda.current_stream().synchronize()\n",
199 |     "t1 = time.time()\n",
200 |     "\n",
201 |     "print(100.0 / (t1 - t0))"
202 |    ]
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": "Python 3",
208 |    "language": "python",
209 |    "name": "python3"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.6.7"
222 |   }
223 |  },
224 |  "nbformat": 4,
225 |  "nbformat_minor": 2
226 | }
227 | 


--------------------------------------------------------------------------------
/docs/usage/reduced_precision.md:
--------------------------------------------------------------------------------
  1 | # Reduced Precision
  2 | 
  3 | For certain platforms, reduced precision can result in substantial improvements in throughput,
  4 | often with little impact on model accuracy.
  5 | 
  6 | # Support Matrix
  7 | 
  8 | Below is a table of layer precision support for various NVIDIA platforms.
  9 | 
 10 | | Platform | FP16 | INT8 |
 11 | |----------|------|------|
 12 | | Jetson Nano | ![X](../images/check.svg) |  |
 13 | | Jetson TX2 | ![X](../images/check.svg)  | ![X](../images/check.svg) |
 14 | | Jetson Xavier NX | ![X](../images/check.svg) | ![X](../images/check.svg) |
 15 | | Jetson AGX Xavier | ![X](../images/check.svg)  | ![X](../images/check.svg)  |
 16 | 
 17 | !!! note
 18 | 
 19 |     If the platform you're using is missing from this table or you spot anything incorrect
 20 |     please [let us know](https://github.com/NVIDIA-AI-IOT/torch2trt).
 21 |     
 22 | ## FP16 Precision
 23 | 
 24 | To enable support for fp16 precision with TensorRT, torch2trt exposes the ``fp16_mode`` parameter.
 25 | Converting a model with ``fp16_mode=True`` allows the TensorRT optimizer to select layers with fp16
 26 | precision.
 27 | 
 28 | 
 29 | ```python
 30 | model_trt = torch2trt(model, [data], fp16_mode=True)
 31 | ```
 32 | 
 33 | !!! note
 34 | 
 35 |     When ``fp16_mode=True``, this does not necessarily mean that TensorRT will select FP16 layers.
 36 |     The optimizer attempts to automatically select tactics which result in the best performance.
 37 |     
 38 | ## INT8 Precision
 39 | 
 40 | torch2trt also supports int8 precision with TensorRT with the ``int8_mode`` parameter.  Unlike fp16 and fp32 precision, switching
 41 | to in8 precision often requires calibration to avoid a significant drop in accuracy.  
 42 | 
 43 | ### Input Data Calibration
 44 | 
 45 | By default
 46 | torch2trt will calibrate using the input data provided.  For example, if you wanted
 47 | to calibrate on a set of 64 random normal images you could do.
 48 | 
 49 | ```python
 50 | data = torch.randn(64, 3, 224, 224).cuda().eval()
 51 | 
 52 | model_trt = torch2trt(model, [data], int8_mode=True)
 53 | ```
 54 | 
 55 | ### Dataset Calibration
 56 | 
 57 | In many instances, you may want to calibrate on more data than fits in memory.  For this reason,
 58 | torch2trt exposes the ``int8_calibration_dataset`` parameter.  This parameter takes an input
 59 | dataset that is used for calibration.  If this parameter is specified, the input data is 
 60 | ignored during calibration.  You create an input dataset by defining
 61 | a class which implements the ``__len__`` and ``__getitem__`` methods.  
 62 | 
 63 | * The ``__len__`` method should return the number of calibration samples
 64 | * The ``__getitem__`` method must return a single calibration sample.  This is a list of input tensors to the model.  Each tensor should match the shape
 65 | you provide to the ``inputs`` parameter when calling ``torch2trt``.
 66 | 
 67 | For example, say you trained an image classification network using the PyTorch [``ImageFolder``](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder) dataset.
 68 | You could wrap this dataset for calibration, by defining a new dataset which returns only the images without labels in list format.
 69 | 
 70 | ```python
 71 | from torchvision.datasets import ImageFolder
 72 | from torchvision.transforms import ToTensor, Compose, Normalize, Resize
 73 | 
 74 | 
 75 | class ImageFolderCalibDataset():
 76 |     
 77 |     def __init__(self, root):
 78 |         self.dataset = ImageFolder(
 79 |             root=root, 
 80 |             transform=Compose([
 81 |                 Resize((224, 224)),
 82 |                 ToTensor(),
 83 |                 Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 84 |             ])
 85 |         )
 86 |         
 87 |     def __len__(self):
 88 |         return len(self.dataset)
 89 |     
 90 |     def __getitem__(self, idx):
 91 |         image, _ = self.dataset[idx]
 92 |         image = image[None, ...]  # add batch dimension
 93 |         return [image]
 94 | ```
 95 | 
 96 | You would then provide this calibration dataset to torch2trt as follows
 97 | 
 98 | ```python
 99 | dataset = ImageFolderCalibDataset('images')
100 | 
101 | model_trt = torch2trt(model, [data], int8_calib_dataset=dataset)
102 | ```
103 | 
104 | ### Calibration Algorithm
105 | 
106 | To override the default calibration algorithm that torch2trt uses, you can set the ``int8_calib_algoirthm``
107 | to the [``tensorrt.CalibrationAlgoType``](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Int8/Calibrator.html#iint8calibrator)
108 | that you wish to use.  For example, to use the minmax calibration algorithm you would do
109 | 
110 | ```python
111 | import tensorrt as trt
112 | 
113 | model_trt = torch2trt(model, [data], int8_mode=True, int8_calib_algorithm=trt.CalibrationAlgoType.MINMAX_CALIBRATION)
114 | ```
115 | 
116 | ### Calibration Batch Size
117 | 
118 | During calibration, torch2trt pulls data in batches for the TensorRT calibrator.  In some instances
119 | [developers have found](https://github.com/NVIDIA-AI-IOT/torch2trt/pull/398) that the calibration batch size can impact the calibrated model accuracy.  To set the calibration batch size, you can set the ``int8_calib_batch_size``
120 | parameter.  For example, to use a calibration batch size of 32 you could do
121 | 
122 | ```python
123 | model_trt = torch2trt(model, [data], int8_mode=True, int8_calib_batch_size=32)
124 | ```
125 | 
126 | ## Binding Data Types
127 | 
128 | The data type of input and output bindings in TensorRT are determined by the original
129 | PyTorch module input and output data types.
130 | This does not directly impact whether the TensorRT optimizer will internally use fp16 or int8 precision.
131 | 
132 | For example, to create a model with fp32 precision bindings, you would do the following
133 | 
134 | ```python
135 | model = model.float()
136 | data = data.float()
137 | 
138 | model_trt = torch2trt(model, [data], fp16_mode=True)
139 | ```
140 | 
141 | In this instance, the optimizer may choose to use fp16 precision layers internally, but the
142 | input and output data types are fp32.  To use fp16 precision input and output bindings you would do
143 | 
144 | ```python
145 | model = model.half()
146 | data = data.half()
147 | 
148 | model_trt = torch2trt(model, [data], fp16_mode=True)
149 | ```
150 | 
151 | Now, the input and output bindings of the model are half precision, and internally the optimizer may
152 | choose to select fp16 layers as well.
153 | 


--------------------------------------------------------------------------------
/examples/contrib/quantization_aware_training/train.py:
--------------------------------------------------------------------------------
  1 | import torch 
  2 | import torch.nn as nn
  3 | import numpy as np 
  4 | import torchvision
  5 | import argparse
  6 | import os,sys 
  7 | import torch.optim as optim 
  8 | from datasets.cifar10 import Cifar10Loaders
  9 | from models.models import vanilla_cnn
 10 | from models.resnet import resnet18 , resnet34
 11 | from utils.utilities import calculate_accuracy , add_missing_keys, transfer_learning_resnet18,transfer_learning_resnet34, mapping_names
 12 | from parser import parse_args
 13 | import time
 14 | from torch2trt import torch2trt
 15 | import tensorrt as trt 
 16 | 
 17 | def main():
 18 |     args = parse_args()
 19 | 
 20 |     ## Create an output dir
 21 |     output_dir_path = args.od + args.en
 22 |     if not os.path.exists(output_dir_path):
 23 |         os.makedirs(output_dir_path)
 24 |         dir_name=output_dir_path 
 25 |     else:
 26 |         counter=1
 27 |         dir_name = output_dir_path
 28 |         new_dir_name = dir_name
 29 |         while os.path.exists(new_dir_name):
 30 |             new_dir_name = dir_name + "_" + str(counter)
 31 |             counter +=1 
 32 |         os.makedirs(new_dir_name)
 33 |         dir_name=new_dir_name
 34 | 
 35 |     print("===>> Output folder = {}".format(dir_name))
 36 |     
 37 |     args.cuda = not args.no_cuda and torch.cuda.is_available()
 38 |     torch.manual_seed(args.seed)
 39 | 
 40 |     if args.cuda:
 41 |         torch.backends.cudnn.benchmark = True
 42 |         torch.cuda.manual_seed(args.seed)
 43 |     
 44 |     loaders = Cifar10Loaders()
 45 |     train_loader = loaders.train_loader()
 46 |     test_loader = loaders.test_loader()
 47 | 
 48 |     if args.m =="resnet18":
 49 |         if args.netqat:
 50 |             model=resnet18(qat_mode=True)
 51 |         else:
 52 |             model=resnet18()
 53 |     elif args.m =="resnet34":
 54 |         if args.netqat:
 55 |             model=resnet34(qat_mode=True)
 56 |         else:
 57 |             model=resnet34()
 58 |     elif args.m == 'resnet34-tl':
 59 |         model = transfer_learning_resnet34()
 60 |     elif args.m == "resnet18-tl": ## resnet18 transfer learning
 61 |         model=transfer_learning_resnet18()
 62 |     else:
 63 |         raise NotImplementedError("model {} is not defined".format(args.m))
 64 | 
 65 |     if args.cuda:
 66 |         model = model.cuda()
 67 | 
 68 |     best_test_accuracy=0
 69 |     if args.v:
 70 |         print("======>>> keys present in state dict at model creation")
 71 |         for k,_ in model.state_dict().items():
 72 |             print(k)
 73 | 
 74 |     if args.load_ckpt:
 75 |         model.eval()
 76 |         checkpoint = torch.load(args.load_ckpt)
 77 |         if args.partial_ckpt:
 78 |             model_state = checkpoint['model_state_dict']
 79 |             if args.v:
 80 |                 print("====>>>>> keys present in the ckpt state dict")
 81 |                 for k,_ in model_state.items():
 82 |                     print(k)
 83 |             if args.tl:
 84 |                 model_state = mapping_names(model_state)
 85 |             new_state_dict = add_missing_keys(model.state_dict(),model_state)
 86 |             model.load_state_dict(new_state_dict,strict=True)
 87 |         else:
 88 |             model.load_state_dict(checkpoint['model_state_dict'],strict=True)
 89 |     
 90 |     criterion = nn.CrossEntropyLoss()
 91 |     optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=0.9)
 92 |     if args.load_ckpt:
 93 |         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 94 |         epoch = checkpoint['epoch']
 95 |         loss = checkpoint['loss']
 96 |         print("===>>> Checkpoint loaded successfully from {} at epoch {} ".format(args.load_ckpt,epoch))
 97 | 
 98 |     print("===>> Training started")
 99 |     for epoch in range(args.start_epoch, args.start_epoch + args.num_epochs):
100 |         running_loss=0.0
101 |         start=time.time()
102 |         model.train()
103 |         for i, data in enumerate(train_loader,0):
104 |             inputs, labels = data
105 | 
106 |             if args.cuda:
107 |                 inputs = inputs.cuda()
108 |                 labels = labels.cuda()
109 | 
110 |             optimizer.zero_grad()
111 |             
112 |             outputs = model(inputs)
113 |             loss = criterion(outputs,labels)
114 |             loss.backward()
115 |             optimizer.step()
116 | 
117 |             running_loss +=loss.item()
118 |         
119 |         if epoch > 0 and  epoch % args.lrdt == 0:
120 |             print("===>> decaying learning rate at epoch {}".format(epoch))
121 |             for param_group in optimizer.param_groups:
122 |                 param_group['lr'] = param_group['lr'] * 0.94
123 | 
124 |         running_loss /= len(train_loader)
125 |         end = time.time()
126 |         test_accuracy = calculate_accuracy(model,test_loader)
127 | 
128 |         print("Epoch: {0} | Loss: {1} | Test accuracy: {2}| Time Taken (sec): {3} ".format(epoch+1, np.around(running_loss,6), test_accuracy, np.around((end-start),4)))
129 | 
130 |         ##Save the best checkpoint
131 |         if test_accuracy > best_test_accuracy:
132 |             best_ckpt_filename = dir_name + "/ckpt_" + str(epoch)
133 |             best_test_accuracy = test_accuracy
134 |             torch.save({
135 |                 'epoch': epoch,
136 |                 'model_state_dict': model.state_dict(),
137 |                 'optimizer_state_dict': optimizer.state_dict(),
138 |                 'loss': running_loss,
139 |                 }, best_ckpt_filename)
140 |     print("Training finished")
141 |     
142 |     ## Running metrics
143 |     if args.test_trt:
144 |         if args.m == 'resnet34-tl' or args.m == 'resnet34':
145 |             model = transfer_learning_resnet34(pretrained=False)
146 |         elif args.m == 'resnet18-tl' or args.m == 'resnet18':
147 |             model= transfer_learning_resnet18(pretrained=False)
148 |         else:
149 |             raise NotImplementedError("model {} is not defined".format(args.m))
150 |         
151 |         model=model.cuda().eval()
152 |         checkpoint = torch.load(best_ckpt_filename)
153 |         model.load_state_dict(checkpoint['model_state_dict'],strict=True)
154 |         
155 |         pytorch_test_accuracy = calculate_accuracy(model,test_loader)
156 |         rand_in = torch.randn([128,3,32,32],dtype=torch.float32).cuda()
157 | 
158 |         if args.FP16:
159 |             trt_model_fp16 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,max_batch_size=128)
160 |             trtfp16_test_accuracy = calculate_accuracy(trt_model_fp16,test_loader)
161 |     
162 |         if args.INT8PTC:
163 |             ##preparing calib dataset
164 |             calib_dataset = list()
165 |             for i, sam in enumerate(test_loader):
166 |                 calib_dataset.extend(sam[0])
167 |                 if i ==5:
168 |                     break
169 | 
170 |             trt_model_calib_int8 = torch2trt(model,[rand_in],log_level=trt.Logger.INFO,fp16_mode=True,int8_calib_dataset=calib_dataset,int8_mode=True,max_batch_size=128)
171 |             int8_test_accuracy = calculate_accuracy(trt_model_calib_int8,test_loader)
172 | 
173 |         print("Test Accuracy")
174 |         print("Pytorch model :",pytorch_test_accuracy)
175 |         print("TRT FP16 model :",trtfp16_test_accuracy)
176 |         print("TRT INT8 PTC model :",int8_test_accuracy)
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     main()
181 | 


--------------------------------------------------------------------------------
/torch2trt/trt_module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import tensorrt as trt
  3 | from .flattener import Flattener
  4 | from .misc_utils import (
  5 |     torch_dtype_from_trt,
  6 |     torch_device_from_trt
  7 | )
  8 | from .version_utils import (
  9 |     trt_version
 10 | )
 11 | 
 12 | 
 13 | class TRTModule(torch.nn.Module):
 14 |     def __init__(self, engine=None, input_names=None, output_names=None, input_flattener=None, output_flattener=None):
 15 |         super(TRTModule, self).__init__()
 16 |         self._register_state_dict_hook(TRTModule._on_state_dict)
 17 | 
 18 |         if isinstance(engine, str):
 19 |             # assume filepath
 20 |             with open(engine, 'rb') as f:
 21 |                 engine = f.read()
 22 |             with trt.Logger() as logger, trt.Runtime(logger) as runtime:
 23 |                 engine = runtime.deserialize_cuda_engine(engine)
 24 |         elif isinstance(engine, trt.IHostMemory):
 25 |             with trt.Logger() as logger, trt.Runtime(logger) as runtime:
 26 |                 engine = runtime.deserialize_cuda_engine(engine)
 27 |             
 28 |         self.engine = engine
 29 |         if self.engine is not None:
 30 |             self.context = self.engine.create_execution_context()
 31 |             self._update_name_binindgs_maps()
 32 |         self.input_names = input_names
 33 |         self.output_names = output_names
 34 |         self.input_flattener = input_flattener
 35 |         self.output_flattener = output_flattener
 36 |     
 37 |     def _update_name_binindgs_maps(self):
 38 |         if trt_version() >= "10.0":
 39 |             self._update_name_binding_maps_trt_10()
 40 |         else:
 41 |             self._update_name_binding_maps_pre_trt_10()
 42 | 
 43 |     def _update_name_binding_maps_trt_10(self):
 44 |         self._name_to_binding = {}
 45 |         self._binding_to_name = {}
 46 |         for i in range(self.engine.num_io_tensors):
 47 |             name_i = self.engine.get_tensor_name(i)
 48 |             self._name_to_binding[name_i] = i
 49 |             self._binding_to_name[i] = name_i
 50 | 
 51 |     def _update_name_binding_maps_pre_trt_10(self):
 52 |         self._name_to_binding = {}
 53 |         self._binding_to_name = {}
 54 |         for i in range(self.engine.num_bindings):
 55 |             name_i = self.engine.get_binding_name(i)
 56 |             self._name_to_binding[name_i] = i
 57 |             self._binding_to_name[i] = name_i
 58 | 
 59 |     def _on_state_dict(self, state_dict, prefix, local_metadata):
 60 |         state_dict[prefix + "engine"] = bytearray(self.engine.serialize())
 61 |         state_dict[prefix + "input_names"] = self.input_names
 62 |         state_dict[prefix + "output_names"] = self.output_names
 63 |         state_dict[prefix + "input_flattener"] = self.input_flattener.dict()
 64 |         state_dict[prefix + "output_flattener"] = self.output_flattener.dict()
 65 | 
 66 |     def _load_from_state_dict(
 67 |         self,
 68 |         state_dict,
 69 |         prefix,
 70 |         local_metadata,
 71 |         strict,
 72 |         missing_keys,
 73 |         unexpected_keys,
 74 |         error_msgs,
 75 |     ):
 76 |         engine_bytes = state_dict[prefix + "engine"]
 77 | 
 78 |         with trt.Logger() as logger, trt.Runtime(logger) as runtime:
 79 |             self.engine = runtime.deserialize_cuda_engine(engine_bytes)
 80 |             self.context = self.engine.create_execution_context()
 81 | 
 82 |         self.input_names = state_dict[prefix + "input_names"]
 83 |         self.output_names = state_dict[prefix + "output_names"]
 84 | 
 85 |         if 'input_flattener' in state_dict:
 86 |             self.input_flattener = Flattener.from_dict(state_dict['input_flattener'])
 87 |         else:
 88 |             self.input_flattener = None
 89 | 
 90 |         if 'output_flattener' in state_dict:
 91 |             self.output_flattener = Flattener.from_dict(state_dict['output_flattener'])
 92 |         else:
 93 |             self.output_flattener = None
 94 | 
 95 |         self._update_name_binindgs_maps()
 96 | 
 97 |     def _forward_pre_10(self, *inputs):
 98 |         bindings = [None] * (len(self.input_names) + len(self.output_names))
 99 |         
100 |         if self.input_flattener is not None:
101 |             inputs = self.input_flattener.flatten(inputs)
102 | 
103 |         for i, input_name in enumerate(self.input_names):
104 |             idx = self.engine.get_binding_index(input_name)
105 |             shape = tuple(inputs[i].shape)
106 |             bindings[idx] = inputs[i].contiguous().data_ptr()
107 |             self.context.set_binding_shape(idx, shape)
108 | 
109 |         # create output tensors
110 |         outputs = [None] * len(self.output_names)
111 |         for i, output_name in enumerate(self.output_names):
112 |             idx = self.engine.get_binding_index(output_name)
113 |             dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))
114 |             shape = tuple(self.context.get_binding_shape(idx))
115 |             device = torch_device_from_trt(self.engine.get_location(idx))
116 |             output = torch.empty(size=shape, dtype=dtype, device=device)
117 |             outputs[i] = output
118 |             bindings[idx] = output.data_ptr()
119 | 
120 |         self.context.execute_async_v2(
121 |             bindings, torch.cuda.current_stream().cuda_stream
122 |         )
123 | 
124 |         if self.output_flattener is not None:
125 |             outputs = self.output_flattener.unflatten(outputs)
126 |         else:
127 |             outputs = tuple(outputs)
128 |             if len(outputs) == 1:
129 |                 outputs = outputs[0]
130 | 
131 |         return outputs
132 | 
133 |     def _forward_post_10(self, *inputs):
134 |         if self.input_flattener is not None:
135 |             inputs = self.input_flattener.flatten(inputs)
136 | 
137 |         # set shapes
138 |         for i, input_name in enumerate(self.input_names):
139 |             shape = tuple(inputs[i].shape)
140 |             data_ptr = inputs[i].contiguous().data_ptr()
141 |             self.context.set_tensor_address(input_name, data_ptr)
142 |             self.context.set_input_shape(input_name, shape)
143 | 
144 |         # execute
145 |         outputs = [None] * len(self.output_names)
146 |         for i, output_name in enumerate(self.output_names):
147 |             dtype = torch_dtype_from_trt(self.engine.get_tensor_dtype(output_name))
148 |             shape = tuple(self.context.get_tensor_shape(output_name))
149 |             device = torch_device_from_trt(self.engine.get_tensor_location(output_name))
150 |             output = torch.empty(size=shape, dtype=dtype, device=device)
151 |             outputs[i] = output
152 |             self.context.set_tensor_address(output_name, output.data_ptr())
153 | 
154 |         self.context.execute_async_v3(torch.cuda.current_stream().cuda_stream)
155 | 
156 |         if self.output_flattener is not None:
157 |             outputs = self.output_flattener.unflatten(outputs)
158 |         else:
159 |             outputs = tuple(outputs)
160 |             if len(outputs) == 1:
161 |                 outputs = outputs[0]
162 | 
163 |         return outputs
164 | 
165 |     def forward(self, *inputs):
166 |         if trt_version() < "10.0":
167 |             return self._forward_pre_10(*inputs)
168 |         else:
169 |             return self._forward_post_10(*inputs)
170 | 
171 |     def enable_profiling(self):
172 |         if not self.context.profiler:
173 |             self.context.profiler = trt.Profiler()
174 | 


--------------------------------------------------------------------------------
/torch2trt/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import glob
  4 | from uuid import uuid1
  5 | from torch2trt.flattener import Flattener
  6 | 
  7 | 
  8 | __all__ = [
  9 |     'DatasetRecorder',
 10 |     'Dataset',
 11 |     'ListDataset',
 12 |     'TensorBatchDataset'
 13 | ]
 14 | 
 15 | 
 16 | class DatasetRecorder(object):
 17 | 
 18 |     def __init__(self, dataset, module):
 19 |         self.dataset = dataset
 20 |         self.module = module
 21 |         self.handle = None
 22 | 
 23 |     def __enter__(self, *args, **kwargs):
 24 | 
 25 |         if self.handle is not None:
 26 |             raise RuntimeError('DatasetRecorder is already active.')
 27 | 
 28 |         self.handle = self.module.register_forward_pre_hook(self._callback)
 29 | 
 30 |         return self
 31 | 
 32 |     def __exit__(self, *args, **kwargs):
 33 |         if self.handle is not None:
 34 |             self.handle.remove()
 35 |             self.handle = None
 36 | 
 37 |     def _callback(self, module, input):
 38 |         self.dataset.insert(input)
 39 | 
 40 | 
 41 | class Dataset(object):
 42 | 
 43 |     def __len__(self):
 44 |         raise NotImplementedError
 45 | 
 46 |     def __getitem__(self, index):
 47 |         raise NotImplementedError
 48 | 
 49 |     def insert(self, item):
 50 |         raise NotImplementedError
 51 | 
 52 |     def record(self, module):
 53 |         return DatasetRecorder(self, module)
 54 | 
 55 |     def num_inputs(self):
 56 |         return len(self.getitem_flat(0))
 57 | 
 58 |     @property
 59 |     def flattener(self):
 60 |         if not hasattr(self, '_flattener') or self._flattener is None:
 61 |             assert(len(self) > 0, 'Cannot create default flattener without input data.')
 62 |             value = self[0]
 63 |             self._flattener = Flattener.from_value(value)
 64 |         return self._flattener
 65 | 
 66 |     def getitem_flat(self, index):
 67 |         return self.flattener.flatten(self[index])
 68 |     
 69 |     def shapes_for_index(self, index, flat=False):
 70 |         shapes = [None for i in range(self.num_inputs())]
 71 |         tensors = self.getitem_flat(index)
 72 |         for j in range(len(tensors)):
 73 |             shapes[j] = torch.Size(tuple(tensors[j].shape))
 74 | 
 75 |         if flat:
 76 |             return shapes
 77 |         else:
 78 |             return self.flattener.unflatten(shapes)
 79 | 
 80 |     def shapes(self, flat=False):
 81 |         shapes = [[] for i in range(self.num_inputs())]
 82 |         for i in range(len(self)):
 83 |             tensors = self.getitem_flat(i)
 84 |             for j in range(len(tensors)):
 85 |                 shapes[j].append(torch.Size(tuple(tensors[j].shape)))
 86 | 
 87 |         if flat:
 88 |             return shapes
 89 |         else:
 90 |             return self.flattener.unflatten(shapes)
 91 | 
 92 |     def _shape_stats(self, stat_fn, flat=False):
 93 |         shapes = []
 94 |         for s in self.shapes(flat=True):
 95 |             shape_tensor = []
 96 |             for si in s:
 97 |                 shape_tensor.append(tuple(si))
 98 |             shape_tensor = torch.LongTensor(shape_tensor)
 99 |             shapes.append(shape_tensor)
100 |         
101 |         stat_shapes = []
102 |         for shape in shapes:
103 |             stat_shape = torch.Size(stat_fn(shape))
104 |             stat_shapes.append(stat_shape)
105 |         if flat:
106 |             return stat_shapes
107 |         else:
108 |             return self.flattener.unflatten(stat_shapes)
109 | 
110 |     def min_shapes(self, flat=False):
111 |         return self._shape_stats(lambda x: torch.min(x, dim=0)[0], flat)
112 | 
113 |     def max_shapes(self, flat=False):
114 |         return self._shape_stats(lambda x: torch.max(x, dim=0)[0], flat)
115 | 
116 |     def item_numel(self, index):
117 |         tensors = self.getitem_flat(index)
118 |         return sum([t.numel() for t in tensors])
119 | 
120 |     def median_numel_shapes(self, flat=False):
121 |         numels = torch.LongTensor([self.item_numel(i) for i in range(len(self))])
122 |         median_index = int(torch.argsort(numels)[len(numels) // 2])
123 |         return self.shapes_for_index(median_index, flat=flat)
124 | 
125 |     def infer_dynamic_axes(self, flat=False):
126 |         min_shapes = self.min_shapes(flat=True)
127 |         max_shapes = self.max_shapes(flat=True)
128 |         dynamic_axes = [[] for i in range(self.num_inputs())]
129 |         for i, (mins, maxs) in enumerate(zip(min_shapes, max_shapes)):
130 |             for j, (mins_i, maxs_i) in enumerate(zip(mins, maxs)):
131 |                 if mins_i != maxs_i:
132 |                     dynamic_axes[i].append(j)
133 |         if flat:
134 |             return dynamic_axes
135 |         else:
136 |             return self.flattener.unflatten(dynamic_axes)
137 | 
138 | 
139 | class ListDataset(Dataset):
140 | 
141 |     def __init__(self, items=None):
142 |         if items is None:
143 |             items = []
144 |         self.items = [t for t in items]
145 | 
146 |     def __len__(self):
147 |         return len(self.items)
148 | 
149 |     def __getitem__(self, index):
150 |         return self.items[index]
151 | 
152 |     def insert(self, item):
153 |         self.items.append(item)
154 | 
155 | 
156 | class TensorBatchDataset(Dataset):
157 | 
158 |     def __init__(self, tensors=None):
159 |         if tensors is not None:
160 |             self._flattener = Flattener.from_value(tensors)
161 |             self.tensors = self._flattener.flatten(tensors)
162 |         else:
163 |             self._flattener = None
164 |             self.tensors = None
165 | 
166 |     def __len__(self):
167 |         if self.tensors is None:
168 |             return 0
169 |         else:
170 |             return len(self.tensors[0])
171 | 
172 |     def __getitem__(self, idx):
173 |         if self.tensors is None:
174 |             raise IndexError('Dataset is empty.')
175 |         return self.flattener.unflatten([t[idx:idx+1] for t in self.tensors])
176 | 
177 |     def insert(self, tensors):
178 |         if self._flattener is None:
179 |             self._flattener = Flattener.from_value(tensors)
180 | 
181 |         tensors = self.flattener.flatten(tensors)
182 | 
183 |         if self.tensors is None:
184 |             self.tensors = tensors
185 |         else:
186 |             if len(self.tensors) != len(tensors):
187 |                 raise ValueError('Number of inserted tensors does not match the number of tensors in the current dataset.')
188 |             
189 |             self.tensors = tuple([
190 |                 torch.cat((self.tensors[index], tensors[index]), dim=0) 
191 |                 for index in range(len(tensors))
192 |             ])
193 | 
194 | 
195 | class FolderDataset(Dataset):
196 | 
197 |     def __init__(self, folder):
198 |         super().__init__()
199 |         if not os.path.exists(folder):
200 |             os.makedirs(folder)
201 |         self.folder = folder
202 |     
203 |     def file_paths(self):
204 |         return sorted(glob.glob(os.path.join(self.folder, '*.pth')))
205 | 
206 |     def __len__(self):
207 |         return len(self.file_paths())
208 | 
209 |     def __getitem__(self, index):
210 |         return torch.load(self.file_paths()[index])
211 | 
212 |     def insert(self, tensors):
213 |         i = 0
214 |         file_paths = [os.path.basename(path) for path in self.file_paths()]
215 |         while ('input_%d.pth' % i) in file_paths:
216 |             i += 1
217 |         torch.save(tensors, os.path.join(self.folder, 'input_%d.pth' % i))


--------------------------------------------------------------------------------